blob: a6a4996dc17c58207c48015d8dc6394e38e6d834 [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: sligocki@google.com (Shawn Ligocki)
#ifndef NET_INSTAWEB_REWRITER_PUBLIC_COMMON_FILTER_H_
#define NET_INSTAWEB_REWRITER_PUBLIC_COMMON_FILTER_H_
#include "net/instaweb/rewriter/public/resource.h"
#include "net/instaweb/rewriter/public/rewrite_driver.h"
#include "net/instaweb/rewriter/public/rewrite_options.h"
#include "net/instaweb/rewriter/public/server_context.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/html/empty_html_filter.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_node.h"
#include "pagespeed/kernel/http/google_url.h"
#include "pagespeed/kernel/http/response_headers.h"
namespace net_instaweb {
// CommonFilter encapsulates useful functionality that many filters will want.
// All filters who want this functionality should inherit from CommonFilter and
// define the Helper methods rather than the main methods.
//
// Currently, it stores whether we are in a <noscript> element (in
// which case, we should be careful about moving things out of this
// element).
//
// The base-tag is maintained in the RewriteDriver, although it can be
// accessed via a convenience method here for historical reasons.
class CommonFilter : public EmptyHtmlFilter {
public:
// Debug message to be inserted when resource creation fails.
static const char kCreateResourceFailedDebugMsg[];
explicit CommonFilter(RewriteDriver* driver);
virtual ~CommonFilter();
// Getters
// URL of the requested HTML or resource.
const GoogleUrl& base_url() const;
// For rewritten resources, decoded_base_url() is the base of the original
// (un-rewritten) resource's URL.
const GoogleUrl& decoded_base_url() const;
RewriteDriver* driver() const { return driver_; }
HtmlElement* noscript_element() const { return noscript_element_; }
// Insert a node at the best available location in or near the closing body
// tag during EndDocument. This is useful for filters that want to insert
// scripts or summary data at the end of body, but need to wait until
// EndDocument to do so.
//
// Tries to inject just before </body> if nothing else intervenes; otherwise
// tries to inject before </html> or, failing that, at the end of all content.
// This latter case still works in browsers, but breaks HTML validation (and
// is incredibly ugly). It can be necessitated by other post-</html> content,
// or by flushes in the body.
//
// Note that if a subclass overloads the Characters function, it needs to call
// the parent implementation for this function to be correct.
void InsertNodeAtBodyEnd(HtmlNode* data);
// Note: Don't overload these methods, overload the implementers instead!
virtual void StartDocument();
virtual void StartElement(HtmlElement* element);
virtual void EndElement(HtmlElement* element);
// If a subclass overloads this function and wishes to use
// InsertNodeAtBodyEnd(), it needs to make an upcall to this implementation
// for InsertNodeAtBodyEnd() to work correctly.
virtual void Characters(HtmlCharactersNode* characters);
// Creates an input resource with the url evaluated based on input_url
// which may need to be absolutified relative to base_url(). Returns NULL
// if input resource url isn't valid, or can't legally be rewritten in the
// context of this page. *is_authorized will be set to false if the domain
// of input_url is not authorized, which could true of false regardless of
// the return value: for example if we are allowing inlining of resources
// from unauthorized domains we will return non-NULL but *is_authorized will
// be false; converse cases are possible too (e.g. input_url is a data URI).
ResourcePtr CreateInputResource(StringPiece input_url, bool* is_authorized);
// Similar to CreateInputResource except that if the input_url is not
// authorized we insert a debug comment after the given element if possible
// (debug is enabled and the element is writable). The returned ResourcePtr
// is guaranteed to be non-NULL iff the input_url is authorized.
ResourcePtr CreateInputResourceOrInsertDebugComment(StringPiece input_url,
HtmlElement* element);
// Resolves input_url based on the driver's location and any base tag into
// out_url. If resolution fails, the resulting URL may be invalid.
void ResolveUrl(StringPiece input_url, GoogleUrl* out_url);
// Returns whether or not the base url is valid. This value will change
// as a filter processes the document. E.g. If there are url refs before
// the base tag is reached, it will return false until the filter sees the
// base tag. After the filter sees the base tag, it will return true.
bool BaseUrlIsValid() const;
// Returns whether the current options specify the "debug" filter.
// If set, then other filters can annotate output HTML with HTML
// comments indicating why they did or did not do an optimization,
// using HtmlParse::InsertComment.
bool DebugMode() const { return driver_->DebugMode(); }
// Utility function to extract the mime type and/or charset from a meta tag,
// either the HTML4 http-equiv form or the HTML5 charset form:
// element is the meta tag element to process.
// headers is optional: if provided it is checked to see if it already has
// a content type with the tag's value; if so, returns false.
// content is set to the content attribute's value, http-equiv form only.
// mime_type is set to the extracted mime type, if any.
// charset is the set to the extracted charset, if any.
// returns true if the details were extracted, false if not. If true is
// returned then content will be empty for the HTML5 charset form and
// non-empty for the HTML4 http-equiv form; also an http-equiv attribute
// with a blank mime type returns false as it's not a valid format.
static bool ExtractMetaTagDetails(const HtmlElement& element,
const ResponseHeaders* headers,
GoogleString* content,
GoogleString* mime_type,
GoogleString* charset);
// Returns true if the image element is not in a <noscript> block and it has
// a) no onload attribute or
// b) an onload attribute exists with the value being equal to the
// CriticalImagesBeaconFilter::kImageOnloadCode.
bool CanAddPagespeedOnloadToImage(const HtmlElement&);
// Add this filter to the logged list of applied rewriters. The intended
// semantics of this are that it should only include filters that modified the
// content of the response to the request being processed.
// This class logs using Name(); subclasses may do otherwise.
virtual void LogFilterModifiedContent();
// Returns true if this filter allows domains not authorized by any pagespeed
// directive to be optimized. Filters that end up inlining content onto the
// HTML are almost the only ones that can safely do this.
virtual RewriteDriver::InlineAuthorizationPolicy AllowUnauthorizedDomain()
const { return RewriteDriver::kInlineOnlyAuthorizedResources; }
// Returns true if the filter intends to inline the resource it fetches. This
// is to support AllowWhenInlining. Unlike AllowUnauthorizedDomain() this
// doesn't have security implications and is just used for performance tuning.
virtual bool IntendedForInlining() const { return false; }
// Add JavaScript code to an HtmlElement*. Requires MimeTypeXhtmlStatus(),
// preventing this from going into HtmlParse.
void AddJsToElement(StringPiece js, HtmlElement* script);
protected:
ServerContext* server_context() const { return server_context_; }
const RewriteOptions* rewrite_options() { return rewrite_options_; }
// Overload these implementer methods:
// Intentionally left abstract so that implementers don't forget to change
// the name from Blah to BlahImpl.
virtual void StartDocumentImpl() = 0;
virtual void StartElementImpl(HtmlElement* element) = 0;
virtual void EndElementImpl(HtmlElement* element) = 0;
// ID string used in logging. Inheritors should supply whatever short ID
// string they use.
virtual const char* LoggingId() { return Name(); }
private:
// These fields are gettable by inheritors.
RewriteDriver* driver_;
ServerContext* server_context_;
const RewriteOptions* rewrite_options_;
HtmlElement* noscript_element_;
// These are private.
HtmlElement* end_body_point_;
bool seen_base_;
DISALLOW_COPY_AND_ASSIGN(CommonFilter);
};
} // namespace net_instaweb
#endif // NET_INSTAWEB_REWRITER_PUBLIC_COMMON_FILTER_H_