src/net/instaweb/rewriter/public/img_rewrite_filter.h - incubator-pagespeed-mod - Git at Google

 /**
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmaessen@google.com (Jan Maessen)

 #ifndef NET_INSTAWEB_REWRITER_PUBLIC_IMG_REWRITE_FILTER_H_
 #define NET_INSTAWEB_REWRITER_PUBLIC_IMG_REWRITE_FILTER_H_

 #include "net/instaweb/rewriter/public/rewrite_filter.h"

 #include "base/basictypes.h"
 #include "base/scoped_ptr.h"
 #include "net/instaweb/htmlparse/public/html_element.h"
 #include "net/instaweb/rewriter/public/img_tag_scanner.h"
 #include "net/instaweb/rewriter/public/resource.h"
 #include "net/instaweb/util/public/atom.h"
 #include <string>
 #include "net/instaweb/util/public/url_segment_encoder.h"

 namespace net_instaweb {

 class ContentType;
 class FileSystem;
 class HtmlParse;
 class Image;
 class ImageDim;
 class OutputResource;
 class ResourceManager;
 class UrlEscaper;
 class Variable;

 // This class supports the encoding of image urls with optional
 // additional dimension metadata.  The passed-in stored_dim is used as
 // the source and/or destination of this metadata during encode/decode
 // respectively.
 class ImageUrlEncoder : public UrlSegmentEncoder {
  public:
   ImageUrlEncoder(UrlEscaper* url_escaper, ImageDim* stored_dim);
   virtual ~ImageUrlEncoder();

   // Encode an origin_url and stored_dim from origin page to a rewritten_url.
   virtual void EncodeToUrlSegment(
       const StringPiece& origin_url, std::string* rewritten_url);

   // Decode an origin_url and stored_dim from a rewritten_url, returning false
   // on parse failure (invalidating output vars).
   virtual bool DecodeFromUrlSegment(const StringPiece& rewritten_url,
                                     std::string* origin_url);

  private:
   UrlEscaper* url_escaper_;
   ImageDim* stored_dim_;
 };

 // Identify img tags in html and optimize them.
 // TODO(jmaessen): See which ones have immediately-obvious size info.
 // TODO(jmaessen): Provide alternate resources at rewritten urls
 //     asynchronously somehow.
 // TODO(jmaessen): Big open question: how best to link pulled-in resources to
 //     rewritten urls, when in general those urls will be in a different domain.
 class ImgRewriteFilter : public RewriteFilter {
  public:
   ImgRewriteFilter(RewriteDriver* driver,
                    bool log_image_elements,
                    bool insert_image_dimensions,
                    StringPiece path_prefix,
                    size_t img_inline_max_bytes);
   static void Initialize(Statistics* statistics);
   virtual void StartDocumentImpl() {}
   virtual void StartElementImpl(HtmlElement* element) {}
   virtual void EndElementImpl(HtmlElement* element);
   virtual void Flush();
   virtual bool Fetch(OutputResource* resource,
                      Writer* writer,
                      const MetaData& request_header,
                      MetaData* response_headers,
                      UrlAsyncFetcher* fetcher,
                      MessageHandler* message_handler,
                      UrlAsyncFetcher::Callback* callback);
   virtual const char* Name() const { return "ImgRewrite"; }

   // Can we inline resource?  If so, encode its contents into the data_url,
   // otherwise leave data_url alone.
   static bool CanInline(
       int img_inline_max_bytes, const StringPiece& contents,
       const ContentType* content_type, std::string* data_url);

  private:
   // Helper methods.
   Image* GetImage(const StringPiece& origin_url, Resource* img_resource);
   OutputResource* ImageOutputResource(const std::string& url_string,
                                       Image* image);
   const ContentType* ImageToContentType(const std::string& origin_url,
                                         Image* image);
   void OptimizeImage(const Resource& input_resource, const ImageDim& page_dim,
                      Image* image, OutputResource* result);
   bool OptimizedImageFor(
       const StringPiece& origin_url, const ImageDim& page_dim,
       Resource* img_resource, OutputResource* output);
   void RewriteImageUrl(HtmlElement* element, HtmlElement::Attribute* src);
   void UpdateTargetElement(const Resource& input_resource,
                            const OutputResource& output_resource,
                            const ImageDim& page_dim, const ImageDim& actual_dim,
                            HtmlElement* element, HtmlElement::Attribute* src);

   FileSystem* file_system_;
   HtmlParse* html_parse_;
   scoped_ptr<ImgTagScanner> img_filter_;
   ResourceManager* resource_manager_;
   // Threshold size (in bytes) below which we should just inline images
   // encountered.
   // TODO(jmaessen): Heuristic must be more sophisticated.  Does this image
   // touch a fresh domain?  Require opening a new connection?  If so we can
   // afford to inline quite large images (basically anything we could transmit
   // in the resulting RTTs)---but of course we don't know about RTT here.  In
   // the absence of such information, we ought to inline if header length + url
   // size can be saved by inlining image, without increasing the size in packets
   // of the html.  Otherwise we end up loading the image in favor of the html,
   // which might be a lose.  More work is needed here to figure out the exact
   // tradeoffs involved, especially as we also undermine image cacheability.
   size_t img_inline_max_bytes_;
   // Should we log each image element as we encounter it?  Handy for debug.
   bool log_image_elements_;
   // Should we insert image dimensions into html if they are absent?
   bool insert_image_dimensions_;
   const Atom s_width_;
   const Atom s_height_;
   Variable* rewrite_count_;
   Variable* inline_count_;
   Variable* rewrite_saved_bytes_;

   DISALLOW_COPY_AND_ASSIGN(ImgRewriteFilter);
 };

 }  // namespace net_instaweb

 #endif  // NET_INSTAWEB_REWRITER_PUBLIC_IMG_REWRITE_FILTER_H_
	/**
	* Copyright 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: jmaessen@google.com (Jan Maessen)

	#ifndef NET_INSTAWEB_REWRITER_PUBLIC_IMG_REWRITE_FILTER_H_
	#define NET_INSTAWEB_REWRITER_PUBLIC_IMG_REWRITE_FILTER_H_

	#include "net/instaweb/rewriter/public/rewrite_filter.h"

	#include "base/basictypes.h"
	#include "base/scoped_ptr.h"
	#include "net/instaweb/htmlparse/public/html_element.h"
	#include "net/instaweb/rewriter/public/img_tag_scanner.h"
	#include "net/instaweb/rewriter/public/resource.h"
	#include "net/instaweb/util/public/atom.h"
	#include <string>
	#include "net/instaweb/util/public/url_segment_encoder.h"

	namespace net_instaweb {

	class ContentType;
	class FileSystem;
	class HtmlParse;
	class Image;
	class ImageDim;
	class OutputResource;
	class ResourceManager;
	class UrlEscaper;
	class Variable;

	// This class supports the encoding of image urls with optional
	// additional dimension metadata. The passed-in stored_dim is used as
	// the source and/or destination of this metadata during encode/decode
	// respectively.
	class ImageUrlEncoder : public UrlSegmentEncoder {
	public:
	ImageUrlEncoder(UrlEscaper* url_escaper, ImageDim* stored_dim);
	virtual ~ImageUrlEncoder();

	// Encode an origin_url and stored_dim from origin page to a rewritten_url.
	virtual void EncodeToUrlSegment(
	const StringPiece& origin_url, std::string* rewritten_url);

	// Decode an origin_url and stored_dim from a rewritten_url, returning false
	// on parse failure (invalidating output vars).
	virtual bool DecodeFromUrlSegment(const StringPiece& rewritten_url,
	std::string* origin_url);

	private:
	UrlEscaper* url_escaper_;
	ImageDim* stored_dim_;
	};

	// Identify img tags in html and optimize them.
	// TODO(jmaessen): See which ones have immediately-obvious size info.
	// TODO(jmaessen): Provide alternate resources at rewritten urls
	// asynchronously somehow.
	// TODO(jmaessen): Big open question: how best to link pulled-in resources to
	// rewritten urls, when in general those urls will be in a different domain.
	class ImgRewriteFilter : public RewriteFilter {
	public:
	ImgRewriteFilter(RewriteDriver* driver,
	bool log_image_elements,
	bool insert_image_dimensions,
	StringPiece path_prefix,
	size_t img_inline_max_bytes);
	static void Initialize(Statistics* statistics);
	virtual void StartDocumentImpl() {}
	virtual void StartElementImpl(HtmlElement* element) {}
	virtual void EndElementImpl(HtmlElement* element);
	virtual void Flush();
	virtual bool Fetch(OutputResource* resource,
	Writer* writer,
	const MetaData& request_header,
	MetaData* response_headers,
	UrlAsyncFetcher* fetcher,
	MessageHandler* message_handler,
	UrlAsyncFetcher::Callback* callback);
	virtual const char* Name() const { return "ImgRewrite"; }

	// Can we inline resource? If so, encode its contents into the data_url,
	// otherwise leave data_url alone.
	static bool CanInline(
	int img_inline_max_bytes, const StringPiece& contents,
	const ContentType* content_type, std::string* data_url);

	private:
	// Helper methods.
	Image* GetImage(const StringPiece& origin_url, Resource* img_resource);
	OutputResource* ImageOutputResource(const std::string& url_string,
	Image* image);
	const ContentType* ImageToContentType(const std::string& origin_url,
	Image* image);
	void OptimizeImage(const Resource& input_resource, const ImageDim& page_dim,
	Image* image, OutputResource* result);
	bool OptimizedImageFor(
	const StringPiece& origin_url, const ImageDim& page_dim,
	Resource* img_resource, OutputResource* output);
	void RewriteImageUrl(HtmlElement* element, HtmlElement::Attribute* src);
	void UpdateTargetElement(const Resource& input_resource,
	const OutputResource& output_resource,
	const ImageDim& page_dim, const ImageDim& actual_dim,
	HtmlElement* element, HtmlElement::Attribute* src);

	FileSystem* file_system_;
	HtmlParse* html_parse_;
	scoped_ptr<ImgTagScanner> img_filter_;
	ResourceManager* resource_manager_;
	// Threshold size (in bytes) below which we should just inline images
	// encountered.
	// TODO(jmaessen): Heuristic must be more sophisticated. Does this image
	// touch a fresh domain? Require opening a new connection? If so we can
	// afford to inline quite large images (basically anything we could transmit
	// in the resulting RTTs)---but of course we don't know about RTT here. In
	// the absence of such information, we ought to inline if header length + url
	// size can be saved by inlining image, without increasing the size in packets
	// of the html. Otherwise we end up loading the image in favor of the html,
	// which might be a lose. More work is needed here to figure out the exact
	// tradeoffs involved, especially as we also undermine image cacheability.
	size_t img_inline_max_bytes_;
	// Should we log each image element as we encounter it? Handy for debug.
	bool log_image_elements_;
	// Should we insert image dimensions into html if they are absent?
	bool insert_image_dimensions_;
	const Atom s_width_;
	const Atom s_height_;
	Variable* rewrite_count_;
	Variable* inline_count_;
	Variable* rewrite_saved_bytes_;

	DISALLOW_COPY_AND_ASSIGN(ImgRewriteFilter);
	};

	} // namespace net_instaweb

	#endif // NET_INSTAWEB_REWRITER_PUBLIC_IMG_REWRITE_FILTER_H_