src/net/instaweb/rewriter/collect_flush_early_content_filter.cc - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2012 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 // Author: nikhilmadan@google.com (Nikhil Madan)

 #include "net/instaweb/rewriter/public/collect_flush_early_content_filter.h"

 #include <memory>

 #include "base/logging.h"
 #include "net/instaweb/rewriter/cached_result.pb.h"
 #include "net/instaweb/rewriter/flush_early.pb.h"
 #include "net/instaweb/rewriter/public/critical_selector_filter.h"
 #include "net/instaweb/rewriter/public/flush_early_info_finder.h"
 #include "net/instaweb/rewriter/public/output_resource_kind.h"
 #include "net/instaweb/rewriter/public/resource.h"
 #include "net/instaweb/rewriter/public/resource_slot.h"
 #include "net/instaweb/rewriter/public/resource_tag_scanner.h"
 #include "net/instaweb/rewriter/public/rewrite_driver.h"
 #include "net/instaweb/rewriter/public/rewrite_result.h"
 #include "net/instaweb/rewriter/public/server_context.h"
 #include "net/instaweb/rewriter/public/single_rewrite_context.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/html/html_element.h"
 #include "pagespeed/kernel/html/html_keywords.h"
 #include "pagespeed/kernel/html/html_name.h"
 #include "pagespeed/kernel/http/data_url.h"
 #include "pagespeed/kernel/http/google_url.h"
 #include "pagespeed/kernel/http/semantic_type.h"

 namespace net_instaweb {

 class CollectFlushEarlyContentFilter::Context : public SingleRewriteContext {
  public:
   explicit Context(RewriteDriver* driver)
       : SingleRewriteContext(driver, NULL, NULL) {}

  protected:
   virtual void RewriteSingle(const ResourcePtr& input,
                              const OutputResourcePtr& output) {
     // Do not add resources which are inlined or combined.
     if (num_slots() != 1 || slot(0)->should_delete_element()) {
       // Do nothing.
     } else {
       // Update the cache with resource size.
       ResourceSlot* resource_slot = slot(0).get();
       ResourcePtr resource = resource_slot->resource();
       CachedResult* partition = output_partition(0);
       partition->set_size(resource->UncompressedContentsSize());
     }
     RewriteDone(kRewriteFailed, 0);
   }

   virtual void Render() {
     if (num_output_partitions() > 0 && output_partition(0)->has_size()) {
       HtmlResourceSlot* html_slot = static_cast<HtmlResourceSlot*>(
           slot(0).get());
       HtmlElement* element = html_slot->element();
       if (Driver()->IsRewritable(element)) {
         // TODO(pulkitg): Can IsRewritable be false here (see comment to
         // Propagate in rewrite_context.h)?
         Driver()->AddAttribute(element, HtmlName::kDataPagespeedSize,
                                Integer64ToString(output_partition(0)->size()));
       }
     }
   }

   virtual OutputResourceKind kind() const { return kOnTheFlyResource; }

   virtual const char* id() const {
     return "rscc";
   }

  private:
   DISALLOW_COPY_AND_ASSIGN(Context);
 };

 CollectFlushEarlyContentFilter::CollectFlushEarlyContentFilter(
     RewriteDriver* driver)
     : RewriteFilter(driver) {
   Clear();
 }

 void CollectFlushEarlyContentFilter::StartDocumentImpl() {
   Clear();
   FlushEarlyInfoFinder* finder =
       driver()->server_context()->flush_early_info_finder();
   if (finder != NULL && finder->IsMeaningful(driver())) {
     finder->UpdateFlushEarlyInfoInDriver(driver());
   }
 }

 void CollectFlushEarlyContentFilter::EndDocument() {
   if (driver()->flushing_early()) {
     return;
   }
   // Empty the resource_html_ if no resource is found.
   if (!found_resource_) {
     resource_html_ = "";
   }
   if (!resource_html_.empty()) {
     driver()->flush_early_info()->set_resource_html(resource_html_);
   }
 }

 void CollectFlushEarlyContentFilter::StartElementImpl(HtmlElement* element) {
   // Collect the link stylesheet tags inside the noscript element only if
   // they are added by the Critical CSS filter. In this case, the link tags
   // thus collected will be parsed by a subsequent run of the Critical CSS
   // filter in flush early phase. In this phase, Critical CSS filter replaces
   // link tags with style elements with critical CSS rules inlined and a
   // special attribute added (kDataPagespeedFlushStyle). Flush early content
   // filter in turn looks for the special attribute in the style tag and flush
   // the content early as inlined CSS link tags.
   // Note that this may cause the order of CSS elements stored in resource html
   // to be different from the order in which elements are parsed in HTML. This
   // can cause downloads to be in a different order too.
   //
   // FlushEarlyContentWriterFilter depends on us not flushing multiple resources
   // for the same element for two reasons:
   //  - The data-pagespeed-size attribute doesn't specify which url-valued
   //    attribute it refers to.
   //  - If there are multiple such attributes at least one is unlikely to be
   //    used and so not worth flushing.
   if (element == noscript_element()) {
     if (driver()->options()->enable_flush_early_critical_css()) {
       const char* cls = noscript_element()->AttributeValue(HtmlName::kClass);
       if (cls != NULL &&
           StringCaseEqual(cls, CriticalSelectorFilter::kNoscriptStylesClass)) {
         should_collect_critical_css_ = true;
       }
     }
     return;
   }

   if (noscript_element() != NULL && !should_collect_critical_css_) {
     // Do nothing
     return;
   }

   if (element->keyword() == HtmlName::kBody) {
     StrAppend(&resource_html_, "<body>");
     return;
   }

   if (driver()->flushing_early() &&
       driver()->options()->flush_more_resources_early_if_time_permits()) {
     resource_tag_scanner::UrlCategoryVector attributes;
     resource_tag_scanner::ScanElement(
         element, driver()->options(), &attributes);
     // We only want to flush early if there is a single flushable resource.
     HtmlElement::Attribute* resource_url = NULL;
     for (int i = 0, n = attributes.size(); i < n; ++i) {
       if (attributes[i].category == semantic_type::kStylesheet ||
           attributes[i].category == semantic_type::kScript ||
           attributes[i].category == semantic_type::kImage) {
         if (resource_url != NULL) {
           // This should never happen.  When StartElementImpl is called with
           // driver()->flushing_early() being true we're parsing the content
           // which we want to flush early.  That content was already filtered to
           // contain only elements with single resources to be flushed early.
           DCHECK(false);
           return;
         }
         resource_url = attributes[i].url;
       }
     }
     if (resource_url != NULL) {
       // We found a single resource to flush early.
       StringPiece url(resource_url->DecodedValueOrNull());
       if (url.empty() || IsDataUrl(url)) {
         return;
       }
       ResourcePtr resource(CreateInputResourceOrInsertDebugComment(url,
                                                                    element));
       if (resource.get() == NULL) {
         return;
       }
       ResourceSlotPtr slot(driver()->GetSlot(resource, element, resource_url));
       Context* context = new Context(driver());
       context->AddSlot(slot);
       driver()->InitiateRewrite(context);
     }
   } else {
     // Find javascript elements in the head, and css elements in the entire
     // page.  Only look at standard link-href/script-src tags because those are
     // the only ones we can handle with AppendToHtml() and because we're only
     // able to flush one resource early per element.
     HtmlName::Keyword attribute_name;
     if (element->keyword() == HtmlName::kScript) {
       attribute_name = HtmlName::kSrc;
     } else if (element->keyword() == HtmlName::kLink) {
       attribute_name = HtmlName::kHref;
     } else {
       return;
     }
     HtmlElement::Attribute* resource_url =
         element->FindAttribute(attribute_name);
     semantic_type::Category category =
         resource_tag_scanner::CategorizeAttribute(
             element, resource_url, driver()->options());
     if (element->keyword() == HtmlName::kScript &&
         category != semantic_type::kScript) {
       return;
     }
     if (element->keyword() == HtmlName::kLink &&
         category != semantic_type::kStylesheet) {
       return;
     }

     StringPiece url(resource_url->DecodedValueOrNull());
     if (url.empty() || IsDataUrl(url)) {
       return;
     }
     ResourcePtr resource(CreateInputResourceOrInsertDebugComment(url, element));
     if (resource.get() == NULL) {
       return;
     }
     // We need to always use the absolutified urls while flushing, else we
     // might end up flushing wrong resources. Use the absolutified url that is
     // computed in CreateInputResource call.
     GoogleUrl gurl(resource->url());
     if (gurl.IsWebValid()) {
       StringVector decoded_url;
       // Decode the url if it is encoded.
       if (driver()->DecodeUrl(gurl, &decoded_url)) {
         // TODO(pulkitg): Detect cases where rewritten resources are already
         // present in the original html.
         if (decoded_url.size() == 1) {
           // There will be only 1 url as combiners are off and this should be
           // modified once they are enabled.
           AppendToHtml(decoded_url.at(0), category, element);
         }
       } else {
         AppendToHtml(gurl.Spec(), category, element);
       }
     }
   }
 }

 void CollectFlushEarlyContentFilter::AppendToHtml(
     StringPiece url, semantic_type::Category category, HtmlElement* element) {
   GoogleString escaped_url;
   HtmlKeywords::Escape(url, &escaped_url);
   found_resource_ = true;
   if (category == semantic_type::kStylesheet) {
     StrAppend(&resource_html_, "<link ");
     AppendAttribute(HtmlName::kType, element);
     AppendAttribute(HtmlName::kRel, element);
     StrAppend(&resource_html_, "href=\"", escaped_url, "\"/>");
   } else if (category == semantic_type::kScript) {
     StrAppend(&resource_html_, "<script ");
     AppendAttribute(HtmlName::kType, element);
     StrAppend(&resource_html_, "src=\"", escaped_url, "\"></script>");
   }
 }

 void CollectFlushEarlyContentFilter::AppendAttribute(
     HtmlName::Keyword keyword, HtmlElement* element) {
   HtmlElement::Attribute* attr = element->FindAttribute(keyword);
   if (attr != NULL) {
     StringPiece attr_value(attr->DecodedValueOrNull());
     if (!attr_value.empty()) {
       GoogleString escaped_value;
       HtmlKeywords::Escape(attr_value, &escaped_value);
       StrAppend(
           &resource_html_, attr->name_str(), "=\"", escaped_value, "\" ");
     }
   }
 }

 void CollectFlushEarlyContentFilter::EndElementImpl(HtmlElement* element) {
   if (noscript_element() != NULL) {
     if (element == noscript_element()) {
       should_collect_critical_css_ = false;
     }
   } else if (element->keyword() == HtmlName::kBody) {
     StrAppend(&resource_html_, "</body>");
   }
 }

 void CollectFlushEarlyContentFilter::Clear() {
   resource_html_.clear();
   found_resource_ = false;
   should_collect_critical_css_ = false;
 }

 }  // namespace net_instaweb
	/*
	* Copyright 2012 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	// Author: nikhilmadan@google.com (Nikhil Madan)

	#include "net/instaweb/rewriter/public/collect_flush_early_content_filter.h"

	#include <memory>

	#include "base/logging.h"
	#include "net/instaweb/rewriter/cached_result.pb.h"
	#include "net/instaweb/rewriter/flush_early.pb.h"
	#include "net/instaweb/rewriter/public/critical_selector_filter.h"
	#include "net/instaweb/rewriter/public/flush_early_info_finder.h"
	#include "net/instaweb/rewriter/public/output_resource_kind.h"
	#include "net/instaweb/rewriter/public/resource.h"
	#include "net/instaweb/rewriter/public/resource_slot.h"
	#include "net/instaweb/rewriter/public/resource_tag_scanner.h"
	#include "net/instaweb/rewriter/public/rewrite_driver.h"
	#include "net/instaweb/rewriter/public/rewrite_result.h"
	#include "net/instaweb/rewriter/public/server_context.h"
	#include "net/instaweb/rewriter/public/single_rewrite_context.h"
	#include "pagespeed/kernel/base/string_util.h"
	#include "pagespeed/kernel/html/html_element.h"
	#include "pagespeed/kernel/html/html_keywords.h"
	#include "pagespeed/kernel/html/html_name.h"
	#include "pagespeed/kernel/http/data_url.h"
	#include "pagespeed/kernel/http/google_url.h"
	#include "pagespeed/kernel/http/semantic_type.h"

	namespace net_instaweb {

	class CollectFlushEarlyContentFilter::Context : public SingleRewriteContext {
	public:
	explicit Context(RewriteDriver* driver)
	: SingleRewriteContext(driver, NULL, NULL) {}

	protected:
	virtual void RewriteSingle(const ResourcePtr& input,
	const OutputResourcePtr& output) {
	// Do not add resources which are inlined or combined.
	if (num_slots() != 1 \|\| slot(0)->should_delete_element()) {
	// Do nothing.
	} else {
	// Update the cache with resource size.
	ResourceSlot* resource_slot = slot(0).get();
	ResourcePtr resource = resource_slot->resource();
	CachedResult* partition = output_partition(0);
	partition->set_size(resource->UncompressedContentsSize());
	}
	RewriteDone(kRewriteFailed, 0);
	}

	virtual void Render() {
	if (num_output_partitions() > 0 && output_partition(0)->has_size()) {
	HtmlResourceSlot* html_slot = static_cast<HtmlResourceSlot*>(
	slot(0).get());
	HtmlElement* element = html_slot->element();
	if (Driver()->IsRewritable(element)) {
	// TODO(pulkitg): Can IsRewritable be false here (see comment to
	// Propagate in rewrite_context.h)?
	Driver()->AddAttribute(element, HtmlName::kDataPagespeedSize,
	Integer64ToString(output_partition(0)->size()));
	}
	}
	}

	virtual OutputResourceKind kind() const { return kOnTheFlyResource; }

	virtual const char* id() const {
	return "rscc";
	}

	private:
	DISALLOW_COPY_AND_ASSIGN(Context);
	};

	CollectFlushEarlyContentFilter::CollectFlushEarlyContentFilter(
	RewriteDriver* driver)
	: RewriteFilter(driver) {
	Clear();
	}

	void CollectFlushEarlyContentFilter::StartDocumentImpl() {
	Clear();
	FlushEarlyInfoFinder* finder =
	driver()->server_context()->flush_early_info_finder();
	if (finder != NULL && finder->IsMeaningful(driver())) {
	finder->UpdateFlushEarlyInfoInDriver(driver());
	}
	}

	void CollectFlushEarlyContentFilter::EndDocument() {
	if (driver()->flushing_early()) {
	return;
	}
	// Empty the resource_html_ if no resource is found.
	if (!found_resource_) {
	resource_html_ = "";
	}
	if (!resource_html_.empty()) {
	driver()->flush_early_info()->set_resource_html(resource_html_);
	}
	}

	void CollectFlushEarlyContentFilter::StartElementImpl(HtmlElement* element) {
	// Collect the link stylesheet tags inside the noscript element only if
	// they are added by the Critical CSS filter. In this case, the link tags
	// thus collected will be parsed by a subsequent run of the Critical CSS
	// filter in flush early phase. In this phase, Critical CSS filter replaces
	// link tags with style elements with critical CSS rules inlined and a
	// special attribute added (kDataPagespeedFlushStyle). Flush early content
	// filter in turn looks for the special attribute in the style tag and flush
	// the content early as inlined CSS link tags.
	// Note that this may cause the order of CSS elements stored in resource html
	// to be different from the order in which elements are parsed in HTML. This
	// can cause downloads to be in a different order too.
	//
	// FlushEarlyContentWriterFilter depends on us not flushing multiple resources
	// for the same element for two reasons:
	// - The data-pagespeed-size attribute doesn't specify which url-valued
	// attribute it refers to.
	// - If there are multiple such attributes at least one is unlikely to be
	// used and so not worth flushing.
	if (element == noscript_element()) {
	if (driver()->options()->enable_flush_early_critical_css()) {
	const char* cls = noscript_element()->AttributeValue(HtmlName::kClass);
	if (cls != NULL &&
	StringCaseEqual(cls, CriticalSelectorFilter::kNoscriptStylesClass)) {
	should_collect_critical_css_ = true;
	}
	}
	return;
	}

	if (noscript_element() != NULL && !should_collect_critical_css_) {
	// Do nothing
	return;
	}

	if (element->keyword() == HtmlName::kBody) {
	StrAppend(&resource_html_, "<body>");
	return;
	}

	if (driver()->flushing_early() &&
	driver()->options()->flush_more_resources_early_if_time_permits()) {
	resource_tag_scanner::UrlCategoryVector attributes;
	resource_tag_scanner::ScanElement(
	element, driver()->options(), &attributes);
	// We only want to flush early if there is a single flushable resource.
	HtmlElement::Attribute* resource_url = NULL;
	for (int i = 0, n = attributes.size(); i < n; ++i) {
	if (attributes[i].category == semantic_type::kStylesheet \|\|
	attributes[i].category == semantic_type::kScript \|\|
	attributes[i].category == semantic_type::kImage) {
	if (resource_url != NULL) {
	// This should never happen. When StartElementImpl is called with
	// driver()->flushing_early() being true we're parsing the content
	// which we want to flush early. That content was already filtered to
	// contain only elements with single resources to be flushed early.
	DCHECK(false);
	return;
	}
	resource_url = attributes[i].url;
	}
	}
	if (resource_url != NULL) {
	// We found a single resource to flush early.
	StringPiece url(resource_url->DecodedValueOrNull());
	if (url.empty() \|\| IsDataUrl(url)) {
	return;
	}
	ResourcePtr resource(CreateInputResourceOrInsertDebugComment(url,
	element));
	if (resource.get() == NULL) {
	return;
	}
	ResourceSlotPtr slot(driver()->GetSlot(resource, element, resource_url));
	Context* context = new Context(driver());
	context->AddSlot(slot);
	driver()->InitiateRewrite(context);
	}
	} else {
	// Find javascript elements in the head, and css elements in the entire
	// page. Only look at standard link-href/script-src tags because those are
	// the only ones we can handle with AppendToHtml() and because we're only
	// able to flush one resource early per element.
	HtmlName::Keyword attribute_name;
	if (element->keyword() == HtmlName::kScript) {
	attribute_name = HtmlName::kSrc;
	} else if (element->keyword() == HtmlName::kLink) {
	attribute_name = HtmlName::kHref;
	} else {
	return;
	}
	HtmlElement::Attribute* resource_url =
	element->FindAttribute(attribute_name);
	semantic_type::Category category =
	resource_tag_scanner::CategorizeAttribute(
	element, resource_url, driver()->options());
	if (element->keyword() == HtmlName::kScript &&
	category != semantic_type::kScript) {
	return;
	}
	if (element->keyword() == HtmlName::kLink &&
	category != semantic_type::kStylesheet) {
	return;
	}

	StringPiece url(resource_url->DecodedValueOrNull());
	if (url.empty() \|\| IsDataUrl(url)) {
	return;
	}
	ResourcePtr resource(CreateInputResourceOrInsertDebugComment(url, element));
	if (resource.get() == NULL) {
	return;
	}
	// We need to always use the absolutified urls while flushing, else we
	// might end up flushing wrong resources. Use the absolutified url that is
	// computed in CreateInputResource call.
	GoogleUrl gurl(resource->url());
	if (gurl.IsWebValid()) {
	StringVector decoded_url;
	// Decode the url if it is encoded.
	if (driver()->DecodeUrl(gurl, &decoded_url)) {
	// TODO(pulkitg): Detect cases where rewritten resources are already
	// present in the original html.
	if (decoded_url.size() == 1) {
	// There will be only 1 url as combiners are off and this should be
	// modified once they are enabled.
	AppendToHtml(decoded_url.at(0), category, element);
	}
	} else {
	AppendToHtml(gurl.Spec(), category, element);
	}
	}
	}
	}

	void CollectFlushEarlyContentFilter::AppendToHtml(
	StringPiece url, semantic_type::Category category, HtmlElement* element) {
	GoogleString escaped_url;
	HtmlKeywords::Escape(url, &escaped_url);
	found_resource_ = true;
	if (category == semantic_type::kStylesheet) {
	StrAppend(&resource_html_, "<link ");
	AppendAttribute(HtmlName::kType, element);
	AppendAttribute(HtmlName::kRel, element);
	StrAppend(&resource_html_, "href=\"", escaped_url, "\"/>");
	} else if (category == semantic_type::kScript) {
	StrAppend(&resource_html_, "<script ");
	AppendAttribute(HtmlName::kType, element);
	StrAppend(&resource_html_, "src=\"", escaped_url, "\"></script>");
	}
	}

	void CollectFlushEarlyContentFilter::AppendAttribute(
	HtmlName::Keyword keyword, HtmlElement* element) {
	HtmlElement::Attribute* attr = element->FindAttribute(keyword);
	if (attr != NULL) {
	StringPiece attr_value(attr->DecodedValueOrNull());
	if (!attr_value.empty()) {
	GoogleString escaped_value;
	HtmlKeywords::Escape(attr_value, &escaped_value);
	StrAppend(
	&resource_html_, attr->name_str(), "=\"", escaped_value, "\" ");
	}
	}
	}

	void CollectFlushEarlyContentFilter::EndElementImpl(HtmlElement* element) {
	if (noscript_element() != NULL) {
	if (element == noscript_element()) {
	should_collect_critical_css_ = false;
	}
	} else if (element->keyword() == HtmlName::kBody) {
	StrAppend(&resource_html_, "</body>");
	}
	}

	void CollectFlushEarlyContentFilter::Clear() {
	resource_html_.clear();
	found_resource_ = false;
	should_collect_critical_css_ = false;
	}

	} // namespace net_instaweb