blob: 9d2e70a678fb828d8e50d88492dca086304cbefc [file] [log] [blame]
/*
* Copyright 2016 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: morlovich@google.com (Maksim Orlovich)
#include "net/instaweb/rewriter/public/collect_dependencies_filter.h"
#include <memory>
#include "base/logging.h"
#include "net/instaweb/rewriter/cached_result.pb.h"
#include "net/instaweb/rewriter/dependencies.pb.h"
#include "net/instaweb/rewriter/input_info.pb.h"
#include "net/instaweb/rewriter/public/css_util.h"
#include "net/instaweb/rewriter/public/dependency_tracker.h"
#include "net/instaweb/rewriter/public/output_resource_kind.h"
#include "net/instaweb/rewriter/public/resource.h"
#include "net/instaweb/rewriter/public/resource_slot.h"
#include "net/instaweb/rewriter/public/resource_tag_scanner.h"
#include "net/instaweb/rewriter/public/rewrite_context.h"
#include "net/instaweb/rewriter/public/rewrite_driver.h"
#include "net/instaweb/rewriter/public/rewrite_result.h"
#include "net/instaweb/rewriter/public/server_context.h"
#include "pagespeed/kernel/base/abstract_mutex.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/thread_annotations.h"
#include "pagespeed/kernel/base/thread_system.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_name.h"
#include "pagespeed/kernel/http/data_url.h"
#include "pagespeed/kernel/http/google_url.h"
#include "pagespeed/kernel/http/semantic_type.h"
#include "util/utf8/public/unicodetext.h"
#include "webutil/css/parser.h"
namespace net_instaweb {
class CollectDependenciesFilter::Context : public RewriteContext {
public:
Context(DependencyType type, RewriteDriver* driver)
: RewriteContext(driver, nullptr, nullptr),
mutex_(driver->server_context()->thread_system()->NewMutex()),
reported_(false),
dep_type_(type),
dep_id_(-1) {
}
void Initiated() {
dep_id_ = Driver()->dependency_tracker()->RegisterDependencyCandidate();
}
~Context() override {
CHECK(reported_ || dep_id_ == -1);
}
bool Partition(OutputPartitions* partitions,
OutputResourceVector* outputs) override {
// We will never produce output, but always want to do stuff.
outputs->push_back(OutputResourcePtr(nullptr));
partitions->add_partition();
ResourcePtr resource(slot(0)->resource());
if (resource->loaded()) {
resource->AddInputInfoToPartition(
Resource::kIncludeInputHash, 0, partitions->mutable_partition(0));
}
return true;
}
static bool DefinitelyNeededToRender(
const std::unique_ptr<Css::Import>& import) {
StringVector media_types;
if (!css_util::ConvertMediaQueriesToStringVector(
import->media_queries(), &media_types)) {
// Something we don't understand. This includes things specifying
// media queries, which we can't evaluate, and therefore conservatively
// assume to be potentially unneeded.
return false;
}
return DefinitelyNeededToRender(media_types);
}
static bool DefinitelyNeededToRender(const StringVector& media_types) {
if (media_types.empty()) {
return true; // @import "foo", without media specified.
}
for (auto& medium : media_types) {
if (StringCaseEqual(medium, "all") || StringCaseEqual(medium, "screen")) {
return true;
}
}
return false;
}
protected:
void ExtractNestedCssDependencies(const Dependency* parent_dep,
const ResourcePtr& resource,
CachedResult* partition) {
// TODO(morlovich): We should probably look inside <style> blocks like this,
// too?
// Don't crash out on resources without anything loaded, and don't try to
// parse error pages for CSS imports.
if (!resource->HttpStatusOk()) {
return;
}
Css::Parser parser(resource->ExtractUncompressedContents());
parser.set_preservation_mode(true);
// We avoid quirks-mode so that we do not "fix" something we shouldn't have.
parser.set_quirks_mode(false);
while (true) {
std::unique_ptr<Css::Import> import(parser.ParseNextImport());
if (import == nullptr ||
parser.errors_seen_mask() != Css::Parser::kNoError) {
break;
}
if (DefinitelyNeededToRender(import)) {
GoogleString rel_url(
import->link().utf8_data(), import->link().utf8_length());
GoogleUrl full_url(GoogleUrl(resource->url()), rel_url);
if (full_url.IsWebValid()) {
Dependency* dep = partition->add_collected_dependency();
dep->set_url(full_url.Spec().as_string());
dep->set_content_type(DEP_CSS);
*dep->mutable_validity_info() = parent_dep->validity_info();
}
}
}
}
void Rewrite(int partition_index,
CachedResult* partition,
const OutputResourcePtr& output_resource) override {
Dependency* dep = partition->add_collected_dependency();
dep->set_url(slot(0)->resource()->url());
dep->set_content_type(dep_type_);
// The framework collected input info from any filter that ran before
// us, but not us (since it will do it after we finish work) --- which
// matters if our input is an unoptimized result, so add in our input info.
for (int i = 0; i < partition->input_size(); ++i) {
slot(0)->ReportInput(partition->input(i));
}
if (slot(0)->inputs() != nullptr) {
for (const InputInfo& input : *slot(0)->inputs()) {
InputInfo* stored_copy = dep->add_validity_info();
*stored_copy = input;
// Drop the parts of the info we can't use for checking validity
// of push.
stored_copy->clear_input_content_hash();
stored_copy->clear_disable_further_processing();
stored_copy->clear_index();
}
}
// Note: this needs to happen after the above since we need to propagate
// validity_info.
if (dep_type_ == DEP_CSS) {
ExtractNestedCssDependencies(dep, slot(0)->resource(), partition);
}
// TODO(morlovich): is_pagespeed_resource is not currently set, but I am not
// sure I actually want that: validity_info may be useful for non-optimized
// resources as well, and we set that already.
CHECK(output_resource.get() == nullptr);
CHECK_EQ(0, partition_index);
RewriteDone(kRewriteFailed, 0);
}
OutputResourceKind kind() const override { return kOnTheFlyResource; }
const char* id() const override {
return "cdf";
}
bool PolicyPermitsRendering() const {
return true; // We don't alter the doc...
}
void Render() override {
Report();
}
void WillNotRender() override {
{
ScopedMutex hold(mutex_.get());
if (reported_) {
return;
}
reported_ = true;
}
// We don't have results in time (and if we did, we wouldn't be able to
// access them from this thread), so give up on propagating to pcache for
// this time. This is somewhat conservative: if this is actually an early
// flush window we could deliver the result to depedency_tracker safely,
// but then if it's after document end it would have us miss the cache
// commit entirely...
Driver()->dependency_tracker()->ReportDependencyCandidate(dep_id_, nullptr);
}
void Cancel() override {
Report();
}
private:
void Report() {
{
ScopedMutex hold(mutex_.get());
if (reported_) {
return;
}
reported_ = true;
}
DependencyTracker* dep_tracker = Driver()->dependency_tracker();
// We already allocated dep_id_, so we should report on it, with either
// the first dependency we collected, or nullptr.
if (num_output_partitions() == 1 &&
output_partition(0)->collected_dependency_size() > 0) {
// Deep copy here because output_partition is already written, and it
// makes no sense to mutate it.
CachedResult result = *output_partition(0);
// Top-level stuff just gets its dep_id_ as the sorting key.
result.mutable_collected_dependency(0)->add_order_key(dep_id_);
dep_tracker->ReportDependencyCandidate(dep_id_,
&result.collected_dependency(0));
// Any other dependencies stored in result->collected_dependency >= 1
// are things we discovered *inside* whatever is described by
// result->collected_dependency(0)
//
// We grab a brand new ID for each one's storage inside
// dependency_tracker, and give them sorting keys based on the parent's
// dep_id_: (dep_id_, 1), (dep_id_, 2), etc., and so on, to make them get
// sorted after their parent (whose sorting key will be (dep_id_)) and
// before the next top-level resource, which will be something like
// (dep_id_ + 1) or some larger number. Note that we produce order keys
// at most 2 deep because we (for now?) only collect dependencies that
// deep.
for (int c = 1; c < result.collected_dependency_size(); ++c) {
int additional_dep_id = dep_tracker->RegisterDependencyCandidate();
Dependency* child_dep = result.mutable_collected_dependency(c);
child_dep->add_order_key(dep_id_);
child_dep->add_order_key(c);
dep_tracker->ReportDependencyCandidate(additional_dep_id, child_dep);
}
} else {
dep_tracker->ReportDependencyCandidate(dep_id_, nullptr);
}
}
std::unique_ptr<AbstractMutex> mutex_;
bool reported_ GUARDED_BY(mutex_);
DependencyType dep_type_;
int dep_id_;
DISALLOW_COPY_AND_ASSIGN(Context);
};
CollectDependenciesFilter::CollectDependenciesFilter(RewriteDriver* driver)
: CommonFilter(driver) {
}
void CollectDependenciesFilter::StartDocumentImpl() {
}
void CollectDependenciesFilter::StartElementImpl(HtmlElement* element) {
// We generally don't want noscript path stuff, since it's not usually
// used.
if (noscript_element() != nullptr) {
// Do nothing
return;
}
resource_tag_scanner::UrlCategoryVector attributes;
resource_tag_scanner::ScanElement(
element, driver()->options(), &attributes);
for (int i = 0, n = attributes.size(); i < n; ++i) {
// We only collect scripts and CSS.
if (attributes[i].category == semantic_type::kStylesheet ||
attributes[i].category == semantic_type::kScript) {
HtmlElement::Attribute* attr = attributes[i].url;
StringPiece url(attr->DecodedValueOrNull());
if (url.empty() || IsDataUrl(url)) {
continue;
}
// Check media on standard stylesheets.
if (attributes[i].category == semantic_type::kStylesheet &&
element->keyword() == HtmlName::kLink &&
attr->keyword() == HtmlName::kHref) {
HtmlElement::Attribute* media =
element->FindAttribute(HtmlName::kMedia);
if (media != nullptr) {
if (media->DecodedValueOrNull() == nullptr) {
// Encoding weirdness with media attribute -> don't push
continue;
}
StringVector media_vector;
css_util::VectorizeMediaAttribute(media->DecodedValueOrNull(),
&media_vector);
if (!Context::DefinitelyNeededToRender(media_vector)) {
continue;
}
}
}
// Code below relies on this being the guard here for it to be safe.
CHECK(attributes[i].category == semantic_type::kStylesheet ||
attributes[i].category == semantic_type::kScript);
RewriteDriver::InputRole role =
(attributes[i].category == semantic_type::kStylesheet
? RewriteDriver::InputRole::kStyle
: RewriteDriver::InputRole::kScript);
ResourcePtr resource(
CreateInputResourceOrInsertDebugComment(url, role, element));
if (resource.get() == nullptr) {
// TODO(morlovich): This may mean a valid 3rd party resource;
// we also probably don't want a warning in that case.
continue;
}
ResourceSlotPtr slot(driver()->GetSlot(resource, element, attr));
slot->set_need_aggregate_input_info(true);
Context* context = new Context(
attributes[i].category == semantic_type::kStylesheet ?
DEP_CSS : DEP_JAVASCRIPT,
driver());
context->AddSlot(slot);
if (driver()->InitiateRewrite(context)) {
context->Initiated();
}
}
}
}
void CollectDependenciesFilter::EndElementImpl(HtmlElement* element) {
}
} // namespace net_instaweb