blob: 103e271640193367c760c444d173359af5f66709 [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Authors: jmarantz@google.com (Joshua Marantz)
// jefftk@google.com (Jeff Kaufman)
#ifndef NET_INSTAWEB_REWRITER_PUBLIC_RESOURCE_TAG_SCANNER_H_
#define NET_INSTAWEB_REWRITER_PUBLIC_RESOURCE_TAG_SCANNER_H_
#include <cstddef> // for NULL
#include <vector>
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/http/semantic_type.h"
namespace net_instaweb {
class RewriteOptions;
namespace resource_tag_scanner {
struct UrlCategoryPair {
HtmlElement::Attribute* url;
semantic_type::Category category;
UrlCategoryPair()
: url(NULL),
category(semantic_type::kUndefined) {}
};
typedef std::vector<UrlCategoryPair> UrlCategoryVector;
// If the attribute is url-valued, determine it's semantic category. Return
// kUndefined otherwise.
//
// Supported patterns are:
// Scripts:
// <script src=...>
// Stylesheets:
// <link rel="stylesheet" href=...>
// Images:
// <img src=...>,
// <link rel="icon" href=...>
// <link rel="apple-touch-icon" href=...>
// <link rel="apple-touch-icon-precomposed" href=...>
// <link rel="apple-touch-startup-image" href=...>
// <body background=...>
// <td background=...>
// <th background=...>
// <table background=...>
// <thead background=...>
// <tbody background=...>
// <tfoot background=...>
// <input type="image" src=...>
// <command icon=...>
// <video poster=...>
// Hyperlink:
// <a href=...>
// <area href=...>
// <form action=...>
// <blockquote cite=...>
// <body cite=...>
// <q cite=...>
// <ins cite=...>
// <del cite=...>
// <button formaction=...>
// <input formaction=...>
// <frame longdesc=...>
// <iframe longdesc=...>
// <img longdesc=...>
// OtherResource:
// <audio src=...>
// <video src=...>
// <source src=...>
// <track src=...>
// <html manifest=...>
// <embed src=...>
// <frame src=...>
// <iframe src=...>
//
// Exceptions:
//
// 1. Because we don't parse the codebase attribute of applet or object elements
// it's not safe for us to manipulate any of their other url-valued
// attributes.
//
// 2. The base tag is dealt with elsewhere, but here we skip it. It's not safe
// to make any changes to it, so this is safe. It's also not safe to make
// changes to <head profile=...> because one use of a profile is as a
// globally unique name which a browser or other agent recognizes and
// interprets without accessing, so we skip it too.
//
// 3. While usemap, an attribute of img, input, and object, is technically a
// url, it always has the form #name or #id, which means there's nothing we
// can do to improve it and there's no harm in leaving it out.
//
// These exceptions aside, we should support all url-valued attributes in
// html4 and html5:
// http://dev.w3.org/html5/spec/section-index.html#attributes-1
// http://www.w3.org/TR/REC-html40/index/attributes.html
//
semantic_type::Category CategorizeAttribute(
const HtmlElement* element,
const HtmlElement::Attribute* attribute,
const RewriteOptions* options);
// Examines an HTML element to determine if it's a link to any sort of resource,
// extracting out the HREF, SRC, or other URL attributes and identifying their
// categories. Because, for example, a LINK tag can be an image, stylesheet, or
// nearly anything else, it's not sufficient for callers to assume they can
// figure out the category from the element's name.
//
// See CategorizeAttribute for the meaning of "url-valued attribute".
//
// Attributes that we can't decode, such as non-ascii urls, will be skipped.
//
// Attributes are returned in left-to-right order.
void ScanElement(HtmlElement* element, const RewriteOptions* options,
UrlCategoryVector* attributes);
} // namespace resource_tag_scanner
} // namespace net_instaweb
#endif // NET_INSTAWEB_REWRITER_PUBLIC_RESOURCE_TAG_SCANNER_H_