blob: e097eeb4ce282e9bc3337f6b73b95ebfdb5d0dfc [file] [log] [blame]
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Copyright 2006 Google Inc. All Rights Reserved.
// Author: dpeng@google.com (Daniel Peng)
#ifndef WEBUTIL_CSS_PARSER_H__
#define WEBUTIL_CSS_PARSER_H__
#include <memory>
#include "base/scoped_ptr.h"
#include <string>
#include <vector>
#include "base/scoped_ptr.h"
#include "strings/stringpiece.h"
#include "testing/production_stub/public/gunit_prod.h"
#include "util/utf8/public/unicodetext.h"
#include "webutil/css/media.h"
#include "webutil/css/property.h" // while these CSS includes can be
#include "webutil/css/selector.h" // forward-declared, who is really
#include "webutil/css/string.h"
#include "webutil/css/value.h" // going to want parser.h but not values.h?
#include "webutil/html/htmlcolor.h"
namespace Css {
// These are defined below Parser.
class Declaration;
class Declarations;
class Import;
class FontFace;
class Stylesheet;
class Ruleset;
// Recursive descent parser for CSS.
// Based on: http://www.w3.org/TR/CSS21/syndata.html
// http://www.w3.org/TR/CSS21/grammar.html
//
// Say you want to parse a fragment of CSS. Then construct a new
// Parser object (this is very lightweight) and pass in the fragment to parse.
// Then, call the top-level ParseXXX() function for what you want to parse.
// This parses the fragment and returns a pointer to the abstract syntax tree.
// You own this pointer and must delete it when you're done.
//
// The data structures comprising the abstract syntax tree are described in
// cssvalue.h, cssparser-inl.h, csscondition.h, and cssproperty.h.
//
// Essentially, each stylesheet is a collection of rulesets.
// Each ruleset has some selectors to describe what HTML elements it
// applies to and some declarations to describe how the HTML elements
// should be styled.
// The ruleset may apply to multiple comma-separated selectors, which
// means they apply to an element matching any of the selectors.
// Ex: h1, h2 > p, h3 { color: red; }
//
// Each selector consists of a chain of simple selectors, separated by
// combinators.
// Ex: h2 > p selects a P element that is a child of an H2 element.
//
// Each simple selector may have conditions which impose further
// restrictions, such as [foo], #id, .class, or :hover. We represent
// these as a list, which are semantically AND-ed together.
//
// Each declaration consists of a property and a list of values.
//
// Say, for example, you wish to parse a CSS declaration:
// Parser a("background: white; color: #333; line-height: 1.3;
// text-align: justify; font-family: \"Gill Sans MT\",
// \"Gill Sans\", GillSans, Arial, Helvetica, sans-serif");
// scoped_ptr<Declarations> t(a.ParseDeclarations());
// See the 'declarations' unit test case for more details.
//
// We've made most of the recursive-descent ParseXXX functions private
// to shrink the size of the public interface. We expose
// ParseStylesheet() and ParseDeclarations() because those are the
// top-level functions necessary to parse stylesheets in HTML
// documents. And ParseSelectors is exposed to parse selectors. If it's
// useful to expose more of the functions, please just send a CL for approval,
// so we know what people depend on.
//
// The CSS parser runs in either quirks mode (the default) and standard
// compliant mode. The latter is stricter in many aspects. Currently, it
// affects color parsing (see below for details). Please refer to:
// http://developer.mozilla.org/en/docs/Mozilla_Quirks_Mode_Behavior
// for the difference in Mozilla browsers.
class Parser {
public:
Parser(const char* utf8text, const char* textend);
explicit Parser(const char* utf8text);
explicit Parser(StringPiece s);
// ParseRawSytlesheet and ParseStylesheet consume the entire document and
// return a Stylesheet* containing all the imports and rulesets that it
// found. You must delete the return pointer.
// ParseRawStylesheet simply parses the document into an abstract syntax tree.
Stylesheet* ParseRawStylesheet();
// ParseStylesheet also runs a second pass to convert shorthand
// declarations such as background, font and font-family into sets of
// declarations that they represent.
Stylesheet* ParseStylesheet();
// ParserRawDeclarations and ParseDeclarations parse declarations like
// "background: white; color: #333; line-height: 1.3;", consuming until
// (but not including) the closing '}' or EOF. You must delete the return
// pointer.
// ParseRawDeclarations simply parses the declarations into an AST.
Declarations* ParseRawDeclarations();
// ParseDeclarations also runs a second pass to convert *some* syntactic
// sugar declarations such as background, font and font-family.
// Currently, both the expanded properties (such as background-color) and the
// original property (background) are stored because the impl. is incomplete.
// For details, see parser.cc.
Declarations* ParseDeclarations();
// Starting at whitespace or the first media query, ParseMediaQueries
// parses a media query list and returns it. Never returns NULL. Returns
// all MediaQueries it can successfully parse.
MediaQueries* ParseMediaQueries();
// Expand the values of shorthand declarations. Currently expands background
// and font. Clears (but does not delete) input orig_declarartions in the
// process. orig_declarations should be a std::vector of NULLs on exit.
Declarations* ExpandDeclarations(Declarations* orig_declarations);
// Starting at the first simple selector or whitespace, ParseSelectors parses
// a sequence of selectors. Return NULL if the parsing fails. The parser would
// consume anything up to the declaration starting '{' or the end of document.
Selectors* ParseSelectors();
// Parse the next @import statement from the document. If it's not an @import
// or if there's a syntax error, NULL is returned. Added for mod_pagespeed's
// conversion to a link of @imports inside a style element.
// If the next statement is not an @import rule, in_ is left at the
// beginning of that statement.
Import* ParseNextImport();
// Parse the document as a single @import statement. If it's not exactly
// one of these, or there's a syntax error, NULL is returned. Added for
// mod_pagespeed's conversion to a link of this inside a style element.
Import* ParseAsSingleImport();
// Extract the leading @charset from the document. The return value is
// valid iff it is not empty -and- errors_seen_mask() is zero. Added so
// that mod_pagespeed can determine the charset of a CSS file without
// duplicating a ton of our code.
UnicodeText ExtractCharset();
// current position in the parse.
const char* getpos() const { return in_; }
// Current position in document (bytes from beginning).
int CurrentOffset() const { return in_ - begin_; }
// Done with the parse?
bool Done() const {
DCHECK(in_ <= end_) << "in_ is out of bounds, buffer overflow.";
return in_ >= end_;
}
// Whether quirks mode (the default) is used in parsing. Standard compliant
// (non-quirks) mode is stricter in color parsing, where a form of "rrgbbb"
// without a leading # is not allowed.
bool quirks_mode() const { return quirks_mode_; }
void set_quirks_mode(bool quirks_mode) { quirks_mode_ = quirks_mode; }
// In preservation mode (default off) we attempt to parse and store as much
// info as possible from the stylesheet. We avoid value validation and allow
// all parseable values. In addition for some constructs that cannot be
// parsed, we store verbatim bytes which can be re-serialized back out.
bool preservation_mode() const { return preservation_mode_; }
void set_preservation_mode(bool x) { preservation_mode_ = x; }
// Maximum recursive function depth. How deeply should the parser parse
// functions inside of functions. It is important to limit this to avoid
// unbounded stack-frame depth on untrusted input. See b/17628553
int max_function_depth() const { return max_function_depth_; }
void set_max_function_depth(int x) { max_function_depth_ = x; }
static const int kDefaultMaxFunctionDepth = 10;
// This is a bitmask of errors seen during the parse. This is decidedly
// incomplete --- there are definitely many errors that are not reported here.
static const uint64 kNoError = 0;
static const uint64 kUtf8Error = 1ULL << 0; // 1
static const uint64 kDeclarationError = 1ULL << 1; // 2
static const uint64 kSelectorError = 1ULL << 2; // 4
static const uint64 kFunctionError = 1ULL << 3; // 8
static const uint64 kMediaError = 1ULL << 4; // 16
static const uint64 kCounterError = 1ULL << 5; // 32
static const uint64 kHtmlCommentError = 1ULL << 6; // 64
static const uint64 kValueError = 1ULL << 7; // 128
static const uint64 kRulesetError = 1ULL << 8; // 256
static const uint64 kSkippedTokenError = 1ULL << 9; // 512
static const uint64 kCharsetError = 1ULL << 10; // 1024
static const uint64 kBlockError = 1ULL << 11; // 2048
static const uint64 kNumberError = 1ULL << 12; // 4096
static const uint64 kImportError = 1ULL << 13; // 8192
static const uint64 kAtRuleError = 1ULL << 14; // 16384
static const uint64 kCssCommentError = 1ULL << 15; // 32768
uint64 errors_seen_mask() const { return errors_seen_mask_; }
uint64 unparseable_sections_seen_mask() const {
return unparseable_sections_seen_mask_;
}
static const int kMaxErrorsRemembered = 16;
struct ErrorInfo {
int error_num;
int byte_offset;
string message;
};
// A vector of first kNumErrorsRemembered errors seen.
const std::vector<ErrorInfo> errors_seen() const { return errors_seen_; }
// Returns the error number based on the error flag.
// Ex: ErrorNumber(kUtf8Error) == 0,
// ErrorNumber(kDeclarationError) == 1, etc.
static int ErrorNumber(uint64 error_flag);
private:
//
// Syntactic methods
//
// SkipSpace() skips whitespace ([ \t\r\n\f]) and comments
// (/* .... */) until we reach a non-whitespace, non-comment
// character, or the end of the document.
void SkipSpace();
// Starting at /*, SkipComment() skips past the matching */ or to
// the end of the document.
void SkipComment();
// Helper method for the other Skip* methods. Skips over the next bit of text.
// Note: It does not yet lex all tokens, only strings, comments and escape
// sequences. These are specifically lexed to avoid naively interpreting:
// "}", /*]*/ or identifier\)foo as closing brackets.
// Note: We do not use ParseAny() for this to avoid excessive recursion.
void SkipNextToken();
// Starting at '{', '[' or '(', SkipMatching consumes to the closing '}',
// ']' or ')' respecting nested blocks. We discard the result.
// Returns true if matching '}' was found, false if EOF was reached first.
bool SkipMatching();
// Skips following tokens until delimiter delim or end is seen, delim is
// consumed if found. Smart enough to skip over matches inside comments,
// quoted strings or balanced parentheses ()[]{}.
// For example, if in_ = "foo(a, b), 1, bar"
// SkipPastDelimiter(',') will result in in_ = " 1, bar".
// Returns true if it found delim before end of file.
bool SkipPastDelimiter(char delim);
// Skip until next "any" token (value which can be parsed by ParseAny).
//
// Skips whitespace, comments, blocks ({..}), and @tokens, and returns true
// unless we are at the end of the document or the next character is a token
// ending delimiter ([;}!]).
bool SkipToNextAny();
// Skip past the end of the at-rule. Used for at-rules that we do not
// recognize. Return value is whether or not the at-rule was closed correctly.
// Returns true if at-rule is correctly closed (by ; or end of block),
// false if EOF was reached first.
// Ending ; or {}-block are consumed. However, closing } are not consumed.
//
// From http://www.w3.org/TR/CSS2/syndata.html#parsing-errors:
//
// At-rules with unknown at-keywords. User agents must ignore an invalid
// at-keyword together with everything following it, up to the end of the
// block that contains the invalid at-keyword, or up to and including the
// next semicolon (;), or up to and including the next block ({...}),
// whichever comes first.
bool SkipToAtRuleEnd();
// Skip until the end of a single media query. @media statements may have
// multiple comma-separated media queries. If one cannot be parsed, the others
// are still valid, so we need to skip just the one.
// Does not consume the tokens marking the end of the media query.
void SkipToMediaQueryEnd();
// Parse functions.
//
// When the comment reads 'starting at foo', it's a dchecked runtime
// error to call the function if the input does not start with
// 'foo'.
//
// If a ParseXXX method returns a pointer, you own it and must
// delete it.
//
// 'leaves' of the parse tree: strings, urls, identifiers, numbers,
// etc
//
// ParseIdent() consumes the identifier and returns its unescaped
// representation. If we are at the end of the document, or if no
// identifier is found, ParseIdent() returns the empty string.
//
// In CSS2, identifiers (including element names, classes, and IDs in
// selectors) can contain only the characters [A-Za-z0-9] and ISO
// 10646 characters 161 and higher, plus the hyphen (-); they cannot
// start with a hyphen or a digit. They can also contain escaped
// characters and any ISO 10646 character as a numeric code (see next
// item). For instance, the identifier "B&W?" may be written as
// "B\&W\?" or "B\26 W\3F".
// http://www.w3.org/TR/REC-CSS2/syndata.html#value-def-identifier
//
// We're a little more forgiving than the standard and permit hyphens
// and digits to start identifiers.
// This method does not skip spaces like most other methods do, because it
// may be used to identify things like "import" in "@import", which is
// different from "@ import".
UnicodeText ParseIdent();
// Starting at \, parse the escape and return the corresponding
// unicode codepoint. If the \ is the last character in the
// document, we return '\'; there is no other malformed input. This
// implements the second and third types of character escapes at
// http://www.w3.org/TR/REC-CSS2/syndata.html#escaped-characters
//
// 2) It cancels the meaning of special CSS characters. Any
// character (except a hexadecimal digit) can be escaped with a
// backslash to remove its special meaning. For example,
// ParseEscape() returns 0x6240 for \所 and 71 for \G (but \C is a
// hex escape, treated below:)
//
// 3) Backslash escapes allow authors to refer to characters
// they can't easily put in a document. In this case, the backslash
// is followed by at most six hexadecimal digits (0..9A..Fa..f), which
// stand for the ISO 10646 ([ISO10646]) character with that
// number. If a digit or letter follows the hexadecimal number, the
// end of the number needs to be made clear. There are two ways to
// do that:
// 1. with a space (or other whitespace character): "\26 B" ("&B")
// 2. by providing exactly 6 hexadecimal digits: "\000026B" ("&B")
//
// So, if the escape sequence is a hex escape and the character following
// the last hex digit is a space, then ParseEscape() consumes it.
//
// Only interchange valid Unicode characters will be returned.
// all other characters will be replaced with space (" ") and
// a kUtf8Error will be recorded in errors_seen_mask_.
char32 ParseEscape(); // return the codepoint for the current escape \12a76f
// Starting at delim, ParseString<char delim>() consumes the string,
// including the matching end-delim, and returns its unescaped
// representation, without the delimiters. If we fail to find the
// matching delimiter, we consume the rest of the document and
// return it.
//
// Strings can either be written with double quotes or with single
// quotes. Double quotes cannot occur inside double quotes, unless
// escaped (as '\"' or as '\22'). Analogously for single quotes
// ("\'" or "\27"). A string cannot directly contain a newline,
// unless hex-escaped as "\A".
//
// It is possible to break strings over several lines, for aesthetic
// or other reasons, but in such a case the newline itself has to be
// escaped with a backslash (\). For instance, the following two
// selectors are exactly the same:
// http://www.w3.org/TR/REC-CSS2/syndata.html#strings
template<char delim> UnicodeText ParseString();
// If the current character is a string-delimiter (' or "),
// ParseStringOrIdent() parses a string and returns the contents.
// Otherwise, it tries to parse an identifier. We must not be at
// the end of the document.
UnicodeText ParseStringOrIdent();
// Same as ParseString, but returns a Value object containing that string,
// which has bytes_in_original_buffer set.
template<char delim> Value* ParseStringValue();
// ParseNumber parses a number and an optional unit, consuming to
// the end of the number or unit and returning a Value*.
// Real numbers and integers are specified in decimal notation
// only. An <integer> consists of one or more digits "0" to "9". A
// <number> can either be an <integer>, or it can be zero or more
// digits followed by a dot (.) followed by one or more digits. Both
// integers and real numbers may be preceded by a "-" or "+" to
// indicate the sign.
//
// If no number is found, ParseNumber returns NULL.
Value* ParseNumber();
// ParseColor parses several different representations of colors:
// 1) rgb
// 2) #rgb
// 3) rrggbb
// 4) #rrggbb
// 5) The 16 HTML4 color names (aqua, black, blue,
// fuchsia, gray, green, lime, maroon, navy, olive, purple, red,
// silver, teal, white, and yellow), with or without quotes (' or ").
// It's designed to handle all the ill-formed CSS color values out there.
// It consumes the color if it finds a valid color. Otherwise, it returns
// an undefined HtmlColor (HtmlColor::IsDefined()) and does not consume
// anything.
//
// However, if quirks_mode_ is false (standard compliant mode), forms 1 and 3
// (without #) would not be accepted.
HtmlColor ParseColor(); // parse a hex or named
// color like #fff, #bcdefa
// or black
//
// FUNCTIONS and FUNCTION-like objects: rgb(), url(), rect()
//
// Parse a generic list of function parameters.
//
// Specifically, starting after the opening '(', repeatedly ParseAny() as
// values either comma or space separated until we reach the closing ')'.
//
// ParseFunction() does not consume closing ')' and returns a vector of
// values if successful, and NULL if the contents were mal-formed.
//
// We limit the max depth of nested functions to avoid unbounded stack depth.
// See b/17628553
FunctionParameters* ParseFunction(int max_function_depth);
// Converts a Value number or percentage to an RGB value.
static unsigned char ValueToRGB(Value* v);
// ParseRgbColor parsers the part between the parentheses of rgb( )
// according to http://www.w3.org/TR/REC-CSS2/syndata.html#color-units .
//
// The format of an RGB value in the functional notation is 'rgb('
// followed by a comma-separated list of three numerical values
// (either three integer values or three percentage values)
// followed by ')'. The integer value 255 corresponds to 100%, and
// to F or FF in the hexadecimal notation: rgb(255,255,255) =
// rgb(100%,100%,100%) = #FFF. Whitespace characters are allowed
// around the numerical values.
//
// Starting just past 'rgb(', ParseRgbColor() consumes up to (but not
// including) the closing ) and returns the color it finds.
// Returns NULL if mal-formed.
Value* ParseRgbColor(); // parse an rgbcolor like 125, 25, 12
// or 12%, 57%, 89%
// ParseUrl parses the part between the parentheses of url( )
// according to http://www.w3.org/TR/REC-CSS2/syndata.html#uri .
//
// The format of a URI value is 'url(' followed by optional
// whitespace followed by an optional single quote (') or double
// quote (") character followed by the URI itself, followed by an
// optional single quote (') or double quote (") character followed
// by optional whitespace followed by ')'. The two quote characters
// must be the same.
//
// Starting just past 'url(', ParseUrl() consumes the url as well as
// the optional whitespace. If the url is well-formed, the next
// character must be ')'.
// Returns NULL for mal-formed URLs.
Value* ParseUrl(); // parse a url like yellow.png or 'blah.png'
//
// Value and Values
//
// Parses a value which is expected to be color values. It can be
// different from ParseAny, for example, for black or ccddff, both
// are translated into color values here but are returned as idents
// in the latter case. We call this instead of ParseAny() after
// color, background-color, and background properties to accomodate bad CSS.
// If no value is found, ParseAnyExpectingColor returns NULL.
Value* ParseAnyExpectingColor();
// ParseAny() parses a css value and consumes it. It does not skip
// leading or trailing whitespace.
// If no value is found, ParseAny returns NULL and make sure at least one
// character is consumed (to make progress).
Value* ParseAny();
// Helper function which limits the levels of recursion.
Value* ParseAnyWithFunctionDepth(int max_function_depth);
// Parse a list of values for the given property.
// We parse until we see a !, ;, or } delimiter. However, if there are any
// malformed values, stop parsing and return NULL immediately.
// For special shortcut properties, use the following specialized methods
// instead.
Values* ParseValues(Property::Prop prop);
// Expand a background property into all the sub-properties (background-color,
// background-image, etc.). Return false on malformed original_declaration.
static bool ExpandBackground(const Declaration& original_declaration,
Declarations* new_declarations);
// Parses FONT. Returnss NULL if malformed. Otherwise, the output is a tuple
// in the following order
// "font-style font-variant font-weight font-size line-height font-family+"
Values* ParseFont();
// Parses FONT-FAMILY and the tailing part in FONT and appends the results in
// values. Returns false if there are any malformed values.
// This interface is different from the others because it is also used by
// ParseFont(), where family names are appended to other CSS values.
bool ParseFontFamily(Values* values);
//
// Selectors and Rulesets
//
// ParseAttributeSelector() starts at [ and parses an attribute
// selector like [ foo ~= bar], consuming the final ]. Returns NULL
// on error but still consumes to the matching ].
// This method does not skip spaces like most other methods do.
// Whitespace is syntactically significant here, because a sequence of simple
// selectors contains no whitespace. 'div[align=center]' is a sequence of
// simple selectors, but 'div [align=center]' is a syntax error (though we
// will parse it as a selector, i.e., two simple selector sequences separated
// by a whitespace combinator).
SimpleSelector* ParseAttributeSelector();
// ParseSimpleSelector() parses one simple sector. Starts from
// anything and returns NULL if no simple selector found or parse error.
// This method does not skip spaces like most other methods do.
// See comment above.
SimpleSelector* ParseSimpleSelector();
// Checks if the parser stops at a character (or characters) that will
// legally terminate a SimpleSelectors. The checked characters are not eaten.
// Valid terminators are whitespaces, comments, combinators ('>', '+'), ','
// and '{'. A stop at the end is also considered valid.
bool AtValidSimpleSelectorsTerminator() const;
// Starting at whitespace, a combinator, or the first simple
// selector, ParseSimpleSelectors parses a sequence of simple
// selectors, i.e., a chain of simple selectors that are not
// separated by a combinator. The chain itself may be preceeded by
// a combinator, in which case you should pass true for
// expecting_combinator, and we will parse the combinator.
// Typically, when you're parsing a selector (i.e., a chain of
// sequences of simple selectors separated by combinators), you pass
// false on the first simple selector and true on the subsequent
// ones.
SimpleSelectors* ParseSimpleSelectors(bool expecting_combinator);
// Parse an at-rule or ruleset.
//
// This may be nested inside of an @media rule if media_queries != NULL.
// If media_queries == NULL, this is not nested.
//
// Although @media rules are allowed to be nested inside other @media rules
// in CSS3, we do not parse such nested rules, and therefore avoid unbounded
// recursive depth.
void ParseStatement(const MediaQueries* media_queries,
Stylesheet* stylesheet);
// ParseRuleset() starts from the first character of the first
// selector (note: it does not skip whitespace) and consumes the
// ruleset, including the closing '}'. Return NULL if the parsing fails.
// However, the parser would consume anything up to the closing '}', if any,
// even if it fails somehow in the middle, per CSS spec.
//
// Note: In preservation mode, a ruleset may be returned even if selectors
// could not be parsed. If this happens the selectors.is_dummy() will be true.
Ruleset* ParseRuleset();
//
// Miscellaneous
//
// Starting at whitespace or the start of a media query, parses and returns
// the entire query. Returns NULL if the media query is invalid.
MediaQuery* ParseMediaQuery();
// ParseImport starts just after @import and consumes the import
// declaration, but not the closing ;. It returns a Import*
// containing the imported name and the media.
Import* ParseImport();
// Parse the charset after an @charset rule.
UnicodeText ParseCharset();
// Parse an @font-face statement.
FontFace* ParseFontFace();
static const int kErrorContext;
// error_flag should be one of the static const k*Error's above.
void ReportParsingError(uint64 error_flag, const StringPiece& message);
const char *begin_; // The beginning of the doc (used to report offset).
const char *in_; // The current point in the parse.
const char *end_; // The end of the document to parse.
bool quirks_mode_; // Whether we are in quirks mode.
// In preservation mode, we attempt to save all information from the
// stylesheet (including unparseable constructs such as proprietary CSS
// and CSS hacks) so that they can be re-serialized precisely.
bool preservation_mode_;
int max_function_depth_;
// errors_seen_mask_ is non-zero iff we failed to parse part of the CSS
// and could not recover and so we have lost information.
uint64 errors_seen_mask_;
// Only set in preservation_mode_. unparseable_sections_seen_mask_ is non-zero
// iff we failed to parse a section of CSS, but saved the text verbatim or
// in some other way preserved the information from the original document.
uint64 unparseable_sections_seen_mask_;
// Vector of all errors { error_type_number, location, message }.
std::vector<ErrorInfo> errors_seen_;
friend class Tracer;
friend class ParserTest; // we need to unit test private Parse functions.
FRIEND_TEST(ParserTest, color);
FRIEND_TEST(ParserTest, url);
FRIEND_TEST(ParserTest, rect);
FRIEND_TEST(ParserTest, background);
FRIEND_TEST(ParserTest, font_family);
FRIEND_TEST(ParserTest, ParseBlock);
FRIEND_TEST(ParserTest, font);
FRIEND_TEST(ParserTest, numbers);
FRIEND_TEST(ParserTest, values);
FRIEND_TEST(ParserTest, declarations);
FRIEND_TEST(ParserTest, universalselector);
FRIEND_TEST(ParserTest, universalselectorcondition);
FRIEND_TEST(ParserTest, comment_breaking_descendant_combinator);
FRIEND_TEST(ParserTest, comment_breaking_child_combinator);
FRIEND_TEST(ParserTest, simple_selectors);
FRIEND_TEST(ParserTest, bad_simple_selectors);
FRIEND_TEST(ParserTest, rulesets);
FRIEND_TEST(ParserTest, ruleset_starts_with_combinator);
FRIEND_TEST(ParserTest, atrules);
FRIEND_TEST(ParserTest, percentage_colors);
FRIEND_TEST(ParserTest, SkipCornerCases);
FRIEND_TEST(ParserTest, SkipMatching);
FRIEND_TEST(ParserTest, SkippedTokenError);
FRIEND_TEST(ParserTest, ValueError);
FRIEND_TEST(ParserTest, ParseAnyParens);
friend void ParseFontFamily(Parser* parser);
friend class MediaAppliesToScreenTest;
DISALLOW_COPY_AND_ASSIGN(Parser);
};
// Definitions of various data structures returned by the parser.
// More in selector.h and value.h.
// A single declaration such as font: 12pt Arial.
// A declaration consists of a property name (Property) and a list
// of values (Values*).
// It could also be important (font: 12pt Arial !important).
class Declaration {
public:
// constructor. We take ownership of v.
Declaration(Property p, Values* v, bool important)
: property_(p), values_(v), important_(important) {}
// constructor with a single Value. We make a copy of the value.
Declaration(Property p, const Value& v, bool important)
: property_(p), values_(new Values), important_(important) {
values_->push_back(new Value(v));
}
// Constructor for dummy declaration used to pass through unparseable
// declaration text.
explicit Declaration(const StringPiece& bytes_in_original_buffer)
: property_(Property::UNPARSEABLE), important_(false),
bytes_in_original_buffer_(bytes_in_original_buffer.data(),
bytes_in_original_buffer.length()) {}
// accessors
Property property() const { return property_; }
const Values* values() const { return values_.get(); }
bool IsImportant() const { return important_; }
// Note: May be invalid UTF8.
StringPiece bytes_in_original_buffer() const {
return bytes_in_original_buffer_;
}
void set_bytes_in_original_buffer(const StringPiece& new_bytes) {
bytes_in_original_buffer_ = string(new_bytes.data(), new_bytes.length());
}
// convenience accessors
Property::Prop prop() const { return property_.prop(); }
string prop_text() const { return property_.prop_text(); }
Values* mutable_values() { return values_.get(); }
void set_property(Property property) { property_ = property; }
// Takes ownership of values.
void set_values(Values* values) { values_.reset(values); }
void set_important(bool important) { important_ = important; }
string ToString() const;
private:
Property property_;
scoped_ptr<Values> values_;
bool important_; // Whether !important is declared on this declaration.
// Verbatim bytes parsed for the declaration. Currently this is only stored
// for unparseable declarations (stored with property_ == UNPARSEABLE).
// TODO(sligocki): We may want to store verbatim text for all declarations
// to preserve the details of the original text.
string bytes_in_original_buffer_;
DISALLOW_COPY_AND_ASSIGN(Declaration);
};
// Declarations is a vector of Declaration*, which we own and
// will delete upon destruction. If you remove elements from
// Declarations, you are responsible for deleting them.
// Also, be careful --- there's no virtual destructor, so this must be
// deleted as a Declarations.
class Declarations : public std::vector<Declaration*> {
public:
Declarations() : std::vector<Declaration*>() { }
~Declarations();
// We provide syntactic sugar for accessing elements.
// declarations->get(i) looks better than (*declarations)[i])
const Declaration* get(int i) const { return (*this)[i]; }
string ToString() const;
private:
DISALLOW_COPY_AND_ASSIGN(Declarations);
};
// Unparsed sections of CSS file. For example, unexpected @-rules cannnot be
// parsed, so we simply collect the verbatim bytes from start to finish and
// store them in an UnparsedRegion so that they can be re-emitted in
// preservation mode.
class UnparsedRegion {
public:
explicit UnparsedRegion(const StringPiece& bytes_in_original_buffer)
: bytes_in_original_buffer_(bytes_in_original_buffer.data(),
bytes_in_original_buffer.size()) {}
StringPiece bytes_in_original_buffer() const {
return bytes_in_original_buffer_;
}
void set_bytes_in_original_buffer(const StringPiece& bytes) {
bytes.CopyToString(&bytes_in_original_buffer_);
}
string ToString() const;
private:
string bytes_in_original_buffer_;
DISALLOW_COPY_AND_ASSIGN(UnparsedRegion);
};
// A ruleset consists of a list of selectors followed by a declaration block.
// It can also optionally include a list of medium description.
//
// Unparsed regions between Rulesets can also be stored here in preservation
// mode. For example, at-rules can be interspersed with Rulesets, for those
// that we don't parse, they are stored in dummy Rulesets.
class Ruleset {
public:
// TODO(sligocki): Allow other parsed at-rules, like @page.
enum Type { RULESET, UNPARSED_REGION, };
Ruleset() : type_(RULESET), media_queries_(new MediaQueries),
selectors_(new Selectors),
declarations_(new Declarations) { }
// Takes ownership of selectors, media_queries and declarations.
Ruleset(Selectors* selectors, MediaQueries* media_queries,
Declarations* declarations)
: type_(RULESET), media_queries_(media_queries), selectors_(selectors),
declarations_(declarations) { }
// Dummy Ruleset. Used for unparsed statements, for example unknown at-rules.
explicit Ruleset(UnparsedRegion* unparsed_region)
: type_(UNPARSED_REGION), media_queries_(new MediaQueries),
unparsed_region_(unparsed_region) { }
~Ruleset() { }
// Is this actually a Ruleset or some sort of at-rule? For historical reasons
// at-rules are also stored as Rulesets.
Type type() const { return type_; }
// All type()s can have media_queries.
const MediaQueries& media_queries() const { return *media_queries_; }
const MediaQuery& media_query(int i) const { return *media_queries_->at(i); }
MediaQueries& mutable_media_queries() { return *media_queries_; }
// Takes ownership of parameter.
void set_media_queries(MediaQueries* media_queries) {
media_queries_.reset(media_queries);
}
// NOTE: Only call these getters if you know that type() == RULESET.
// type() always == RULESET if Css::Parser::preservation_mode() is false,
// so getters should all be valid if preservation mode is off (default).
const Selectors& selectors() const {
CHECK_EQ(RULESET, type());
return *selectors_;
}
const Selector& selector(int i) const {
CHECK_EQ(RULESET, type());
return *selectors_->at(i);
}
const Declarations& declarations() const {
CHECK_EQ(RULESET, type());
return *declarations_;
}
const Declaration& declaration(int i) const {
CHECK_EQ(RULESET, type());
return *declarations_->at(i);
}
Selectors& mutable_selectors() {
CHECK_EQ(RULESET, type());
return *selectors_;
}
Declarations& mutable_declarations() {
CHECK_EQ(RULESET, type());
return *declarations_;
}
// set_selectors and _declarations take ownership of parameters.
void set_selectors(Selectors* selectors) {
CHECK_EQ(RULESET, type());
selectors_.reset(selectors);
}
void set_declarations(Declarations* decls) {
CHECK_EQ(RULESET, type());
declarations_.reset(decls);
}
// If type() == UNPARSED_REGION, this is the link to that region.
const UnparsedRegion* unparsed_region() const {
CHECK_EQ(UNPARSED_REGION, type());
return unparsed_region_.get();
}
UnparsedRegion* mutable_unparsed_region() {
CHECK_EQ(UNPARSED_REGION, type());
return unparsed_region_.get();
}
string ToString() const;
private:
Type type_;
// All types have media_queries_.
scoped_ptr<MediaQueries> media_queries_;
// Only defined for type_ == RULESET.
scoped_ptr<Selectors> selectors_;
scoped_ptr<Declarations> declarations_;
// Only defined for type_ == UNPARSED_REGION.
scoped_ptr<UnparsedRegion> unparsed_region_;
DISALLOW_COPY_AND_ASSIGN(Ruleset);
};
class Rulesets : public std::vector<Css::Ruleset*> {
public:
Rulesets() : std::vector<Css::Ruleset*>() { }
~Rulesets();
};
class Charsets : public std::vector<UnicodeText> {
public:
~Charsets();
string ToString() const;
};
class Import {
public:
Import() {}
~Import() {}
const MediaQueries& media_queries() const { return *media_queries_; }
const UnicodeText& link() const { return link_; }
// Takes ownership of media_queries.
void set_media_queries(MediaQueries* media_queries) {
media_queries_.reset(media_queries);
}
void set_link(const UnicodeText& link) { link_ = link; }
string ToString() const;
private:
scoped_ptr<MediaQueries> media_queries_;
UnicodeText link_;
DISALLOW_COPY_AND_ASSIGN(Import);
};
class Imports : public std::vector<Css::Import*> {
public:
Imports() : std::vector<Css::Import*>() { }
~Imports();
};
class FontFace {
public:
FontFace() {}
~FontFace() {}
const MediaQueries& media_queries() const { return *media_queries_; }
// Stores all font-face properties as Declarations.
// TODO(sligocki): Provide accessors for individual properties, like src?
const Declarations& declarations() const { return *declarations_; }
void set_media_queries(MediaQueries* media_queries) {
media_queries_.reset(media_queries);
}
void set_declarations(Declarations* declarations) {
declarations_.reset(declarations);
}
MediaQueries& mutable_media_queries() { return *media_queries_; }
Declarations& mutable_declarations() { return *declarations_; }
string ToString() const;
private:
scoped_ptr<MediaQueries> media_queries_;
scoped_ptr<Declarations> declarations_;
DISALLOW_COPY_AND_ASSIGN(FontFace);
};
class FontFaces : public std::vector<Css::FontFace*> {
public:
FontFaces() : std::vector<Css::FontFace*>() { }
~FontFaces();
};
// A stylesheet consists of a list of import information and a list of
// rulesets.
class Stylesheet {
public:
Stylesheet() : type_(AUTHOR) {}
// USER is currently unused.
enum StylesheetType { AUTHOR, USER, SYSTEM };
StylesheetType type() const { return type_; }
const Charsets& charsets() const { return charsets_; }
const Imports& imports() const { return imports_; }
const FontFaces& font_faces() const { return font_faces_; }
const Rulesets& rulesets() const { return rulesets_; }
const UnicodeText& charset(int i) const { return charsets_[i]; }
const Import& import(int i) const { return *imports_[i]; }
const FontFace& font_face(int i) const { return *font_faces_[i]; }
const Ruleset& ruleset(int i) const { return *rulesets_[i]; }
void set_type(StylesheetType type) { type_ = type; }
// TODO(sligocki): Return pointer instead of ref as per Google-style for
// non-const return values.
Charsets& mutable_charsets() { return charsets_; }
Imports& mutable_imports() { return imports_; }
FontFaces& mutable_font_faces() { return font_faces_; }
Rulesets& mutable_rulesets() { return rulesets_; }
string ToString() const;
private:
StylesheetType type_;
Charsets charsets_;
Imports imports_;
FontFaces font_faces_;
// Note: CSS spec specifies that a stylesheet is a list of statements each
// of which is either a ruleset or at-rule. Since we want to support the
// legacy rulesets() interface and most at-rules are not parsed, unparsed
// at-rules are currently being stored as dummy rulesets.
Rulesets rulesets_;
DISALLOW_COPY_AND_ASSIGN(Stylesheet);
};
} // namespace Css
#endif // WEBUTIL_CSS_PARSER_H__