src/net/instaweb/rewriter/css_tag_scanner.cc - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)

 #include "net/instaweb/rewriter/public/css_tag_scanner.h"

 #include <cstddef>

 #include "base/logging.h"
 #include "net/instaweb/rewriter/public/domain_rewrite_filter.h"
 #include "net/instaweb/rewriter/public/rewrite_options.h"
 #include "net/instaweb/rewriter/public/server_context.h"
 #include "net/instaweb/rewriter/public/url_left_trim_filter.h"
 #include "pagespeed/kernel/base/message_handler.h"
 #include "pagespeed/kernel/base/string.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/base/writer.h"
 #include "pagespeed/kernel/html/html_element.h"
 #include "pagespeed/kernel/html/html_name.h"
 #include "pagespeed/kernel/http/google_url.h"
 #include "webutil/css/tostring.h"

 namespace {
 const char kTextCss[] = "text/css";
 }

 namespace net_instaweb {

 CssTagScanner::Transformer::~Transformer() {
 }

 const char CssTagScanner::kStylesheet[] = "stylesheet";
 const char CssTagScanner::kAlternate[] = "alternate";
 const char CssTagScanner::kUriValue[] = "url(";

 CssTagScanner::CssTagScanner(
     Transformer* transformer, MessageHandler* handler)
     : transformer_(transformer), handler_(handler) {
 }

 bool CssTagScanner::ParseCssElement(
     HtmlElement* element,
     HtmlElement::Attribute** href,
     const char** media,
     StringPieceVector* nonstandard_attributes) {
   *media = "";
   *href = NULL;
   if (element->keyword() != HtmlName::kLink) {
     return false;
   }
   // We must have all attributes rel='stylesheet' href='name.css'; and if
   // there is a type, it must be type='text/css'. These can be in any order.
   HtmlElement::AttributeList* attrs = element->mutable_attributes();
   bool has_href = false, has_rel_stylesheet = false;
   for (HtmlElement::AttributeIterator i(attrs->begin());
        i != attrs->end(); ++i) {
     HtmlElement::Attribute& attr = *i;
     switch (attr.keyword()) {
       case HtmlName::kHref:
         if (has_href || attr.decoding_error()) {
           // Duplicate or undecipherable href.
           return false;
         }
         *href = &attr;
         has_href = true;
         break;
       case HtmlName::kRel: {
         StringPiece rel(attr.DecodedValueOrNull());
         TrimWhitespace(&rel);
         if (!StringCaseEqual(rel, kStylesheet)) {
           // rel=something_else.  Abort.  Includes alternate stylesheets.
           return false;
         }
         has_rel_stylesheet = true;
         break;
       }
       case HtmlName::kMedia:
         *media = attr.DecodedValueOrNull();
         if (*media == NULL) {
           // No value (media rather than media=), or decoding error
           return false;
         }
         break;
       case HtmlName::kType: {
         // If we see this, it must be type=text/css.  This attribute is not
         // required.
         StringPiece type(attr.DecodedValueOrNull());
         TrimWhitespace(&type);
         if (!StringCaseEqual(type, kTextCss)) {
           return false;
         }
         break;
       }
       case HtmlName::kTitle:
       case HtmlName::kDataPagespeedNoTransform:
       case HtmlName::kPagespeedNoTransform:
         // title= is here because it indicates a default stylesheet among
         // alternatives.  See:
         // http://www.w3.org/TR/REC-html40/present/styles.html#h-14.3.1
         // We don't alter a link for which data-pagespeed-no-transform is set.
         return false;
       default:
         // Other tags are assumed to be harmless noise; if that is not the case
         // for a particular filter, it should be detected within that filter
         // (examples: extra tags are rejected in css_combine_filter, but they're
         // preserved by css_inline_filter).
         if (nonstandard_attributes != NULL) {
           nonstandard_attributes->push_back(attr.name_str());
         }
         break;
     }
   }

   // we require both 'href=...' and 'rel=stylesheet'.
   return (has_rel_stylesheet && has_href);
 }

 namespace {

 // Removes the first character from *in, and puts it into *c.
 // Returns true if successful
 inline bool PopFirst(StringPiece* in, char* c) {
   if (!in->empty()) {
     *c = (*in)[0];
     in->remove_prefix(1);
     return true;
   } else {
     return false;
   }
 }

 // Since we handle incomplete input, in some cases we may not have enough of it
 // available to accept or reject a construct --- in which case the routines
 // will return kLexInterrupted.
 enum LexResult {
   kLexNo,
   kLexYes,
   kLexInterrupted
 };

 // If in starts with expected, returns kLexYes and consumes it.
 inline LexResult EatLiteral(CssTagScanner::InputPortion input_kind,
                             StringPiece expected, StringPiece* in) {
   if (in->starts_with(expected)) {
     in->remove_prefix(expected.size());
     return kLexYes;
   }

   if (input_kind == CssTagScanner::kInputIncludesEnd) {
     return kLexNo;
   }

   if (in->size() >= expected.size()) {
     return kLexNo;
   }

   // This is conservative: we may already see a difference at this point.
   return kLexInterrupted;
 }

 // Extract string- or identifier-like content from CSS until reaching the
 // given terminator (which will not be included in the output), handling simple
 // escapes along the way. If is_string is true, will also permit escaped line
 // continuations. Returns whether the content could be successfully extracted.
 //
 // *in is updated to have either the whole token or up to first clear error
 // consumed.
 LexResult CssExtractUntil(bool is_string,
                           CssTagScanner::InputPortion input_kind,
                           char term, StringPiece* in,
                           GoogleString* out, bool* found_term) {
   *found_term = false;

   StringPiece original_input = *in;

   char c;
   out->clear();
   while (PopFirst(in, &c)) {
     if (c == term) {
       *found_term = true;
       return kLexYes;
     } else if (c == '\\') {
       // See if it's an escape we recognize. We need to evaluate the
       // escape since they will get escaped again on output.
       // TODO(morlovich): handle hex escapes here as well. For now we just match
       // the non-whitespace stuff we ourselves produce.
       char escape_val;
       if (PopFirst(in, &escape_val)) {
         switch (escape_val) {
           case ',':
           case '\"':
           case '\'':
           case '\\':
           case '(':
           case ')':
             out->push_back(escape_val);
             break;
           case '\n':
           case '\r':
           case '\f':
             // \ before newline in strings simply disappears; for everything
             // else we fallthrough to below.
             if (is_string) {
               if (escape_val == '\r') {
                 // CR+LF.
                 EatLiteral(input_kind, "\n", in);
               }
               break;
             }
             FALLTHROUGH_INTENDED;
           default:
             // We are in more than a bit of trouble here: we can't accurately
             // parse everything (we don't have good enough encoding handling
             // here to represent unicode, at least), and we can't just pass it
             // through since GoogleUrl will turn \ into /, so we fail to match.
             return kLexNo;
         }
       } else {
         // We have \ but not what's afterwards.
         if (input_kind == CssTagScanner::kInputIncludesEnd) {
           // end of input -> this is messed up, not what we expect.
           return kLexNo;
         } else {
           // \ may be continued on in next chunk. Will need to retry
           // once it's available.
           *in = original_input;
           return kLexInterrupted;
         }
       }
     } else {
       if (!is_string && IsHtmlSpace(c)) {
         // Whitespace is not generally permitted in url() payload, but can
         // come before closing ).
         for (int i = 0, n = in->size(); i < n; ++i) {
           char ahead = (*in)[i];

           // IsHtmlSpace is, in a pleasant surprise, also appropriate for CSS.
           // (Don't worry, JS has a totally different idea of what's whitespace
           //  to keep things interesting).
           if (IsHtmlSpace(ahead)) {
             continue;
           }
           if (ahead == term) {
             // Got closing character --- skip ahead to it, and accumulate
             // whitespace.
             StrAppend(out, in->substr(0, i));
             in->remove_prefix(i);
             break;
           } else {
             // Some other character. Bail out.
             *in = StringPiece(in->data() - 1, in->size() + 1);
             return kLexYes;
           }
         }
       } else if (c == '\n' || c == '\r' || c == '\f') {
         // Strings tokens can't have unescaped newlines, so we are done here.
         // We do need to pop-back the line terminator, though.
         // (Newlines in URL tokens are handed in the case above, with other
         //  whitespace).
         *in = StringPiece(in->data() - 1, in->size() + 1);
         break;
       } else {
         // Normal character.
         out->push_back(c);
       }
     }
   }

   // We got to the end of *in without seeing a closing terminator.
   if (input_kind == CssTagScanner::kInputDoesNotIncludeEnd) {
     // This is a streaming parse and there may be more bytes coming in
     // ==> one of them may be the closing terminator, so we don't know.
     *in = original_input;
     return kLexInterrupted;
   }

   // Lex as an unclosed literal, serialization will retain that, and we will
   // let the browser's CSS parser's error recovery figure out what to do.
   return kLexYes;
 }

 // Tries to extract a string from current position into out.
 // If successful, *quote_out will contain its delimeter, and *found_term
 // will say whether the trailing terminator was present.
 LexResult CssExtractString(
     CssTagScanner::InputPortion input_kind,
     StringPiece* in, GoogleString* out,
     char* quote_out, bool* found_term) {
   if (in->starts_with("'")) {
     in->remove_prefix(1);
     *quote_out = '\'';
     return CssExtractUntil(true, input_kind, '\'', in, out, found_term);
   } else if (in->starts_with("\"")) {
     in->remove_prefix(1);
     *quote_out = '\"';
     return CssExtractUntil(true, input_kind, '"', in, out, found_term);
   } else {
     if (in->empty() && input_kind == CssTagScanner::kInputDoesNotIncludeEnd) {
       // Empty chunk of streaming input -> can't tell if string or not?
       return kLexInterrupted;
     }
     return kLexNo;
   }
 }

 bool WriteRange(const char* out_begin, const char* out_end,
                 Writer* writer, MessageHandler* handler) {
   if (out_end > out_begin) {
     return writer->Write(StringPiece(out_begin, out_end - out_begin), handler);
   } else {
     return true;
   }
 }

 }  // namespace


 void CssTagScanner::SerializeUrlUse(
     UrlKind kind, const GoogleString& url,
     bool is_quoted, bool have_term_quote, char quote,
     bool have_term_paren,
     Writer* writer, bool* ok) {
   DCHECK(kind != kNone);

   if (kind == kImport) {
     *ok = *ok && writer->Write("@import ", handler_);
   } else {
     *ok = *ok && writer->Write("url(", handler_);
   }

   if (is_quoted) {
     *ok = *ok && writer->Write(StringPiece(&quote, 1), handler_);
   }
   *ok = *ok && writer->Write(Css::EscapeUrl(url), handler_);
   if (have_term_quote) {
     *ok = *ok && writer->Write(StringPiece(&quote, 1), handler_);
   }

   if (have_term_paren) {
     *ok = *ok && writer->Write(")", handler_);
   }
 }

 bool CssTagScanner::TransformUrlsStreaming(
     StringPiece contents, CssTagScanner::InputPortion input_portion,
     Writer* writer) {
   bool ok = true;

   GoogleString concat_buffer;
   if (!reparse_.empty()) {
     concat_buffer = StrCat(reparse_, contents);
     contents = concat_buffer;
     reparse_.clear();
   }

   // Keeps track of which portion of input we should write out in
   // the next output batch. This an iterator-style interval, i.e.
   // [out_begin, out_end)
   const char* out_begin = contents.data();
   const char* out_end = contents.data();

   char c;
   GoogleString url;
   // The difference between remaining and *reparse_out is that remaining is
   // updated in the middle of processing, and is committed to *reparse_out only
   // when an entire chunk has been understood. This means when we are streaming
   // incrementally, unparsed can be retained until the next chunk.
   StringPiece remaining = contents;
   StringPiece reparse_candidate = remaining;
   while (PopFirst(&remaining, &c)) {
     UrlKind have_url = kNone;
     bool is_quoted = false;
     bool have_term_quote = false;
     bool have_term_paren = false;
     char quote = '?';

     if (c == '@') {
       // See if we are at an @import. We provisionally set an
       // end point for batch write to exclude the @, so if we
       // write out with transformed URL, we should start with
       // @import.
       switch (EatLiteral(input_portion, "import", &remaining)) {
         case kLexYes: {
           TrimLeadingWhitespace(&remaining);
           // The code here handles @import "foo" and @import 'foo';
           // for @import url(... we simply pass the @import through and let
           // the code that handles url( below take care of it.
           LexResult url_argument =
               CssExtractString(input_portion, &remaining, &url,
                                &quote, &have_term_quote);
           if (url_argument == kLexYes) {
             have_url = kImport;
             is_quoted = true;
           } else if (url_argument == kLexInterrupted) {
             reparse_candidate.CopyToString(&reparse_);
             return ok && WriteRange(out_begin, out_end, writer, handler_);
           }
           break;
         }
         case kLexInterrupted:
           reparse_candidate.CopyToString(&reparse_);
           return ok && WriteRange(out_begin, out_end, writer, handler_);
         case kLexNo:
           break;
       }
     } else if (c == 'u') {
       // See if we are at url(. Also provisionally set an
       // end point for batch write to exclude the u, so if we
       // write out with transformed URL, we should start with
       // url(
       GoogleString wrapped_url;
       switch (EatLiteral(input_portion, "rl(", &remaining)) {
         case kLexYes: {
           TrimLeadingWhitespace(&remaining);
           // Note if we have a quoted URL inside url(), it needs to be
           // parsed as such.
           LexResult quoted_url_argument =
               CssExtractString(input_portion, &remaining, &url,
                                &quote, &have_term_quote);
           if (quoted_url_argument == kLexYes) {
             TrimLeadingWhitespace(&remaining);
             switch (EatLiteral(input_portion, ")", &remaining)) {
               case kLexYes:
                 have_url = kUrl;
                 is_quoted = true;
                 have_term_paren = true;
                 break;
               case kLexInterrupted:
                 reparse_candidate.CopyToString(&reparse_);
                 return ok && WriteRange(out_begin, out_end, writer, handler_);
               case kLexNo:
                 break;
             }
           } else if (quoted_url_argument == kLexInterrupted) {
             reparse_candidate.CopyToString(&reparse_);
             return ok && WriteRange(out_begin, out_end, writer, handler_);
           } else {
             // No quoted argument.
             LexResult unquoted_url_argument =
               CssExtractUntil(false, input_portion, ')', &remaining,
                               &wrapped_url, &have_term_paren);
             if (unquoted_url_argument == kLexYes) {
               TrimWhitespace(wrapped_url, &url);
               have_url = kUrl;
             } else if (unquoted_url_argument == kLexInterrupted) {
               reparse_candidate.CopyToString(&reparse_);
               return ok && WriteRange(out_begin, out_end, writer, handler_);
             }
           }
           break;
         }
         case kLexInterrupted:
           reparse_candidate.CopyToString(&reparse_);
           return ok && WriteRange(out_begin, out_end, writer, handler_);
         case kLexNo:
           break;
       }
     }

     if (have_url != kNone) {
       // See if we actually have to do something. If the transformer
       // wants to leave the URL alone, we will just pass the bytes through.
       switch (transformer_->Transform(&url)) {
         case Transformer::kSuccess: {
           // Write out the buffered up part of input.
           ok = ok && WriteRange(out_begin, out_end, writer, handler_);

           SerializeUrlUse(have_url, url,
                           is_quoted, have_term_quote, quote,
                           have_term_paren,
                           writer, &ok);

           // Begin accumulating input again starting from next byte.
           out_begin = remaining.data();
           break;
         }
         case Transformer::kFailure: {
           // We could not transform URL, fail fast.
           handler_->Message(kWarning,
                             "Transform failed for url %s", url.c_str());
           return false;
         }
         case Transformer::kNoChange: {
           break;
         }
       }
     }

     // remaining.data() points to the next byte to read, which is exactly
     // right after the last byte we want to output.
     out_end = remaining.data();
     reparse_candidate = remaining;
   }

   // Write out whatever got buffered at the end.
   ok = ok && WriteRange(out_begin, out_end, writer, handler_);

   return ok;
 }

 bool CssTagScanner::TransformUrls(
     StringPiece contents, Writer* writer, Transformer* transformer,
     MessageHandler* handler) {
   CssTagScanner scanner(transformer, handler);
   return scanner.TransformUrlsStreaming(contents, kInputIncludesEnd, writer);
 }

 bool CssTagScanner::HasImport(const StringPiece& contents,
                               MessageHandler* handler) {
   // Search for case insensitive @import.
   size_t pos = -1;  // So that pos + 1 == 0 below.
   const StringPiece kImport("import");
   while ((pos = contents.find("@", pos + 1)) != StringPiece::npos) {
     // Rest is everything past the @ (non-inclusive).
     StringPiece rest = contents.substr(pos + 1);
     if (StringCaseStartsWith(rest, kImport)) {
       return true;
     }
   }
   return false;
 }

 bool CssTagScanner::HasUrl(const StringPiece& contents) {
   return (contents.find(CssTagScanner::kUriValue) != StringPiece::npos);
 }

 bool CssTagScanner::IsStylesheetOrAlternate(
     const StringPiece& attribute_value) {
   StringPieceVector values;
   SplitStringPieceToVector(attribute_value, " ", &values, true);
   for (int i = 0, n = values.size(); i < n; ++i) {
     if (StringCaseEqual(values[i], kStylesheet)) {
       return true;
     }
   }
   return false;
 }

 bool CssTagScanner::IsAlternateStylesheet(const StringPiece& attribute_value) {
   bool has_stylesheet = false;
   bool has_alternate = false;
   StringPieceVector values;
   SplitStringPieceToVector(attribute_value, " ", &values, true);
   for (int i = 0, n = values.size(); i < n; ++i) {
     if (StringCaseEqual(values[i], kStylesheet)) {
       has_stylesheet = true;
     } else if (StringCaseEqual(values[i], kAlternate)) {
       has_alternate = true;
     }
   }

   return has_stylesheet && has_alternate;
 }

 RewriteDomainTransformer::RewriteDomainTransformer(
     const GoogleUrl* old_base_url, const GoogleUrl* new_base_url,
     const ServerContext* server_context, const RewriteOptions* options,
     MessageHandler* handler)
     : old_base_url_(old_base_url), new_base_url_(new_base_url),
       server_context_(server_context), options_(options),
       handler_(handler), trim_urls_(true) {
 }

 RewriteDomainTransformer::~RewriteDomainTransformer() {
 }

 CssTagScanner::Transformer::TransformStatus RewriteDomainTransformer::Transform(
     GoogleString* str) {
   GoogleString rewritten;  // Result of rewriting domain.
   GoogleString out;        // Result after trimming.
   if (DomainRewriteFilter::Rewrite(
           *str, *old_base_url_, server_context_,
           options_,
           true, /* apply_sharding */
           true, /* apply_domain_suffix */
           &rewritten)
       == DomainRewriteFilter::kFail) {
     return kFailure;
   }
   // Note: Even if Rewrite() returned kDomainUnchanged, it will still absolutify
   // the URL into rewritten. We may return kSuccess if that URL does not get
   // re-trimmed to the original string.

   // Note: Because of complications with sharding, we cannot trim
   // sharded resources against the final sharded domain of the CSS file.
   // Specifically, that final domain depends upon the precise text of that
   // we are altering here.
   if (!trim_urls_ ||
       !UrlLeftTrimFilter::Trim(*new_base_url_, rewritten, &out, handler_)) {
     // If we couldn't trim rewritten -> out, just copy it (swap is optimization)
     out.swap(rewritten);
   }

   if (out == *str) {
     return kNoChange;
   } else {
     str->swap(out);
     return kSuccess;
   }
 }

 }  // namespace net_instaweb
	/*
	* Copyright 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: jmarantz@google.com (Joshua Marantz)

	#include "net/instaweb/rewriter/public/css_tag_scanner.h"

	#include <cstddef>

	#include "base/logging.h"
	#include "net/instaweb/rewriter/public/domain_rewrite_filter.h"
	#include "net/instaweb/rewriter/public/rewrite_options.h"
	#include "net/instaweb/rewriter/public/server_context.h"
	#include "net/instaweb/rewriter/public/url_left_trim_filter.h"
	#include "pagespeed/kernel/base/message_handler.h"
	#include "pagespeed/kernel/base/string.h"
	#include "pagespeed/kernel/base/string_util.h"
	#include "pagespeed/kernel/base/writer.h"
	#include "pagespeed/kernel/html/html_element.h"
	#include "pagespeed/kernel/html/html_name.h"
	#include "pagespeed/kernel/http/google_url.h"
	#include "webutil/css/tostring.h"

	namespace {
	const char kTextCss[] = "text/css";
	}

	namespace net_instaweb {

	CssTagScanner::Transformer::~Transformer() {
	}

	const char CssTagScanner::kStylesheet[] = "stylesheet";
	const char CssTagScanner::kAlternate[] = "alternate";
	const char CssTagScanner::kUriValue[] = "url(";

	CssTagScanner::CssTagScanner(
	Transformer* transformer, MessageHandler* handler)
	: transformer_(transformer), handler_(handler) {
	}

	bool CssTagScanner::ParseCssElement(
	HtmlElement* element,
	HtmlElement::Attribute** href,
	const char** media,
	StringPieceVector* nonstandard_attributes) {
	*media = "";
	*href = NULL;
	if (element->keyword() != HtmlName::kLink) {
	return false;
	}
	// We must have all attributes rel='stylesheet' href='name.css'; and if
	// there is a type, it must be type='text/css'. These can be in any order.
	HtmlElement::AttributeList* attrs = element->mutable_attributes();
	bool has_href = false, has_rel_stylesheet = false;
	for (HtmlElement::AttributeIterator i(attrs->begin());
	i != attrs->end(); ++i) {
	HtmlElement::Attribute& attr = *i;
	switch (attr.keyword()) {
	case HtmlName::kHref:
	if (has_href \|\| attr.decoding_error()) {
	// Duplicate or undecipherable href.
	return false;
	}
	*href = &attr;
	has_href = true;
	break;
	case HtmlName::kRel: {
	StringPiece rel(attr.DecodedValueOrNull());
	TrimWhitespace(&rel);
	if (!StringCaseEqual(rel, kStylesheet)) {
	// rel=something_else. Abort. Includes alternate stylesheets.
	return false;
	}
	has_rel_stylesheet = true;
	break;
	}
	case HtmlName::kMedia:
	*media = attr.DecodedValueOrNull();
	if (*media == NULL) {
	// No value (media rather than media=), or decoding error
	return false;
	}
	break;
	case HtmlName::kType: {
	// If we see this, it must be type=text/css. This attribute is not
	// required.
	StringPiece type(attr.DecodedValueOrNull());
	TrimWhitespace(&type);
	if (!StringCaseEqual(type, kTextCss)) {
	return false;
	}
	break;
	}
	case HtmlName::kTitle:
	case HtmlName::kDataPagespeedNoTransform:
	case HtmlName::kPagespeedNoTransform:
	// title= is here because it indicates a default stylesheet among
	// alternatives. See:
	// http://www.w3.org/TR/REC-html40/present/styles.html#h-14.3.1
	// We don't alter a link for which data-pagespeed-no-transform is set.
	return false;
	default:
	// Other tags are assumed to be harmless noise; if that is not the case
	// for a particular filter, it should be detected within that filter
	// (examples: extra tags are rejected in css_combine_filter, but they're
	// preserved by css_inline_filter).
	if (nonstandard_attributes != NULL) {
	nonstandard_attributes->push_back(attr.name_str());
	}
	break;
	}
	}

	// we require both 'href=...' and 'rel=stylesheet'.
	return (has_rel_stylesheet && has_href);
	}

	namespace {

	// Removes the first character from in, and puts it into c.
	// Returns true if successful
	inline bool PopFirst(StringPiece* in, char* c) {
	if (!in->empty()) {
	c = (in)[0];
	in->remove_prefix(1);
	return true;
	} else {
	return false;
	}
	}

	// Since we handle incomplete input, in some cases we may not have enough of it
	// available to accept or reject a construct --- in which case the routines
	// will return kLexInterrupted.
	enum LexResult {
	kLexNo,
	kLexYes,
	kLexInterrupted
	};

	// If in starts with expected, returns kLexYes and consumes it.
	inline LexResult EatLiteral(CssTagScanner::InputPortion input_kind,
	StringPiece expected, StringPiece* in) {
	if (in->starts_with(expected)) {
	in->remove_prefix(expected.size());
	return kLexYes;
	}

	if (input_kind == CssTagScanner::kInputIncludesEnd) {
	return kLexNo;
	}

	if (in->size() >= expected.size()) {
	return kLexNo;
	}

	// This is conservative: we may already see a difference at this point.
	return kLexInterrupted;
	}

	// Extract string- or identifier-like content from CSS until reaching the
	// given terminator (which will not be included in the output), handling simple
	// escapes along the way. If is_string is true, will also permit escaped line
	// continuations. Returns whether the content could be successfully extracted.
	//
	// *in is updated to have either the whole token or up to first clear error
	// consumed.
	LexResult CssExtractUntil(bool is_string,
	CssTagScanner::InputPortion input_kind,
	char term, StringPiece* in,
	GoogleString* out, bool* found_term) {
	*found_term = false;

	StringPiece original_input = *in;

	char c;
	out->clear();
	while (PopFirst(in, &c)) {
	if (c == term) {
	*found_term = true;
	return kLexYes;
	} else if (c == '\\') {
	// See if it's an escape we recognize. We need to evaluate the
	// escape since they will get escaped again on output.
	// TODO(morlovich): handle hex escapes here as well. For now we just match
	// the non-whitespace stuff we ourselves produce.
	char escape_val;
	if (PopFirst(in, &escape_val)) {
	switch (escape_val) {
	case ',':
	case '\"':
	case '\'':
	case '\\':
	case '(':
	case ')':
	out->push_back(escape_val);
	break;
	case '\n':
	case '\r':
	case '\f':
	// \ before newline in strings simply disappears; for everything
	// else we fallthrough to below.
	if (is_string) {
	if (escape_val == '\r') {
	// CR+LF.
	EatLiteral(input_kind, "\n", in);
	}
	break;
	}
	FALLTHROUGH_INTENDED;
	default:
	// We are in more than a bit of trouble here: we can't accurately
	// parse everything (we don't have good enough encoding handling
	// here to represent unicode, at least), and we can't just pass it
	// through since GoogleUrl will turn \ into /, so we fail to match.
	return kLexNo;
	}
	} else {
	// We have \ but not what's afterwards.
	if (input_kind == CssTagScanner::kInputIncludesEnd) {
	// end of input -> this is messed up, not what we expect.
	return kLexNo;
	} else {
	// \ may be continued on in next chunk. Will need to retry
	// once it's available.
	*in = original_input;
	return kLexInterrupted;
	}
	}
	} else {
	if (!is_string && IsHtmlSpace(c)) {
	// Whitespace is not generally permitted in url() payload, but can
	// come before closing ).
	for (int i = 0, n = in->size(); i < n; ++i) {
	char ahead = (*in)[i];

	// IsHtmlSpace is, in a pleasant surprise, also appropriate for CSS.
	// (Don't worry, JS has a totally different idea of what's whitespace
	// to keep things interesting).
	if (IsHtmlSpace(ahead)) {
	continue;
	}
	if (ahead == term) {
	// Got closing character --- skip ahead to it, and accumulate
	// whitespace.
	StrAppend(out, in->substr(0, i));
	in->remove_prefix(i);
	break;
	} else {
	// Some other character. Bail out.
	*in = StringPiece(in->data() - 1, in->size() + 1);
	return kLexYes;
	}
	}
	} else if (c == '\n' \|\| c == '\r' \|\| c == '\f') {
	// Strings tokens can't have unescaped newlines, so we are done here.
	// We do need to pop-back the line terminator, though.
	// (Newlines in URL tokens are handed in the case above, with other
	// whitespace).
	*in = StringPiece(in->data() - 1, in->size() + 1);
	break;
	} else {
	// Normal character.
	out->push_back(c);
	}
	}
	}

	// We got to the end of *in without seeing a closing terminator.
	if (input_kind == CssTagScanner::kInputDoesNotIncludeEnd) {
	// This is a streaming parse and there may be more bytes coming in
	// ==> one of them may be the closing terminator, so we don't know.
	*in = original_input;
	return kLexInterrupted;
	}

	// Lex as an unclosed literal, serialization will retain that, and we will
	// let the browser's CSS parser's error recovery figure out what to do.
	return kLexYes;
	}

	// Tries to extract a string from current position into out.
	// If successful, quote_out will contain its delimeter, and found_term
	// will say whether the trailing terminator was present.
	LexResult CssExtractString(
	CssTagScanner::InputPortion input_kind,
	StringPiece* in, GoogleString* out,
	char* quote_out, bool* found_term) {
	if (in->starts_with("'")) {
	in->remove_prefix(1);
	*quote_out = '\'';
	return CssExtractUntil(true, input_kind, '\'', in, out, found_term);
	} else if (in->starts_with("\"")) {
	in->remove_prefix(1);
	*quote_out = '\"';
	return CssExtractUntil(true, input_kind, '"', in, out, found_term);
	} else {
	if (in->empty() && input_kind == CssTagScanner::kInputDoesNotIncludeEnd) {
	// Empty chunk of streaming input -> can't tell if string or not?
	return kLexInterrupted;
	}
	return kLexNo;
	}
	}

	bool WriteRange(const char* out_begin, const char* out_end,
	Writer* writer, MessageHandler* handler) {
	if (out_end > out_begin) {
	return writer->Write(StringPiece(out_begin, out_end - out_begin), handler);
	} else {
	return true;
	}
	}

	} // namespace


	void CssTagScanner::SerializeUrlUse(
	UrlKind kind, const GoogleString& url,
	bool is_quoted, bool have_term_quote, char quote,
	bool have_term_paren,
	Writer* writer, bool* ok) {
	DCHECK(kind != kNone);

	if (kind == kImport) {
	ok = ok && writer->Write("@import ", handler_);
	} else {
	ok = ok && writer->Write("url(", handler_);
	}

	if (is_quoted) {
	ok = ok && writer->Write(StringPiece(&quote, 1), handler_);
	}
	ok = ok && writer->Write(Css::EscapeUrl(url), handler_);
	if (have_term_quote) {
	ok = ok && writer->Write(StringPiece(&quote, 1), handler_);
	}

	if (have_term_paren) {
	ok = ok && writer->Write(")", handler_);
	}
	}

	bool CssTagScanner::TransformUrlsStreaming(
	StringPiece contents, CssTagScanner::InputPortion input_portion,
	Writer* writer) {
	bool ok = true;

	GoogleString concat_buffer;
	if (!reparse_.empty()) {
	concat_buffer = StrCat(reparse_, contents);
	contents = concat_buffer;
	reparse_.clear();
	}

	// Keeps track of which portion of input we should write out in
	// the next output batch. This an iterator-style interval, i.e.
	// [out_begin, out_end)
	const char* out_begin = contents.data();
	const char* out_end = contents.data();

	char c;
	GoogleString url;
	// The difference between remaining and *reparse_out is that remaining is
	// updated in the middle of processing, and is committed to *reparse_out only
	// when an entire chunk has been understood. This means when we are streaming
	// incrementally, unparsed can be retained until the next chunk.
	StringPiece remaining = contents;
	StringPiece reparse_candidate = remaining;
	while (PopFirst(&remaining, &c)) {
	UrlKind have_url = kNone;
	bool is_quoted = false;
	bool have_term_quote = false;
	bool have_term_paren = false;
	char quote = '?';

	if (c == '@') {
	// See if we are at an @import. We provisionally set an
	// end point for batch write to exclude the @, so if we
	// write out with transformed URL, we should start with
	// @import.
	switch (EatLiteral(input_portion, "import", &remaining)) {
	case kLexYes: {
	TrimLeadingWhitespace(&remaining);
	// The code here handles @import "foo" and @import 'foo';
	// for @import url(... we simply pass the @import through and let
	// the code that handles url( below take care of it.
	LexResult url_argument =
	CssExtractString(input_portion, &remaining, &url,
	&quote, &have_term_quote);
	if (url_argument == kLexYes) {
	have_url = kImport;
	is_quoted = true;
	} else if (url_argument == kLexInterrupted) {
	reparse_candidate.CopyToString(&reparse_);
	return ok && WriteRange(out_begin, out_end, writer, handler_);
	}
	break;
	}
	case kLexInterrupted:
	reparse_candidate.CopyToString(&reparse_);
	return ok && WriteRange(out_begin, out_end, writer, handler_);
	case kLexNo:
	break;
	}
	} else if (c == 'u') {
	// See if we are at url(. Also provisionally set an
	// end point for batch write to exclude the u, so if we
	// write out with transformed URL, we should start with
	// url(
	GoogleString wrapped_url;
	switch (EatLiteral(input_portion, "rl(", &remaining)) {
	case kLexYes: {
	TrimLeadingWhitespace(&remaining);
	// Note if we have a quoted URL inside url(), it needs to be
	// parsed as such.
	LexResult quoted_url_argument =
	CssExtractString(input_portion, &remaining, &url,
	&quote, &have_term_quote);
	if (quoted_url_argument == kLexYes) {
	TrimLeadingWhitespace(&remaining);
	switch (EatLiteral(input_portion, ")", &remaining)) {
	case kLexYes:
	have_url = kUrl;
	is_quoted = true;
	have_term_paren = true;
	break;
	case kLexInterrupted:
	reparse_candidate.CopyToString(&reparse_);
	return ok && WriteRange(out_begin, out_end, writer, handler_);
	case kLexNo:
	break;
	}
	} else if (quoted_url_argument == kLexInterrupted) {
	reparse_candidate.CopyToString(&reparse_);
	return ok && WriteRange(out_begin, out_end, writer, handler_);
	} else {
	// No quoted argument.
	LexResult unquoted_url_argument =
	CssExtractUntil(false, input_portion, ')', &remaining,
	&wrapped_url, &have_term_paren);
	if (unquoted_url_argument == kLexYes) {
	TrimWhitespace(wrapped_url, &url);
	have_url = kUrl;
	} else if (unquoted_url_argument == kLexInterrupted) {
	reparse_candidate.CopyToString(&reparse_);
	return ok && WriteRange(out_begin, out_end, writer, handler_);
	}
	}
	break;
	}
	case kLexInterrupted:
	reparse_candidate.CopyToString(&reparse_);
	return ok && WriteRange(out_begin, out_end, writer, handler_);
	case kLexNo:
	break;
	}
	}

	if (have_url != kNone) {
	// See if we actually have to do something. If the transformer
	// wants to leave the URL alone, we will just pass the bytes through.
	switch (transformer_->Transform(&url)) {
	case Transformer::kSuccess: {
	// Write out the buffered up part of input.
	ok = ok && WriteRange(out_begin, out_end, writer, handler_);

	SerializeUrlUse(have_url, url,
	is_quoted, have_term_quote, quote,
	have_term_paren,
	writer, &ok);

	// Begin accumulating input again starting from next byte.
	out_begin = remaining.data();
	break;
	}
	case Transformer::kFailure: {
	// We could not transform URL, fail fast.
	handler_->Message(kWarning,
	"Transform failed for url %s", url.c_str());
	return false;
	}
	case Transformer::kNoChange: {
	break;
	}
	}
	}

	// remaining.data() points to the next byte to read, which is exactly
	// right after the last byte we want to output.
	out_end = remaining.data();
	reparse_candidate = remaining;
	}

	// Write out whatever got buffered at the end.
	ok = ok && WriteRange(out_begin, out_end, writer, handler_);

	return ok;
	}

	bool CssTagScanner::TransformUrls(
	StringPiece contents, Writer* writer, Transformer* transformer,
	MessageHandler* handler) {
	CssTagScanner scanner(transformer, handler);
	return scanner.TransformUrlsStreaming(contents, kInputIncludesEnd, writer);
	}

	bool CssTagScanner::HasImport(const StringPiece& contents,
	MessageHandler* handler) {
	// Search for case insensitive @import.
	size_t pos = -1; // So that pos + 1 == 0 below.
	const StringPiece kImport("import");
	while ((pos = contents.find("@", pos + 1)) != StringPiece::npos) {
	// Rest is everything past the @ (non-inclusive).
	StringPiece rest = contents.substr(pos + 1);
	if (StringCaseStartsWith(rest, kImport)) {
	return true;
	}
	}
	return false;
	}

	bool CssTagScanner::HasUrl(const StringPiece& contents) {
	return (contents.find(CssTagScanner::kUriValue) != StringPiece::npos);
	}

	bool CssTagScanner::IsStylesheetOrAlternate(
	const StringPiece& attribute_value) {
	StringPieceVector values;
	SplitStringPieceToVector(attribute_value, " ", &values, true);
	for (int i = 0, n = values.size(); i < n; ++i) {
	if (StringCaseEqual(values[i], kStylesheet)) {
	return true;
	}
	}
	return false;
	}

	bool CssTagScanner::IsAlternateStylesheet(const StringPiece& attribute_value) {
	bool has_stylesheet = false;
	bool has_alternate = false;
	StringPieceVector values;
	SplitStringPieceToVector(attribute_value, " ", &values, true);
	for (int i = 0, n = values.size(); i < n; ++i) {
	if (StringCaseEqual(values[i], kStylesheet)) {
	has_stylesheet = true;
	} else if (StringCaseEqual(values[i], kAlternate)) {
	has_alternate = true;
	}
	}

	return has_stylesheet && has_alternate;
	}

	RewriteDomainTransformer::RewriteDomainTransformer(
	const GoogleUrl* old_base_url, const GoogleUrl* new_base_url,
	const ServerContext* server_context, const RewriteOptions* options,
	MessageHandler* handler)
	: old_base_url_(old_base_url), new_base_url_(new_base_url),
	server_context_(server_context), options_(options),
	handler_(handler), trim_urls_(true) {
	}

	RewriteDomainTransformer::~RewriteDomainTransformer() {
	}

	CssTagScanner::Transformer::TransformStatus RewriteDomainTransformer::Transform(
	GoogleString* str) {
	GoogleString rewritten; // Result of rewriting domain.
	GoogleString out; // Result after trimming.
	if (DomainRewriteFilter::Rewrite(
	str, old_base_url_, server_context_,
	options_,
	true, /* apply_sharding */
	true, /* apply_domain_suffix */
	&rewritten)
	== DomainRewriteFilter::kFail) {
	return kFailure;
	}
	// Note: Even if Rewrite() returned kDomainUnchanged, it will still absolutify
	// the URL into rewritten. We may return kSuccess if that URL does not get
	// re-trimmed to the original string.

	// Note: Because of complications with sharding, we cannot trim
	// sharded resources against the final sharded domain of the CSS file.
	// Specifically, that final domain depends upon the precise text of that
	// we are altering here.
	if (!trim_urls_ \|\|
	!UrlLeftTrimFilter::Trim(*new_base_url_, rewritten, &out, handler_)) {
	// If we couldn't trim rewritten -> out, just copy it (swap is optimization)
	out.swap(rewritten);
	}

	if (out == *str) {
	return kNoChange;
	} else {
	str->swap(out);
	return kSuccess;
	}
	}

	} // namespace net_instaweb