src/pagespeed/kernel/html/collapse_whitespace_filter.cc - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: mdsteele@google.com (Matthew D. Steele)

 #include "pagespeed/kernel/html/collapse_whitespace_filter.h"

 #include <algorithm>
 #include <cstddef>
 #include <vector>

 #include "base/logging.h"
 #include "pagespeed/kernel/html/html_element.h"
 #include "pagespeed/kernel/html/html_name.h"
 #include "pagespeed/kernel/html/html_node.h"
 #include "pagespeed/kernel/base/basictypes.h"
 #include "pagespeed/kernel/base/string.h"

 namespace net_instaweb {
 class HtmlParse;

 namespace {

 // Tags within which we should never try to collapse whitespace (note that this
 // is not _quite_ the same thing as kLiteralTags in html_lexer.cc):
 const HtmlName::Keyword kSensitiveTags[] = {
   HtmlName::kPre, HtmlName::kScript, HtmlName::kStyle, HtmlName::kTextarea
 };

 bool IsSensitiveKeyword(HtmlName::Keyword keyword) {
   const HtmlName::Keyword* end = kSensitiveTags + arraysize(kSensitiveTags);
   return std::binary_search(kSensitiveTags, end, keyword);
 }

 }  // namespace

 CollapseWhitespaceFilter::CollapseWhitespaceFilter(HtmlParse* html_parse)
     : html_parse_(html_parse) {
   for (size_t i = 1; i < arraysize(kSensitiveTags); ++i) {
     DCHECK(kSensitiveTags[i - 1] < kSensitiveTags[i]);
   }
 }

 CollapseWhitespaceFilter::~CollapseWhitespaceFilter() {}

 void CollapseWhitespaceFilter::StartDocument() {
   keyword_stack_.clear();
 }

 void CollapseWhitespaceFilter::StartElement(HtmlElement* element) {
   HtmlName::Keyword keyword = element->keyword();
   if (IsSensitiveKeyword(keyword)) {
     keyword_stack_.push_back(keyword);
   }
 }

 void CollapseWhitespaceFilter::EndElement(HtmlElement* element) {
   HtmlName::Keyword keyword = element->keyword();
   if (!keyword_stack_.empty() && (keyword == keyword_stack_.back())) {
     keyword_stack_.pop_back();
   } else {
     DCHECK(!IsSensitiveKeyword(keyword));
   }
 }

 void CollapseWhitespaceFilter::Characters(HtmlCharactersNode* characters) {
   if (keyword_stack_.empty()) {
     // Mutate the contents-string in-place for speed.
     GoogleString* contents = characters->mutable_contents();
     // It is safe to directly mutate the bytes in the string because
     // we are only going to shrink it, never grow it.
     char* read_ptr = &(*contents)[0];
     char* write_ptr = read_ptr;
     char* end = read_ptr + contents->size();
     int in_whitespace = 0;  // Used for pointer-subtraction so newlines dominate
     for (; read_ptr != end; ++read_ptr) {
       char ch = *read_ptr;
       switch (ch) {
         // See http://www.w3.org/TR/html401/struct/text.html#h-9.1
         case ' ':
         case '\t':
         case '\r':
         case '\f':
           // Add whitespace if the previous character was not already
           // whitespace.  Note that the whitespace may be overwritten
           // by a newline.  This extra branch could be avoided if we folded
           // the current whitespace-state into the switch via an OR.
           if (in_whitespace == 0) {
             *write_ptr++ = ch;
             in_whitespace = 1;
           }
           break;
         case '\n':
           // If the previous character was a whitespace, then back up
           // so that the 'write' in the default case will overwrite the
           // previous whitespace with a newline.  Avoid branches.
           write_ptr -= in_whitespace;
           in_whitespace = 1;
           *write_ptr++ = ch;
           break;
         default:
           in_whitespace = 0;
           *write_ptr++ = ch;
           break;
       }
     }
     contents->resize(write_ptr - contents->data());
   }
 }

 }  // namespace net_instaweb
	/*
	* Copyright 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: mdsteele@google.com (Matthew D. Steele)

	#include "pagespeed/kernel/html/collapse_whitespace_filter.h"

	#include <algorithm>
	#include <cstddef>
	#include <vector>

	#include "base/logging.h"
	#include "pagespeed/kernel/html/html_element.h"
	#include "pagespeed/kernel/html/html_name.h"
	#include "pagespeed/kernel/html/html_node.h"
	#include "pagespeed/kernel/base/basictypes.h"
	#include "pagespeed/kernel/base/string.h"

	namespace net_instaweb {
	class HtmlParse;

	namespace {

	// Tags within which we should never try to collapse whitespace (note that this
	// is not _quite_ the same thing as kLiteralTags in html_lexer.cc):
	const HtmlName::Keyword kSensitiveTags[] = {
	HtmlName::kPre, HtmlName::kScript, HtmlName::kStyle, HtmlName::kTextarea
	};

	bool IsSensitiveKeyword(HtmlName::Keyword keyword) {
	const HtmlName::Keyword* end = kSensitiveTags + arraysize(kSensitiveTags);
	return std::binary_search(kSensitiveTags, end, keyword);
	}

	} // namespace

	CollapseWhitespaceFilter::CollapseWhitespaceFilter(HtmlParse* html_parse)
	: html_parse_(html_parse) {
	for (size_t i = 1; i < arraysize(kSensitiveTags); ++i) {
	DCHECK(kSensitiveTags[i - 1] < kSensitiveTags[i]);
	}
	}

	CollapseWhitespaceFilter::~CollapseWhitespaceFilter() {}

	void CollapseWhitespaceFilter::StartDocument() {
	keyword_stack_.clear();
	}

	void CollapseWhitespaceFilter::StartElement(HtmlElement* element) {
	HtmlName::Keyword keyword = element->keyword();
	if (IsSensitiveKeyword(keyword)) {
	keyword_stack_.push_back(keyword);
	}
	}

	void CollapseWhitespaceFilter::EndElement(HtmlElement* element) {
	HtmlName::Keyword keyword = element->keyword();
	if (!keyword_stack_.empty() && (keyword == keyword_stack_.back())) {
	keyword_stack_.pop_back();
	} else {
	DCHECK(!IsSensitiveKeyword(keyword));
	}
	}

	void CollapseWhitespaceFilter::Characters(HtmlCharactersNode* characters) {
	if (keyword_stack_.empty()) {
	// Mutate the contents-string in-place for speed.
	GoogleString* contents = characters->mutable_contents();
	// It is safe to directly mutate the bytes in the string because
	// we are only going to shrink it, never grow it.
	char* read_ptr = &(*contents)[0];
	char* write_ptr = read_ptr;
	char* end = read_ptr + contents->size();
	int in_whitespace = 0; // Used for pointer-subtraction so newlines dominate
	for (; read_ptr != end; ++read_ptr) {
	char ch = *read_ptr;
	switch (ch) {
	// See http://www.w3.org/TR/html401/struct/text.html#h-9.1
	case ' ':
	case '\t':
	case '\r':
	case '\f':
	// Add whitespace if the previous character was not already
	// whitespace. Note that the whitespace may be overwritten
	// by a newline. This extra branch could be avoided if we folded
	// the current whitespace-state into the switch via an OR.
	if (in_whitespace == 0) {
	*write_ptr++ = ch;
	in_whitespace = 1;
	}
	break;
	case '\n':
	// If the previous character was a whitespace, then back up
	// so that the 'write' in the default case will overwrite the
	// previous whitespace with a newline. Avoid branches.
	write_ptr -= in_whitespace;
	in_whitespace = 1;
	*write_ptr++ = ch;
	break;
	default:
	in_whitespace = 0;
	*write_ptr++ = ch;
	break;
	}
	}
	contents->resize(write_ptr - contents->data());
	}
	}

	} // namespace net_instaweb