blob: a92cd68afb816f0dd4459cf2ed6cc69b8b1f5c23 [file] [log] [blame]
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmaessen@google.com (Jan Maessen)
#ifndef NET_INSTAWEB_REWRITER_PUBLIC_URL_LEFT_TRIM_FILTER_H_
#define NET_INSTAWEB_REWRITER_PUBLIC_URL_LEFT_TRIM_FILTER_H_
#include "base/basictypes.h"
#include "net/instaweb/htmlparse/public/empty_html_filter.h"
#include "net/instaweb/htmlparse/public/html_element.h"
#include "net/instaweb/htmlparse/public/html_parse.h"
#include "net/instaweb/rewriter/public/resource_tag_scanner.h"
#include "net/instaweb/util/public/string_util.h"
namespace net_instaweb {
// Relatively simple filter that trims redundant information from the left end
// of each url. In particular, we can drop http: from any url that is on a page
// served via http (etc. for other protocols). By the same token, a page often
// contains fully-qualified urls that can be made base-relative (especially as
// we may do this as a result of rewriting itself), or root-relative. We should
// strip the leading portions of these urls.
//
// We actually register the base url of a page. This in turn registers
// individual trimmings for the protocol, host, and path in that order. These
// portions of the url are then trimmed off in order by the Trim(...) operation.
//
// TODO(jmaessen): url references in css / outside src= and href= properties
// TODO(jmaessen): do we need a generic filter base class that just finds urls
// and calls a class method? Or do we need context information for any
// transform other than the sort of thing you see here?
// TODO(jmaessen): Do we care to introduce ../ in order to relativize more urls?
// Do we have a library solution to do so with minimal effort?
class Statistics;
class Variable;
class UrlLeftTrimFilter : public EmptyHtmlFilter {
public:
UrlLeftTrimFilter(HtmlParse* html_parse, Statistics* resource_manager);
static void Initialize(Statistics* statistics);
virtual void StartElement(HtmlElement* element);
// TODO(sligocki): This is broken and only adds base_urls. We need to be able
// to ResetBaseUrl to a new value. There is only one base_url at a time.
virtual void AddBaseUrl(const StringPiece& base_url);
virtual const char* Name() const { return "UrlLeftTrim"; }
protected:
friend class UrlLeftTrimFilterTest;
bool Trim(StringPiece* url);
void AddTrimming(const StringPiece& trimming);
private:
HtmlParse* html_parse_;
StringVector left_trim_strings_;
const Atom s_base_;
const Atom s_href_;
const Atom s_src_;
Variable* trim_count_;
Variable* trim_saved_bytes_;
void TrimAttribute(HtmlElement::Attribute* attr);
DISALLOW_COPY_AND_ASSIGN(UrlLeftTrimFilter);
};
} // namespace net_instaweb
#endif // NET_INSTAWEB_REWRITER_PUBLIC_URL_LEFT_TRIM_FILTER_H_