blob: dc9484c5f8a7e3996c495b7f0bbb9b49fe468f9d [file] [log] [blame]
/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: morlovich@google.com (Maksim Orlovich),
// sligocki@google.com (Shawn Ligocki)
//
// This contains HtmlDetector, which tries to heuristically guess whether
// content a server claims to be HTML actually is HTML (it sometimes isn't).
#include "pagespeed/automatic/html_detector.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
namespace net_instaweb {
HtmlDetector::HtmlDetector() : already_decided_(false), probable_html_(false) {
}
HtmlDetector::~HtmlDetector() {
}
bool HtmlDetector::ConsiderInput(const StringPiece& data) {
DCHECK(!already_decided_);
for (int i = 0, n = data.size(); i < n; ++i) {
unsigned char c = static_cast<unsigned char>(data[i]);
switch (c) {
// Ignore all leading whitespace and byte order markers.
// See http://en.wikipedia.org/wiki/Byte_order_mark
// Note: This test allows arbitrary orderings and combinations of the
// byte order markers, but we do not expect many false positives.
case ' ':
case '\t':
case '\n':
case '\r':
case 0xef:
case 0xbb:
case 0xbf: {
break;
}
// If the first non-whitespace, non-BOM char is <, we are content that
// this is HTML.
case '<': {
already_decided_ = true;
probable_html_ = true;
return true;
}
// Similarly, if it's something else, it probably isn't.
default: {
already_decided_ = true;
probable_html_ = false;
return true;
}
}
}
// Looks like we managed to get entirely whitespace --- buffer it up.
StrAppend(&buffer_, data);
return false;
}
void HtmlDetector::ReleaseBuffered(GoogleString* out_buffer) {
buffer_.swap(*out_buffer);
buffer_.clear();
}
void HtmlDetector::ForceDecision(bool is_html) {
DCHECK(!already_decided_);
already_decided_ = true;
probable_html_ = is_html;
}
} // namespace net_instaweb