blob: 3919181ab4adaafbdeb68a05a5207703ca601523 [file] [log] [blame]
/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: morlovich@google.com (Maksim Orlovich),
// sligocki@google.com (Shawn Ligocki)
//
// This contains HtmlDetector, which tries to heuristically detect whether
// content a server claims to be HTML actually is HTML (it sometimes isn't).
#ifndef PAGESPEED_AUTOMATIC_HTML_DETECTOR_H_
#define PAGESPEED_AUTOMATIC_HTML_DETECTOR_H_
#include "base/logging.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
namespace net_instaweb {
// This class tries to heuristically detect whether something that claims to
// HTML is likely to be. For now, it merely looks at whether the first
// non-whitespace/non-BOM character is <.
//
// Typical usage:
// HtmlDetector detect_html_;
//
// if (!detect_html_.already_decided() &&
// detect_html_.ConsiderInput(data)) {
// GoogleString buffered;
// detect_html_.ReleaseBuffered(&buffered);
// if (detect_html_.probable_html()) {
// do html-specific bits with buffered
// } else {
// do non-html things with buffered
// }
// }
//
// if (detect_html_.already_decided()) {
// do appropriate things with data based on detect_html_.probable_html()
// }
class HtmlDetector {
public:
HtmlDetector();
~HtmlDetector();
// Processes the data, trying to determine if it's HTML or not. If there is
// enough evidence to make a decision, returns true.
//
// If true is returned, already_decided() will be true as well, and hence
// probable_html() will be accessible. buffered() will not be changed.
//
// If false is returned, data will be accumulated inside buffered().
//
// Precondition: !already_decided()
bool ConsiderInput(const StringPiece& data);
// Returns true if we have seen enough input to make a guess as to whether
// it's HTML or not.
bool already_decided() const { return already_decided_; }
// Precondition: already_decided() true (or ConsiderInput returning true).
bool probable_html() const {
DCHECK(already_decided_);
return probable_html_;
}
// Transfers any data that was buffered by ConsiderInput calls that returned
// false into *out_buffer. The old value of out_buffer is overwritten, and
// HtmlDetector's internal buffers are cleared.
void ReleaseBuffered(GoogleString* out_buffer);
// Forces already_decided() to true, and probable_html() to match is_html.
//
// Precondition: !already_decided()
void ForceDecision(bool is_html);
private:
GoogleString buffer_;
bool already_decided_;
bool probable_html_; // valid only if already_decided_.
DISALLOW_COPY_AND_ASSIGN(HtmlDetector);
};
} // namespace net_instaweb
#endif // PAGESPEED_AUTOMATIC_HTML_DETECTOR_H_