| %{ |
| // bot_checker.cc is automatically generated from bot_checker.gperf. |
| // Author: fangfei@google.com |
| |
| #include "pagespeed/kernel/http/bot_checker.h" |
| |
| #include "pagespeed/kernel/base/string_util.h" |
| |
| namespace net_instaweb { |
| %} |
| %compare-strncmp |
| %define class-name RobotDetect |
| %define lookup-function-name Lookup |
| %global-table |
| %includes |
| %language=C++ |
| %readonly-tables |
| |
| %% |
| ### These names are either application names or domain names of BOTs. |
| ### This is not a complete list of BOTs names -- just the ones we want |
| ### to scan for now. |
| about.ask.com |
| A6-Indexer |
| Apache-HttpClient |
| Arachni |
| AsyncHttpClient |
| BackRub |
| BingPreview |
| expo9 |
| facebookexternalhit |
| facebookplatform |
| Feedfetcher-Google |
| Genieo |
| Google-StructuredDataTestingTool |
| HttpClient |
| Lycos |
| Mediapartners-Google |
| PHP |
| proximic |
| Python |
| Python-urllib |
| RPT-HTTPClient |
| Ruby |
| WordPress |
| Yahoo! |
| %% |
| |
| bool BotChecker::Lookup(const StringPiece& user_agent) { |
| // Check if the user agent is empty, or contains bot / crawl / spider. |
| if (user_agent.empty() || |
| FindIgnoreCase(user_agent, "bot") != StringPiece::npos || |
| FindIgnoreCase(user_agent, "crawl") != StringPiece::npos || |
| FindIgnoreCase(user_agent, "spider") != StringPiece::npos) { |
| return true; |
| } |
| // check whether the whole string is in database |
| if (RobotDetect::Lookup(user_agent.data(), user_agent.size()) != NULL) { |
| return true; |
| } |
| // get the application_name/domain_name/email |
| const char separator[] = " /,;+"; |
| StringPieceVector names; |
| SplitStringPieceToVector(user_agent, separator, &names, true); |
| for (int i = 0, n = names.size(); i < n; ++i) { |
| if (RobotDetect::Lookup(names[i].data(), |
| names[i].size()) != NULL) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| } // namespace net_instaweb |