blob: 4ad3b9c0a593f1632418a3876b4d39a9bb16649d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. The ASF licenses this file to You
* under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. For additional information regarding
* copyright in this work, please see the NOTICE file in the top level
* directory of this distribution.
*/
/* Created on Nov 11, 2003 */
package org.apache.roller.weblogger.util;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.File;
import java.io.FileOutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.roller.util.RollerConstants;
import org.apache.roller.weblogger.config.WebloggerConfig;
import org.apache.commons.lang3.StringUtils;
import org.apache.roller.util.DateUtil;
/**
* Loads MT-Bannedwordslist style bannedwordslist from disk and allows callers to test
* strings against the bannedwordslist and (optionally) addition bannedwordslists.
* <br />
* First looks for bannedwordslist.txt in uploads directory, than in classpath
* as /bannedwordslist.txt. Download from web feature disabled.
* <br />
* Bannedwordslist is formatted one entry per line.
* Any line that begins with # is considered to be a comment.
* Any line that begins with ( is considered to be a regex expression.
* <br />
* For more information on the (discontinued) MT-Bannedwordslist service:
* http://www.jayallen.org/projects/mt-bannedwordslist.
*
* @author Lance Lavandowska
* @author Allen Gilliland
*/
public final class Bannedwordslist {
private static Log mLogger = LogFactory.getLog(Bannedwordslist.class);
private static Bannedwordslist bannedwordslist;
private static final String BANNEDWORDSLIST_FILE = "bannedwordslist.txt";
private static final String LAST_UPDATE_STR = "Last update:";
/** We no longer have a bannedwordslist update URL */
private static final String BANNEDWORDSLIST_URL = null;
private Date lastModified = null;
private List<String> bannedwordslistStr = new LinkedList<>();
private List<Pattern> bannedwordslistRegex = new LinkedList<>();
// setup our singleton at class loading time
static {
mLogger.info("Initializing MT Bannedwordslist");
bannedwordslist = new Bannedwordslist();
bannedwordslist.loadBannedwordslistFromFile(null);
}
/** Hide constructor */
private Bannedwordslist() {
}
/** Singleton factory method. */
public static Bannedwordslist getBannedwordslist() {
return bannedwordslist;
}
/** Non-Static update method. */
public void update() {
if (BANNEDWORDSLIST_URL != null) {
boolean bannedwordslist_updated = this.downloadBannedwordslist();
if (bannedwordslist_updated) {
this.loadBannedwordslistFromFile(null);
}
}
}
/** Download the MT bannedwordslist from the web to our uploads directory. */
private boolean downloadBannedwordslist() {
boolean bannedwordslistUpdated = false;
try {
mLogger.debug("Attempting to download MT bannedwordslist");
URL url = new URL(BANNEDWORDSLIST_URL);
HttpURLConnection connection =
(HttpURLConnection) url.openConnection();
// after spending way too much time debugging i've discovered
// that the bannedwordslist server is selective based on the User-Agent
// header. without this header set i always get a 403 response :(
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
if (this.lastModified != null) {
connection.setRequestProperty("If-Modified-Since",
DateUtil.formatRfc822(this.lastModified));
}
int responseCode = connection.getResponseCode();
mLogger.debug("HttpConnection response = "+responseCode);
// did the connection return NotModified? If so, no need to parse
if (responseCode == HttpURLConnection.HTTP_NOT_MODIFIED) {
mLogger.debug("MT bannedwordslist site says we are current");
return false;
}
// did the connection return a LastModified header?
long lastModifiedLong =
connection.getHeaderFieldDate("Last-Modified", -1);
// if the file is newer than our current then we need do update it
if (responseCode == HttpURLConnection.HTTP_OK &&
(this.lastModified == null ||
this.lastModified.getTime() < lastModifiedLong)) {
mLogger.debug("my last modified = " + (this.lastModified == null ? "(null)" :
this.lastModified.getTime()));
mLogger.debug("MT last modified = " + lastModifiedLong);
// save the new bannedwordslist
InputStream instream = connection.getInputStream();
String uploadDir = WebloggerConfig.getProperty("uploads.dir");
String path = uploadDir + File.separator + BANNEDWORDSLIST_FILE;
FileOutputStream outstream = new FileOutputStream(path);
mLogger.debug("writing updated MT bannedwordslist to "+path);
// read from url and write to file
byte[] buf = new byte[RollerConstants.FOUR_KB_IN_BYTES];
int length;
while((length = instream.read(buf)) > 0) {
outstream.write(buf, 0, length);
}
outstream.close();
instream.close();
bannedwordslistUpdated = true;
mLogger.debug("MT bannedwordslist download completed.");
} else {
mLogger.debug("bannedwordslist *NOT* saved, assuming we are current");
}
} catch (Exception e) {
mLogger.error("error downloading bannedwordslist", e);
}
return bannedwordslistUpdated;
}
/**
* Load the MT bannedwordslist from the file system.
* We look for a previously downloaded version of the bannedwordslist first and
* if it's not found then we load the default bannedwordslist packed with Roller.
* Only public for purposes of unit testing.
*/
public void loadBannedwordslistFromFile(String bannedwordslistFilePath) {
InputStream txtStream;
try {
String path = bannedwordslistFilePath;
if (path == null) {
String uploadDir = WebloggerConfig.getProperty("uploads.dir");
path = uploadDir + File.separator + BANNEDWORDSLIST_FILE;
}
File bannedwordslistFile = new File(path);
// check our lastModified date to see if we need to re-read the file
if (this.lastModified != null &&
this.lastModified.getTime() >= bannedwordslistFile.lastModified()) {
mLogger.debug("Bannedwordslist is current, no need to load again");
return;
} else {
this.lastModified = new Date(bannedwordslistFile.lastModified());
}
txtStream = new FileInputStream(bannedwordslistFile);
mLogger.info("Loading bannedwordslist from "+path);
} catch (Exception e) {
// Roller keeps a copy in the webapp just in case
txtStream = getClass().getResourceAsStream("/bannedwordslist.txt");
mLogger.warn(
"Couldn't find downloaded bannedwordslist, loaded bannedwordslist.txt from classpath instead");
}
if (txtStream != null) {
readFromStream(txtStream, false);
} else {
mLogger.error("Couldn't load a bannedwordslist file from anywhere, "
+ "this means bannedwordslist checking is disabled for now.");
}
mLogger.info("Number of bannedwordslist string rules: "+bannedwordslistStr.size());
mLogger.info("Number of bannedwordslist regex rules: "+bannedwordslistRegex.size());
}
/**
* Read in the InputStream for rules.
* @param txtStream stream to read from
*/
private String readFromStream(InputStream txtStream, boolean saveStream) {
String line;
StringBuilder buf = new StringBuilder();
BufferedReader in = null;
try {
in = new BufferedReader(
new InputStreamReader( txtStream, "UTF-8" ) );
while ((line = in.readLine()) != null) {
if (line.startsWith("#")) {
readComment(line);
} else {
readRule(line);
}
if (saveStream) {
buf.append(line).append("\n");
}
}
} catch (Exception e) {
mLogger.error(e);
} finally {
try {
if (in != null) {
in.close();
}
} catch (IOException e1) {
mLogger.error(e1);
}
}
return buf.toString();
}
private void readRule(String str) {
// check for bad condition
if (StringUtils.isEmpty(str)) {
return;
}
String rule = str.trim();
// line has a comment?
if (str.indexOf('#') > 0) {
int commentLoc = str.indexOf('#');
// strip comment
rule = str.substring(0, commentLoc-1).trim();
}
// regex rule?
if (rule.indexOf( '(' ) > -1) {
// pre-compile patterns since they will be frequently used
bannedwordslistRegex.add(Pattern.compile(rule));
} else if (StringUtils.isNotEmpty(rule)) {
bannedwordslistStr.add(rule);
}
}
/** Read comment and try to parse out "Last update" value */
private void readComment(String str) {
int lastUpdatePos = str.indexOf(LAST_UPDATE_STR);
if (lastUpdatePos > -1) {
str = str.substring(lastUpdatePos + LAST_UPDATE_STR.length());
str = str.trim();
try {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
lastModified = DateUtil.parse(str, sdf);
} catch (ParseException e) {
mLogger.debug("ParseException reading " + str);
}
}
}
/**
* Does the String argument match any of the rules in the built-in bannedwordslist?
*/
public boolean isBannedwordslisted(String str) {
return isBannedwordslisted(str, null, null);
}
/**
* Does the String argument match any of the rules in the built-in bannedwordslist
* plus additional bannedwordslists provided by caller?
* @param str String to be checked against bannedwordslist
* @param moreStringRules Additional string rules to consider
* @param moreRegexRules Additional regex rules to consider
*/
public boolean isBannedwordslisted(
String str, List<String> moreStringRules, List<Pattern> moreRegexRules) {
if (str == null || StringUtils.isEmpty(str)) {
return false;
}
// First iterate over bannedwordslist, doing indexOf.
// Then iterate over bannedwordslistRegex and test.
// As soon as there is a hit in either case return true
// test plain String.indexOf
List<String> stringRules = bannedwordslistStr;
if (moreStringRules != null && !moreStringRules.isEmpty()) {
stringRules = new ArrayList<>();
stringRules.addAll(moreStringRules);
stringRules.addAll(bannedwordslistStr);
}
if (testStringRules(str, stringRules)) {
return true;
}
// test regex bannedwordslisted
List<Pattern> regexRules = bannedwordslistRegex;
if (moreRegexRules != null && !moreRegexRules.isEmpty()) {
regexRules = new ArrayList<>();
regexRules.addAll(moreRegexRules);
regexRules.addAll(bannedwordslistRegex);
}
return testRegExRules(str, regexRules);
}
/**
* Test string only against rules provided by caller, NOT against built-in bannedwordslist.
* @param str String to be checked against rules
* @param stringRules String rules to consider
* @param regexRules Regex rules to consider
*/
public static boolean matchesRulesOnly(
String str, List<String> stringRules, List<Pattern> regexRules) {
return testStringRules(str, stringRules) || testRegExRules(str, regexRules);
}
/** Test String against the RegularExpression rules. */
private static boolean testRegExRules(String str, List<Pattern> regexRules) {
for (Pattern testPattern : regexRules) {
// want to see what it is matching on, but only in debug mode
if (mLogger.isDebugEnabled()) {
Matcher matcher = testPattern.matcher(str);
if (matcher.find()) {
mLogger.debug(matcher.group()
+ " matched by " + testPattern.pattern());
return true;
}
} else {
if (testPattern.matcher(str).find()) {
return true;
}
}
}
return false;
}
/**
* Tests the source text against the String rules. Each String rule is
* first treated as a word-boundary, case insensitive regular expression.
* If a PatternSyntaxException is encountered, a simple contains test
* is performed.
*
* @param source The text in which to apply the matching rules.
* @param rules A list a simple matching rules.
*
* @return true if a match was found, otherwise false
*/
private static boolean testStringRules(String source, List rules) {
boolean matches = false;
for (Object ruleObj : rules) {
String rule;
rule = (String) ruleObj;
try {
StringBuilder patternBuilder;
patternBuilder = new StringBuilder();
patternBuilder.append("\\b(");
patternBuilder.append(rule);
patternBuilder.append(")\\b");
Pattern pattern;
pattern = Pattern.compile(patternBuilder.toString(),
Pattern.CASE_INSENSITIVE);
Matcher matcher;
matcher = pattern.matcher(source);
matches = matcher.find();
if (matches) {
break;
}
}
catch (PatternSyntaxException e) {
matches = source.contains(rule);
if (matches) {
break;
}
}
finally {
if (matches && mLogger.isDebugEnabled()) {
// Log the matched rule in debug mode
mLogger.debug("matched:" + rule + ":");
}
}
}
return matches;
}
/** Utility method to populate lists based a bannedwordslist in string form */
public static void populateSpamRules(
String bannedwordslist, List<String> stringRules, List<Pattern> regexRules, String addendum) {
String weblogWords = bannedwordslist;
weblogWords = (weblogWords == null) ? "" : weblogWords;
String siteWords = (addendum != null) ? addendum : "";
StringTokenizer toker = new StringTokenizer(siteWords + "\n" + weblogWords, "\n");
while (toker.hasMoreTokens()) {
String token = toker.nextToken().trim();
if (token.startsWith("#")) {
continue;
}
if (token.startsWith("(")) {
regexRules.add(Pattern.compile(token));
} else {
stringRules.add(token);
}
}
}
/** Return pretty list of String and RegEx rules. */
@Override
public String toString() {
String val = "bannedwordslist " + bannedwordslistStr;
val += "\nRegex bannedwordslist " + bannedwordslistRegex;
return val;
}
}