Merge branch 'derhecht-patch-2', closes #545
Includes solution for (closes #544)
NUTCH-2813 MoreIndexingFilter - can't parse erroneous date - 2019-07-03T10:28:14
diff --git a/conf/date-styles.txt.template b/conf/date-styles.txt.template
new file mode 100644
index 0000000..61ee0ac
--- /dev/null
+++ b/conf/date-styles.txt.template
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Set of date time formats
+# used by the plugin index-more when filling the index field `lastModified'.
+#
+# Format (line separated date time patterns following definition in
+# https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html,
+# comment lines start with `#'):
+#
+# <date format> <CR>
+# <date format2> [<CR> <date format3> ...]
+#
+# Examples and currently used formats:
+#
+
+EEE MMM dd HH:mm:ss yyyy
+EEE MMM dd HH:mm:ss yyyy zzz
+EEE MMM dd HH:mm:ss zzz yyyy
+EEE, MMM dd HH:mm:ss yyyy zzz
+EEE, dd MMM yyyy HH:mm:ss zzz
+EEE,dd MMM yyyy HH:mm:ss zzz
+EEE, dd MMM yyyy HH:mm:sszzz
+EEE, dd MMM yyyy HH:mm:ss
+EEE, dd-MMM-yy HH:mm:ss zzz
+yyyy/MM/dd HH:mm:ss.SSS zzz
+yyyy/MM/dd HH:mm:ss.SSS
+yyyy/MM/dd HH:mm:ss zzz
+yyyy/MM/dd
+yyyy.MM.dd HH:mm:ss
+yyyy-MM-dd HH:mm
+MMM dd yyyy HH:mm:ss. zzz
+MMM dd yyyy HH:mm:ss zzz
+dd.MM.yyyy HH:mm:ss zzz
+dd MM yyyy HH:mm:ss zzz
+dd.MM.yyyy; HH:mm:ss
+dd.MM.yyyy HH:mm:ss
+dd.MM.yyyy zzz
+yyyy-MM-dd'T'HH:mm:ssXXX
+yyyy-MM-dd'T'HH:mm:ss
diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index 45b79b7..2a475c5 100644
--- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -54,19 +54,26 @@
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateUtils;
+import java.io.File;
+import java.net.URL;
+import java.util.List;
+import java.util.ArrayList;
+import org.apache.commons.io.FileUtils;
+import java.nio.charset.StandardCharsets;
+
/**
* Add (or reset) a few metaData properties as respective fields (if they are
* available), so that they can be accurately used within the search index.
- *
+ *
* 'lastModifed' is indexed to support query by date, 'contentLength' obtains
* content length from the HTTP header, 'type' field is indexed to support query
* by type and finally the 'title' field is an attempt to reset the title if a
* content-disposition hint exists. The logic is that such a presence is
* indicative that the content provider wants the filename therein to be used as
* the title.
- *
+ *
* Still need to make content-length searchable!
- *
+ *
* @author John Xing
*/
@@ -83,15 +90,30 @@
private boolean mapMimes = false;
private String mapFieldName;
+ /** Date-styles used to parse date. */
+ private String[] defaultDateStyles = new String[] {
+ "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
+ "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
+ "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
+ "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
+ "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
+ "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
+ "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
+ "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
+ "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
+ "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
+ "yyyy-MM-dd'T'HH:mm:ssXXX" };
+ private String[] dateStyles = null;
+
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
String url_s = url.toString();
addTime(doc, parse.getData(), url_s, datum);
- addLength(doc, parse.getData(), url_s);
+ addLength(doc, parse.getData());
addType(doc, parse.getData(), url_s, datum);
- resetTitle(doc, parse.getData(), url_s);
+ resetTitle(doc, parse.getData());
return doc;
}
@@ -126,43 +148,32 @@
private long getTime(String date, String url) {
long time = -1;
+
try {
time = HttpDateFormat.toLong(date);
} catch (ParseException e) {
// try to parse it as date in alternative format
try {
- Date parsedDate = DateUtils.parseDate(date, new String[] {
- "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
- "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
- "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
- "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
- "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
- "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
- "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
- "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
- "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
- "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
- "yyyy-MM-dd'T'HH:mm:ssXXX" });
+ Date parsedDate = DateUtils.parseDate(date, dateStyles);
time = parsedDate.getTime();
- // if (LOG.isWarnEnabled()) {
- // LOG.warn(url + ": parsed date: " + date +" to:"+time);
- // }
+ LOG.info(url + ": parsed date: " + date +" to: " + time);
} catch (Exception e2) {
if (LOG.isWarnEnabled()) {
LOG.warn(url + ": can't parse erroneous date: " + date);
}
}
}
+
return time;
}
// Add Content-Length
- private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
+ private NutchDocument addLength(NutchDocument doc, ParseData data) {
String contentLength = data.getMeta(Response.CONTENT_LENGTH);
if (contentLength != null) {
// NUTCH-1010 ContentLength not trimmed
- String trimmed = contentLength.toString().trim();
+ String trimmed = contentLength.trim();
if (!trimmed.isEmpty())
doc.add("contentLength", trimmed);
}
@@ -183,7 +194,7 @@
* all case insensitive. The query filter is implemented in
* {@link TypeQueryFilter}.
* </p>
- *
+ *
* @param doc
* @param data
* @param url
@@ -196,10 +207,13 @@
Writable tcontentType = datum.getMetaData().get(
new Text(Response.CONTENT_TYPE));
+
if (tcontentType != null) {
contentType = tcontentType.toString();
- } else
+ } else {
contentType = data.getMeta(Response.CONTENT_TYPE);
+ }
+
if (contentType == null) {
// Note by Jerome Charron on 20050415:
// Content Type not solved by a previous plugin
@@ -224,14 +238,11 @@
}
// Check if we have to map mime types
- if (mapMimes) {
- // Check if the current mime is mapped
- if (mimeMap.containsKey(mimeType)) {
- if (mapFieldName != null) {
- doc.add(mapFieldName, mimeMap.get(mimeType));
- } else {
- mimeType = mimeMap.get(mimeType);
- }
+ if (mapMimes && mimeMap.containsKey(mimeType)) {
+ if (mapFieldName != null) {
+ doc.add(mapFieldName, mimeMap.get(mimeType));
+ } else {
+ mimeType = mimeMap.get(mimeType);
}
}
@@ -255,7 +266,7 @@
/**
* Utility method for splitting mime type into type and subtype.
- *
+ *
* @param mimeType
* @return
*/
@@ -272,7 +283,7 @@
// Content-Disposition: inline; filename="foo.ppt"
private Configuration conf;
- static Pattern patterns[] = { null, null };
+ static Pattern[] patterns = { null, null };
static {
try {
@@ -284,7 +295,7 @@
}
}
- private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) {
+ private NutchDocument resetTitle(NutchDocument doc, ParseData data) {
String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
if (contentDisposition == null || doc.getFieldValue("title") != null)
return doc;
@@ -316,6 +327,29 @@
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
+
+ URL dateStylesResource = conf.getResource("date-styles.txt");
+ if (dateStylesResource == null) {
+ dateStyles = defaultDateStyles;
+ LOG.warn("Can't find resource: date-styles.txt - Defaults will be used.");
+ } else {
+ try {
+ List<String> usedLines = new ArrayList<String>();
+ for (String dateStyle: FileUtils.readLines(new File(dateStylesResource.getFile()),
+ StandardCharsets.US_ASCII)) {
+ if (StringUtils.isBlank(dateStyle) || dateStyle.startsWith("#")) {
+ continue;
+ }
+
+ usedLines.add(StringUtils.trim(dateStyle));
+ }
+
+ dateStyles = new String[usedLines.size()];
+ usedLines.toArray(dateStyles);
+ } catch (IOException e) {
+ LOG.error("Failed to load resource: date-styles.txt");
+ }
+ }
}
public Configuration getConf() {
@@ -324,16 +358,19 @@
private void readConfiguration() throws IOException {
LOG.info("Reading content type mappings from file contenttype-mapping.txt");
- BufferedReader reader = new BufferedReader(
- conf.getConfResourceAsReader("contenttype-mapping.txt"));
- String line;
- String parts[];
- boolean formatWarningShown = false;
+ try (BufferedReader reader = new BufferedReader(
+ conf.getConfResourceAsReader("contenttype-mapping.txt"))) {
+ String line;
+ String[] parts;
+ boolean formatWarningShown = false;
- mimeMap = new HashMap<String, String>();
+ mimeMap = new HashMap<String, String>();
- while ((line = reader.readLine()) != null) {
- if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isBlank(line) || line.startsWith("#")) {
+ continue;
+ }
+
line = line.trim();
parts = line.split("\t");