Merge branch 'derhecht-patch-2', closes #545 Includes solution for (closes #544) NUTCH-2813 MoreIndexingFilter - can't parse erroneous date - 2019-07-03T10:28:14

commit: ae844b6cfd5ca4b1c3a28b93a9ceccb1ff393531 [log] [tgz]
author: Sebastian Nagel <snagel@apache.org> Sun Aug 16 20:59:33 2020 +0200
committer: Sebastian Nagel <snagel@apache.org> Sun Aug 16 20:59:33 2020 +0200
tree: d8e83dfc7ffec872fc315c9d3054f0f45f4c922a
parent: 466cac5ddaee7e23827274a59d2cc3eec48ebcff [diff]
parent: 69deffa67d76eb61ddabe29d54575c5b6635a4e2 [diff]
diff --git a/conf/date-styles.txt.template b/conf/date-styles.txt.template
new file mode 100644
index 0000000..61ee0ac
--- /dev/null
+++ b/conf/date-styles.txt.template

@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Set of date time formats
+# used by the plugin index-more when filling the index field `lastModified'.
+#
+# Format (line separated date time patterns following definition in
+# https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html,
+# comment lines start with `#'):
+#
+#  <date format> <CR>
+#  <date format2> [<CR> <date format3> ...]
+#
+# Examples and currently used formats:
+#
+
+EEE MMM dd HH:mm:ss yyyy
+EEE MMM dd HH:mm:ss yyyy zzz
+EEE MMM dd HH:mm:ss zzz yyyy
+EEE, MMM dd HH:mm:ss yyyy zzz
+EEE, dd MMM yyyy HH:mm:ss zzz
+EEE,dd MMM yyyy HH:mm:ss zzz
+EEE, dd MMM yyyy HH:mm:sszzz
+EEE, dd MMM yyyy HH:mm:ss
+EEE, dd-MMM-yy HH:mm:ss zzz
+yyyy/MM/dd HH:mm:ss.SSS zzz
+yyyy/MM/dd HH:mm:ss.SSS
+yyyy/MM/dd HH:mm:ss zzz
+yyyy/MM/dd
+yyyy.MM.dd HH:mm:ss
+yyyy-MM-dd HH:mm
+MMM dd yyyy HH:mm:ss. zzz
+MMM dd yyyy HH:mm:ss zzz
+dd.MM.yyyy HH:mm:ss zzz
+dd MM yyyy HH:mm:ss zzz
+dd.MM.yyyy; HH:mm:ss
+dd.MM.yyyy HH:mm:ss
+dd.MM.yyyy zzz
+yyyy-MM-dd'T'HH:mm:ssXXX
+yyyy-MM-dd'T'HH:mm:ss

diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index 45b79b7..2a475c5 100644
--- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

@@ -54,19 +54,26 @@
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang.time.DateUtils;
 
+import java.io.File;
+import java.net.URL;
+import java.util.List;
+import java.util.ArrayList;
+import org.apache.commons.io.FileUtils;
+import java.nio.charset.StandardCharsets;
+
 /**
  * Add (or reset) a few metaData properties as respective fields (if they are
  * available), so that they can be accurately used within the search index.
- * 
+ *
  * 'lastModifed' is indexed to support query by date, 'contentLength' obtains
  * content length from the HTTP header, 'type' field is indexed to support query
  * by type and finally the 'title' field is an attempt to reset the title if a
  * content-disposition hint exists. The logic is that such a presence is
  * indicative that the content provider wants the filename therein to be used as
  * the title.
- * 
+ *
  * Still need to make content-length searchable!
- * 
+ *
  * @author John Xing
  */
 
@@ -83,15 +90,30 @@
   private boolean mapMimes = false;
   private String mapFieldName;
 
+  /** Date-styles used to parse date. */
+  private String[] defaultDateStyles = new String[] {
+            "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
+            "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
+            "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
+            "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
+            "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
+            "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
+            "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
+            "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
+            "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
+            "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
+            "yyyy-MM-dd'T'HH:mm:ssXXX" };
+  private String[] dateStyles = null;
+
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
       CrawlDatum datum, Inlinks inlinks) throws IndexingException {
 
     String url_s = url.toString();
 
     addTime(doc, parse.getData(), url_s, datum);
-    addLength(doc, parse.getData(), url_s);
+    addLength(doc, parse.getData());
     addType(doc, parse.getData(), url_s, datum);
-    resetTitle(doc, parse.getData(), url_s);
+    resetTitle(doc, parse.getData());
 
     return doc;
   }
@@ -126,43 +148,32 @@
 
   private long getTime(String date, String url) {
     long time = -1;
+
     try {
       time = HttpDateFormat.toLong(date);
     } catch (ParseException e) {
       // try to parse it as date in alternative format
       try {
-        Date parsedDate = DateUtils.parseDate(date, new String[] {
-            "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
-            "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
-            "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
-            "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
-            "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
-            "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
-            "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
-            "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
-            "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
-            "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
-            "yyyy-MM-dd'T'HH:mm:ssXXX" });
+        Date parsedDate = DateUtils.parseDate(date, dateStyles);
         time = parsedDate.getTime();
-        // if (LOG.isWarnEnabled()) {
-        // LOG.warn(url + ": parsed date: " + date +" to:"+time);
-        // }
+        LOG.info(url + ": parsed date: " + date +" to: " + time);
       } catch (Exception e2) {
         if (LOG.isWarnEnabled()) {
           LOG.warn(url + ": can't parse erroneous date: " + date);
         }
       }
     }
+
     return time;
   }
 
   // Add Content-Length
-  private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
+  private NutchDocument addLength(NutchDocument doc, ParseData data) {
     String contentLength = data.getMeta(Response.CONTENT_LENGTH);
 
     if (contentLength != null) {
       // NUTCH-1010 ContentLength not trimmed
-      String trimmed = contentLength.toString().trim();
+      String trimmed = contentLength.trim();
       if (!trimmed.isEmpty())
         doc.add("contentLength", trimmed);
     }
@@ -183,7 +194,7 @@
    * all case insensitive. The query filter is implemented in
    * {@link TypeQueryFilter}.
    * </p>
-   * 
+   *
    * @param doc
    * @param data
    * @param url
@@ -196,10 +207,13 @@
 
     Writable tcontentType = datum.getMetaData().get(
         new Text(Response.CONTENT_TYPE));
+
     if (tcontentType != null) {
       contentType = tcontentType.toString();
-    } else
+    } else {
       contentType = data.getMeta(Response.CONTENT_TYPE);
+    }
+
     if (contentType == null) {
       // Note by Jerome Charron on 20050415:
       // Content Type not solved by a previous plugin
@@ -224,14 +238,11 @@
     }
 
     // Check if we have to map mime types
-    if (mapMimes) {
-      // Check if the current mime is mapped
-      if (mimeMap.containsKey(mimeType)) {
-        if (mapFieldName != null) {
-          doc.add(mapFieldName, mimeMap.get(mimeType));
-        } else {
-          mimeType = mimeMap.get(mimeType);
-        }
+    if (mapMimes && mimeMap.containsKey(mimeType)) {
+      if (mapFieldName != null) {
+        doc.add(mapFieldName, mimeMap.get(mimeType));
+      } else {
+        mimeType = mimeMap.get(mimeType);
       }
     }
 
@@ -255,7 +266,7 @@
 
   /**
    * Utility method for splitting mime type into type and subtype.
-   * 
+   *
    * @param mimeType
    * @return
    */
@@ -272,7 +283,7 @@
   // Content-Disposition: inline; filename="foo.ppt"
   private Configuration conf;
 
-  static Pattern patterns[] = { null, null };
+  static Pattern[] patterns = { null, null };
 
   static {
     try {
@@ -284,7 +295,7 @@
     }
   }
 
-  private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) {
+  private NutchDocument resetTitle(NutchDocument doc, ParseData data) {
     String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
     if (contentDisposition == null || doc.getFieldValue("title") != null)
       return doc;
@@ -316,6 +327,29 @@
         LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
       }
     }
+
+    URL dateStylesResource = conf.getResource("date-styles.txt");
+    if (dateStylesResource == null) {
+      dateStyles = defaultDateStyles;
+      LOG.warn("Can't find resource: date-styles.txt - Defaults will be used.");
+    } else {
+      try {
+        List<String> usedLines = new ArrayList<String>();
+        for (String dateStyle: FileUtils.readLines(new File(dateStylesResource.getFile()),
+            StandardCharsets.US_ASCII)) {
+          if (StringUtils.isBlank(dateStyle) || dateStyle.startsWith("#")) {
+            continue;
+          }
+
+          usedLines.add(StringUtils.trim(dateStyle));
+        }
+
+        dateStyles = new String[usedLines.size()];
+        usedLines.toArray(dateStyles);
+      } catch (IOException e) {
+        LOG.error("Failed to load resource: date-styles.txt");
+      }
+    }
   }
 
   public Configuration getConf() {
@@ -324,16 +358,19 @@
 
   private void readConfiguration() throws IOException {
     LOG.info("Reading content type mappings from file contenttype-mapping.txt");
-    BufferedReader reader = new BufferedReader(
-        conf.getConfResourceAsReader("contenttype-mapping.txt"));
-    String line;
-    String parts[];
-    boolean formatWarningShown = false;
+    try (BufferedReader reader = new BufferedReader(
+        conf.getConfResourceAsReader("contenttype-mapping.txt"))) {
+      String line;
+      String[] parts;
+      boolean formatWarningShown = false;
 
-    mimeMap = new HashMap<String, String>();
+      mimeMap = new HashMap<String, String>();
 
-    while ((line = reader.readLine()) != null) {
-      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+      while ((line = reader.readLine()) != null) {
+        if (StringUtils.isBlank(line) || line.startsWith("#")) {
+          continue;
+        }
+
         line = line.trim();
         parts = line.split("\t");
commit	ae844b6cfd5ca4b1c3a28b93a9ceccb1ff393531	[log] [tgz]
author	Sebastian Nagel <snagel@apache.org>	Sun Aug 16 20:59:33 2020 +0200
committer	Sebastian Nagel <snagel@apache.org>	Sun Aug 16 20:59:33 2020 +0200
tree	d8e83dfc7ffec872fc315c9d3054f0f45f4c922a
parent	466cac5ddaee7e23827274a59d2cc3eec48ebcff [diff]
parent	69deffa67d76eb61ddabe29d54575c5b6635a4e2 [diff]