Merge pull request #551 from sebastian-nagel/NUTCH-2823

NUTCH-2823 IllegalStateException in IndexWriters.describe() when valiā€¦
diff --git a/build.xml b/build.xml
index bbe4aaf..67a3aa2 100644
--- a/build.xml
+++ b/build.xml
@@ -15,7 +15,11 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
-<project name="${name}" default="runtime" xmlns:ivy="antlib:org.apache.ivy.ant" xmlns:artifact="antlib:org.apache.maven.artifact.ant">
+<project name="${name}" default="runtime"
+         xmlns:ivy="antlib:org.apache.ivy.ant"
+         xmlns:artifact="antlib:org.apache.maven.artifact.ant"
+         xmlns:rat="antlib:org.apache.rat.anttasks"
+         xmlns="antlib:org.apache.tools.ant">
 
   <!-- Load all the default properties, and any the user wants    -->
   <!-- to contribute (without having to type -D or edit this file -->
@@ -33,10 +37,16 @@
   <property name="maven-javadoc-jar" value="${release.dir}/${artifactId}-${version}-javadoc.jar" />
   <property name="maven-sources-jar" value="${release.dir}/${artifactId}-${version}-sources.jar" />
 
+  <property name="dependency-check.home" value="${ivy.dir}/dependency-check-ant/"/>
+
   <property name="spotbugs.version" value="4.1.1" />
   <property name="spotbugs.home" value="${basedir}/lib/spotbugs-${spotbugs.version}" />
   <property name="spotbugs.jar" value="${spotbugs.home}/lib/spotbugs-ant.jar" />
 
+  <property name="apache-rat.version" value="0.13" />
+  <property name="apache-rat.home" value="${ivy.dir}/apache-rat-${apache-rat.version}" />
+  <property name="apache-rat.jar" value="${apache-rat.home}/apache-rat-${apache-rat.version}.jar" />
+
   <!-- the normal classpath -->
   <path id="classpath">
     <pathelement location="${build.classes}"/>
@@ -633,7 +643,6 @@
   <!--   https://jeremylong.github.io/DependencyCheck/dependency-check-ant/index.html     -->
   <!-- get http://dl.bintray.com/jeremy-long/owasp/dependency-check-ant-3.3.2-release.zip -->
   <!-- and unzip in directory ./ivy/                                                      -->
-  <property name="dependency-check.home" value="${ivy.dir}/dependency-check-ant/"/>
   <path id="dependency-check.path">
     <pathelement location="${dependency-check.home}/dependency-check-ant.jar"/>
     <fileset dir="${dependency-check.home}/lib" erroronmissingdir="false">
@@ -1003,17 +1012,35 @@
   <!-- ================================================================== -->
   <!-- RAT targets                                                        -->
   <!-- ================================================================== -->
-  <target name="rat-sources-typedef" description="--> run RAT antlib task">
-    <typedef resource="org/apache/rat/anttasks/antlib.xml" >
-      <classpath>
-        <fileset dir="." includes="rat*.jar"/>
-      </classpath>
-    </typedef>
+  <target name="apache-rat-download" description="--> download Apache Rat jar">
+    <available file="${apache-rat.jar}" property="apache-rat.jar.found"/>
+    <antcall target="apache-rat-download-unchecked"/>
   </target>
 
-  <target name="rat-sources" depends="rat-sources-typedef"
+  <target name="apache-rat-download-unchecked" unless="apache-rat.jar.found"
+          description="--> downloads the Apache Rat jar">
+    <get src="https://www.apache.org/dist/creadur/apache-rat-${apache-rat.version}/apache-rat-${apache-rat.version}-bin.tar.gz"
+         dest="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz" usetimestamp="false" />
+
+    <untar src="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz"
+           dest="${ivy.dir}/" compression="gzip">
+    </untar>
+
+    <delete file="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz" />
+  </target>
+
+  <taskdef
+      uri="antlib:org.apache.rat.anttasks"
+      resource="org/apache/rat/anttasks/antlib.xml">
+    <classpath>
+      <pathelement location="${apache-rat.jar}" />
+    </classpath>
+  </taskdef>
+
+  <target name="rat-sources" depends="init, apache-rat-download"
     description="--> runs RAT tasks over src/java">
-    <rat:report xmlns:rat="antlib:org.apache.rat.anttasks">
+    <rat:report
+        reportFile="${build.dir}/apache-rat-report.txt">
       <fileset dir="src">
         <include name="java/**/*"/>
         <include name="plugin/**/src/**/*"/>
diff --git a/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java b/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
index fbd45a2..f30fb20 100644
--- a/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
+++ b/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
@@ -24,48 +24,35 @@
 import java.text.ParseException;
 
 /**
- * class to handle HTTP dates.
+ * Parse and format HTTP dates in HTTP headers, e.g., used to fill the
+ * &quot;If-Modified-Since&quot; request header field.
  * 
- * Modified from FastHttpDateFormat.java in jakarta-tomcat.
+ * HTTP dates use Greenwich Mean Time (GMT) as time zone and a date format like:
  * 
- * @author John Xing
+ * <pre>
+ * Sun, 06 Nov 1994 08:49:37 GMT
+ * </pre>
+ * 
+ * See <a href=
+ * "https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3.1">sec. 3.3.1
+ * in RFC 2616</a> and
+ * <a href="https://tools.ietf.org/html/rfc7231#section-7.1.1.1">sec. 7.1.1.1 in
+ * RFC 7231</a>.
  */
 public class HttpDateFormat {
 
   protected static SimpleDateFormat format = new SimpleDateFormat(
       "EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
 
+  protected static TimeZone gmt = TimeZone.getTimeZone("GMT");
+
   /**
    * HTTP date uses TimeZone GMT
    */
   static {
-    format.setTimeZone(TimeZone.getTimeZone("GMT"));
+    format.setTimeZone(gmt);
   }
 
-  // HttpDate (long t) {
-  // }
-
-  // HttpDate (String s) {
-  // }
-
-  // /**
-  // * Get the current date in HTTP format.
-  // */
-  // public static String getCurrentDate() {
-  //
-  // long now = System.currentTimeMillis();
-  // if ((now - currentDateGenerated) > 1000) {
-  // synchronized (format) {
-  // if ((now - currentDateGenerated) > 1000) {
-  // currentDateGenerated = now;
-  // currentDate = format.format(new Date(now));
-  // }
-  // }
-  // }
-  // return currentDate;
-  //
-  // }
-
   /**
    * Get the HTTP format of the specified date.
    */
@@ -97,6 +84,7 @@
     Date date;
     synchronized (format) {
       date = format.parse(dateString);
+      format.setTimeZone(gmt);
     }
     return date;
   }
@@ -105,6 +93,7 @@
     long time;
     synchronized (format) {
       time = format.parse(dateString).getTime();
+      format.setTimeZone(gmt);
     }
     return time;
   }
diff --git a/src/test/org/apache/nutch/net/protocols/TestHttpDateFormat.java b/src/test/org/apache/nutch/net/protocols/TestHttpDateFormat.java
new file mode 100644
index 0000000..94f30c3
--- /dev/null
+++ b/src/test/org/apache/nutch/net/protocols/TestHttpDateFormat.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.protocols;
+
+import java.text.ParseException;
+import java.util.Date;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestHttpDateFormat {
+
+  /**
+   * Test date as string and epoche milliseconds:
+   * 
+   * <pre>
+   *   $> date --date "Sun, 06 Nov 1994 08:49:37 GMT" '+%s'
+   *   784111777
+   * </pre>
+   */
+  private final String dateString = "Sun, 06 Nov 1994 08:49:37 GMT";
+  private long dateMillis = 784111777000L;
+
+  @Test
+  public void testHttpDateFormat() throws ParseException {
+
+    Assert.assertEquals(dateMillis, HttpDateFormat.toLong(dateString));
+    Assert.assertEquals(dateString, HttpDateFormat.toString(dateMillis));
+    Assert.assertEquals(new Date(dateMillis), HttpDateFormat.toDate(dateString));
+
+    String ds2 = "Sun, 6 Nov 1994 08:49:37 GMT";
+    Assert.assertEquals(dateMillis, HttpDateFormat.toLong(ds2));
+  }
+
+  @Test(expected = ParseException.class)
+  public void testHttpDateFormatException() throws ParseException {
+    String ds = "this is not a valid date";
+    HttpDateFormat.toLong(ds);
+  }
+
+  /**
+   * NUTCH-2814 - HttpDateFormat's internal time zone must not change when
+   * parsing a date using a different time zone
+   */
+  @Test
+  public void testHttpDateFormatTimeZone() throws ParseException {
+    String dateStringPDT = "Mon, 21 Oct 2019 03:18:16 PDT";
+    HttpDateFormat.toLong(dateStringPDT); // must not affect internal time zone
+    Assert.assertEquals(dateString, HttpDateFormat.toString(dateMillis));
+  }
+}