TENTACLES-9: Add patch to have a retr strategy during crawl phase (patch)



git-svn-id: https://svn.apache.org/repos/asf/creadur/tentacles/trunk@1714849 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt
index d7f800d..d5b69b5 100644
--- a/RELEASE_NOTES.txt
+++ b/RELEASE_NOTES.txt
@@ -11,3 +11,4 @@
     * [TENTACLES-3] - provide help text if runtime parameters are missing
     * [TENTACLES-2] - use proper escaping in Velocity template files.
     * [TENTACLES-1] - allow filtering of directories in LicenseFilter
+    * [TENTACLES-9] - adding retry during crawl (thanks to Andy Gumbrecht)
diff --git a/pom.xml b/pom.xml
index 3aa01de..18a9f84 100644
--- a/pom.xml
+++ b/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache</groupId>
     <artifactId>apache</artifactId>
-    <version>14</version>
+    <version>17</version>
   </parent>
   <groupId>org.apache.creadur.tentacles</groupId>
   <artifactId>apache-tentacles</artifactId>
@@ -74,7 +74,7 @@
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <javaVersion>1.6</javaVersion>
-    <httpClientVersion>4.3.5</httpClientVersion>
+    <httpClientVersion>4.3.6</httpClientVersion>
     <apacheRatVersion>0.11</apacheRatVersion>
   </properties>
   <issueManagement>
diff --git a/src/main/java/org/apache/creadur/tentacles/IOSystem.java b/src/main/java/org/apache/creadur/tentacles/IOSystem.java
index b424537..4232b93 100644
--- a/src/main/java/org/apache/creadur/tentacles/IOSystem.java
+++ b/src/main/java/org/apache/creadur/tentacles/IOSystem.java
@@ -16,26 +16,12 @@
  */
 package org.apache.creadur.tentacles;
 
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.BufferedWriter;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.Closeable;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
-import java.io.Flushable;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
+import org.apache.log4j.Logger;
+
+import java.io.*;
 import java.net.URL;
 import java.util.zip.ZipInputStream;
 
-import org.apache.log4j.Logger;
-
 /**
  * @version $Rev$ $Date$
  */
@@ -117,12 +103,12 @@
                 ((Flushable) closeable).flush();
             }
         } catch (final IOException e) {
-        	LOG.error("Error when trying to flush before closing " + closeable, e);
+        	LOG.trace("Error when trying to flush before closing " + closeable, e);
         }
         try {
             closeable.close();
         } catch (final IOException e) {
-        	LOG.error("Error when trying to close " + closeable, e);
+        	LOG.trace("Error when trying to close " + closeable, e);
         }
     }
 
diff --git a/src/main/java/org/apache/creadur/tentacles/NexusClient.java b/src/main/java/org/apache/creadur/tentacles/NexusClient.java
index 8f793de..87683cf 100644
--- a/src/main/java/org/apache/creadur/tentacles/NexusClient.java
+++ b/src/main/java/org/apache/creadur/tentacles/NexusClient.java
@@ -16,6 +16,17 @@
  */
 package org.apache.creadur.tentacles;
 
+import org.apache.http.Header;
+import org.apache.http.HttpHeaders;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpHead;
+import org.apache.http.client.methods.HttpUriRequest;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.log4j.Logger;
+import org.codehaus.swizzle.stream.StreamLexer;
+
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -23,145 +34,153 @@
 import java.util.LinkedHashSet;
 import java.util.Set;
 
-import org.apache.http.Header;
-import org.apache.http.HttpHeaders;
-import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpGet;
-import org.apache.http.client.methods.HttpHead;
-import org.apache.http.impl.client.CloseableHttpClient;
-import org.apache.http.impl.client.HttpClientBuilder;
-import org.apache.log4j.Logger;
-import org.codehaus.swizzle.stream.StreamLexer;
-
 public class NexusClient {
 
-	private static final Logger log = Logger.getLogger(NexusClient.class);
-	private static final String SLASH = "/";
-	private static final String ONE_UP = "../";
-	private static final String USER_AGENT_CONTENTS = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13";
+    private static final Logger log = Logger.getLogger(NexusClient.class);
+    private static final String SLASH = "/";
+    private static final String ONE_UP = "../";
+    private static final String USER_AGENT_CONTENTS = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13";
 
-	private final CloseableHttpClient client;
-	private final FileSystem fileSystem;
-	private final IOSystem ioSystem;
+    private final CloseableHttpClient client;
+    private final FileSystem fileSystem;
+    private final IOSystem ioSystem;
+    private final int retries;
 
-	public NexusClient(final Platform platform) {
+    public NexusClient(final Platform platform) {
 
-		System.setProperty("http.keepAlive", "false");
-		System.setProperty("http.maxConnections", "50");
+        System.setProperty("http.keepAlive", "false");
+        System.setProperty("http.maxConnections", "50");
 
-		this.client = HttpClientBuilder.create().disableContentCompression()
-				.build();
-		this.fileSystem = platform.getFileSystem();
-		this.ioSystem = platform.getIoSystem();
-	}
+        this.retries = Integer.parseInt(System.getProperty("NexusClient.retries", "5"));
 
-	public File download(final URI uri, final File file) throws IOException {
-		if (file.exists()) {
+        this.client = HttpClientBuilder.create().disableContentCompression()
+                .build();
+        this.fileSystem = platform.getFileSystem();
+        this.ioSystem = platform.getIoSystem();
+    }
 
-			final long length = getContentLength(uri);
+    public File download(final URI uri, final File file) throws IOException {
+        if (file.exists()) {
 
-			if (file.length() == length) {
-				log.info("Exists " + uri);
-				return file;
-			} else {
-				log.info("Incomplete " + uri);
-			}
-		}
+            final long length = getContentLength(uri);
 
-		log.info("Download " + uri);
+            if (file.length() == length) {
+                log.info("Exists " + uri);
+                return file;
+            } else {
+                log.info("Incomplete " + uri);
+            }
+        }
 
-		final CloseableHttpResponse response = get(uri);
+        log.info("Download " + uri);
 
-		InputStream content = null;
-		try {
-			content = response.getEntity().getContent();
+        final CloseableHttpResponse response = get(uri);
 
-			this.fileSystem.mkparent(file);
+        InputStream content = null;
+        try {
+            content = response.getEntity().getContent();
 
-			this.ioSystem.copy(content, file);
-		} finally {
-			if (content != null) {
-				content.close();
-			}
+            this.fileSystem.mkparent(file);
 
-			response.close();
-		}
+            this.ioSystem.copy(content, file);
+        } finally {
+            if (content != null) {
+                content.close();
+            }
 
-		return file;
-	}
+            response.close();
+        }
 
-	private Long getContentLength(final URI uri) throws IOException {
-		final CloseableHttpResponse head = head(uri);
-		final Header[] headers = head.getHeaders(HttpHeaders.CONTENT_LENGTH);
+        return file;
+    }
 
-		if (headers != null && headers.length >= 1) {
-			return Long.valueOf(headers[0].getValue());
-		}
+    private Long getContentLength(final URI uri) throws IOException {
+        final CloseableHttpResponse head = head(uri);
+        final Header[] headers = head.getHeaders(HttpHeaders.CONTENT_LENGTH);
 
-		head.close();
+        if (headers != null && headers.length >= 1) {
+            return Long.valueOf(headers[0].getValue());
+        }
 
-		return Long.valueOf(-1);
-	}
+        head.close();
 
-	private CloseableHttpResponse get(final URI uri) throws IOException {
-		final HttpGet request = new HttpGet(uri);
-		request.setHeader(HttpHeaders.USER_AGENT, USER_AGENT_CONTENTS);
-		return this.client.execute(request);
-	}
+        return (long) -1;
+    }
 
-	private CloseableHttpResponse head(final URI uri) throws IOException {
-		final HttpHead request = new HttpHead(uri);
-		request.setHeader(HttpHeaders.USER_AGENT, USER_AGENT_CONTENTS);
-		return this.client.execute(request);
-	}
+    private CloseableHttpResponse get(final URI uri) throws IOException {
+        return get(new HttpGet(uri), this.retries);
+    }
 
-	public Set<URI> crawl(final URI index) throws IOException {
-		log.info("Crawl " + index);
-		final Set<URI> resources = new LinkedHashSet<URI>();
+    private CloseableHttpResponse head(final URI uri) throws IOException {
+        return get(new HttpHead(uri), this.retries);
+    }
 
-		final CloseableHttpResponse response = get(index);
+    private CloseableHttpResponse get(final HttpUriRequest request, int tries) throws IOException {
+        try {
+            request.setHeader(HttpHeaders.USER_AGENT, USER_AGENT_CONTENTS);
+            return this.client.execute(request);
+        } catch (final IOException e) {
+            if (tries > 0) {
+                try {
+                    Thread.sleep(250);
+                } catch (final InterruptedException ie) {
+                    Thread.interrupted();
+                    throw new IOException("Interrupted", ie);
+                }
+                return get(request, tries--);
+            } else {
+                throw e;
+            }
+        }
+    }
 
-		final InputStream content = response.getEntity().getContent();
-		final StreamLexer lexer = new StreamLexer(content);
+    public Set<URI> crawl(final URI index) throws IOException {
+        log.info("Crawl " + index);
+        final Set<URI> resources = new LinkedHashSet<URI>();
 
-		final Set<URI> crawl = new LinkedHashSet<URI>();
+        final CloseableHttpResponse response = get(index);
 
-		// <a
-		// href="https://repository.apache.org/content/repositories/orgapacheopenejb-094/archetype-catalog.xml">archetype-catalog.xml</a>
-		while (lexer.readAndMark("<a ", "/a>")) {
+        final InputStream content = response.getEntity().getContent();
+        final StreamLexer lexer = new StreamLexer(content);
 
-			try {
-				final String link = lexer.peek("href=\"", "\"");
-				final String name = lexer.peek(">", "<");
+        final Set<URI> crawl = new LinkedHashSet<URI>();
 
-				final URI uri = index.resolve(link);
+        // <a
+        // href="https://repository.apache.org/content/repositories/orgapacheopenejb-094/archetype-catalog.xml">archetype-catalog.xml</a>
+        while (lexer.readAndMark("<a ", "/a>")) {
 
-				if (name.equals(ONE_UP)) {
-					continue;
-				}
-				if (link.equals(ONE_UP)) {
-					continue;
-				}
+            try {
+                final String link = lexer.peek("href=\"", "\"");
+                final String name = lexer.peek(">", "<");
 
-				if (name.endsWith(SLASH)) {
-					crawl.add(uri);
-					continue;
-				}
+                final URI uri = index.resolve(link);
 
-				resources.add(uri);
+                if (name.equals(ONE_UP)) {
+                    continue;
+                }
+                if (link.equals(ONE_UP)) {
+                    continue;
+                }
 
-			} finally {
-				lexer.unmark();
-			}
-		}
+                if (name.endsWith(SLASH)) {
+                    crawl.add(uri);
+                    continue;
+                }
 
-		content.close();
-		response.close();
+                resources.add(uri);
 
-		for (final URI uri : crawl) {
-			resources.addAll(crawl(uri));
-		}
+            } finally {
+                lexer.unmark();
+            }
+        }
 
-		return resources;
-	}
+        content.close();
+        response.close();
+
+        for (final URI uri : crawl) {
+            resources.addAll(crawl(uri));
+        }
+
+        return resources;
+    }
 }
\ No newline at end of file