collects a full list of binaries in sorted order

git-svn-id: https://svn.apache.org/repos/asf/openejb/trunk/sandbox/legal@1176650 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/main/java/org/apache/openejb/tools/legal/Main.java b/src/main/java/org/apache/openejb/tools/legal/Main.java
index 90c4a1f..923d8ce 100644
--- a/src/main/java/org/apache/openejb/tools/legal/Main.java
+++ b/src/main/java/org/apache/openejb/tools/legal/Main.java
@@ -22,8 +22,9 @@
 import org.codehaus.swizzle.stream.StreamLexer;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.net.URI;
-import java.util.HashSet;
+import java.util.LinkedHashSet;
 import java.util.Set;
 
 /**
@@ -55,26 +56,48 @@
     }
 
     private Set<URI> crawl(URI index) throws IOException {
-        final Set<URI> resources = new HashSet<URI>();
+        final Set<URI> resources = new LinkedHashSet<URI>();
 
-        HttpGet request = new HttpGet(index);
+        final HttpGet request = new HttpGet(index);
         request.setHeader("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13");
-        HttpResponse response = client.execute(request);
-        StreamLexer lexer = new StreamLexer(response.getEntity().getContent());
+        final HttpResponse response = client.execute(request);
+
+        final InputStream content = response.getEntity().getContent();
+        final StreamLexer lexer = new StreamLexer(content);
+
+        final Set<URI> crawl = new LinkedHashSet<URI>();
 
         //<a href="https://repository.apache.org/content/repositories/orgapacheopenejb-094/archetype-catalog.xml">archetype-catalog.xml</a>
-        while (lexer.readAndMark("<a ","/a>")) {
-            final String link = lexer.peek("href=\"", "\"");
-            final String name = lexer.peek(">", "<");
+        while (lexer.readAndMark("<a ", "/a>")) {
 
-            final URI uri = index.resolve(link);
+            try {
+                final String link = lexer.peek("href=\"", "\"");
+                final String name = lexer.peek(">", "<");
 
-            if (name.endsWith("/")) {
-                resources.addAll(crawl(uri));
-            } else {
+                final URI uri = index.resolve(link);
+
+                if (name.equals("../")) continue;
+                if (link.equals("../")) continue;
+
+                if (name.endsWith("/")) {
+                    crawl.add(uri);
+                    continue;
+                }
+
+                if (!uri.getPath().matches(".*(jar|zip|tar.gz)")) continue;
+
                 resources.add(uri);
+
+            } finally {
+                lexer.unmark();
             }
         }
+
+        content.close();
+
+        for (URI uri : crawl) {
+            resources.addAll(crawl(uri));
+        }
         return resources;
     }