[MINDEXER-185] Filter in index reader and use update request for configuration (#302)

This PR does two things:
 - uses `IndexUpdateRequest` as configuration for `IndexDataReader` (tmp folder, factory etc)
 - moves the filtering from post extraction to the read phase
   - makes filtering really fast (and multi threaded too), it is no longer an extra step
   - has actually an effect on the on-disk index size (since I've learned lucene doesn't really remove things since all files are immutable)
 
I do realize that this is not quite the same behavior as before. To retain the exact same behavior, we could add this as additional filter, one during read (new), one after extraction (old). (edit: done, see second commit)

example filter:
```java
      final Instant cutoff = ZonedDateTime.now().minusYears(2).toInstant();
      iur.setExtractionFilter((doc) -> {
          IndexableField field = doc.getField("m"); // usually never null
          return field != null && Instant.ofEpochMilli(Long.parseLong(field.stringValue())).isAfter(cutoff);
      });
```
results (single threaded, since MT has a index size penalty due to merge overhead):
```
full: 5.6 GB
2y: 2.6 GB
1y: 1.4 GB
```

---

https://issues.apache.org/jira/browse/MINDEXER-185
diff --git a/indexer-core/src/main/java/org/apache/maven/index/updater/DefaultIndexUpdater.java b/indexer-core/src/main/java/org/apache/maven/index/updater/DefaultIndexUpdater.java
index 2a8a17c..d6c6e38 100644
--- a/indexer-core/src/main/java/org/apache/maven/index/updater/DefaultIndexUpdater.java
+++ b/indexer-core/src/main/java/org/apache/maven/index/updater/DefaultIndexUpdater.java
@@ -181,7 +181,7 @@
             Set<String> allGroups;
             if (remoteIndexFile.endsWith(".gz")) {
                 IndexDataReadResult result =
-                        unpackIndexData(is, updateRequest.getThreads(), directory, updateRequest.getIndexingContext());
+                        unpackIndexData(is, updateRequest, directory, updateRequest.getIndexingContext());
                 timestamp = result.getTimestamp();
                 rootGroups = result.getRootGroups();
                 allGroups = result.getAllGroups();
@@ -233,7 +233,7 @@
                     continue;
                 }
 
-                Document d = r.document(i);
+                Document d = r.storedFields().document(i);
 
                 if (!filter.accept(d)) {
                     boolean success = w.tryDeleteDocument(r, i) != -1;
@@ -318,15 +318,27 @@
     public static IndexDataReadResult unpackIndexData(
             final InputStream is, final int threads, final Directory d, final IndexingContext context)
             throws IOException {
+        return unpackIndexData(d, new IndexDataReader(is, threads), context);
+    }
+
+    /**
+     * @param is an input stream to unpack index data from
+     * @param request IndexUpdateRequest for configuration
+     * @param d
+     * @param context
+     */
+    public static IndexDataReadResult unpackIndexData(
+            final InputStream is, final IndexUpdateRequest request, final Directory d, final IndexingContext context)
+            throws IOException {
+        return unpackIndexData(d, new IndexDataReader(is, request), context);
+    }
+
+    private static IndexDataReadResult unpackIndexData(
+            final Directory d, IndexDataReader dr, final IndexingContext context) throws IOException {
         IndexWriterConfig config = new IndexWriterConfig(new NexusAnalyzer());
         config.setUseCompoundFile(false);
-        NexusIndexWriter w = new NexusIndexWriter(d, config);
-        try {
-            IndexDataReader dr = new IndexDataReader(is, threads);
-
+        try (NexusIndexWriter w = new NexusIndexWriter(d, config)) {
             return dr.readIndex(w, context);
-        } finally {
-            IndexUtils.close(w);
         }
     }
 
diff --git a/indexer-core/src/main/java/org/apache/maven/index/updater/IndexDataReader.java b/indexer-core/src/main/java/org/apache/maven/index/updater/IndexDataReader.java
index a29203f..1b8c2a5 100644
--- a/indexer-core/src/main/java/org/apache/maven/index/updater/IndexDataReader.java
+++ b/indexer-core/src/main/java/org/apache/maven/index/updater/IndexDataReader.java
@@ -27,11 +27,13 @@
 import java.io.InputStream;
 import java.io.UTFDataFormatException;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.time.Duration;
 import java.time.Instant;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashSet;
+import java.util.Objects;
 import java.util.Set;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
@@ -49,6 +51,7 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.maven.index.ArtifactInfo;
+import org.apache.maven.index.context.DocumentFilter;
 import org.apache.maven.index.context.IndexUtils;
 import org.apache.maven.index.context.IndexingContext;
 import org.apache.maven.index.context.NexusAnalyzer;
@@ -65,7 +68,9 @@
     private static final Logger LOGGER = LoggerFactory.getLogger(IndexDataReader.class);
 
     private final DataInputStream dis;
-
+    private final Path tempStorage;
+    private final DocumentFilter filter;
+    private final FSDirectoryFactory factory;
     private final int threads;
 
     public IndexDataReader(final InputStream is) throws IOException {
@@ -73,10 +78,33 @@
     }
 
     public IndexDataReader(final InputStream is, final int threads) throws IOException {
+        this(is, null, null, null, threads);
+    }
+
+    public IndexDataReader(final InputStream is, final IndexUpdateRequest request) throws IOException {
+        this(
+                is,
+                request.getIndexTempDir() != null ? request.getIndexTempDir().toPath() : null,
+                request.getExtractionFilter(),
+                request.getFSDirectoryFactory(),
+                request.getThreads());
+    }
+
+    public IndexDataReader(
+            final InputStream is,
+            final Path tempStorage,
+            final DocumentFilter filter,
+            final FSDirectoryFactory factory,
+            final int threads)
+            throws IOException {
         if (threads < 1) {
             throw new IllegalArgumentException("Reader threads must be greater than zero: " + threads);
         }
+        this.tempStorage = Objects.requireNonNullElse(tempStorage, Path.of(System.getProperty("java.io.tmpdir")));
+        this.factory = Objects.requireNonNullElse(factory, FSDirectoryFactory.DEFAULT);
+        this.filter = filter;
         this.threads = threads;
+
         // MINDEXER-13
         // LightweightHttpWagon may have performed automatic decompression
         // Handle it transparently
@@ -248,7 +276,8 @@
     }
 
     private FSDirectory tempDirectory(final String name) throws IOException {
-        return FSDirectory.open(Files.createTempDirectory(name + ".dir"));
+        return factory.open(
+                Files.createTempDirectory(tempStorage, name + ".dir").toFile());
     }
 
     private IndexWriter tempWriter(final FSDirectory directory) throws IOException {
@@ -266,10 +295,11 @@
             throws IOException {
         ArtifactInfo ai = IndexUtils.constructArtifactInfo(doc, context);
         if (ai != null) {
-            indexWriter.addDocument(IndexUtils.updateDocument(doc, context, false, ai));
-
-            rootGroups.add(ai.getRootGroup());
-            allGroups.add(ai.getGroupId());
+            if (filter == null || filter.accept(doc)) {
+                indexWriter.addDocument(IndexUtils.updateDocument(doc, context, false, ai));
+                rootGroups.add(ai.getRootGroup());
+                allGroups.add(ai.getGroupId());
+            }
         } else {
             // these two fields are automatically handled in code above
             if (doc.getField(ArtifactInfo.ALL_GROUPS) == null && doc.getField(ArtifactInfo.ROOT_GROUPS) == null) {
diff --git a/indexer-core/src/main/java/org/apache/maven/index/updater/IndexUpdateRequest.java b/indexer-core/src/main/java/org/apache/maven/index/updater/IndexUpdateRequest.java
index d832452..38a3c97 100644
--- a/indexer-core/src/main/java/org/apache/maven/index/updater/IndexUpdateRequest.java
+++ b/indexer-core/src/main/java/org/apache/maven/index/updater/IndexUpdateRequest.java
@@ -37,6 +37,8 @@
 
     private DocumentFilter documentFilter;
 
+    private DocumentFilter extractionFilter;
+
     private boolean forceFullUpdate;
 
     private boolean incrementalOnly;
@@ -82,6 +84,14 @@
         this.documentFilter = documentFilter;
     }
 
+    public DocumentFilter getExtractionFilter() {
+        return extractionFilter;
+    }
+
+    public void setExtractionFilter(DocumentFilter extractionFilter) {
+        this.extractionFilter = extractionFilter;
+    }
+
     public void setForceFullUpdate(boolean forceFullUpdate) {
         this.forceFullUpdate = forceFullUpdate;
     }