OAK-10671- [Indexing Job] Improve Mongo regex query: remove condition on non-indexed _path field to speed-up traversal (#1331) * Change filter on Mongo to apply conditions only on the _modified and _id fields, so that the filter condition can be evaluated only with the contents of an index on (_modified, _id).

commit: 2ac4819b9b59019fff0f574be982e43b6f51d82d [log] [tgz]
author: Nuno Santos <nsantos@adobe.com> Thu Feb 29 11:42:18 2024 +0100
committer: GitHub <noreply@github.com> Thu Feb 29 11:42:18 2024 +0100
tree: 91b79c2de642ee34ad4eab6cccdfbc6acd6bd003
parent: e02fee399f202e8b289e274e8d6a85a3b9acda3a [diff]
diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
index aeac8ba..d9276d9 100644
--- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
+++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java

@@ -147,15 +147,29 @@
      * @param mongoFilterPaths          The paths to be included/excluded in the filter. These define subtrees to be included or excluded.
      *                                  (see {@link MongoFilterPaths} for details)
      * @param customExcludeEntriesRegex Documents with paths matching this regex are excluded from download
-     * @param queryUsesIndexTraversal   Whether the query will use an index to traverse the documents.
      * @return The filter to be used in the Mongo query, or null if no filter is required
      */
-    static Bson computeMongoQueryFilter(@NotNull MongoFilterPaths mongoFilterPaths, String customExcludeEntriesRegex, boolean queryUsesIndexTraversal) {
+    static Bson computeMongoQueryFilter(@NotNull MongoFilterPaths mongoFilterPaths, String customExcludeEntriesRegex) {
         var filters = new ArrayList<Bson>();
 
-        Bson includedFilter = descendantsFilter(mongoFilterPaths.included, queryUsesIndexTraversal);
-        if (includedFilter != null) {
-            filters.add(includedFilter);
+        List<Pattern> includedPatterns = toFilterPatterns(mongoFilterPaths.included);
+        if (!includedPatterns.isEmpty()) {
+            // The conditions above on the _id field is not enough to match all JCR nodes in the given paths because nodes
+            // with paths longer than a certain threshold, are represented by Mongo documents where the _id field is replaced
+            // by a hash and the full path is stored in an additional field _path. To retrieve these long path documents,
+            // we could add a condition on the _path field, but this would slow down substantially scanning the DB, because
+            // the _path field is not part of the index used by this query (it's an index on _modified, _id). Therefore,
+            // Mongo would have to retrieve every document from the column store to evaluate the filter condition. So instead
+            // we add below a condition to download all the long path documents. These documents can be identified by the
+            // format of the _id field (<n>:h<hash>), so it is possible to identify them using only the index.
+            // This might download documents for nodes that are not in the included paths, but those documents will anyway
+            // be filtered in the transform stage. And in most repositories, the number of long path documents is very small,
+            // often there are none, so the extra documents downloaded will not slow down by much the download. However, the
+            // performance gains of evaluating the filter of the query using only the index are very significant, especially
+            // when the index requires only a small number of nodes.
+            var patternsWithLongPathInclude = new ArrayList<>(includedPatterns);
+            patternsWithLongPathInclude.add(LONG_PATH_ID_PATTERN);
+            filters.add(Filters.in(NodeDocument.ID, patternsWithLongPathInclude));
         }
 
         // The Mongo filter returned here will download the top level path of each excluded subtree, which in theory
@@ -164,15 +178,13 @@
         // This is done because excluding also the top level path would add extra complexity to the filter and
         // would not have any measurable impact on performance because it only downloads a few extra documents, one
         // for each excluded subtree. The transform stage will anyway filter out these paths.
-        Bson excludedFilter = descendantsFilter(mongoFilterPaths.excluded, queryUsesIndexTraversal);
-        if (excludedFilter != null) {
-            filters.add(Filters.nor(excludedFilter));
-        }
-
+        ArrayList<Pattern> excludedPatterns = new ArrayList<>();
+        excludedPatterns.addAll(toFilterPatterns(mongoFilterPaths.excluded));
         // Custom regex filter to exclude paths
-        Bson customExcludedPathsFilter = createCustomExcludedEntriesFilter(customExcludeEntriesRegex, queryUsesIndexTraversal);
-        if (customExcludedPathsFilter != null) {
-            filters.add(customExcludedPathsFilter);
+        excludedPatterns.addAll(customExcludedPatterns(customExcludeEntriesRegex));
+
+        if (!excludedPatterns.isEmpty()) {
+            filters.add(Filters.nin(NodeDocument.ID, excludedPatterns));
         }
 
         if (filters.isEmpty()) {
@@ -184,65 +196,31 @@
         }
     }
 
-    static Bson createCustomExcludedEntriesFilter(String customRegexPattern, boolean queryUsesIndexTraversal) {
-        if (customRegexPattern == null || customRegexPattern.trim().isEmpty()) {
-            LOG.info("Mongo custom regex is disabled");
-            return null;
-        } else {
-            LOG.info("Excluding nodes with paths matching regex: {}", customRegexPattern);
-            var pattern = Pattern.compile(customRegexPattern);
-            Bson pathFilter = createPathFilter(List.of(pattern), queryUsesIndexTraversal);
-            return Filters.nor(Filters.regex(NodeDocument.ID, pattern), pathFilter);
-        }
-    }
-
-    private static Bson descendantsFilter(List<String> paths, boolean queryUsesIndexTraversal) {
+    private static List<Pattern> toFilterPatterns(List<String> paths) {
         if (paths.isEmpty()) {
-            return null;
+            return List.of();
         }
         if (paths.size() == 1 && paths.get(0).equals("/")) {
-            return null;
+            return List.of();
         }
-
-        // The filter for descendants of a list of paths is a series of or conditions. For each path, we have to build
-        // two conditions in two different fields of the documents:
-        // _ _id   - for non-long paths - In this case, the _id is of the form "2:/foo/bar"
-        // _ _path - for long paths - In this case, the _id is a hash and the document contains an additional _path
-        //      field with the path of the document.
-        // We use the $in operator with a regular expression to match the paths.
-        //  https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression
-        ArrayList<Pattern> pathPatterns = new ArrayList<>();
-        ArrayList<Pattern> idPatterns = new ArrayList<>();
-
+        ArrayList<Pattern> patterns = new ArrayList<>();
         for (String path : paths) {
             if (!path.endsWith("/")) {
                 path = path + "/";
             }
             String quotedPath = Pattern.quote(path);
-            idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$"));
-            pathPatterns.add(Pattern.compile("^" + quotedPath + ".*$"));
+            patterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$"));
         }
-
-        Bson pathFilter = createPathFilter(pathPatterns, queryUsesIndexTraversal);
-        return Filters.or(Filters.in(NodeDocument.ID, idPatterns), pathFilter);
+        return patterns;
     }
 
-    private static Bson createPathFilter(List<Pattern> pattern, boolean queryUsesIndexTraversal) {
-        // If a document has a long path, the _id is replaced by a hash and the path is stored in an additional _path field.
-        // When doing an index scan, it may be more efficient to check that the _id is in the format of a long path id
-        // (that is, numeric prefix followed by ":h") first, before checking the _path field. The _id
-        // is available from the index while the _path field is only available on the document itself, so checking the
-        // _path will force an expensive retrieval of the full document. It is not guaranteed that Mongo will implement
-        // this optimization, but it is adding this additional check to allow MongoDB to apply this optimization.
-        // If the query does a column scan, then Mongo retrieves the full document from the column store, so we can
-        // check the _path directly, which simplifies a bit the query.
-        if (queryUsesIndexTraversal) {
-            return Filters.and(
-                    Filters.regex(NodeDocument.ID, LONG_PATH_ID_PATTERN),
-                    Filters.in(NodeDocument.PATH, pattern)
-            );
+    static List<Pattern> customExcludedPatterns(String customRegexPattern) {
+        if (customRegexPattern == null || customRegexPattern.trim().isEmpty()) {
+            LOG.info("Mongo custom regex is disabled");
+            return List.of();
         } else {
-            return Filters.in(NodeDocument.PATH, pattern);
+            LOG.info("Excluding nodes with paths matching regex: {}", customRegexPattern);
+            return List.of(Pattern.compile(customRegexPattern));
         }
     }
 
@@ -387,7 +365,7 @@
                     .build();
             MetricsUtils.addMetric(statisticsProvider, reporter, PipelinedMetrics.OAK_INDEXER_PIPELINED_MONGO_DOWNLOAD_DURATION_SECONDS, durationMillis / 1000);
             MetricsUtils.addMetric(statisticsProvider, reporter, PipelinedMetrics.OAK_INDEXER_PIPELINED_DOCUMENTS_DOWNLOADED_TOTAL, documentsDownloadedTotal);
-            MetricsUtils.addMetric(statisticsProvider, reporter,  PipelinedMetrics.OAK_INDEXER_PIPELINED_MONGO_DOWNLOAD_ENQUEUE_DELAY_PERCENTAGE,
+            MetricsUtils.addMetric(statisticsProvider, reporter, PipelinedMetrics.OAK_INDEXER_PIPELINED_MONGO_DOWNLOAD_ENQUEUE_DELAY_PERCENTAGE,
                     PipelinedUtils.toPercentage(totalEnqueueWaitTimeMillis, durationMillis)
             );
             MetricsUtils.addMetricByteSize(statisticsProvider, reporter, PipelinedMetrics.OAK_INDEXER_PIPELINED_DOCUMENTS_DOWNLOADED_TOTAL_BYTES,
@@ -421,7 +399,7 @@
         // That is, download "/", "/content", "/content/dam" for a base path of "/content/dam". These nodes will not be
         // matched by the regex used in the Mongo query, which assumes a prefix of "???:/content/dam"
         MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering();
-        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, customExcludeEntriesRegex, true);
+        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, customExcludeEntriesRegex);
         if (mongoFilter == null) {
             LOG.info("Downloading full repository");
         } else {
@@ -516,7 +494,7 @@
         // We are downloading potentially a large fraction of the repository, so using an index scan will be
         // inefficient. So we pass the natural hint to force MongoDB to use natural ordering, that is, column scan
         MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering();
-        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, customExcludeEntriesRegex, false);
+        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, customExcludeEntriesRegex);
         if (mongoFilter == null) {
             LOG.info("Downloading full repository from Mongo with natural order");
             FindIterable<NodeDocument> mongoIterable = dbCollection

diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
index b7f0f78..30ab0a3 100644
--- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
+++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java

@@ -32,6 +32,7 @@
 import org.apache.jackrabbit.oak.plugins.document.RevisionVector;
 import org.apache.jackrabbit.oak.plugins.document.mongo.MongoDocumentStore;
 import org.apache.jackrabbit.oak.plugins.document.util.MongoConnection;
+import org.apache.jackrabbit.oak.plugins.document.util.Utils;
 import org.apache.jackrabbit.oak.plugins.index.ConsoleIndexingReporter;
 import org.apache.jackrabbit.oak.plugins.metric.MetricStatisticsProvider;
 import org.apache.jackrabbit.oak.spi.blob.MemoryBlobStore;
@@ -186,7 +187,7 @@
                 "/content/dam/2023|{\"p2\":\"v2023\"}",
                 "/content/dam/2023/01|{\"p1\":\"v202301\"}",
                 "/content/dam/2023/02|{}"
-        ));
+        ), true);
     }
 
     @Test
@@ -214,7 +215,7 @@
                 "/content/dam/2022/02|{\"p1\":\"v202202\"}",
                 "/content/dam/2022/03|{\"p1\":\"v202203\"}",
                 "/content/dam/2022/04|{\"p1\":\"v202204\"}"
-        ));
+        ), true);
     }
 
 
@@ -234,7 +235,7 @@
                 "/etc|{}",
                 "/home|{}",
                 "/jcr:system|{}"
-        ));
+        ), true);
     }
 
     @Test
@@ -253,7 +254,7 @@
                 "/etc|{}",
                 "/home|{}",
                 "/jcr:system|{}"
-        ));
+        ), true);
     }
 
     @Test
@@ -283,8 +284,7 @@
                 "/content/dam/2022/02/04|{\"p1\":\"v20220204\"}",
                 "/content/dam/2022/03|{\"p1\":\"v202203\"}",
                 "/content/dam/2022/04|{\"p1\":\"v202204\"}"
-
-        ));
+        ), true);
     }
 
     @Test
@@ -305,7 +305,7 @@
                 "/content/dam/2023/01|{\"p1\":\"v202301\"}",
                 "/content/dam/2023/02|{}",
                 "/content/dam/2023/02/28|{\"p1\":\"v20230228\"}"
-        ));
+        ), true);
     }
 
     @Test
@@ -344,7 +344,7 @@
         // The list above has the longest paths first, reverse it to match the order in the FFS
         Collections.reverse(expected);
 
-        testSuccessfulDownload(pathPredicate, pathFilters, expected);
+        testSuccessfulDownload(pathPredicate, pathFilters, expected, false);
     }
 
 
@@ -454,10 +454,10 @@
 
     private void testSuccessfulDownload(Predicate<String> pathPredicate, List<PathFilter> pathFilters)
             throws CommitFailedException, IOException {
-        testSuccessfulDownload(pathPredicate, pathFilters, EXPECTED_FFS);
+        testSuccessfulDownload(pathPredicate, pathFilters, EXPECTED_FFS, false);
     }
 
-    private void testSuccessfulDownload(Predicate<String> pathPredicate, List<PathFilter> pathFilters, List<String> expected)
+    private void testSuccessfulDownload(Predicate<String> pathPredicate, List<PathFilter> pathFilters, List<String> expected, boolean ignoreLongPaths)
             throws CommitFailedException, IOException {
         Backend rwStore = createNodeStore(false);
         createContent(rwStore.documentNodeStore);
@@ -468,7 +468,19 @@
 
         File file = pipelinedStrategy.createSortedStoreFile();
         assertTrue(file.exists());
-        assertEquals(expected, Files.readAllLines(file.toPath()));
+        List<String> result = Files.readAllLines(file.toPath());
+        if (ignoreLongPaths) {
+            // Remove the long paths from the result. The filter on Mongo is best-effort, it will download long path
+            // documents, even if they do not match the includedPaths.
+            result = result.stream()
+                    .filter(s -> {
+                        var name = s.split("\\|")[0];
+                        return name.length() < Utils.PATH_LONG;
+                    })
+                    .collect(Collectors.toList());
+
+        }
+        assertEquals(expected, result);
         assertMetrics();
     }
 

diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
index 46060b7..3e87830 100644
--- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
+++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java

@@ -51,8 +51,10 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
+import static org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.mock;
@@ -297,22 +299,14 @@
 
     @Test
     public void createCustomExcludeEntriesFilter() {
-        assertNull(PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter(null, true));
-        assertNull(PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter("", true));
+        assertTrue(PipelinedMongoDownloadTask.customExcludedPatterns(null).isEmpty());
+        assertTrue(PipelinedMongoDownloadTask.customExcludedPatterns("").isEmpty());
 
         Pattern p = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
-        var expectedBson = Filters.nor(
-                Filters.regex(NodeDocument.ID, p),
-                Filters.and(
-                        Filters.regex(NodeDocument.ID, PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                        Filters.in(NodeDocument.PATH, p)
-                )
-        );
+        var actualListOfPatterns = PipelinedMongoDownloadTask.customExcludedPatterns("^[0-9]{1,3}:/a/b.*$");
+        assertEquals(1, actualListOfPatterns.size());
 
-
-        var actualBson = PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter("^[0-9]{1,3}:/a/b.*$", true);
-
-        assertBsonEquals(expectedBson, actualBson);
+        assertEquals(p.toString(), actualListOfPatterns.get(0).toString());
     }
 
     @Test
@@ -321,8 +315,7 @@
         assertNull(
                 PipelinedMongoDownloadTask.computeMongoQueryFilter(
                         MongoFilterPaths.DOWNLOAD_ALL,
-                        null,
-                        true
+                        null
                 )
         );
     }
@@ -332,23 +325,14 @@
         // Path filter but no exclude filter
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/"), List.of("/excluded1", "/content/excluded2")),
-                null,
-                true
+                null
         );
         // The generated filter should not include any condition to include the descendants of /
-        var expected = Filters.nor(
-                Filters.or(
-                        Filters.in(NodeDocument.ID,
-                                Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/excluded1/") + ".*$"),
-                                Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/content/excluded2/") + ".*$")),
-                        Filters.and(
-                                Filters.regex(NodeDocument.ID, PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                Filters.in(NodeDocument.PATH,
-                                        Pattern.compile("^" + Pattern.quote("/excluded1/") + ".*$"),
-                                        Pattern.compile("^" + Pattern.quote("/content/excluded2/") + ".*$"))
-                        )
-                )
-        );
+        var expected =
+                Filters.nin(NodeDocument.ID,
+                        Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/excluded1/") + ".*$"),
+                        Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/content/excluded2/") + ".*$")
+                );
         assertBsonEquals(expected, actual);
     }
 
@@ -358,15 +342,11 @@
         // Path filter but no exclude filter
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/parent"), List.of()),
-                null,
-                true
+                null
         );
-        var expected = Filters.or(
-                Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$")),
-                Filters.and(
-                        Filters.in(NodeDocument.PATH, Pattern.compile("^" + Pattern.quote("/parent/") + ".*$")),
-                        Filters.regex(NodeDocument.ID, PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN)
-                )
+        var expected = Filters.in(NodeDocument.ID,
+                Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$"),
+                LONG_PATH_ID_PATTERN
         );
         assertBsonEquals(expected, actual);
     }
@@ -375,47 +355,24 @@
     public void computeMongoQueryFilterNoPathFilterWithExcludeFilter() {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 MongoFilterPaths.DOWNLOAD_ALL,
-                "^[0-9]{1,3}:/a/b.*$",
-                true
+                "^[0-9]{1,3}:/a/b.*$"
         );
-        Pattern excludePattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
-        assertBsonEquals(
-                Filters.nor(
-                        Filters.regex(NodeDocument.ID, excludePattern),
-                        Filters.and(
-                                Filters.regex(NodeDocument.ID, PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                Filters.in(NodeDocument.PATH, excludePattern)
-                        )
-                ),
-                actual
-        );
+        Bson expectedFilter = Filters.nin(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:/a/b.*$"));
+        assertBsonEquals(expectedFilter, actual);
     }
 
     @Test
     public void computeMongoQueryFilterWithPathFilterWithExcludeFilter() {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/parent"), List.of()),
-                "^[0-9]{1,3}:/a/b.*$",
-                true
+                "^[0-9]{1,3}:/a/b.*$"
         );
 
         Pattern excludesPattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
         var expected =
                 Filters.and(
-                        Filters.or(
-                                Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$")),
-                                Filters.and(
-                                        Filters.regex(NodeDocument.ID, PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                        Filters.in(NodeDocument.PATH, Pattern.compile("^" + Pattern.quote("/parent/") + ".*$"))
-                                )
-                        ),
-                        Filters.nor(
-                                Filters.regex(NodeDocument.ID, excludesPattern),
-                                Filters.and(
-                                        Filters.regex(NodeDocument.ID, PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                        Filters.in(NodeDocument.PATH, excludesPattern)
-                                )
-                        )
+                        Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$"), LONG_PATH_ID_PATTERN),
+                        Filters.nin(NodeDocument.ID, excludesPattern)
                 );
         assertBsonEquals(expected, actual);
     }
@@ -424,21 +381,17 @@
     public void computeMongoQueryFilterWithPathFilterWithExcludeFilterAndNaturalOrderTraversal() {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/parent"), List.of()),
-                "^[0-9]{1,3}:/a/b.*$",
-                false
+                "^[0-9]{1,3}:/a/b.*$"
         );
 
         Pattern excludePattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
         var expected =
                 Filters.and(
-                        Filters.or(
-                                Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$")),
-                                Filters.in(NodeDocument.PATH, Pattern.compile("^" + Pattern.quote("/parent/") + ".*$"))
+                        Filters.in(NodeDocument.ID,
+                                Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$"),
+                                LONG_PATH_ID_PATTERN
                         ),
-                        Filters.nor(
-                                Filters.regex(NodeDocument.ID, excludePattern),
-                                Filters.in(NodeDocument.PATH, excludePattern)
-                        )
+                        Filters.nin(NodeDocument.ID, excludePattern)
                 );
         assertBsonEquals(expected, actual);
     }
@@ -447,24 +400,12 @@
     public void computeMongoQueryFilterWithPathFilterWithExcludeFilterAndNaturalColumnTraversal() {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/"), List.of("/excluded")),
-                "^[0-9]{1,3}:/a/b.*$",
-                false
+                "^[0-9]{1,3}:/a/b.*$"
         );
 
         Pattern excludePattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
         var expected =
-                Filters.and(
-                        Filters.nor(
-                                Filters.or(
-                                        Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/excluded/") + ".*$")),
-                                        Filters.in(NodeDocument.PATH, Pattern.compile("^" + Pattern.quote("/excluded/") + ".*$"))
-                                )
-                        ),
-                        Filters.nor(
-                                Filters.regex(NodeDocument.ID, excludePattern),
-                                Filters.in(NodeDocument.PATH, excludePattern)
-                        )
-                );
+                Filters.nin(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/excluded/") + ".*$"), excludePattern);
         assertBsonEquals(expected, actual);
     }
 
@@ -472,30 +413,14 @@
     public void computeMongoQueryFilterWithPathFilterWithExcludeFilterAndNaturalIndexTraversal() {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/"), List.of("/excluded")),
-                "^[0-9]{1,3}:/a/b.*$",
-                true
+                "^[0-9]{1,3}:/a/b.*$"
         );
 
         Pattern excludePattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
-        var expected =
-                Filters.and(
-                        Filters.nor(
-                                Filters.or(
-                                        Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/excluded/") + ".*$")),
-                                        Filters.and(
-                                                Filters.regex(NodeDocument.ID, PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                                Filters.in(NodeDocument.PATH, Pattern.compile("^" + Pattern.quote("/excluded/") + ".*$"))
-                                        )
-                                )
-                        ),
-                        Filters.nor(
-                                Filters.regex(NodeDocument.ID, excludePattern),
-                                Filters.and(
-                                        Filters.regex(NodeDocument.ID, PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                        Filters.in(NodeDocument.PATH, excludePattern)
-                                )
-                        )
-                );
+        var expected = Filters.nin(NodeDocument.ID,
+                Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/excluded/") + ".*$"),
+                excludePattern
+        );
         assertBsonEquals(expected, actual);
     }
commit	2ac4819b9b59019fff0f574be982e43b6f51d82d	[log] [tgz]
author	Nuno Santos <nsantos@adobe.com>	Thu Feb 29 11:42:18 2024 +0100
committer	GitHub <noreply@github.com>	Thu Feb 29 11:42:18 2024 +0100
tree	91b79c2de642ee34ad4eab6cccdfbc6acd6bd003
parent	e02fee399f202e8b289e274e8d6a85a3b9acda3a [diff]