oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java - jackrabbit-oak - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined;

 import com.mongodb.MongoClientSettings;
 import com.mongodb.MongoException;
 import com.mongodb.MongoIncompatibleDriverException;
 import com.mongodb.MongoInterruptedException;
 import com.mongodb.ReadPreference;
 import com.mongodb.client.FindIterable;
 import com.mongodb.client.MongoCollection;
 import com.mongodb.client.MongoCursor;
 import com.mongodb.client.MongoDatabase;
 import com.mongodb.client.model.Filters;
 import org.apache.jackrabbit.guava.common.base.Preconditions;
 import org.apache.jackrabbit.guava.common.base.Stopwatch;
 import org.apache.jackrabbit.oak.commons.IOUtils;
 import org.apache.jackrabbit.oak.commons.PathUtils;
 import org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.MongoRegexPathFilterFactory.MongoFilterPaths;
 import org.apache.jackrabbit.oak.plugins.document.Collection;
 import org.apache.jackrabbit.oak.plugins.document.NodeDocument;
 import org.apache.jackrabbit.oak.plugins.document.mongo.MongoDocumentStore;
 import org.apache.jackrabbit.oak.plugins.document.util.Utils;
 import org.apache.jackrabbit.oak.plugins.index.FormattingUtils;
 import org.apache.jackrabbit.oak.plugins.index.MetricsFormatter;
 import org.apache.jackrabbit.oak.plugins.index.IndexingReporter;
 import org.apache.jackrabbit.oak.plugins.index.MetricsUtils;
 import org.apache.jackrabbit.oak.spi.filter.PathFilter;
 import org.apache.jackrabbit.oak.stats.StatisticsProvider;
 import org.bson.BsonDocument;
 import org.bson.codecs.configuration.CodecRegistries;
 import org.bson.codecs.configuration.CodecRegistry;
 import org.bson.conversions.Bson;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.time.Duration;
 import java.time.Instant;
 import java.time.temporal.ChronoUnit;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.TreeSet;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.Callable;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;

 import static com.mongodb.client.model.Sorts.ascending;

 public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownloadTask.Result> {

     public static class Result {
         private final long documentsDownloaded;

         public Result(long documentsDownloaded) {
             this.documentsDownloaded = documentsDownloaded;
         }

         public long getDocumentsDownloaded() {
             return documentsDownloaded;
         }
     }

     private static final Logger LOG = LoggerFactory.getLogger(PipelinedMongoDownloadTask.class);

     /**
      * Whether to retry on connection errors to MongoDB.
      * This property affects the query that is used to download the documents from MongoDB. If set to true, the query
      * will traverse the results by order of the _modified property (does an index scan), which allows it to resume after
      * a failed connection from where it left off. If set to false, it uses a potentially more efficient query that does
      * not impose any order on the results (does a simple column scan).
      */
     public static final String OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS = "oak.indexer.pipelined.retryOnConnectionErrors";
     public static final boolean DEFAULT_OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS = true;
     public static final String OAK_INDEXER_PIPELINED_MONGO_CONNECTION_RETRY_SECONDS = "oak.indexer.pipelined.mongoConnectionRetrySeconds";
     public static final int DEFAULT_OAK_INDEXER_PIPELINED_MONGO_CONNECTION_RETRY_SECONDS = 300;

     /**
      * Whether to do path filtering in the Mongo query instead of doing a full traversal of the document store and
      * filtering in the indexing job. This feature may significantly reduce the number of documents downloaded from
      * Mongo.
      * The performance gains may not be proportional to the reduction in the number of documents downloaded because Mongo
      * still has to traverse all the documents. This is required because the regex expression used for path filtering
      * starts with a wildcard (because the _id starts with the depth of the path, so the regex expression must ignore
      * this part). Because of the wildcard at the start, Mongo cannot use of the index on _id.
      */
     public static final String OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = "oak.indexer.pipelined.mongoRegexPathFiltering";
     public static final boolean DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = false;
     /**
      * Any document with a path that matches this regex pattern will not be downloaded. This pattern will be included
      * in the Mongo query, that is, the filtering is done by server-side at Mongo, which avoids downloading the documents
      * matching this query. This is typically a _suffix_, for example "/metadata.xml$|/renditions/.*.jpg$".
      * To exclude subtrees such as /content/abc, use mongoFilterPaths instead.
      */
     public static final String OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX = "oak.indexer.pipelined.mongoCustomExcludeEntriesRegex";
     public static final String DEFAULT_OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX = "";

     /**
      * Maximum number of elements in the included/excluded paths list used for regex path filtering. If after
      * merging and de-deduplication of the paths of all the path filters the number of included or excluded paths exceeds
      * this value, then disable path filtering to avoid creating Mongo queries with large number of filters
      */
     public static final String OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS = "oak.indexer.pipelined.mongoRegexPathFilteringMaxPaths";
     public static final int DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS = 20;


     // Use a short initial retry interval. In most cases if the connection to a replica fails, there will be other
     // replicas available so a reconnection attempt will succeed immediately.
     private final static long retryInitialIntervalMillis = 100;
     private final static long retryMaxIntervalMillis = 10_000;

     // TODO: Revise this timeout. It is used to prevent the indexer from blocking forever if the queue is full.
     private static final Duration MONGO_QUEUE_OFFER_TIMEOUT = Duration.ofMinutes(30);
     private static final int MIN_INTERVAL_BETWEEN_DELAYED_ENQUEUING_MESSAGES = 10;
     private final static BsonDocument NATURAL_HINT = BsonDocument.parse("{ $natural: 1 }");
     private final static BsonDocument ID_INDEX_HINT = BsonDocument.parse("{ _id: 1 }");
     final static Pattern LONG_PATH_ID_PATTERN = Pattern.compile("^[0-9]{1,3}:h.*$");

     private static final String THREAD_NAME = "mongo-dump";

     /**
      * Creates the filter to be used in the Mongo query
      *
      * @param mongoFilterPaths          The paths to be included/excluded in the filter. These define subtrees to be included or excluded.
      *                                  (see {@link MongoFilterPaths} for details)
      * @param customExcludeEntriesRegex Documents with paths matching this regex are excluded from download
      * @param queryUsesIndexTraversal   Whether the query will use an index to traverse the documents.
      * @return The filter to be used in the Mongo query, or null if no filter is required
      */
     static Bson computeMongoQueryFilter(@NotNull MongoFilterPaths mongoFilterPaths, String customExcludeEntriesRegex, boolean queryUsesIndexTraversal) {
         var filters = new ArrayList<Bson>();

         Bson includedFilter = descendantsFilter(mongoFilterPaths.included, queryUsesIndexTraversal);
         if (includedFilter != null) {
             filters.add(includedFilter);
         }

         // The Mongo filter returned here will download the top level path of each excluded subtree, which in theory
         // should be excluded. That is, if the tree /a/b/c is excluded, the filter will download /a/b/c but none of
         // its descendants.
         // This is done because excluding also the top level path would add extra complexity to the filter and
         // would not have any measurable impact on performance because it only downloads a few extra documents, one
         // for each excluded subtree. The transform stage will anyway filter out these paths.
         Bson excludedFilter = descendantsFilter(mongoFilterPaths.excluded, queryUsesIndexTraversal);
         if (excludedFilter != null) {
             filters.add(Filters.nor(excludedFilter));
         }

         // Custom regex filter to exclude paths
         Bson customExcludedPathsFilter = createCustomExcludedEntriesFilter(customExcludeEntriesRegex, queryUsesIndexTraversal);
         if (customExcludedPathsFilter != null) {
             filters.add(customExcludedPathsFilter);
         }

         if (filters.isEmpty()) {
             return null;
         } else if (filters.size() == 1) {
             return filters.get(0);
         } else {
             return Filters.and(filters);
         }
     }

     static Bson createCustomExcludedEntriesFilter(String customRegexPattern, boolean queryUsesIndexTraversal) {
         if (customRegexPattern == null || customRegexPattern.trim().isEmpty()) {
             LOG.info("Mongo custom regex is disabled");
             return null;
         } else {
             LOG.info("Excluding nodes with paths matching regex: {}", customRegexPattern);
             var pattern = Pattern.compile(customRegexPattern);
             Bson pathFilter = createPathFilter(List.of(pattern), queryUsesIndexTraversal);
             return Filters.nor(Filters.regex(NodeDocument.ID, pattern), pathFilter);
         }
     }

     private static Bson descendantsFilter(List<String> paths, boolean queryUsesIndexTraversal) {
         if (paths.isEmpty()) {
             return null;
         }
         if (paths.size() == 1 && paths.get(0).equals("/")) {
             return null;
         }

         // The filter for descendants of a list of paths is a series of or conditions. For each path, we have to build
         // two conditions in two different fields of the documents:
         // _ _id   - for non-long paths - In this case, the _id is of the form "2:/foo/bar"
         // _ _path - for long paths - In this case, the _id is a hash and the document contains an additional _path
         //      field with the path of the document.
         // We use the $in operator with a regular expression to match the paths.
         //  https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression
         ArrayList<Pattern> pathPatterns = new ArrayList<>();
         ArrayList<Pattern> idPatterns = new ArrayList<>();

         for (String path : paths) {
             if (!path.endsWith("/")) {
                 path = path + "/";
             }
             String quotedPath = Pattern.quote(path);
             idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$"));
             pathPatterns.add(Pattern.compile("^" + quotedPath + ".*$"));
         }

         Bson pathFilter = createPathFilter(pathPatterns, queryUsesIndexTraversal);
         return Filters.or(Filters.in(NodeDocument.ID, idPatterns), pathFilter);
     }

     private static Bson createPathFilter(List<Pattern> pattern, boolean queryUsesIndexTraversal) {
         // If a document has a long path, the _id is replaced by a hash and the path is stored in an additional _path field.
         // When doing an index scan, it may be more efficient to check that the _id is in the format of a long path id
         // (that is, numeric prefix followed by ":h") first, before checking the _path field. The _id
         // is available from the index while the _path field is only available on the document itself, so checking the
         // _path will force an expensive retrieval of the full document. It is not guaranteed that Mongo will implement
         // this optimization, but it is adding this additional check to allow MongoDB to apply this optimization.
         // If the query does a column scan, then Mongo retrieves the full document from the column store, so we can
         // check the _path directly, which simplifies a bit the query.
         if (queryUsesIndexTraversal) {
             return Filters.and(
                     Filters.regex(NodeDocument.ID, LONG_PATH_ID_PATTERN),
                     Filters.in(NodeDocument.PATH, pattern)
             );
         } else {
             return Filters.in(NodeDocument.PATH, pattern);
         }
     }

     /**
      * Returns all the ancestors paths of the given list of paths. That is, if the list is ["/a/b/c", "/a/b/d"],
      * this method will return ["/", "/a", "/a/b", "/a/b/c", "/a/b/d"]. Note that the paths on the input list are also
      * returned, even though they are not strictly ancestors of themselves.
      */
     static List<String> getAncestors(List<String> paths) {
         TreeSet<String> ancestors = new TreeSet<>();
         for (String child : paths) {
             String parent = child;
             while (true) {
                 ancestors.add(parent);
                 if (PathUtils.denotesRoot(parent)) {
                     break;
                 }
                 parent = PathUtils.getParentPath(parent);
             }
         }
         return new ArrayList<>(ancestors);
     }


     private static Bson ancestorsFilter(List<String> paths) {
         List<String> parentFilters = getAncestors(paths).stream()
                 .map(Utils::getIdFromPath)
                 .collect(Collectors.toList());
         return Filters.in(NodeDocument.ID, parentFilters);
     }

     private final int maxBatchNumberOfDocuments;
     private final BlockingQueue<NodeDocument[]> mongoDocQueue;
     private final List<PathFilter> pathFilters;
     private final int retryDuringSeconds;
     private final boolean retryOnConnectionErrors;
     private final boolean regexPathFiltering;
     private final Logger traversalLog = LoggerFactory.getLogger(PipelinedMongoDownloadTask.class.getName() + ".traversal");
     private final MongoCollection<NodeDocument> dbCollection;
     private final ReadPreference readPreference;
     private final Stopwatch downloadStartWatch = Stopwatch.createUnstarted();
     private final int maxBatchSizeBytes;
     private final StatisticsProvider statisticsProvider;
     private final IndexingReporter reporter;
     private final MongoRegexPathFilterFactory regexPathFilterFactory;
     private final String customExcludeEntriesRegex;

     private long totalEnqueueWaitTimeMillis = 0;
     private Instant lastDelayedEnqueueWarningMessageLoggedTimestamp = Instant.now();
     private long documentsDownloadedTotal = 0;
     private long documentsDownloadedTotalBytes = 0;
     private long nextLastModified = 0;
     private String lastIdDownloaded = null;

     public PipelinedMongoDownloadTask(MongoDatabase mongoDatabase,
                                       MongoDocumentStore mongoDocStore,
                                       int maxBatchSizeBytes,
                                       int maxBatchNumberOfDocuments,
                                       BlockingQueue<NodeDocument[]> queue,
                                       List<PathFilter> pathFilters,
                                       StatisticsProvider statisticsProvider,
                                       IndexingReporter reporter) {
         this.statisticsProvider = statisticsProvider;
         this.reporter = reporter;
         NodeDocumentCodecProvider nodeDocumentCodecProvider = new NodeDocumentCodecProvider(mongoDocStore, Collection.NODES);
         CodecRegistry nodeDocumentCodecRegistry = CodecRegistries.fromRegistries(
                 CodecRegistries.fromProviders(nodeDocumentCodecProvider),
                 MongoClientSettings.getDefaultCodecRegistry()
         );
         this.dbCollection = mongoDatabase
                 .withCodecRegistry(nodeDocumentCodecRegistry)
                 .getCollection(Collection.NODES.toString(), NodeDocument.class);
         this.maxBatchSizeBytes = maxBatchSizeBytes;
         this.maxBatchNumberOfDocuments = maxBatchNumberOfDocuments;
         this.mongoDocQueue = queue;
         this.pathFilters = pathFilters;

         // Default retries for 5 minutes.
         this.retryDuringSeconds = ConfigHelper.getSystemPropertyAsInt(
                 OAK_INDEXER_PIPELINED_MONGO_CONNECTION_RETRY_SECONDS,
                 DEFAULT_OAK_INDEXER_PIPELINED_MONGO_CONNECTION_RETRY_SECONDS);
         Preconditions.checkArgument(retryDuringSeconds > 0,
                 "Property " + OAK_INDEXER_PIPELINED_MONGO_CONNECTION_RETRY_SECONDS + " must be > 0. Was: " + retryDuringSeconds);
         this.reporter.addConfig(OAK_INDEXER_PIPELINED_MONGO_CONNECTION_RETRY_SECONDS, String.valueOf(retryDuringSeconds));

         this.retryOnConnectionErrors = ConfigHelper.getSystemPropertyAsBoolean(
                 OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS,
                 DEFAULT_OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS);
         this.reporter.addConfig(OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS, String.valueOf(retryOnConnectionErrors));

         this.regexPathFiltering = ConfigHelper.getSystemPropertyAsBoolean(
                 OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING,
                 DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING);
         this.reporter.addConfig(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, String.valueOf(regexPathFiltering));

         int regexPathFilteringMaxNumberOfPaths = ConfigHelper.getSystemPropertyAsInt(
                 OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS,
                 DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS);
         this.reporter.addConfig(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS, String.valueOf(regexPathFilteringMaxNumberOfPaths));
         this.regexPathFilterFactory = new MongoRegexPathFilterFactory(regexPathFilteringMaxNumberOfPaths);

         this.customExcludeEntriesRegex = ConfigHelper.getSystemPropertyAsString(
                 OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX,
                 DEFAULT_OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX
         );
         this.reporter.addConfig(OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX, customExcludeEntriesRegex);

         //TODO This may lead to reads being routed to secondary depending on MongoURI
         //So caller must ensure that its safe to read from secondary
 //        this.readPreference = MongoDocumentStoreHelper.getConfiguredReadPreference(mongoStore, collection);
         this.readPreference = ReadPreference.secondaryPreferred();
         LOG.info("maxBatchSizeBytes: {}, maxBatchNumberOfDocuments: {}, readPreference: {}",
                 maxBatchSizeBytes, maxBatchNumberOfDocuments, readPreference.getName());
     }

     @Override
     public Result call() throws Exception {
         String originalName = Thread.currentThread().getName();
         Thread.currentThread().setName(THREAD_NAME);
         LOG.info("[TASK:{}:START] Starting to download from MongoDB", THREAD_NAME.toUpperCase(Locale.ROOT));
         try {
             this.nextLastModified = 0;
             this.lastIdDownloaded = null;

             downloadStartWatch.start();
             if (retryOnConnectionErrors) {
                 downloadWithRetryOnConnectionErrors();
             } else {
                 downloadWithNaturalOrdering();
             }

             long durationMillis = downloadStartWatch.elapsed(TimeUnit.MILLISECONDS);
             String enqueueingDelayPercentage = PipelinedUtils.formatAsPercentage(totalEnqueueWaitTimeMillis, durationMillis);
             String metrics = MetricsFormatter.newBuilder()
                     .add("duration", FormattingUtils.formatToSeconds(downloadStartWatch))
                     .add("durationSeconds", durationMillis / 1000)
                     .add("documentsDownloaded", documentsDownloadedTotal)
                     .add("documentsDownloadedTotalBytes", documentsDownloadedTotalBytes)
                     .add("dataDownloaded", IOUtils.humanReadableByteCountBin(documentsDownloadedTotalBytes))
                     .add("enqueueingDelayMillis", totalEnqueueWaitTimeMillis)
                     .add("enqueueingDelayPercentage", enqueueingDelayPercentage)
                     .build();
             MetricsUtils.addMetric(statisticsProvider, reporter, PipelinedMetrics.OAK_INDEXER_PIPELINED_MONGO_DOWNLOAD_DURATION_SECONDS, durationMillis / 1000);
             MetricsUtils.addMetric(statisticsProvider, reporter, PipelinedMetrics.OAK_INDEXER_PIPELINED_DOCUMENTS_DOWNLOADED_TOTAL, documentsDownloadedTotal);
             MetricsUtils.addMetric(statisticsProvider, reporter,  PipelinedMetrics.OAK_INDEXER_PIPELINED_MONGO_DOWNLOAD_ENQUEUE_DELAY_PERCENTAGE,
                     PipelinedUtils.toPercentage(totalEnqueueWaitTimeMillis, durationMillis)
             );
             MetricsUtils.addMetricByteSize(statisticsProvider, reporter, PipelinedMetrics.OAK_INDEXER_PIPELINED_DOCUMENTS_DOWNLOADED_TOTAL_BYTES,
                     documentsDownloadedTotalBytes);
             LOG.info("[TASK:{}:END] Metrics: {}", THREAD_NAME.toUpperCase(Locale.ROOT), metrics);
             reporter.addTiming("Mongo dump", FormattingUtils.formatToSeconds(downloadStartWatch));
             return new Result(documentsDownloadedTotal);
         } catch (InterruptedException t) {
             LOG.warn("Thread interrupted", t);
             throw t;
         } catch (Throwable t) {
             LOG.warn("Thread terminating with exception.", t);
             throw t;
         } finally {
             Thread.currentThread().setName(originalName);
         }
     }

     private void reportProgress(String id) {
         if (this.documentsDownloadedTotal % 10000 == 0) {
             double rate = ((double) this.documentsDownloadedTotal) / downloadStartWatch.elapsed(TimeUnit.SECONDS);
             String formattedRate = String.format(Locale.ROOT, "%1.2f nodes/s, %1.2f nodes/hr", rate, rate * 3600);
             LOG.info("Dumping from NSET Traversed #{} {} [{}] (Elapsed {})",
                     this.documentsDownloadedTotal, id, formattedRate, FormattingUtils.formatToSeconds(downloadStartWatch));
         }
         traversalLog.trace(id);
     }

     private void downloadWithRetryOnConnectionErrors() throws InterruptedException, TimeoutException {
         // If regex filtering is enabled, start by downloading the ancestors of the path used for filtering.
         // That is, download "/", "/content", "/content/dam" for a base path of "/content/dam". These nodes will not be
         // matched by the regex used in the Mongo query, which assumes a prefix of "???:/content/dam"
         MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering();
         Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, customExcludeEntriesRegex, true);
         if (mongoFilter == null) {
             LOG.info("Downloading full repository");
         } else {
             LOG.info("Downloading from Mongo using filter: {}", mongoFilter);
             // Regex path filtering is enabled
             // Download the ancestors in a separate query. No retrials done on this query, as it will take only a few
             // seconds and is done at the start of the job, so if it fails, the job can be retried without losing much work
             downloadAncestors(mongoFilterPaths.included);
         }

         Instant failuresStartTimestamp = null; // When the last series of failures started
         long retryIntervalMs = retryInitialIntervalMillis;
         int numberOfFailures = 0;
         boolean downloadCompleted = false;
         Map<String, Integer> exceptions = new HashMap<>();
         this.nextLastModified = 0;
         this.lastIdDownloaded = null;
         while (!downloadCompleted) {
             try {
                 if (lastIdDownloaded != null) {
                     LOG.info("Recovering from broken connection, finishing downloading documents with _modified={}", nextLastModified);
                     downloadRange(new DownloadRange(nextLastModified, nextLastModified + 1, lastIdDownloaded), mongoFilter);
                     // We have managed to reconnect, reset the failure timestamp
                     failuresStartTimestamp = null;
                     numberOfFailures = 0;
                     // Continue downloading everything starting from the next _lastmodified value
                     downloadRange(new DownloadRange(nextLastModified + 1, Long.MAX_VALUE, null), mongoFilter);
                 } else {
                     downloadRange(new DownloadRange(nextLastModified, Long.MAX_VALUE, null), mongoFilter);
                 }
                 downloadCompleted = true;
             } catch (MongoException e) {
                 if (e instanceof MongoInterruptedException || e instanceof MongoIncompatibleDriverException) {
                     // Non-recoverable exceptions
                     throw e;
                 }
                 if (failuresStartTimestamp == null) {
                     failuresStartTimestamp = Instant.now().truncatedTo(ChronoUnit.SECONDS);
                 }
                 LOG.warn("Connection error downloading from MongoDB.", e);
                 long secondsSinceStartOfFailures = Duration.between(failuresStartTimestamp, Instant.now()).toSeconds();
                 if (secondsSinceStartOfFailures > retryDuringSeconds) {
                     // Give up. Get a string of all exceptions that were thrown
                     StringBuilder summary = new StringBuilder();
                     for (Map.Entry<String, Integer> entry : exceptions.entrySet()) {
                         summary.append("\n\t").append(entry.getValue()).append("x: ").append(entry.getKey());
                     }
                     throw new RetryException(retryDuringSeconds, summary.toString(), e);
                 } else {
                     numberOfFailures++;
                     LOG.warn("Retrying download in {} ms; number of times failed: {}; current series of failures started at: {} ({} seconds ago)",
                             retryIntervalMs, numberOfFailures, failuresStartTimestamp, secondsSinceStartOfFailures);
                     exceptions.compute(e.getClass().getSimpleName() + " - " + e.getMessage(),
                             (key, val) -> val == null ? 1 : val + 1
                     );
                     Thread.sleep(retryIntervalMs);
                     // simple exponential backoff mechanism
                     retryIntervalMs = Math.min(retryMaxIntervalMillis, retryIntervalMs * 2);
                 }
             }
         }
     }

     private void downloadRange(DownloadRange range, Bson filter) throws InterruptedException, TimeoutException {
         Bson findQuery = range.getFindQuery();
         if (filter != null) {
             findQuery = Filters.and(findQuery, filter);
         }
         LOG.info("Traversing: {}. Query: {}", range, findQuery);
         FindIterable<NodeDocument> mongoIterable = dbCollection
                 .withReadPreference(readPreference)
                 .find(findQuery)
                 .sort(ascending(NodeDocument.MODIFIED_IN_SECS, NodeDocument.ID));
         download(mongoIterable);
     }

     private void downloadAncestors(List<String> basePath) throws InterruptedException, TimeoutException {
         if (basePath.size() == 1 && basePath.get(0).equals("/")) {
             return; // No need to download ancestors of root, the root will be downloaded as part of the normal traversal
         }
         Bson ancestorQuery = ancestorsFilter(basePath);
         LOG.info("Downloading ancestors of: {}, Query: {}.", basePath, ancestorQuery);
         FindIterable<NodeDocument> ancestorsIterable = dbCollection
                 .withReadPreference(readPreference)
                 .find(ancestorQuery)
                 // Use the index on _id: this query returns very few documents and the filter condition is on _id.
                 .hint(ID_INDEX_HINT);
         download(ancestorsIterable);
     }

     private void downloadWithNaturalOrdering() throws InterruptedException, TimeoutException {
         // We are downloading potentially a large fraction of the repository, so using an index scan will be
         // inefficient. So we pass the natural hint to force MongoDB to use natural ordering, that is, column scan
         MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering();
         Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, customExcludeEntriesRegex, false);
         if (mongoFilter == null) {
             LOG.info("Downloading full repository from Mongo with natural order");
             FindIterable<NodeDocument> mongoIterable = dbCollection
                     .withReadPreference(readPreference)
                     .find()
                     .hint(NATURAL_HINT);
             download(mongoIterable);

         } else {
             downloadAncestors(mongoFilterPaths.included);

             LOG.info("Downloading from Mongo with natural order using filter: {}", mongoFilter);
             FindIterable<NodeDocument> findIterable = dbCollection
                     .withReadPreference(readPreference)
                     .find(mongoFilter)
                     .hint(NATURAL_HINT);
             download(findIterable);
         }
     }

     private MongoFilterPaths getPathsForRegexFiltering() {
         if (!regexPathFiltering) {
             LOG.info("Regex path filtering disabled.");
             return MongoFilterPaths.DOWNLOAD_ALL;
         } else {
             LOG.info("Computing included/excluded paths for Mongo regex path filtering. PathFilters: {}",
                     pathFilters.stream()
                             .map(pf -> "PF{includedPaths=" + pf.getIncludedPaths() + ", excludedPaths=" + pf.getExcludedPaths() + "}")
                             .collect(Collectors.joining(", "))
             );
             MongoFilterPaths mongoFilterPaths = this.regexPathFilterFactory.buildMongoFilter(pathFilters);
             LOG.info("Paths used for regex filtering on Mongo: {}", mongoFilterPaths);
             return mongoFilterPaths;
         }
     }

     private void download(FindIterable<NodeDocument> mongoIterable) throws InterruptedException, TimeoutException {
         try (MongoCursor<NodeDocument> cursor = mongoIterable.iterator()) {
             NodeDocument[] batch = new NodeDocument[maxBatchNumberOfDocuments];
             int nextIndex = 0;
             int batchSize = 0;
             try {
                 while (cursor.hasNext()) {
                     NodeDocument next = cursor.next();
                     String id = next.getId();
                     // If we are retrying on connection errors, we need to keep track of the last _modified value
                     if (retryOnConnectionErrors) {
                         this.nextLastModified = next.getModified();
                     }
                     this.lastIdDownloaded = id;
                     this.documentsDownloadedTotal++;
                     reportProgress(id);

                     batch[nextIndex] = next;
                     nextIndex++;
                     int docSize = (int) next.remove(NodeDocumentCodec.SIZE_FIELD);
                     batchSize += docSize;
                     documentsDownloadedTotalBytes += docSize;
                     if (batchSize >= maxBatchSizeBytes || nextIndex == batch.length) {
                         LOG.trace("Enqueuing block with {} elements, estimated size: {} bytes", nextIndex, batchSize);
                         tryEnqueueCopy(batch, nextIndex);
                         nextIndex = 0;
                         batchSize = 0;
                     }
                 }
                 if (nextIndex > 0) {
                     LOG.info("Enqueueing last block with {} elements, estimated size: {}",
                             nextIndex, IOUtils.humanReadableByteCountBin(batchSize));
                     tryEnqueueCopy(batch, nextIndex);
                 }
             } catch (MongoException e) {
                 if (e instanceof MongoInterruptedException || e instanceof MongoIncompatibleDriverException) {
                     // Non-recoverable exceptions
                     throw e;
                 }
                 // There may be some documents in the current batch, enqueue them and rethrow the exception
                 if (nextIndex > 0) {
                     LOG.info("Connection interrupted with recoverable failure. Enqueueing partial block with {} elements, estimated size: {}",
                             nextIndex, IOUtils.humanReadableByteCountBin(batchSize));
                     tryEnqueueCopy(batch, nextIndex);
                 }
                 throw e;
             }
         }
     }

     private void tryEnqueueCopy(NodeDocument[] batch, int nextIndex) throws TimeoutException, InterruptedException {
         NodeDocument[] copyOfBatch = Arrays.copyOfRange(batch, 0, nextIndex);
         Stopwatch enqueueDelayStopwatch = Stopwatch.createStarted();
         if (!mongoDocQueue.offer(copyOfBatch, MONGO_QUEUE_OFFER_TIMEOUT.toMillis(), TimeUnit.MILLISECONDS)) {
             throw new TimeoutException("Timeout trying to enqueue batch of MongoDB documents. Waited " + MONGO_QUEUE_OFFER_TIMEOUT);
         }
         long enqueueDelay = enqueueDelayStopwatch.elapsed(TimeUnit.MILLISECONDS);
         totalEnqueueWaitTimeMillis += enqueueDelay;
         if (enqueueDelay > 1) {
             logWithRateLimit(() ->
                     LOG.info("Enqueuing of Mongo document batch was delayed, took {} ms. mongoDocQueue size {}. " +
                                     "Consider increasing the number of Transform threads. " +
                                     "(This message is logged at most once every {} seconds)",
                             enqueueDelay, mongoDocQueue.size(), MIN_INTERVAL_BETWEEN_DELAYED_ENQUEUING_MESSAGES)
             );
         }
     }

     private void logWithRateLimit(Runnable f) {
         Instant now = Instant.now();
         if (Duration.between(lastDelayedEnqueueWarningMessageLoggedTimestamp, now).toSeconds() > MIN_INTERVAL_BETWEEN_DELAYED_ENQUEUING_MESSAGES) {
             f.run();
             lastDelayedEnqueueWarningMessageLoggedTimestamp = now;
         }
     }

     private static class RetryException extends RuntimeException {

         private final int retrialDurationSeconds;

         public RetryException(int retrialDurationSeconds, String message, Throwable cause) {
             super(message, cause);
             this.retrialDurationSeconds = retrialDurationSeconds;
         }

         @Override
         public String toString() {
             return "Tried for " + retrialDurationSeconds + " seconds: \n" + super.toString();
         }
     }
 }