| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.batch.fs; |
| |
| |
| import java.io.IOException; |
| import java.nio.file.DirectoryStream; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.concurrent.ArrayBlockingQueue; |
| |
| import org.apache.tika.batch.FileResource; |
| import org.apache.tika.batch.FileResourceCrawler; |
| |
| public class FSDirectoryCrawler extends FileResourceCrawler { |
| |
| private final Path root; |
| private final Path startDirectory; |
| private final Comparator<Path> pathComparator = new FileNameComparator(); |
| private CRAWL_ORDER crawlOrder; |
| |
| public FSDirectoryCrawler(ArrayBlockingQueue<FileResource> fileQueue, int numConsumers, |
| Path root, CRAWL_ORDER crawlOrder) { |
| super(fileQueue, numConsumers); |
| this.root = root; |
| this.startDirectory = root; |
| this.crawlOrder = crawlOrder; |
| if (!Files.isDirectory(startDirectory)) { |
| throw new RuntimeException( |
| "Crawler couldn't find this directory:" + startDirectory.toAbsolutePath()); |
| } |
| |
| } |
| |
| public FSDirectoryCrawler(ArrayBlockingQueue<FileResource> fileQueue, int numConsumers, |
| Path root, Path startDirectory, CRAWL_ORDER crawlOrder) { |
| super(fileQueue, numConsumers); |
| this.root = root; |
| this.startDirectory = startDirectory; |
| this.crawlOrder = crawlOrder; |
| assert (startDirectory.toAbsolutePath().startsWith(root.toAbsolutePath())); |
| |
| if (!Files.isDirectory(startDirectory)) { |
| throw new RuntimeException( |
| "Crawler couldn't find this directory:" + startDirectory.toAbsolutePath()); |
| } |
| } |
| |
| public void start() throws InterruptedException { |
| addFiles(startDirectory); |
| } |
| |
| private void addFiles(Path directory) throws InterruptedException { |
| |
| if (directory == null) { |
| LOG.warn("FSFileAdder asked to process null directory?!"); |
| return; |
| } |
| |
| List<Path> files = new ArrayList<>(); |
| try (DirectoryStream<Path> ds = Files.newDirectoryStream(directory)) { |
| for (Path p : ds) { |
| files.add(p); |
| } |
| } catch (IOException e) { |
| LOG.warn("FSFileAdder couldn't read {}: {}", directory.toAbsolutePath(), e.getMessage(), |
| e); |
| } |
| if (files.size() == 0) { |
| LOG.info("Empty directory: {}", directory.toAbsolutePath()); |
| return; |
| } |
| |
| |
| if (crawlOrder == CRAWL_ORDER.RANDOM) { |
| Collections.shuffle(files); |
| } else if (crawlOrder == CRAWL_ORDER.SORTED) { |
| files.sort(pathComparator); |
| } |
| |
| int numFiles = 0; |
| List<Path> directories = new LinkedList<>(); |
| for (Path f : files) { |
| if (Thread.currentThread().isInterrupted()) { |
| throw new InterruptedException("file adder interrupted"); |
| } |
| if (!Files.isReadable(f)) { |
| LOG.warn("Skipping -- {} -- file/directory is not readable", f.toAbsolutePath()); |
| continue; |
| } |
| if (Files.isDirectory(f)) { |
| directories.add(f); |
| continue; |
| } |
| numFiles++; |
| if (numFiles == 1) { |
| handleFirstFileInDirectory(f); |
| } |
| int added = tryToAdd(new FSFileResource(root, f)); |
| if (added == FileResourceCrawler.STOP_NOW) { |
| LOG.debug("crawler has hit a limit: {} : {}", f.toAbsolutePath(), added); |
| return; |
| } |
| LOG.debug("trying to add: {} : {}", f.toAbsolutePath(), added); |
| } |
| |
| for (Path f : directories) { |
| addFiles(f); |
| } |
| } |
| |
| /** |
| * Override this if you have any special handling |
| * for the first actual file that the crawler comes across |
| * in a directory. For example, it might be handy to call |
| * mkdirs() on an output directory if your FileResourceConsumers |
| * are writing to a file. |
| * |
| * @param f file to handle |
| */ |
| public void handleFirstFileInDirectory(Path f) { |
| //no-op |
| } |
| |
| public enum CRAWL_ORDER { |
| SORTED, //alphabetical order; necessary for cross-platform unit tests |
| RANDOM, //shuffle |
| OS_ORDER //operating system chooses |
| } |
| |
| //simple lexical order for the file name, we don't really care about localization. |
| //we do want this, though, because file.compareTo behaves differently |
| //on different OS's. |
| private static class FileNameComparator implements Comparator<Path> { |
| |
| @Override |
| public int compare(Path f1, Path f2) { |
| if (f1 == null || f2 == null) { |
| return 0; |
| } |
| return f1.getFileName().toString().compareTo(f2.getFileName().toString()); |
| } |
| } |
| } |