tika-batch/src/main/java/org/apache/tika/batch/fs/FSDirectoryCrawler.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.batch.fs;


 import java.io.IOException;
 import java.nio.file.DirectoryStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.concurrent.ArrayBlockingQueue;

 import org.apache.tika.batch.FileResource;
 import org.apache.tika.batch.FileResourceCrawler;

 public class FSDirectoryCrawler extends FileResourceCrawler {

     private final Path root;
     private final Path startDirectory;
     private final Comparator<Path> pathComparator = new FileNameComparator();
     private CRAWL_ORDER crawlOrder;

     public FSDirectoryCrawler(ArrayBlockingQueue<FileResource> fileQueue, int numConsumers,
                               Path root, CRAWL_ORDER crawlOrder) {
         super(fileQueue, numConsumers);
         this.root = root;
         this.startDirectory = root;
         this.crawlOrder = crawlOrder;
         if (!Files.isDirectory(startDirectory)) {
             throw new RuntimeException(
                     "Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
         }

     }

     public FSDirectoryCrawler(ArrayBlockingQueue<FileResource> fileQueue, int numConsumers,
                               Path root, Path startDirectory, CRAWL_ORDER crawlOrder) {
         super(fileQueue, numConsumers);
         this.root = root;
         this.startDirectory = startDirectory;
         this.crawlOrder = crawlOrder;
         assert (startDirectory.toAbsolutePath().startsWith(root.toAbsolutePath()));

         if (!Files.isDirectory(startDirectory)) {
             throw new RuntimeException(
                     "Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
         }
     }

     public void start() throws InterruptedException {
         addFiles(startDirectory);
     }

     private void addFiles(Path directory) throws InterruptedException {

         if (directory == null) {
             LOG.warn("FSFileAdder asked to process null directory?!");
             return;
         }

         List<Path> files = new ArrayList<>();
         try (DirectoryStream<Path> ds = Files.newDirectoryStream(directory)) {
             for (Path p : ds) {
                 files.add(p);
             }
         } catch (IOException e) {
             LOG.warn("FSFileAdder couldn't read {}: {}", directory.toAbsolutePath(), e.getMessage(),
                     e);
         }
         if (files.size() == 0) {
             LOG.info("Empty directory: {}", directory.toAbsolutePath());
             return;
         }


         if (crawlOrder == CRAWL_ORDER.RANDOM) {
             Collections.shuffle(files);
         } else if (crawlOrder == CRAWL_ORDER.SORTED) {
             files.sort(pathComparator);
         }

         int numFiles = 0;
         List<Path> directories = new LinkedList<>();
         for (Path f : files) {
             if (Thread.currentThread().isInterrupted()) {
                 throw new InterruptedException("file adder interrupted");
             }
             if (!Files.isReadable(f)) {
                 LOG.warn("Skipping -- {} -- file/directory is not readable", f.toAbsolutePath());
                 continue;
             }
             if (Files.isDirectory(f)) {
                 directories.add(f);
                 continue;
             }
             numFiles++;
             if (numFiles == 1) {
                 handleFirstFileInDirectory(f);
             }
             int added = tryToAdd(new FSFileResource(root, f));
             if (added == FileResourceCrawler.STOP_NOW) {
                 LOG.debug("crawler has hit a limit: {} : {}", f.toAbsolutePath(), added);
                 return;
             }
             LOG.debug("trying to add: {} : {}", f.toAbsolutePath(), added);
         }

         for (Path f : directories) {
             addFiles(f);
         }
     }

     /**
      * Override this if you have any special handling
      * for the first actual file that the crawler comes across
      * in a directory.  For example, it might be handy to call
      * mkdirs() on an output directory if your FileResourceConsumers
      * are writing to a file.
      *
      * @param f file to handle
      */
     public void handleFirstFileInDirectory(Path f) {
         //no-op
     }

     public enum CRAWL_ORDER {
         SORTED, //alphabetical order; necessary for cross-platform unit tests
         RANDOM, //shuffle
         OS_ORDER //operating system chooses
     }

     //simple lexical order for the file name, we don't really care about localization.
     //we do want this, though, because file.compareTo behaves differently
     //on different OS's.
     private static class FileNameComparator implements Comparator<Path> {

         @Override
         public int compare(Path f1, Path f2) {
             if (f1 == null || f2 == null) {
                 return 0;
             }
             return f1.getFileName().toString().compareTo(f2.getFileName().toString());
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.batch.fs;


	import java.io.IOException;
	import java.nio.file.DirectoryStream;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.LinkedList;
	import java.util.List;
	import java.util.concurrent.ArrayBlockingQueue;

	import org.apache.tika.batch.FileResource;
	import org.apache.tika.batch.FileResourceCrawler;

	public class FSDirectoryCrawler extends FileResourceCrawler {

	private final Path root;
	private final Path startDirectory;
	private final Comparator<Path> pathComparator = new FileNameComparator();
	private CRAWL_ORDER crawlOrder;

	public FSDirectoryCrawler(ArrayBlockingQueue<FileResource> fileQueue, int numConsumers,
	Path root, CRAWL_ORDER crawlOrder) {
	super(fileQueue, numConsumers);
	this.root = root;
	this.startDirectory = root;
	this.crawlOrder = crawlOrder;
	if (!Files.isDirectory(startDirectory)) {
	throw new RuntimeException(
	"Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
	}

	}

	public FSDirectoryCrawler(ArrayBlockingQueue<FileResource> fileQueue, int numConsumers,
	Path root, Path startDirectory, CRAWL_ORDER crawlOrder) {
	super(fileQueue, numConsumers);
	this.root = root;
	this.startDirectory = startDirectory;
	this.crawlOrder = crawlOrder;
	assert (startDirectory.toAbsolutePath().startsWith(root.toAbsolutePath()));

	if (!Files.isDirectory(startDirectory)) {
	throw new RuntimeException(
	"Crawler couldn't find this directory:" + startDirectory.toAbsolutePath());
	}
	}

	public void start() throws InterruptedException {
	addFiles(startDirectory);
	}

	private void addFiles(Path directory) throws InterruptedException {

	if (directory == null) {
	LOG.warn("FSFileAdder asked to process null directory?!");
	return;
	}

	List<Path> files = new ArrayList<>();
	try (DirectoryStream<Path> ds = Files.newDirectoryStream(directory)) {
	for (Path p : ds) {
	files.add(p);
	}
	} catch (IOException e) {
	LOG.warn("FSFileAdder couldn't read {}: {}", directory.toAbsolutePath(), e.getMessage(),
	e);
	}
	if (files.size() == 0) {
	LOG.info("Empty directory: {}", directory.toAbsolutePath());
	return;
	}


	if (crawlOrder == CRAWL_ORDER.RANDOM) {
	Collections.shuffle(files);
	} else if (crawlOrder == CRAWL_ORDER.SORTED) {
	files.sort(pathComparator);
	}

	int numFiles = 0;
	List<Path> directories = new LinkedList<>();
	for (Path f : files) {
	if (Thread.currentThread().isInterrupted()) {
	throw new InterruptedException("file adder interrupted");
	}
	if (!Files.isReadable(f)) {
	LOG.warn("Skipping -- {} -- file/directory is not readable", f.toAbsolutePath());
	continue;
	}
	if (Files.isDirectory(f)) {
	directories.add(f);
	continue;
	}
	numFiles++;
	if (numFiles == 1) {
	handleFirstFileInDirectory(f);
	}
	int added = tryToAdd(new FSFileResource(root, f));
	if (added == FileResourceCrawler.STOP_NOW) {
	LOG.debug("crawler has hit a limit: {} : {}", f.toAbsolutePath(), added);
	return;
	}
	LOG.debug("trying to add: {} : {}", f.toAbsolutePath(), added);
	}

	for (Path f : directories) {
	addFiles(f);
	}
	}

	/**
	* Override this if you have any special handling
	* for the first actual file that the crawler comes across
	* in a directory. For example, it might be handy to call
	* mkdirs() on an output directory if your FileResourceConsumers
	* are writing to a file.
	*
	* @param f file to handle
	*/
	public void handleFirstFileInDirectory(Path f) {
	//no-op
	}

	public enum CRAWL_ORDER {
	SORTED, //alphabetical order; necessary for cross-platform unit tests
	RANDOM, //shuffle
	OS_ORDER //operating system chooses
	}

	//simple lexical order for the file name, we don't really care about localization.
	//we do want this, though, because file.compareTo behaves differently
	//on different OS's.
	private static class FileNameComparator implements Comparator<Path> {

	@Override
	public int compare(Path f1, Path f2) {
	if (f1 == null \|\| f2 == null) {
	return 0;
	}
	return f1.getFileName().toString().compareTo(f2.getFileName().toString());
	}
	}
	}