blob: a03354f2d2f8fe093e990dd267e8d811c992adb6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.dataimport;
import java.io.File;
import java.io.FilenameFilter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.solr.util.DateMathParser;
/**
* <p>
* An {@link EntityProcessor} instance which can stream file names found in a given base
* directory matching patterns and returning rows containing file information.
* </p>
* <p>
* It supports querying a give base directory by matching:
* <ul>
* <li>regular expressions to file names</li>
* <li>excluding certain files based on regular expression</li>
* <li>last modification date (newer or older than a given date or time)</li>
* <li>size (bigger or smaller than size given in bytes)</li>
* <li>recursively iterating through sub-directories</li>
* </ul>
* Its output can be used along with {@link FileDataSource} to read from files in file
* systems.
* <p>
* Refer to <a
* href="http://wiki.apache.org/solr/DataImportHandler">http://wiki.apache.org/solr/DataImportHandler</a>
* for more details.
* </p>
* <p>
* <b>This API is experimental and may change in the future.</b>
*
* @since solr 1.3
* @see Pattern
*/
public class FileListEntityProcessor extends EntityProcessorBase {
/**
* A regex pattern to identify files given in data-config.xml after resolving any variables
*/
protected String fileName;
/**
* The baseDir given in data-config.xml after resolving any variables
*/
protected String baseDir;
/**
* A Regex pattern of excluded file names as given in data-config.xml after resolving any variables
*/
protected String excludes;
/**
* The newerThan given in data-config as a {@link java.util.Date}
* <p>
* <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
* </p>
*/
protected Date newerThan;
/**
* The newerThan given in data-config as a {@link java.util.Date}
*/
protected Date olderThan;
/**
* The biggerThan given in data-config as a long value
* <p>
* <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
* </p>
*/
protected long biggerThan = -1;
/**
* The smallerThan given in data-config as a long value
* <p>
* <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
* </p>
*/
protected long smallerThan = -1;
/**
* The recursive given in data-config. Default value is false.
*/
protected boolean recursive = false;
private Pattern fileNamePattern, excludesPattern;
@Override
public void init(Context context) {
super.init(context);
fileName = context.getEntityAttribute(FILE_NAME);
if (fileName != null) {
fileName = context.replaceTokens(fileName);
fileNamePattern = Pattern.compile(fileName);
}
baseDir = context.getEntityAttribute(BASE_DIR);
if (baseDir == null)
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"'baseDir' is a required attribute");
baseDir = context.replaceTokens(baseDir);
File dir = new File(baseDir);
if (!dir.isDirectory())
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"'baseDir' value: " + baseDir + " is not a directory");
String r = context.getEntityAttribute(RECURSIVE);
if (r != null)
recursive = Boolean.parseBoolean(r);
excludes = context.getEntityAttribute(EXCLUDES);
if (excludes != null) {
excludes = context.replaceTokens(excludes);
excludesPattern = Pattern.compile(excludes);
}
}
/**
* Get the Date object corresponding to the given string.
*
* @param dateStr the date string. It can be a DateMath string or it may have a evaluator function
* @return a Date instance corresponding to the input string
*/
private Date getDate(String dateStr) {
if (dateStr == null)
return null;
Matcher m = PLACE_HOLDER_PATTERN.matcher(dateStr);
if (m.find()) {
Object o = context.resolve(m.group(1));
if (o instanceof Date) return (Date)o;
dateStr = (String) o;
} else {
dateStr = context.replaceTokens(dateStr);
}
m = Evaluator.IN_SINGLE_QUOTES.matcher(dateStr);
if (m.find()) {
String expr = m.group(1);
//TODO refactor DateMathParser.parseMath a bit to have a static method for this logic.
if (expr.startsWith("NOW")) {
expr = expr.substring("NOW".length());
}
try {
// DWS TODO: is this TimeZone the right default for us? Deserves explanation if so.
return new DateMathParser(TimeZone.getDefault()).parseMath(expr);
} catch (ParseException exp) {
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"Invalid expression for date", exp);
}
}
try {
return new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT).parse(dateStr);
} catch (ParseException exp) {
throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
"Invalid expression for date", exp);
}
}
/**
* Get the Long value for the given string after resolving any evaluator or variable.
*
* @param sizeStr the size as a string
* @return the Long value corresponding to the given string
*/
private Long getSize(String sizeStr) {
if (sizeStr == null)
return null;
Matcher m = PLACE_HOLDER_PATTERN.matcher(sizeStr);
if (m.find()) {
Object o = context.resolve(m.group(1));
if (o instanceof Number) {
Number number = (Number) o;
return number.longValue();
}
sizeStr = (String) o;
} else {
sizeStr = context.replaceTokens(sizeStr);
}
return Long.parseLong(sizeStr);
}
@Override
public Map<String, Object> nextRow() {
if (rowIterator != null)
return getNext();
List<Map<String, Object>> fileDetails = new ArrayList<>();
File dir = new File(baseDir);
String dateStr = context.getEntityAttribute(NEWER_THAN);
newerThan = getDate(dateStr);
dateStr = context.getEntityAttribute(OLDER_THAN);
olderThan = getDate(dateStr);
String biggerThanStr = context.getEntityAttribute(BIGGER_THAN);
if (biggerThanStr != null)
biggerThan = getSize(biggerThanStr);
String smallerThanStr = context.getEntityAttribute(SMALLER_THAN);
if (smallerThanStr != null)
smallerThan = getSize(smallerThanStr);
getFolderFiles(dir, fileDetails);
rowIterator = fileDetails.iterator();
return getNext();
}
private void getFolderFiles(File dir, final List<Map<String, Object>> fileDetails) {
// Fetch an array of file objects that pass the filter, however the
// returned array is never populated; accept() always returns false.
// Rather we make use of the fileDetails array which is populated as
// a side affect of the accept method.
dir.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
File fileObj = new File(dir, name);
if (fileObj.isDirectory()) {
if (recursive) getFolderFiles(fileObj, fileDetails);
} else if (fileNamePattern == null) {
addDetails(fileDetails, dir, name);
} else if (fileNamePattern.matcher(name).find()) {
if (excludesPattern != null && excludesPattern.matcher(name).find())
return false;
addDetails(fileDetails, dir, name);
}
return false;
}
});
}
private void addDetails(List<Map<String, Object>> files, File dir, String name) {
Map<String, Object> details = new HashMap<>();
File aFile = new File(dir, name);
if (aFile.isDirectory()) return;
long sz = aFile.length();
Date lastModified = new Date(aFile.lastModified());
if (biggerThan != -1 && sz <= biggerThan)
return;
if (smallerThan != -1 && sz >= smallerThan)
return;
if (olderThan != null && lastModified.after(olderThan))
return;
if (newerThan != null && lastModified.before(newerThan))
return;
details.put(DIR, dir.getAbsolutePath());
details.put(FILE, name);
details.put(ABSOLUTE_FILE, aFile.getAbsolutePath());
details.put(SIZE, sz);
details.put(LAST_MODIFIED, lastModified);
files.add(details);
}
public static final Pattern PLACE_HOLDER_PATTERN = Pattern
.compile("\\$\\{(.*?)\\}");
public static final String DIR = "fileDir";
public static final String FILE = "file";
public static final String ABSOLUTE_FILE = "fileAbsolutePath";
public static final String SIZE = "fileSize";
public static final String LAST_MODIFIED = "fileLastModified";
public static final String FILE_NAME = "fileName";
public static final String BASE_DIR = "baseDir";
public static final String EXCLUDES = "excludes";
public static final String NEWER_THAN = "newerThan";
public static final String OLDER_THAN = "olderThan";
public static final String BIGGER_THAN = "biggerThan";
public static final String SMALLER_THAN = "smallerThan";
public static final String RECURSIVE = "recursive";
}