blob: cc5b1cde67aa031a7245a1c481d5c80e502c869a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.oodt.cas.metadata.extractors;
import org.apache.oodt.cas.metadata.Metadata;
import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
import org.apache.oodt.cas.metadata.exceptions.MetExtractorConfigReaderException;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Assigns a ProductType based on a filename pattern, while simultaneously assigning values to metadata elements
* embedded in the filename pattern.
* <p/>
* Suppose I have files in the staging area ready to be ingested. These files usually have information encoded into the
* filename in order to distinguish the contents of one file from other files. For example book-1234567890.txt might be
* the contents of a book with ISBN 1234567890. Or page-1234567890-12.txt might be the text on page 12 of book with ISBN
* 1234567890.
* <p/>
* It would be useful to generate metadata from the information encoded in the filename (think: filename => metadata).
* The {@link ProdTypePatternMetExtractor} allows this in a flexible manner using regular expressions. Let's take a look
* at the config file for this met extractor.
* <p/>
* <pre>
* product-type-patterns.xml
*
* {@code
* <config>
* <!-- <element> MUST be defined before <product-type> so their patterns can be resolved -->
* <!-- name MUST be an element defined in elements.xml (also only upper and lower case alpha chars) -->
* <!-- regexp MUST be valid input to java.util.regex.Pattern.compile() -->
* <element name="ISBN" regexp="[0-9]{10}"/>
* <element name="Page" regexp="[0-9]*"/>
*
* <!-- name MUST be a ProductType name defined in product-types.xml -->
* <!-- metadata elements inside brackets MUST be mapped to the ProductType,
* as defined in product-type-element-map.xml -->
* <product-type name="Book" template="book-[ISBN].txt"/>
* <product-type name="BookPage" template="page-[ISBN]-[Page].txt"/>
* </config>
* }
* </pre>
* <p/>
* <p/>
* This file defines a regular expression for the "ISBN" metadata element, in this case, a 10-digit number. Also, the
* "Page" metadata element is defined as a sequence of 0 or more digits.
* <p/>
* Next, the file defines a filename pattern for the "Book" product type. The pattern is compiled into a regular
* expression, substituting the previously defined regexes as capture groups. For example, "book-[ISBN].txt" compiles to
* "book-([0-9]{10}).txt", and the ISBN met element is assigned to capture group 1. When the filename matches this
* pattern, 2 metadata assignments occur: (1) the ISBN met element is set to the matched regex group, and (2) the
* ProductType met element is set to "Book".
* <p/>
* Similarly, the second pattern sets ISBN, Page, and ProductType for files matching "page-([0-9]{10})-([0-9]*).txt".
* <p/>
* This achieves several things: <ol> <li>assigning met elements based on regular expressions</li> <li>assigning product
* type based on easy-to-understand pattern with met elements clearly indicated</li> <li>reuse of met element regular
* expressions</li> </ol>
* <p/>
* Differences from {@link FilenameTokenMetExtractor}:
* <ol>
* <li>Allows dynamic length metadata (does not rely on offset and length of metadata)</li>
* <li>Assigns ProductType</li>
* </ol>
* <p/>
* Differences from {@link org.apache.oodt.cas.crawl.AutoDetectProductCrawler}:
* <ol>
* <li>Does not require definition of custom MIME type and MIME-type regex. Really, all you want is to assign a
* ProductType, rather than indirectly assigning a custom MIME type that maps to a Product Type.</li>
* </ol>
* <p/>
* Differences from {@link org.apache.oodt.cas.filemgr.metadata.extractors.examples.FilenameRegexMetExtractor}:
* <ol>
* <li>Assigns ProductType. FilenameRegexMetExtractor runs after ProductType is already determined.</li>
* <li>Runs on the client-side (crawler). FilenameRegexMetExtractor runs on the server-side (filemgr).</li>
* <li>Different patterns for different ProductTypes. FilenameRegexMetExtractor config applies the same pattern to
* all files.</li>
* </ol>
* <p/>
* Prerequisites:
* <ol>
* <li>{@code <element>} tag occurs before {@code <product-type>} tag</li>
* <li>{@code <element> @name} attribute <strong>MUST</strong> be defined in FileManager policy elements.xml</li>
* <li>{@code <element> @regexp} attribute <strong>MUST</strong> be valid input to
* {@link java.util.regex.Pattern#compile(String)}</li>
* <li>{@code <product-type> @name} attribute <strong>MUST</strong> be a ProductType name (not ID) defined in
* product-types.xml</li>
* <li>met elements used in {@code <product-type> @template} attribute <strong>MUST</strong> be
* mapped to the ProductType, as defined in product-type-element-map.xml</li>
* </ol>
* <p/>
* <strong>Words of Caution</strong>
* <ul>
* <li><strong>Does not support nested met elements.</strong></li>
* <li><strong>Each pattern should map to one product type.</strong> Watch out for similar patterns. Don't do this:
* <pre>
* {@code
* <element name="Page" regexp="[0-9]*"/>
* <element name="Chapter" regexp="[0-9]*"/>
*
* <product-type name="Page" template="data-[Page].txt"/>
* <product-type name="Chapter" template="data-[Chapter].txt"/>
* }</pre>
* Instead, encode the product type information into the filename, for example:
* <pre>
* {@code
* <element name="Page" regexp="[0-9]*"/>
* <element name="Chapter" regexp="[0-9]*"/>
*
* <product-type name="Page" template="page-[Page].txt"/>
* <product-type name="Chapter" template="chapter-[Chapter].txt"/>
* }</pre>
* </li>
* </ul>
*
* @author rickdn (Ricky Nguyen)
*/
public class ProdTypePatternMetExtractor extends CmdLineMetExtractor {
static class ConfigReader extends AbstractSAXConfigReader {
private static final String ELEMENT_TAG = "element";
private static final String ELEMENT_NAME_ATTR = "name";
private static final String ELEMENT_REGEXP_ATTR = "regexp";
private static final String PRODUCT_TYPE_TAG = "product-type";
private static final String PRODUCT_TYPE_NAME_ATTR = "name";
private static final String PRODUCT_TYPE_TEMPLATE_ATTR = "template";
private static final Pattern MET_TOKEN = Pattern.compile("\\[([A-Za-z]*)\\]");
/*
* full file name reg exp => prod type
*/
private final Map<Pattern, String> prodTypePatterns = new HashMap<Pattern, String>();
/*
* prod type => list of met elements in the file name
*/
private final Map<String, List<String>> prodTypeElements = new HashMap<String, List<String>>();
/*
* met elements => element reg exp patterns
*/
private final Map<String, Pattern> elementPatterns = new HashMap<String, Pattern>();
Map<Pattern, String> getProdTypePatterns() {
return prodTypePatterns;
}
Map<String, List<String>> getProdTypeElements() {
return prodTypeElements;
}
void addProductType(String id, String template) {
template = template.replaceAll("\\.", "\\\\.");
Matcher m = MET_TOKEN.matcher(template);
List<String> elemList = prodTypeElements.get(id);
if (elemList == null) {
elemList = new ArrayList<String>();
prodTypeElements.put(id, elemList);
}
String newTemplate = template;
while (m.find()) {
String elem = m.group(1);
String regex = elementPatterns.get(elem).toString();
newTemplate = newTemplate.replaceAll("\\[" + elem + "\\]", "(" + regex + ")");
elemList.add(elem);
}
prodTypePatterns.put(Pattern.compile(newTemplate), id);
}
void addElement(String name, String regexp) {
elementPatterns.put(name, Pattern.compile(regexp));
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if (qName.equals(ELEMENT_TAG)) {
String name = attributes.getValue(ELEMENT_NAME_ATTR);
String regexp = attributes.getValue(ELEMENT_REGEXP_ATTR);
addElement(name, regexp);
} else if (qName.equals(PRODUCT_TYPE_TAG)) {
String id = attributes.getValue(PRODUCT_TYPE_NAME_ATTR);
String template = attributes.getValue(PRODUCT_TYPE_TEMPLATE_ATTR);
addProductType(id, template);
}
}
@Override
public AbstractSAXConfigReader parseConfigFile(File configFile) throws MetExtractorConfigReaderException {
// reset internal state whenever parsing a new config file
prodTypePatterns.clear();
prodTypeElements.clear();
elementPatterns.clear();
return super.parseConfigFile(configFile);
}
}
private static final String PRODUCT_TYPE_MET_KEY = "ProductType";
public ProdTypePatternMetExtractor() {
super(new ConfigReader());
}
@Override
protected Metadata extrMetadata(File file) throws MetExtractionException {
Metadata met = new Metadata();
ConfigReader mConfig = (ConfigReader) config;
for (Pattern p : mConfig.getProdTypePatterns().keySet()) {
Matcher m = p.matcher(file.getName());
if (m.matches()) {
String prodType = mConfig.getProdTypePatterns().get(p);
met.addMetadata(PRODUCT_TYPE_MET_KEY, prodType);
List<String> elemList = mConfig.getProdTypeElements().get(prodType);
for (int i = 0; i < m.groupCount(); i++) {
met.addMetadata(elemList.get(i), m.group(i + 1));
}
}
}
return met;
}
public static void main(String[] args) throws Exception {
processMain(args, new ProdTypePatternMetExtractor());
}
}