blob: 77b93d89190ab2b716cdf823bdff81646ed31ceb [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.lucas.indexer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.uima.lucas.indexer.analysis.TokenStreamConcatenator;
import org.apache.uima.lucas.indexer.analysis.TokenStreamMerger;
import org.apache.uima.lucas.indexer.mapping.FieldDescription;
import org.apache.uima.lucas.indexer.mapping.TermCoverBuilder;
import org.apache.uima.lucas.indexer.mapping.TermCoverBuilderFactory;
import org.apache.uima.lucas.indexer.mapping.FilterDescription;
import org.apache.uima.lucas.indexer.util.TokenStreamStringConcatenator;
public class FieldBuilder {
public static final String FIELD_INDEX_NO = "no";
public static final String FIELD_INDEX_YES = "yes";
public static final String FIELD_INDEX_NO_NORMS = "no_norms";
public static final String FIELD_INDEX_NO_TF = "no_tf";
public static final String FIELD_INDEX_NO_NORMS_TF = "no_norms_tf";
public static final String FIELD_TERM_VECTOR_NO = "no";
public static final String FIELD_TERM_VECTOR_YES = "yes";
public static final String FIELD_TERM_VECTOR_WITH_OFFSETS = "offsets";
public static final String FIELD_TERM_VECTOR_WITH_POSITIONS = "positions";
public static final String FIELD_TERM_VECTOR_WITH_POSITIONS_OFFSETS = "positions_offsets";
public static final String FIELD_STORE_NO = "no";
public static final String FIELD_STORE_YES = "yes";
public static final String FIELD_STORE_COMPRESS = "compress";
protected TokenStreamStringConcatenator tokenStreamStringConcatenator;
private FilterBuilder filterBuilder;
protected FieldDescription fieldDescription;
private TermCoverBuilderFactory termCoverBuilderFactory;
public FieldBuilder(FilterBuilder filterBuilder) {
tokenStreamStringConcatenator = new TokenStreamStringConcatenator();
termCoverBuilderFactory = new TermCoverBuilderFactory();
this.filterBuilder = filterBuilder;
}
public Collection<Field> createFields(Collection<TokenStream> tokenStreams,
FieldDescription fieldDescription) throws FieldBuildingException {
TokenStream tokenStream = createFieldTokenStream(tokenStreams,
fieldDescription);
this.fieldDescription = fieldDescription;
String fieldName = fieldDescription.getName();
String delimiter = fieldDescription.getDelimiter();
Collection<Field> fields = new ArrayList<Field>();
Collection<FilterDescription> filterDescriptions = fieldDescription
.getFilterDescriptions();
tokenStream = getFilteredTokenStream(fieldName, tokenStream,
filterDescriptions);
// The unique flag means we only want ONE field instance with the
// name fieldName.
Boolean unique = fieldDescription.getUnique();
Boolean coverField = fieldDescription.getTermCoverDescription() != null;
Field.Store fieldStore = getFieldStore(fieldDescription.getStored());
Field.Index fieldIndex = getFieldIndex(fieldDescription.getIndex());
Field.TermVector fieldTermVector = getFieldTermVector(fieldDescription
.getTermVector());
boolean omitTF = fieldDescription.getIndex().equals(FIELD_INDEX_NO_TF)
|| fieldDescription.getIndex().equals(FIELD_INDEX_NO_NORMS_TF);
boolean store = fieldStore == Field.Store.YES
|| fieldStore == Field.Store.COMPRESS;
if (!coverField) {
// Create stored fields. The parameters unique, fieldIndex and
// omitTF are only necessary in case of a stored and indexed
// unique field. Then, the field is instanced stored and indexed,
// thus only one instance of the field is necessary. This only works
// with TokenStreams which contain exactly one token (if a
// TokenStream emits more tokens, several fields will be instanced).
if (store)
fields.addAll(createStoredFields(fieldName, tokenStream,
fieldStore, delimiter, unique, fieldIndex, omitTF));
// Create indexed fields. If the field is unique and has been
// stored, there already is an instance of the field and we don't
// create another.
if (fieldIndex != Field.Index.NO && (!unique || !store))
fields.add(createIndexedField(fieldName, tokenStream,
fieldIndex, fieldTermVector, omitTF));
} else {
TermCoverBuilder termCoverBuilder = termCoverBuilderFactory
.createTermCoverBuilder(tokenStream, fieldDescription.getTermCoverDescription());
while (termCoverBuilder.increaseCoverSubset()) {
String coverSubsetName = termCoverBuilder.getCoverSubsetName();
TokenStream coverSubsetTokenStream = termCoverBuilder
.getPartitionTokenStream();
if (store)
fields.addAll(createStoredFields(coverSubsetName,
coverSubsetTokenStream, fieldStore, delimiter,
unique, fieldIndex, omitTF));
if (fieldIndex != Field.Index.NO && (!unique || !store))
fields.add(createIndexedField(coverSubsetName,
coverSubsetTokenStream, fieldIndex, fieldTermVector,
omitTF));
}
}
return fields;
}
protected TokenStream createFieldTokenStream(
Collection<TokenStream> tokenStreams,
FieldDescription fieldDescription) throws FieldBuildingException {
TokenStream tokenStream = null;
if (fieldDescription.getMerge())
tokenStream = getTokenStreamMerger(tokenStreams);
else if (tokenStreams.size() > 1)
tokenStream = new TokenStreamConcatenator(tokenStreams);
else if (tokenStreams.size() == 1)
tokenStream = tokenStreams.iterator().next();
return tokenStream;
}
protected TokenStream getFilteredTokenStream(String fieldName,
TokenStream tokenStream,
Collection<FilterDescription> filterDescriptions)
throws FieldBuildingException {
try {
return filterBuilder.filter(tokenStream, filterDescriptions);
} catch (FilterBuildingException e) {
throw createException(e);
}
}
protected FieldBuildingException createException(Exception e) {
String message = "Can't build field " + fieldDescription.getName()
+ " at line " + fieldDescription.getLineNumber();
return new FieldBuildingException(message, e);
}
private TokenStream getTokenStreamMerger(
Collection<TokenStream> tokenStreams) throws FieldBuildingException {
try {
return new TokenStreamMerger(tokenStreams);
} catch (IOException e) {
throw createException(e);
}
}
protected Field createIndexedField(String fieldName,
TokenStream tokenStream, Index fieldIndex,
TermVector fieldTermVector, boolean omitTF) {
Field field = new Field(fieldName, tokenStream, fieldTermVector);
if (fieldIndex == Field.Index.NOT_ANALYZED_NO_NORMS)
field.setOmitNorms(true);
field.setOmitTermFreqAndPositions(omitTF);
return field;
}
protected Collection<Field> createStoredFields(String fieldName,
TokenStream tokenStream, Store fieldStore, String delimiter,
Boolean unique, Index fieldIndex, Boolean omitTF)
throws FieldBuildingException {
Collection<Field> fields = new ArrayList<Field>();
// Only do indexing if we need a unique, indexed AND stored field.
Index index = unique ? fieldIndex : Field.Index.NO;
try {
Field field;
if (delimiter != null) {
String value = tokenStreamStringConcatenator
.tokenStreamToStringWithDelimiter(tokenStream,
delimiter);
field = new Field(fieldName, value, fieldStore, index);
if (unique)
field.setOmitTermFreqAndPositions(omitTF);
fields.add(field);
} else {
Token nextToken = tokenStream.next(new Token());
while (nextToken != null) {
String value = new String(nextToken.termBuffer(), 0,
nextToken.termLength());
field = new Field(fieldName, value, fieldStore, index);
if (unique)
field.setOmitTermFreqAndPositions(omitTF);
fields.add(field);
nextToken = tokenStream.next(nextToken);
}
}
tokenStream.reset();
} catch (IOException e) {
throw createException(e);
}
return fields;
}
protected Field.Index getFieldIndex(String index)
throws FieldBuildingException {
if (index.equals(FIELD_INDEX_NO))
return Field.Index.NO;
else if (index.equals(FIELD_INDEX_YES))
return Field.Index.NOT_ANALYZED;
else if (index.equals(FIELD_INDEX_NO_NORMS))
return Field.Index.NOT_ANALYZED_NO_NORMS;
else if (index.equals(FIELD_INDEX_NO_NORMS_TF))
return Field.Index.NOT_ANALYZED_NO_NORMS;
else if (index.equals(FIELD_INDEX_NO_TF))
return Field.Index.NOT_ANALYZED;
throw createException(new IllegalArgumentException(
"unknown index parameter: " + index));
}
protected Field.TermVector getFieldTermVector(String termVector)
throws FieldBuildingException {
if (termVector.equals(FIELD_TERM_VECTOR_NO))
return Field.TermVector.NO;
else if (termVector.equals(FIELD_TERM_VECTOR_YES))
return Field.TermVector.YES;
else if (termVector.equals(FIELD_TERM_VECTOR_WITH_OFFSETS))
return Field.TermVector.WITH_OFFSETS;
else if (termVector.equals(FIELD_TERM_VECTOR_WITH_POSITIONS))
return Field.TermVector.WITH_POSITIONS;
else if (termVector.equals(FIELD_TERM_VECTOR_WITH_POSITIONS_OFFSETS))
return Field.TermVector.WITH_POSITIONS_OFFSETS;
throw createException(new IllegalArgumentException(
"unknown termVector parameter: " + termVector));
}
protected Field.Store getFieldStore(String store)
throws FieldBuildingException {
if (store.equals(FIELD_STORE_NO))
return Field.Store.NO;
else if (store.equals(FIELD_STORE_YES))
return Field.Store.YES;
else if (store.equals(FIELD_STORE_COMPRESS))
return Field.Store.COMPRESS;
throw createException(new IllegalArgumentException(
"unknown stored parameter: " + store));
}
}