blob: 3b200f36416c4597dbd4403a18cfb01dce20ca6b [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.uima.lucas.indexer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.uima.lucas.indexer.analysis.TokenStreamConcatenator;
import org.apache.uima.lucas.indexer.analysis.TokenStreamMerger;
import org.apache.uima.lucas.indexer.mapping.FieldDescription;
import org.apache.uima.lucas.indexer.mapping.FilterDescription;
import org.apache.uima.lucas.indexer.util.TokenStreamStringConcatenator;
import java.util.ArrayList;
import java.util.Collection;
public class FieldBuilder {
public static final String FIELD_INDEX_NO = "no";
public static final String FIELD_INDEX_YES = "yes";
public static final String FIELD_INDEX_NO_NORMS = "no_norms";
public static final String FIELD_INDEX_NO_TF = "no_tf";
public static final String FIELD_INDEX_NO_NORMS_TF = "no_norms_tf";
public static final String FIELD_TERM_VECTOR_NO = "no";
public static final String FIELD_TERM_VECTOR_YES = "yes";
public static final String FIELD_TERM_VECTOR_WITH_OFFSETS = "offsets";
public static final String FIELD_TERM_VECTOR_WITH_POSITIONS = "positions";
public static final String FIELD_TERM_VECTOR_WITH_POSITIONS_OFFSETS = "positions_offsets";
public static final String FIELD_STORE_NO = "no";
public static final String FIELD_STORE_YES = "yes";
public static final String FIELD_STORE_COMPRESS = "compress";
protected TokenStreamStringConcatenator tokenStreamStringConcatenator;
private FilterBuilder filterBuilder;
protected FieldDescription fieldDescription;
public FieldBuilder(FilterBuilder filterBuilder) {
tokenStreamStringConcatenator = new TokenStreamStringConcatenator();
this.filterBuilder = filterBuilder;
public Collection<Field> createFields(Collection<TokenStream> tokenStreams,
FieldDescription fieldDescription) throws FieldBuildingException {
TokenStream tokenStream = createFieldTokenStream(tokenStreams,
this.fieldDescription = fieldDescription;
String fieldName = fieldDescription.getName();
String delimiter = fieldDescription.getDelimiter();
Collection<Field> fields = new ArrayList<Field>();
Collection<FilterDescription> filterDescriptions = fieldDescription
tokenStream = getFilteredTokenStream(fieldName, tokenStream,
// The unique flag means we only want ONE field instance with the
// name fieldName.
Boolean unique = fieldDescription.getUnique();
Field.Store fieldStore = getFieldStore(fieldDescription.getStored());
Field.Index fieldIndex = getFieldIndex(fieldDescription.getIndex());
Field.TermVector fieldTermVector = getFieldTermVector(fieldDescription
boolean omitTF = fieldDescription.getIndex().equals(FIELD_INDEX_NO_TF)
|| fieldDescription.getIndex().equals(FIELD_INDEX_NO_NORMS_TF);
boolean store = fieldStore == Field.Store.YES || fieldStore == Field.Store.COMPRESS;
// Created stored fields. The parameters unique, fieldIndex and omitTF
// are only necessary in case of a stored and indexed unique field. Then,
// the field is instanced stored and indexed, thus only one instance
// of the field is necessary. This only works with TokenStreams which
// contain exactly one token (if a TokenStream emits more tokens,
// several fields will be instanced).
if (store)
fields.addAll(createStoredFields(fieldName, tokenStream,
fieldStore, delimiter, unique, fieldIndex, omitTF));
// Create indexed fields. If the field is unique and has been stored,
// there already is an instance of the field and we don't create another.
if (fieldIndex != Field.Index.NO && (!unique || !store))
fields.add(createIndexedField(fieldName, tokenStream, fieldIndex,
fieldTermVector, omitTF));
return fields;
protected TokenStream createFieldTokenStream(
Collection<TokenStream> tokenStreams,
FieldDescription fieldDescription) throws FieldBuildingException {
TokenStream tokenStream = null;
if (fieldDescription.getMerge())
tokenStream = getTokenStreamMerger(tokenStreams);
else if (tokenStreams.size() > 1)
tokenStream = new TokenStreamConcatenator(tokenStreams);
else if (tokenStreams.size() == 1)
tokenStream = tokenStreams.iterator().next();
return tokenStream;
protected TokenStream getFilteredTokenStream(String fieldName,
TokenStream tokenStream,
Collection<FilterDescription> filterDescriptions)
throws FieldBuildingException {
try {
return filterBuilder.filter(tokenStream, filterDescriptions);
} catch (FilterBuildingException e) {
throw createException(e);
protected FieldBuildingException createException(Exception e) {
String message = "Can't build field " + fieldDescription.getName()
+ " at line " + fieldDescription.getLineNumber();
return new FieldBuildingException(message, e);
private TokenStream getTokenStreamMerger(
Collection<TokenStream> tokenStreams) throws FieldBuildingException {
try {
return new TokenStreamMerger(tokenStreams);
} catch (IOException e) {
throw createException(e);
protected Field createIndexedField(String fieldName, TokenStream tokenStream,
Index fieldIndex, TermVector fieldTermVector, boolean omitTF) {
Field field = new Field(fieldName, tokenStream, fieldTermVector);
if (fieldIndex == Field.Index.NOT_ANALYZED_NO_NORMS)
return field;
protected Collection<Field> createStoredFields(String fieldName,
TokenStream tokenStream, Store fieldStore, String delimiter,
Boolean unique, Index fieldIndex, Boolean omitTF)
throws FieldBuildingException {
Collection<Field> fields = new ArrayList<Field>();
// Only do indexing if we need a unique, indexed AND stored field.
Index index = unique ? fieldIndex : Field.Index.NO;
try {
Field field;
if (delimiter != null) {
String value = tokenStreamStringConcatenator
field = new Field(fieldName, value, fieldStore, index);
if (unique)
} else {
Token nextToken = Token());
while (nextToken != null) {
String value = new String(nextToken.termBuffer(), 0,
field = new Field(fieldName, value, fieldStore, index);
if (unique)
nextToken =;
} catch (IOException e) {
throw createException(e);
return fields;
protected Field.Index getFieldIndex(String index)
throws FieldBuildingException {
if (index.equals(FIELD_INDEX_NO))
return Field.Index.NO;
else if (index.equals(FIELD_INDEX_YES))
return Field.Index.NOT_ANALYZED;
else if (index.equals(FIELD_INDEX_NO_NORMS))
return Field.Index.NOT_ANALYZED_NO_NORMS;
else if (index.equals(FIELD_INDEX_NO_NORMS_TF))
return Field.Index.NOT_ANALYZED_NO_NORMS;
else if (index.equals(FIELD_INDEX_NO_TF))
return Field.Index.NOT_ANALYZED;
throw createException(new IllegalArgumentException(
"unknown index parameter: " + index));
protected Field.TermVector getFieldTermVector(String termVector)
throws FieldBuildingException {
if (termVector.equals(FIELD_TERM_VECTOR_NO))
return Field.TermVector.NO;
else if (termVector.equals(FIELD_TERM_VECTOR_YES))
return Field.TermVector.YES;
else if (termVector.equals(FIELD_TERM_VECTOR_WITH_OFFSETS))
return Field.TermVector.WITH_OFFSETS;
else if (termVector.equals(FIELD_TERM_VECTOR_WITH_POSITIONS))
return Field.TermVector.WITH_POSITIONS;
return Field.TermVector.WITH_POSITIONS_OFFSETS;
throw createException(new IllegalArgumentException(
"unknown termVector parameter: " + termVector));
protected Field.Store getFieldStore(String store)
throws FieldBuildingException {
if (store.equals(FIELD_STORE_NO))
return Field.Store.NO;
else if (store.equals(FIELD_STORE_YES))
return Field.Store.YES;
else if (store.equals(FIELD_STORE_COMPRESS))
return Field.Store.COMPRESS;
throw createException(new IllegalArgumentException(
"unknown stored parameter: " + store));