Lucas/src/main/java/org/apache/uima/lucas/indexer/FieldBuilder.java - uima-addons - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.lucas.indexer;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;

 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Index;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.Field.TermVector;
 import org.apache.uima.lucas.indexer.analysis.TokenStreamConcatenator;
 import org.apache.uima.lucas.indexer.analysis.TokenStreamMerger;
 import org.apache.uima.lucas.indexer.mapping.FieldDescription;
 import org.apache.uima.lucas.indexer.mapping.TermCoverBuilder;
 import org.apache.uima.lucas.indexer.mapping.TermCoverBuilderFactory;
 import org.apache.uima.lucas.indexer.mapping.FilterDescription;
 import org.apache.uima.lucas.indexer.util.TokenStreamStringConcatenator;

 public class FieldBuilder {

 	public static final String FIELD_INDEX_NO = "no";

 	public static final String FIELD_INDEX_YES = "yes";

 	public static final String FIELD_INDEX_NO_NORMS = "no_norms";

 	public static final String FIELD_INDEX_NO_TF = "no_tf";

 	public static final String FIELD_INDEX_NO_NORMS_TF = "no_norms_tf";

 	public static final String FIELD_TERM_VECTOR_NO = "no";

 	public static final String FIELD_TERM_VECTOR_YES = "yes";

 	public static final String FIELD_TERM_VECTOR_WITH_OFFSETS = "offsets";

 	public static final String FIELD_TERM_VECTOR_WITH_POSITIONS = "positions";

 	public static final String FIELD_TERM_VECTOR_WITH_POSITIONS_OFFSETS = "positions_offsets";

 	public static final String FIELD_STORE_NO = "no";

 	public static final String FIELD_STORE_YES = "yes";

 	public static final String FIELD_STORE_COMPRESS = "compress";

 	protected TokenStreamStringConcatenator tokenStreamStringConcatenator;
 	private FilterBuilder filterBuilder;
 	protected FieldDescription fieldDescription;

 	private TermCoverBuilderFactory termCoverBuilderFactory;

 	public FieldBuilder(FilterBuilder filterBuilder) {
 		tokenStreamStringConcatenator = new TokenStreamStringConcatenator();
 		termCoverBuilderFactory = new TermCoverBuilderFactory();
 		this.filterBuilder = filterBuilder;
 	}

 	public Collection<Field> createFields(Collection<TokenStream> tokenStreams,
 			FieldDescription fieldDescription) throws FieldBuildingException {

 		TokenStream tokenStream = createFieldTokenStream(tokenStreams,
 				fieldDescription);

 		this.fieldDescription = fieldDescription;
 		String fieldName = fieldDescription.getName();
 		String delimiter = fieldDescription.getDelimiter();
 		Collection<Field> fields = new ArrayList<Field>();

 		Collection<FilterDescription> filterDescriptions = fieldDescription
 				.getFilterDescriptions();
 		tokenStream = getFilteredTokenStream(fieldName, tokenStream,
 				filterDescriptions);

 		// The unique flag means we only want ONE field instance with the
 		// name fieldName.
 		Boolean unique = fieldDescription.getUnique();
 		Boolean coverField = fieldDescription.getTermCoverDescription() != null;
 		Field.Store fieldStore = getFieldStore(fieldDescription.getStored());
 		Field.Index fieldIndex = getFieldIndex(fieldDescription.getIndex());
 		Field.TermVector fieldTermVector = getFieldTermVector(fieldDescription
 				.getTermVector());
 		boolean omitTF = fieldDescription.getIndex().equals(FIELD_INDEX_NO_TF)
 				|| fieldDescription.getIndex().equals(FIELD_INDEX_NO_NORMS_TF);
 		boolean store = fieldStore == Field.Store.YES
 				|| fieldStore == Field.Store.COMPRESS;

 		if (!coverField) {
 			// Create stored fields. The parameters unique, fieldIndex and
 			// omitTF are only necessary in case of a stored and indexed
 			// unique field. Then, the field is instanced stored and indexed,
 			// thus only one instance of the field is necessary. This only works
 			// with TokenStreams which contain exactly one token (if a
 			// TokenStream emits more tokens, several fields will be instanced).
 			if (store)
 				fields.addAll(createStoredFields(fieldName, tokenStream,
 						fieldStore, delimiter, unique, fieldIndex, omitTF));

 			// Create indexed fields. If the field is unique and has been
 			// stored, there already is an instance of the field and we don't
 			// create another.
 			if (fieldIndex != Field.Index.NO && (!unique || !store))
 				fields.add(createIndexedField(fieldName, tokenStream,
 						fieldIndex, fieldTermVector, omitTF));

 		} else {

 			TermCoverBuilder termCoverBuilder = termCoverBuilderFactory
 					.createTermCoverBuilder(tokenStream, fieldDescription.getTermCoverDescription());
 			while (termCoverBuilder.increaseCoverSubset()) {
 				String coverSubsetName = termCoverBuilder.getCoverSubsetName();
 				TokenStream coverSubsetTokenStream = termCoverBuilder
 						.getPartitionTokenStream();
 				if (store)
 					fields.addAll(createStoredFields(coverSubsetName,
 							coverSubsetTokenStream, fieldStore, delimiter,
 							unique, fieldIndex, omitTF));

 				if (fieldIndex != Field.Index.NO && (!unique || !store))
 					fields.add(createIndexedField(coverSubsetName,
 							coverSubsetTokenStream, fieldIndex, fieldTermVector,
 							omitTF));
 			}
 		}
 		return fields;
 	}

 	protected TokenStream createFieldTokenStream(
 			Collection<TokenStream> tokenStreams,
 			FieldDescription fieldDescription) throws FieldBuildingException {
 		TokenStream tokenStream = null;
 		if (fieldDescription.getMerge())
 			tokenStream = getTokenStreamMerger(tokenStreams);
 		else if (tokenStreams.size() > 1)
 			tokenStream = new TokenStreamConcatenator(tokenStreams);
 		else if (tokenStreams.size() == 1)
 			tokenStream = tokenStreams.iterator().next();
 		return tokenStream;
 	}

 	protected TokenStream getFilteredTokenStream(String fieldName,
 			TokenStream tokenStream,
 			Collection<FilterDescription> filterDescriptions)
 			throws FieldBuildingException {
 		try {
 			return filterBuilder.filter(tokenStream, filterDescriptions);
 		} catch (FilterBuildingException e) {
 			throw createException(e);
 		}
 	}

 	protected FieldBuildingException createException(Exception e) {
 		String message = "Can't build field " + fieldDescription.getName()
 				+ " at line " + fieldDescription.getLineNumber();
 		return new FieldBuildingException(message, e);
 	}

 	private TokenStream getTokenStreamMerger(
 			Collection<TokenStream> tokenStreams) throws FieldBuildingException {
 		try {
 			return new TokenStreamMerger(tokenStreams);
 		} catch (IOException e) {
 			throw createException(e);
 		}

 	}

 	protected Field createIndexedField(String fieldName,
 			TokenStream tokenStream, Index fieldIndex,
 			TermVector fieldTermVector, boolean omitTF) {

 		Field field = new Field(fieldName, tokenStream, fieldTermVector);
 		if (fieldIndex == Field.Index.NOT_ANALYZED_NO_NORMS)
 			field.setOmitNorms(true);
 		field.setOmitTermFreqAndPositions(omitTF);

 		return field;
 	}

 	protected Collection<Field> createStoredFields(String fieldName,
 			TokenStream tokenStream, Store fieldStore, String delimiter,
 			Boolean unique, Index fieldIndex, Boolean omitTF)
 			throws FieldBuildingException {
 		Collection<Field> fields = new ArrayList<Field>();
 		// Only do indexing if we need a unique, indexed AND stored field.
 		Index index = unique ? fieldIndex : Field.Index.NO;
 		try {
 			Field field;
 			if (delimiter != null) {
 				String value = tokenStreamStringConcatenator
 						.tokenStreamToStringWithDelimiter(tokenStream,
 								delimiter);
 				field = new Field(fieldName, value, fieldStore, index);
 				if (unique)
 					field.setOmitTermFreqAndPositions(omitTF);
 				fields.add(field);
 			} else {
 				Token nextToken = tokenStream.next(new Token());
 				while (nextToken != null) {
 					String value = new String(nextToken.termBuffer(), 0,
 							nextToken.termLength());
 					field = new Field(fieldName, value, fieldStore, index);
 					if (unique)
 						field.setOmitTermFreqAndPositions(omitTF);
 					fields.add(field);
 					nextToken = tokenStream.next(nextToken);
 				}
 			}
 			tokenStream.reset();
 		} catch (IOException e) {
 			throw createException(e);
 		}
 		return fields;
 	}

 	protected Field.Index getFieldIndex(String index)
 			throws FieldBuildingException {
 		if (index.equals(FIELD_INDEX_NO))
 			return Field.Index.NO;
 		else if (index.equals(FIELD_INDEX_YES))
 			return Field.Index.NOT_ANALYZED;
 		else if (index.equals(FIELD_INDEX_NO_NORMS))
 			return Field.Index.NOT_ANALYZED_NO_NORMS;
 		else if (index.equals(FIELD_INDEX_NO_NORMS_TF))
 			return Field.Index.NOT_ANALYZED_NO_NORMS;
 		else if (index.equals(FIELD_INDEX_NO_TF))
 			return Field.Index.NOT_ANALYZED;

 		throw createException(new IllegalArgumentException(
 				"unknown index parameter: " + index));

 	}

 	protected Field.TermVector getFieldTermVector(String termVector)
 			throws FieldBuildingException {
 		if (termVector.equals(FIELD_TERM_VECTOR_NO))
 			return Field.TermVector.NO;
 		else if (termVector.equals(FIELD_TERM_VECTOR_YES))
 			return Field.TermVector.YES;
 		else if (termVector.equals(FIELD_TERM_VECTOR_WITH_OFFSETS))
 			return Field.TermVector.WITH_OFFSETS;
 		else if (termVector.equals(FIELD_TERM_VECTOR_WITH_POSITIONS))
 			return Field.TermVector.WITH_POSITIONS;
 		else if (termVector.equals(FIELD_TERM_VECTOR_WITH_POSITIONS_OFFSETS))
 			return Field.TermVector.WITH_POSITIONS_OFFSETS;

 		throw createException(new IllegalArgumentException(
 				"unknown termVector parameter: " + termVector));
 	}

 	protected Field.Store getFieldStore(String store)
 			throws FieldBuildingException {
 		if (store.equals(FIELD_STORE_NO))
 			return Field.Store.NO;
 		else if (store.equals(FIELD_STORE_YES))
 			return Field.Store.YES;
 		else if (store.equals(FIELD_STORE_COMPRESS))
 			return Field.Store.COMPRESS;

 		throw createException(new IllegalArgumentException(
 				"unknown stored parameter: " + store));
 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.uima.lucas.indexer;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collection;

	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.Field.Index;
	import org.apache.lucene.document.Field.Store;
	import org.apache.lucene.document.Field.TermVector;
	import org.apache.uima.lucas.indexer.analysis.TokenStreamConcatenator;
	import org.apache.uima.lucas.indexer.analysis.TokenStreamMerger;
	import org.apache.uima.lucas.indexer.mapping.FieldDescription;
	import org.apache.uima.lucas.indexer.mapping.TermCoverBuilder;
	import org.apache.uima.lucas.indexer.mapping.TermCoverBuilderFactory;
	import org.apache.uima.lucas.indexer.mapping.FilterDescription;
	import org.apache.uima.lucas.indexer.util.TokenStreamStringConcatenator;

	public class FieldBuilder {

	public static final String FIELD_INDEX_NO = "no";

	public static final String FIELD_INDEX_YES = "yes";

	public static final String FIELD_INDEX_NO_NORMS = "no_norms";

	public static final String FIELD_INDEX_NO_TF = "no_tf";

	public static final String FIELD_INDEX_NO_NORMS_TF = "no_norms_tf";

	public static final String FIELD_TERM_VECTOR_NO = "no";

	public static final String FIELD_TERM_VECTOR_YES = "yes";

	public static final String FIELD_TERM_VECTOR_WITH_OFFSETS = "offsets";

	public static final String FIELD_TERM_VECTOR_WITH_POSITIONS = "positions";

	public static final String FIELD_TERM_VECTOR_WITH_POSITIONS_OFFSETS = "positions_offsets";

	public static final String FIELD_STORE_NO = "no";

	public static final String FIELD_STORE_YES = "yes";

	public static final String FIELD_STORE_COMPRESS = "compress";

	protected TokenStreamStringConcatenator tokenStreamStringConcatenator;
	private FilterBuilder filterBuilder;
	protected FieldDescription fieldDescription;

	private TermCoverBuilderFactory termCoverBuilderFactory;

	public FieldBuilder(FilterBuilder filterBuilder) {
	tokenStreamStringConcatenator = new TokenStreamStringConcatenator();
	termCoverBuilderFactory = new TermCoverBuilderFactory();
	this.filterBuilder = filterBuilder;
	}

	public Collection<Field> createFields(Collection<TokenStream> tokenStreams,
	FieldDescription fieldDescription) throws FieldBuildingException {

	TokenStream tokenStream = createFieldTokenStream(tokenStreams,
	fieldDescription);

	this.fieldDescription = fieldDescription;
	String fieldName = fieldDescription.getName();
	String delimiter = fieldDescription.getDelimiter();
	Collection<Field> fields = new ArrayList<Field>();

	Collection<FilterDescription> filterDescriptions = fieldDescription
	.getFilterDescriptions();
	tokenStream = getFilteredTokenStream(fieldName, tokenStream,
	filterDescriptions);

	// The unique flag means we only want ONE field instance with the
	// name fieldName.
	Boolean unique = fieldDescription.getUnique();
	Boolean coverField = fieldDescription.getTermCoverDescription() != null;
	Field.Store fieldStore = getFieldStore(fieldDescription.getStored());
	Field.Index fieldIndex = getFieldIndex(fieldDescription.getIndex());
	Field.TermVector fieldTermVector = getFieldTermVector(fieldDescription
	.getTermVector());
	boolean omitTF = fieldDescription.getIndex().equals(FIELD_INDEX_NO_TF)
	\|\| fieldDescription.getIndex().equals(FIELD_INDEX_NO_NORMS_TF);
	boolean store = fieldStore == Field.Store.YES
	\|\| fieldStore == Field.Store.COMPRESS;

	if (!coverField) {
	// Create stored fields. The parameters unique, fieldIndex and
	// omitTF are only necessary in case of a stored and indexed
	// unique field. Then, the field is instanced stored and indexed,
	// thus only one instance of the field is necessary. This only works
	// with TokenStreams which contain exactly one token (if a
	// TokenStream emits more tokens, several fields will be instanced).
	if (store)
	fields.addAll(createStoredFields(fieldName, tokenStream,
	fieldStore, delimiter, unique, fieldIndex, omitTF));

	// Create indexed fields. If the field is unique and has been
	// stored, there already is an instance of the field and we don't
	// create another.
	if (fieldIndex != Field.Index.NO && (!unique \|\| !store))
	fields.add(createIndexedField(fieldName, tokenStream,
	fieldIndex, fieldTermVector, omitTF));

	} else {

	TermCoverBuilder termCoverBuilder = termCoverBuilderFactory
	.createTermCoverBuilder(tokenStream, fieldDescription.getTermCoverDescription());
	while (termCoverBuilder.increaseCoverSubset()) {
	String coverSubsetName = termCoverBuilder.getCoverSubsetName();
	TokenStream coverSubsetTokenStream = termCoverBuilder
	.getPartitionTokenStream();
	if (store)
	fields.addAll(createStoredFields(coverSubsetName,
	coverSubsetTokenStream, fieldStore, delimiter,
	unique, fieldIndex, omitTF));

	if (fieldIndex != Field.Index.NO && (!unique \|\| !store))
	fields.add(createIndexedField(coverSubsetName,
	coverSubsetTokenStream, fieldIndex, fieldTermVector,
	omitTF));
	}
	}
	return fields;
	}

	protected TokenStream createFieldTokenStream(
	Collection<TokenStream> tokenStreams,
	FieldDescription fieldDescription) throws FieldBuildingException {
	TokenStream tokenStream = null;
	if (fieldDescription.getMerge())
	tokenStream = getTokenStreamMerger(tokenStreams);
	else if (tokenStreams.size() > 1)
	tokenStream = new TokenStreamConcatenator(tokenStreams);
	else if (tokenStreams.size() == 1)
	tokenStream = tokenStreams.iterator().next();
	return tokenStream;
	}

	protected TokenStream getFilteredTokenStream(String fieldName,
	TokenStream tokenStream,
	Collection<FilterDescription> filterDescriptions)
	throws FieldBuildingException {
	try {
	return filterBuilder.filter(tokenStream, filterDescriptions);
	} catch (FilterBuildingException e) {
	throw createException(e);
	}
	}

	protected FieldBuildingException createException(Exception e) {
	String message = "Can't build field " + fieldDescription.getName()
	+ " at line " + fieldDescription.getLineNumber();
	return new FieldBuildingException(message, e);
	}

	private TokenStream getTokenStreamMerger(
	Collection<TokenStream> tokenStreams) throws FieldBuildingException {
	try {
	return new TokenStreamMerger(tokenStreams);
	} catch (IOException e) {
	throw createException(e);
	}

	}

	protected Field createIndexedField(String fieldName,
	TokenStream tokenStream, Index fieldIndex,
	TermVector fieldTermVector, boolean omitTF) {

	Field field = new Field(fieldName, tokenStream, fieldTermVector);
	if (fieldIndex == Field.Index.NOT_ANALYZED_NO_NORMS)
	field.setOmitNorms(true);
	field.setOmitTermFreqAndPositions(omitTF);

	return field;
	}

	protected Collection<Field> createStoredFields(String fieldName,
	TokenStream tokenStream, Store fieldStore, String delimiter,
	Boolean unique, Index fieldIndex, Boolean omitTF)
	throws FieldBuildingException {
	Collection<Field> fields = new ArrayList<Field>();
	// Only do indexing if we need a unique, indexed AND stored field.
	Index index = unique ? fieldIndex : Field.Index.NO;
	try {
	Field field;
	if (delimiter != null) {
	String value = tokenStreamStringConcatenator
	.tokenStreamToStringWithDelimiter(tokenStream,
	delimiter);
	field = new Field(fieldName, value, fieldStore, index);
	if (unique)
	field.setOmitTermFreqAndPositions(omitTF);
	fields.add(field);
	} else {
	Token nextToken = tokenStream.next(new Token());
	while (nextToken != null) {
	String value = new String(nextToken.termBuffer(), 0,
	nextToken.termLength());
	field = new Field(fieldName, value, fieldStore, index);
	if (unique)
	field.setOmitTermFreqAndPositions(omitTF);
	fields.add(field);
	nextToken = tokenStream.next(nextToken);
	}
	}
	tokenStream.reset();
	} catch (IOException e) {
	throw createException(e);
	}
	return fields;
	}

	protected Field.Index getFieldIndex(String index)
	throws FieldBuildingException {
	if (index.equals(FIELD_INDEX_NO))
	return Field.Index.NO;
	else if (index.equals(FIELD_INDEX_YES))
	return Field.Index.NOT_ANALYZED;
	else if (index.equals(FIELD_INDEX_NO_NORMS))
	return Field.Index.NOT_ANALYZED_NO_NORMS;
	else if (index.equals(FIELD_INDEX_NO_NORMS_TF))
	return Field.Index.NOT_ANALYZED_NO_NORMS;
	else if (index.equals(FIELD_INDEX_NO_TF))
	return Field.Index.NOT_ANALYZED;

	throw createException(new IllegalArgumentException(
	"unknown index parameter: " + index));

	}

	protected Field.TermVector getFieldTermVector(String termVector)
	throws FieldBuildingException {
	if (termVector.equals(FIELD_TERM_VECTOR_NO))
	return Field.TermVector.NO;
	else if (termVector.equals(FIELD_TERM_VECTOR_YES))
	return Field.TermVector.YES;
	else if (termVector.equals(FIELD_TERM_VECTOR_WITH_OFFSETS))
	return Field.TermVector.WITH_OFFSETS;
	else if (termVector.equals(FIELD_TERM_VECTOR_WITH_POSITIONS))
	return Field.TermVector.WITH_POSITIONS;
	else if (termVector.equals(FIELD_TERM_VECTOR_WITH_POSITIONS_OFFSETS))
	return Field.TermVector.WITH_POSITIONS_OFFSETS;

	throw createException(new IllegalArgumentException(
	"unknown termVector parameter: " + termVector));
	}

	protected Field.Store getFieldStore(String store)
	throws FieldBuildingException {
	if (store.equals(FIELD_STORE_NO))
	return Field.Store.NO;
	else if (store.equals(FIELD_STORE_YES))
	return Field.Store.YES;
	else if (store.equals(FIELD_STORE_COMPRESS))
	return Field.Store.COMPRESS;

	throw createException(new IllegalArgumentException(
	"unknown stored parameter: " + store));
	}

	}