Lucas/src/main/java/org/apache/uima/lucas/indexer/mapping/TermCoverBuilder.java - uima-addons - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.uima.lucas.indexer.mapping;

 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.uima.lucas.indexer.analysis.SelectFilter;
 import org.apache.uima.lucas.indexer.analysis.SelectFilterFactory;

 /**
  * <p>
  * Constructs <tt>TokenStream</tt>s according to a cover definition.
  * </p>
  * <p>
  * This class uses {@link SelectFilter}s to build <tt>TokenStream</tt>s which
  * only output terms that pass the <tt>SelectFilter</tt>. Each
  * <tt>TokenStream</tt> corresponds to one cover set and will only output terms
  * included in this particular set. This mechanism can be used to dynamically
  * create multiple fields according to a definition about which term may be included
  * in which field.<br/>
  * To create an instance of this class, the {@link TermCoverBuilderFactory}
  * should be used as it cashes the cover definition. For further explanation of
  * the nature of these covers and their required file format, please refer to
  * the factory class.
  * </p>
  *
  * @see TermCoverBuilderFactory
  *
  */
 public class TermCoverBuilder {

 	private final Map<String, List<String>> vocabularyCover;
 	private Iterator<String> subsetIterator;
 	private String currentCoverSubsetName;
 	private TokenStream currentCoverSubsetTokenStream;
 	private SelectFilterFactory selectFilterFactory;
 	private TokenStream tokenStream;
 	private Properties properties;
 	private final TermCoverDescription fieldDescription;

 	/**
 	 * @param vocabularyCover
 	 * @param selectFilterFactory
 	 */
 	public TermCoverBuilder(Map<String, List<String>> vocabularyCover,
 			TokenStream tokenStream, TermCoverDescription fieldDescription,
 			SelectFilterFactory selectFilterFactory) {
 		this.vocabularyCover = vocabularyCover;
 		this.tokenStream = tokenStream;
 		this.fieldDescription = fieldDescription;
 		this.selectFilterFactory = selectFilterFactory;
 		properties = new Properties();
 		properties.put("enablePositionIncrements", "false");
 		properties.put("ignoreCase",
 				fieldDescription.getIgnoreCaseOfSelectedTerms() ? "true" : "false");
 		subsetIterator = vocabularyCover.keySet().iterator();
 	}

 	public boolean increaseCoverSubset() {
 		if (subsetIterator.hasNext()) {
 			currentCoverSubsetName = subsetIterator.next();
 			properties.put(SelectFilterFactory.COVER_SUBSET_NAME, getCoverSubsetName());
 			currentCoverSubsetTokenStream = selectFilterFactory
 					.createTokenFilter(tokenStream, properties,
 							vocabularyCover.get(currentCoverSubsetName));
 			return true;
 		}
 		return false;
 	}

 	public String getCoverSubsetName() {
 		String fieldName = fieldDescription.getFieldName();
 		String combineFieldName = fieldDescription.getGenerateFieldNameMethod();
 		if (combineFieldName.equals(TermCoverMapper.USE_SUBSET_NAME_APPEND))
 			fieldName = fieldName + currentCoverSubsetName;
 		else if (combineFieldName.equals(TermCoverMapper.USE_SUBSET_NAME_PREPEND))
 			fieldName = currentCoverSubsetName + fieldName;
 		else if (combineFieldName.equals(TermCoverMapper.USE_SUBSET_NAME_REPLACE))
 			fieldName = currentCoverSubsetName;
 		return fieldName;
 	}

 	public TokenStream getPartitionTokenStream() {
 		return currentCoverSubsetTokenStream;
 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.uima.lucas.indexer.mapping;

	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;
	import java.util.Properties;

	import org.apache.lucene.analysis.TokenStream;
	import org.apache.uima.lucas.indexer.analysis.SelectFilter;
	import org.apache.uima.lucas.indexer.analysis.SelectFilterFactory;

	/**
	* <p>
	* Constructs <tt>TokenStream</tt>s according to a cover definition.
	* </p>
	* <p>
	* This class uses {@link SelectFilter}s to build <tt>TokenStream</tt>s which
	* only output terms that pass the <tt>SelectFilter</tt>. Each
	* <tt>TokenStream</tt> corresponds to one cover set and will only output terms
	* included in this particular set. This mechanism can be used to dynamically
	* create multiple fields according to a definition about which term may be included
	* in which field.<br/>
	* To create an instance of this class, the {@link TermCoverBuilderFactory}
	* should be used as it cashes the cover definition. For further explanation of
	* the nature of these covers and their required file format, please refer to
	* the factory class.
	* </p>
	*
	* @see TermCoverBuilderFactory
	*
	*/
	public class TermCoverBuilder {

	private final Map<String, List<String>> vocabularyCover;
	private Iterator<String> subsetIterator;
	private String currentCoverSubsetName;
	private TokenStream currentCoverSubsetTokenStream;
	private SelectFilterFactory selectFilterFactory;
	private TokenStream tokenStream;
	private Properties properties;
	private final TermCoverDescription fieldDescription;

	/**
	* @param vocabularyCover
	* @param selectFilterFactory
	*/
	public TermCoverBuilder(Map<String, List<String>> vocabularyCover,
	TokenStream tokenStream, TermCoverDescription fieldDescription,
	SelectFilterFactory selectFilterFactory) {
	this.vocabularyCover = vocabularyCover;
	this.tokenStream = tokenStream;
	this.fieldDescription = fieldDescription;
	this.selectFilterFactory = selectFilterFactory;
	properties = new Properties();
	properties.put("enablePositionIncrements", "false");
	properties.put("ignoreCase",
	fieldDescription.getIgnoreCaseOfSelectedTerms() ? "true" : "false");
	subsetIterator = vocabularyCover.keySet().iterator();
	}

	public boolean increaseCoverSubset() {
	if (subsetIterator.hasNext()) {
	currentCoverSubsetName = subsetIterator.next();
	properties.put(SelectFilterFactory.COVER_SUBSET_NAME, getCoverSubsetName());
	currentCoverSubsetTokenStream = selectFilterFactory
	.createTokenFilter(tokenStream, properties,
	vocabularyCover.get(currentCoverSubsetName));
	return true;
	}
	return false;
	}

	public String getCoverSubsetName() {
	String fieldName = fieldDescription.getFieldName();
	String combineFieldName = fieldDescription.getGenerateFieldNameMethod();
	if (combineFieldName.equals(TermCoverMapper.USE_SUBSET_NAME_APPEND))
	fieldName = fieldName + currentCoverSubsetName;
	else if (combineFieldName.equals(TermCoverMapper.USE_SUBSET_NAME_PREPEND))
	fieldName = currentCoverSubsetName + fieldName;
	else if (combineFieldName.equals(TermCoverMapper.USE_SUBSET_NAME_REPLACE))
	fieldName = currentCoverSubsetName;
	return fieldName;
	}

	public TokenStream getPartitionTokenStream() {
	return currentCoverSubsetTokenStream;
	}

	}