oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/OakAnalyzer.java - jackrabbit-oak - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.jackrabbit.oak.plugins.index.lucene;

 import java.io.Reader;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.Version;

 public class OakAnalyzer extends Analyzer {

     private final Version matchVersion;

     private final int INDEX_ORIGINAL_TERM;

     /**
      * Creates a new {@link OakAnalyzer}
      *
      * @param matchVersion
      *            Lucene version to match See
      *            {@link <a href="#version">above</a>}
      */
     public OakAnalyzer(Version matchVersion) {
         this(matchVersion, false);
     }

     /**
      * Create a new {@link OakAnalyzer} with configurable flag to preserve
      * original term being analyzed too.
      * @param matchVersion Lucene version to match See {@link #matchVersion above}
      * @param indexOriginalTerm flag to setup analyzer such that
      *                              {@link WordDelimiterFilter#PRESERVE_ORIGINAL}
      *                              is set to oonfigure word delimeter
      */
     public OakAnalyzer(Version matchVersion, boolean indexOriginalTerm) {
         this.matchVersion = matchVersion;
         INDEX_ORIGINAL_TERM = indexOriginalTerm?WordDelimiterFilter.PRESERVE_ORIGINAL:0;
     }

     @Override
     protected TokenStreamComponents createComponents(final String fieldName,
             final Reader reader) {
         StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
         TokenStream tok = new LowerCaseFilter(matchVersion, src);
         tok = new WordDelimiterFilter(tok,
                 WordDelimiterFilter.GENERATE_WORD_PARTS
                         | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE
                         | this.INDEX_ORIGINAL_TERM
                         | WordDelimiterFilter.GENERATE_NUMBER_PARTS, null);
         return new TokenStreamComponents(src, tok);
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.jackrabbit.oak.plugins.index.lucene;

	import java.io.Reader;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.core.LowerCaseFilter;
	import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
	import org.apache.lucene.analysis.standard.StandardTokenizer;
	import org.apache.lucene.util.Version;

	public class OakAnalyzer extends Analyzer {

	private final Version matchVersion;

	private final int INDEX_ORIGINAL_TERM;

	/**
	* Creates a new {@link OakAnalyzer}
	*
	* @param matchVersion
	* Lucene version to match See
	* {@link <a href="#version">above</a>}
	*/
	public OakAnalyzer(Version matchVersion) {
	this(matchVersion, false);
	}

	/**
	* Create a new {@link OakAnalyzer} with configurable flag to preserve
	* original term being analyzed too.
	* @param matchVersion Lucene version to match See {@link #matchVersion above}
	* @param indexOriginalTerm flag to setup analyzer such that
	* {@link WordDelimiterFilter#PRESERVE_ORIGINAL}
	* is set to oonfigure word delimeter
	*/
	public OakAnalyzer(Version matchVersion, boolean indexOriginalTerm) {
	this.matchVersion = matchVersion;
	INDEX_ORIGINAL_TERM = indexOriginalTerm?WordDelimiterFilter.PRESERVE_ORIGINAL:0;
	}

	@Override
	protected TokenStreamComponents createComponents(final String fieldName,
	final Reader reader) {
	StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
	TokenStream tok = new LowerCaseFilter(matchVersion, src);
	tok = new WordDelimiterFilter(tok,
	WordDelimiterFilter.GENERATE_WORD_PARTS
	\| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE
	\| this.INDEX_ORIGINAL_TERM
	\| WordDelimiterFilter.GENERATE_NUMBER_PARTS, null);
	return new TokenStreamComponents(src, tok);
	}
	}