src/java/org/apache/cassandra/index/sasi/analyzer/StandardTokenizerOptions.java - cassandra - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.cassandra.index.sasi.analyzer;

 import java.util.Locale;
 import java.util.Map;

 /**
  * Various options for controlling tokenization and enabling
  * or disabling features
  */
 public class StandardTokenizerOptions
 {
     public static final String TOKENIZATION_ENABLE_STEMMING = "tokenization_enable_stemming";
     public static final String TOKENIZATION_SKIP_STOP_WORDS = "tokenization_skip_stop_words";
     public static final String TOKENIZATION_LOCALE = "tokenization_locale";
     public static final String TOKENIZATION_NORMALIZE_LOWERCASE = "tokenization_normalize_lowercase";
     public static final String TOKENIZATION_NORMALIZE_UPPERCASE = "tokenization_normalize_uppercase";

     public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
     public static final int DEFAULT_MIN_TOKEN_LENGTH = 0;

     private boolean stemTerms;
     private boolean ignoreStopTerms;
     private Locale locale;
     private boolean caseSensitive;
     private boolean allTermsToUpperCase;
     private boolean allTermsToLowerCase;
     private int minTokenLength;
     private int maxTokenLength;

     public boolean shouldStemTerms()
     {
         return stemTerms;
     }

     public void setStemTerms(boolean stemTerms)
     {
         this.stemTerms = stemTerms;
     }

     public boolean shouldIgnoreStopTerms()
     {
         return ignoreStopTerms;
     }

     public void setIgnoreStopTerms(boolean ignoreStopTerms)
     {
         this.ignoreStopTerms = ignoreStopTerms;
     }

     public Locale getLocale()
     {
         return locale;
     }

     public void setLocale(Locale locale)
     {
         this.locale = locale;
     }

     public boolean isCaseSensitive()
     {
         return caseSensitive;
     }

     public void setCaseSensitive(boolean caseSensitive)
     {
         this.caseSensitive = caseSensitive;
     }

     public boolean shouldUpperCaseTerms()
     {
         return allTermsToUpperCase;
     }

     public void setAllTermsToUpperCase(boolean allTermsToUpperCase)
     {
         this.allTermsToUpperCase = allTermsToUpperCase;
     }

     public boolean shouldLowerCaseTerms()
     {
         return allTermsToLowerCase;
     }

     public void setAllTermsToLowerCase(boolean allTermsToLowerCase)
     {
         this.allTermsToLowerCase = allTermsToLowerCase;
     }

     public int getMinTokenLength()
     {
         return minTokenLength;
     }

     public void setMinTokenLength(int minTokenLength)
     {
         this.minTokenLength = minTokenLength;
     }

     public int getMaxTokenLength()
     {
         return maxTokenLength;
     }

     public void setMaxTokenLength(int maxTokenLength)
     {
         this.maxTokenLength = maxTokenLength;
     }

     public static class OptionsBuilder
     {
         private boolean stemTerms;
         private boolean ignoreStopTerms;
         private Locale locale;
         private boolean caseSensitive;
         private boolean allTermsToUpperCase;
         private boolean allTermsToLowerCase;
         private int minTokenLength = DEFAULT_MIN_TOKEN_LENGTH;
         private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

         public OptionsBuilder()
         {
         }

         public OptionsBuilder stemTerms(boolean stemTerms)
         {
             this.stemTerms = stemTerms;
             return this;
         }

         public OptionsBuilder ignoreStopTerms(boolean ignoreStopTerms)
         {
             this.ignoreStopTerms = ignoreStopTerms;
             return this;
         }

         public OptionsBuilder useLocale(Locale locale)
         {
             this.locale = locale;
             return this;
         }

         public OptionsBuilder caseSensitive(boolean caseSensitive)
         {
             this.caseSensitive = caseSensitive;
             return this;
         }

         public OptionsBuilder alwaysUpperCaseTerms(boolean allTermsToUpperCase)
         {
             this.allTermsToUpperCase = allTermsToUpperCase;
             return this;
         }

         public OptionsBuilder alwaysLowerCaseTerms(boolean allTermsToLowerCase)
         {
             this.allTermsToLowerCase = allTermsToLowerCase;
             return this;
         }

         /**
          * Set the min allowed token length.  Any token shorter
          * than this is skipped.
          */
         public OptionsBuilder minTokenLength(int minTokenLength)
         {
             if (minTokenLength < 1)
                 throw new IllegalArgumentException("minTokenLength must be greater than zero");
             this.minTokenLength = minTokenLength;
             return this;
         }

         /**
          * Set the max allowed token length.  Any token longer
          * than this is skipped.
          */
         public OptionsBuilder maxTokenLength(int maxTokenLength)
         {
             if (maxTokenLength < 1)
                 throw new IllegalArgumentException("maxTokenLength must be greater than zero");
             this.maxTokenLength = maxTokenLength;
             return this;
         }

         public StandardTokenizerOptions build()
         {
             if(allTermsToLowerCase && allTermsToUpperCase)
                 throw new IllegalArgumentException("Options to normalize terms cannot be " +
                         "both uppercase and lowercase at the same time");

             StandardTokenizerOptions options = new StandardTokenizerOptions();
             options.setIgnoreStopTerms(ignoreStopTerms);
             options.setStemTerms(stemTerms);
             options.setLocale(locale);
             options.setCaseSensitive(caseSensitive);
             options.setAllTermsToLowerCase(allTermsToLowerCase);
             options.setAllTermsToUpperCase(allTermsToUpperCase);
             options.setMinTokenLength(minTokenLength);
             options.setMaxTokenLength(maxTokenLength);
             return options;
         }
     }

     public static StandardTokenizerOptions buildFromMap(Map<String, String> optionsMap)
     {
         OptionsBuilder optionsBuilder = new OptionsBuilder();

         for (Map.Entry<String, String> entry : optionsMap.entrySet())
         {
             switch(entry.getKey())
             {
                 case TOKENIZATION_ENABLE_STEMMING:
                 {
                     boolean bool = Boolean.parseBoolean(entry.getValue());
                     optionsBuilder = optionsBuilder.stemTerms(bool);
                     break;
                 }
                 case TOKENIZATION_SKIP_STOP_WORDS:
                 {
                     boolean bool = Boolean.parseBoolean(entry.getValue());
                     optionsBuilder = optionsBuilder.ignoreStopTerms(bool);
                     break;
                 }
                 case TOKENIZATION_LOCALE:
                 {
                     Locale locale = new Locale(entry.getValue());
                     optionsBuilder = optionsBuilder.useLocale(locale);
                     break;
                 }
                 case TOKENIZATION_NORMALIZE_UPPERCASE:
                 {
                     boolean bool = Boolean.parseBoolean(entry.getValue());
                     optionsBuilder = optionsBuilder.alwaysUpperCaseTerms(bool);
                     break;
                 }
                 case TOKENIZATION_NORMALIZE_LOWERCASE:
                 {
                     boolean bool = Boolean.parseBoolean(entry.getValue());
                     optionsBuilder = optionsBuilder.alwaysLowerCaseTerms(bool);
                     break;
                 }
                 default:
                 {
                 }
             }
         }
         return optionsBuilder.build();
     }

     public static StandardTokenizerOptions getDefaultOptions()
     {
         return new OptionsBuilder()
                 .ignoreStopTerms(true).alwaysLowerCaseTerms(true)
                 .stemTerms(false).useLocale(Locale.ENGLISH).build();
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.cassandra.index.sasi.analyzer;

	import java.util.Locale;
	import java.util.Map;

	/**
	* Various options for controlling tokenization and enabling
	* or disabling features
	*/
	public class StandardTokenizerOptions
	{
	public static final String TOKENIZATION_ENABLE_STEMMING = "tokenization_enable_stemming";
	public static final String TOKENIZATION_SKIP_STOP_WORDS = "tokenization_skip_stop_words";
	public static final String TOKENIZATION_LOCALE = "tokenization_locale";
	public static final String TOKENIZATION_NORMALIZE_LOWERCASE = "tokenization_normalize_lowercase";
	public static final String TOKENIZATION_NORMALIZE_UPPERCASE = "tokenization_normalize_uppercase";

	public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
	public static final int DEFAULT_MIN_TOKEN_LENGTH = 0;

	private boolean stemTerms;
	private boolean ignoreStopTerms;
	private Locale locale;
	private boolean caseSensitive;
	private boolean allTermsToUpperCase;
	private boolean allTermsToLowerCase;
	private int minTokenLength;
	private int maxTokenLength;

	public boolean shouldStemTerms()
	{
	return stemTerms;
	}

	public void setStemTerms(boolean stemTerms)
	{
	this.stemTerms = stemTerms;
	}

	public boolean shouldIgnoreStopTerms()
	{
	return ignoreStopTerms;
	}

	public void setIgnoreStopTerms(boolean ignoreStopTerms)
	{
	this.ignoreStopTerms = ignoreStopTerms;
	}

	public Locale getLocale()
	{
	return locale;
	}

	public void setLocale(Locale locale)
	{
	this.locale = locale;
	}

	public boolean isCaseSensitive()
	{
	return caseSensitive;
	}

	public void setCaseSensitive(boolean caseSensitive)
	{
	this.caseSensitive = caseSensitive;
	}

	public boolean shouldUpperCaseTerms()
	{
	return allTermsToUpperCase;
	}

	public void setAllTermsToUpperCase(boolean allTermsToUpperCase)
	{
	this.allTermsToUpperCase = allTermsToUpperCase;
	}

	public boolean shouldLowerCaseTerms()
	{
	return allTermsToLowerCase;
	}

	public void setAllTermsToLowerCase(boolean allTermsToLowerCase)
	{
	this.allTermsToLowerCase = allTermsToLowerCase;
	}

	public int getMinTokenLength()
	{
	return minTokenLength;
	}

	public void setMinTokenLength(int minTokenLength)
	{
	this.minTokenLength = minTokenLength;
	}

	public int getMaxTokenLength()
	{
	return maxTokenLength;
	}

	public void setMaxTokenLength(int maxTokenLength)
	{
	this.maxTokenLength = maxTokenLength;
	}

	public static class OptionsBuilder
	{
	private boolean stemTerms;
	private boolean ignoreStopTerms;
	private Locale locale;
	private boolean caseSensitive;
	private boolean allTermsToUpperCase;
	private boolean allTermsToLowerCase;
	private int minTokenLength = DEFAULT_MIN_TOKEN_LENGTH;
	private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

	public OptionsBuilder()
	{
	}

	public OptionsBuilder stemTerms(boolean stemTerms)
	{
	this.stemTerms = stemTerms;
	return this;
	}

	public OptionsBuilder ignoreStopTerms(boolean ignoreStopTerms)
	{
	this.ignoreStopTerms = ignoreStopTerms;
	return this;
	}

	public OptionsBuilder useLocale(Locale locale)
	{
	this.locale = locale;
	return this;
	}

	public OptionsBuilder caseSensitive(boolean caseSensitive)
	{
	this.caseSensitive = caseSensitive;
	return this;
	}

	public OptionsBuilder alwaysUpperCaseTerms(boolean allTermsToUpperCase)
	{
	this.allTermsToUpperCase = allTermsToUpperCase;
	return this;
	}

	public OptionsBuilder alwaysLowerCaseTerms(boolean allTermsToLowerCase)
	{
	this.allTermsToLowerCase = allTermsToLowerCase;
	return this;
	}

	/**
	* Set the min allowed token length. Any token shorter
	* than this is skipped.
	*/
	public OptionsBuilder minTokenLength(int minTokenLength)
	{
	if (minTokenLength < 1)
	throw new IllegalArgumentException("minTokenLength must be greater than zero");
	this.minTokenLength = minTokenLength;
	return this;
	}

	/**
	* Set the max allowed token length. Any token longer
	* than this is skipped.
	*/
	public OptionsBuilder maxTokenLength(int maxTokenLength)
	{
	if (maxTokenLength < 1)
	throw new IllegalArgumentException("maxTokenLength must be greater than zero");
	this.maxTokenLength = maxTokenLength;
	return this;
	}

	public StandardTokenizerOptions build()
	{
	if(allTermsToLowerCase && allTermsToUpperCase)
	throw new IllegalArgumentException("Options to normalize terms cannot be " +
	"both uppercase and lowercase at the same time");

	StandardTokenizerOptions options = new StandardTokenizerOptions();
	options.setIgnoreStopTerms(ignoreStopTerms);
	options.setStemTerms(stemTerms);
	options.setLocale(locale);
	options.setCaseSensitive(caseSensitive);
	options.setAllTermsToLowerCase(allTermsToLowerCase);
	options.setAllTermsToUpperCase(allTermsToUpperCase);
	options.setMinTokenLength(minTokenLength);
	options.setMaxTokenLength(maxTokenLength);
	return options;
	}
	}

	public static StandardTokenizerOptions buildFromMap(Map<String, String> optionsMap)
	{
	OptionsBuilder optionsBuilder = new OptionsBuilder();

	for (Map.Entry<String, String> entry : optionsMap.entrySet())
	{
	switch(entry.getKey())
	{
	case TOKENIZATION_ENABLE_STEMMING:
	{
	boolean bool = Boolean.parseBoolean(entry.getValue());
	optionsBuilder = optionsBuilder.stemTerms(bool);
	break;
	}
	case TOKENIZATION_SKIP_STOP_WORDS:
	{
	boolean bool = Boolean.parseBoolean(entry.getValue());
	optionsBuilder = optionsBuilder.ignoreStopTerms(bool);
	break;
	}
	case TOKENIZATION_LOCALE:
	{
	Locale locale = new Locale(entry.getValue());
	optionsBuilder = optionsBuilder.useLocale(locale);
	break;
	}
	case TOKENIZATION_NORMALIZE_UPPERCASE:
	{
	boolean bool = Boolean.parseBoolean(entry.getValue());
	optionsBuilder = optionsBuilder.alwaysUpperCaseTerms(bool);
	break;
	}
	case TOKENIZATION_NORMALIZE_LOWERCASE:
	{
	boolean bool = Boolean.parseBoolean(entry.getValue());
	optionsBuilder = optionsBuilder.alwaysLowerCaseTerms(bool);
	break;
	}
	default:
	{
	}
	}
	}
	return optionsBuilder.build();
	}

	public static StandardTokenizerOptions getDefaultOptions()
	{
	return new OptionsBuilder()
	.ignoreStopTerms(true).alwaysLowerCaseTerms(true)
	.stemTerms(false).useLocale(Locale.ENGLISH).build();
	}
	}