blob: da44f0ad7bed45146a82ff81b813a4a6ad4b1741 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.index.sasi.analyzer;
import java.util.Locale;
import java.util.Map;
/**
* Various options for controlling tokenization and enabling
* or disabling features
*/
public class StandardTokenizerOptions
{
public static final String TOKENIZATION_ENABLE_STEMMING = "tokenization_enable_stemming";
public static final String TOKENIZATION_SKIP_STOP_WORDS = "tokenization_skip_stop_words";
public static final String TOKENIZATION_LOCALE = "tokenization_locale";
public static final String TOKENIZATION_NORMALIZE_LOWERCASE = "tokenization_normalize_lowercase";
public static final String TOKENIZATION_NORMALIZE_UPPERCASE = "tokenization_normalize_uppercase";
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
public static final int DEFAULT_MIN_TOKEN_LENGTH = 0;
private boolean stemTerms;
private boolean ignoreStopTerms;
private Locale locale;
private boolean caseSensitive;
private boolean allTermsToUpperCase;
private boolean allTermsToLowerCase;
private int minTokenLength;
private int maxTokenLength;
public boolean shouldStemTerms()
{
return stemTerms;
}
public void setStemTerms(boolean stemTerms)
{
this.stemTerms = stemTerms;
}
public boolean shouldIgnoreStopTerms()
{
return ignoreStopTerms;
}
public void setIgnoreStopTerms(boolean ignoreStopTerms)
{
this.ignoreStopTerms = ignoreStopTerms;
}
public Locale getLocale()
{
return locale;
}
public void setLocale(Locale locale)
{
this.locale = locale;
}
public boolean isCaseSensitive()
{
return caseSensitive;
}
public void setCaseSensitive(boolean caseSensitive)
{
this.caseSensitive = caseSensitive;
}
public boolean shouldUpperCaseTerms()
{
return allTermsToUpperCase;
}
public void setAllTermsToUpperCase(boolean allTermsToUpperCase)
{
this.allTermsToUpperCase = allTermsToUpperCase;
}
public boolean shouldLowerCaseTerms()
{
return allTermsToLowerCase;
}
public void setAllTermsToLowerCase(boolean allTermsToLowerCase)
{
this.allTermsToLowerCase = allTermsToLowerCase;
}
public int getMinTokenLength()
{
return minTokenLength;
}
public void setMinTokenLength(int minTokenLength)
{
this.minTokenLength = minTokenLength;
}
public int getMaxTokenLength()
{
return maxTokenLength;
}
public void setMaxTokenLength(int maxTokenLength)
{
this.maxTokenLength = maxTokenLength;
}
public static class OptionsBuilder
{
private boolean stemTerms;
private boolean ignoreStopTerms;
private Locale locale;
private boolean caseSensitive;
private boolean allTermsToUpperCase;
private boolean allTermsToLowerCase;
private int minTokenLength = DEFAULT_MIN_TOKEN_LENGTH;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
public OptionsBuilder()
{
}
public OptionsBuilder stemTerms(boolean stemTerms)
{
this.stemTerms = stemTerms;
return this;
}
public OptionsBuilder ignoreStopTerms(boolean ignoreStopTerms)
{
this.ignoreStopTerms = ignoreStopTerms;
return this;
}
public OptionsBuilder useLocale(Locale locale)
{
this.locale = locale;
return this;
}
public OptionsBuilder caseSensitive(boolean caseSensitive)
{
this.caseSensitive = caseSensitive;
return this;
}
public OptionsBuilder alwaysUpperCaseTerms(boolean allTermsToUpperCase)
{
this.allTermsToUpperCase = allTermsToUpperCase;
return this;
}
public OptionsBuilder alwaysLowerCaseTerms(boolean allTermsToLowerCase)
{
this.allTermsToLowerCase = allTermsToLowerCase;
return this;
}
/**
* Set the min allowed token length. Any token shorter
* than this is skipped.
*/
public OptionsBuilder minTokenLength(int minTokenLength)
{
if (minTokenLength < 1)
throw new IllegalArgumentException("minTokenLength must be greater than zero");
this.minTokenLength = minTokenLength;
return this;
}
/**
* Set the max allowed token length. Any token longer
* than this is skipped.
*/
public OptionsBuilder maxTokenLength(int maxTokenLength)
{
if (maxTokenLength < 1)
throw new IllegalArgumentException("maxTokenLength must be greater than zero");
this.maxTokenLength = maxTokenLength;
return this;
}
public StandardTokenizerOptions build()
{
if(allTermsToLowerCase && allTermsToUpperCase)
throw new IllegalArgumentException("Options to normalize terms cannot be " +
"both uppercase and lowercase at the same time");
StandardTokenizerOptions options = new StandardTokenizerOptions();
options.setIgnoreStopTerms(ignoreStopTerms);
options.setStemTerms(stemTerms);
options.setLocale(locale);
options.setCaseSensitive(caseSensitive);
options.setAllTermsToLowerCase(allTermsToLowerCase);
options.setAllTermsToUpperCase(allTermsToUpperCase);
options.setMinTokenLength(minTokenLength);
options.setMaxTokenLength(maxTokenLength);
return options;
}
}
public static StandardTokenizerOptions buildFromMap(Map<String, String> optionsMap)
{
OptionsBuilder optionsBuilder = new OptionsBuilder();
for (Map.Entry<String, String> entry : optionsMap.entrySet())
{
switch(entry.getKey())
{
case TOKENIZATION_ENABLE_STEMMING:
{
boolean bool = Boolean.parseBoolean(entry.getValue());
optionsBuilder = optionsBuilder.stemTerms(bool);
break;
}
case TOKENIZATION_SKIP_STOP_WORDS:
{
boolean bool = Boolean.parseBoolean(entry.getValue());
optionsBuilder = optionsBuilder.ignoreStopTerms(bool);
break;
}
case TOKENIZATION_LOCALE:
{
Locale locale = new Locale(entry.getValue());
optionsBuilder = optionsBuilder.useLocale(locale);
break;
}
case TOKENIZATION_NORMALIZE_UPPERCASE:
{
boolean bool = Boolean.parseBoolean(entry.getValue());
optionsBuilder = optionsBuilder.alwaysUpperCaseTerms(bool);
break;
}
case TOKENIZATION_NORMALIZE_LOWERCASE:
{
boolean bool = Boolean.parseBoolean(entry.getValue());
optionsBuilder = optionsBuilder.alwaysLowerCaseTerms(bool);
break;
}
default:
{
}
}
}
return optionsBuilder.build();
}
public static StandardTokenizerOptions getDefaultOptions()
{
return new OptionsBuilder()
.ignoreStopTerms(true).alwaysLowerCaseTerms(true)
.stemTerms(false).useLocale(Locale.ENGLISH).build();
}
}