| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.cassandra.index.sasi.analyzer; |
| |
| import java.util.Locale; |
| import java.util.Map; |
| |
| /** |
| * Various options for controlling tokenization and enabling |
| * or disabling features |
| */ |
| public class StandardTokenizerOptions |
| { |
| public static final String TOKENIZATION_ENABLE_STEMMING = "tokenization_enable_stemming"; |
| public static final String TOKENIZATION_SKIP_STOP_WORDS = "tokenization_skip_stop_words"; |
| public static final String TOKENIZATION_LOCALE = "tokenization_locale"; |
| public static final String TOKENIZATION_NORMALIZE_LOWERCASE = "tokenization_normalize_lowercase"; |
| public static final String TOKENIZATION_NORMALIZE_UPPERCASE = "tokenization_normalize_uppercase"; |
| |
| public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; |
| public static final int DEFAULT_MIN_TOKEN_LENGTH = 0; |
| |
| private boolean stemTerms; |
| private boolean ignoreStopTerms; |
| private Locale locale; |
| private boolean caseSensitive; |
| private boolean allTermsToUpperCase; |
| private boolean allTermsToLowerCase; |
| private int minTokenLength; |
| private int maxTokenLength; |
| |
| public boolean shouldStemTerms() |
| { |
| return stemTerms; |
| } |
| |
| public void setStemTerms(boolean stemTerms) |
| { |
| this.stemTerms = stemTerms; |
| } |
| |
| public boolean shouldIgnoreStopTerms() |
| { |
| return ignoreStopTerms; |
| } |
| |
| public void setIgnoreStopTerms(boolean ignoreStopTerms) |
| { |
| this.ignoreStopTerms = ignoreStopTerms; |
| } |
| |
| public Locale getLocale() |
| { |
| return locale; |
| } |
| |
| public void setLocale(Locale locale) |
| { |
| this.locale = locale; |
| } |
| |
| public boolean isCaseSensitive() |
| { |
| return caseSensitive; |
| } |
| |
| public void setCaseSensitive(boolean caseSensitive) |
| { |
| this.caseSensitive = caseSensitive; |
| } |
| |
| public boolean shouldUpperCaseTerms() |
| { |
| return allTermsToUpperCase; |
| } |
| |
| public void setAllTermsToUpperCase(boolean allTermsToUpperCase) |
| { |
| this.allTermsToUpperCase = allTermsToUpperCase; |
| } |
| |
| public boolean shouldLowerCaseTerms() |
| { |
| return allTermsToLowerCase; |
| } |
| |
| public void setAllTermsToLowerCase(boolean allTermsToLowerCase) |
| { |
| this.allTermsToLowerCase = allTermsToLowerCase; |
| } |
| |
| public int getMinTokenLength() |
| { |
| return minTokenLength; |
| } |
| |
| public void setMinTokenLength(int minTokenLength) |
| { |
| this.minTokenLength = minTokenLength; |
| } |
| |
| public int getMaxTokenLength() |
| { |
| return maxTokenLength; |
| } |
| |
| public void setMaxTokenLength(int maxTokenLength) |
| { |
| this.maxTokenLength = maxTokenLength; |
| } |
| |
| public static class OptionsBuilder |
| { |
| private boolean stemTerms; |
| private boolean ignoreStopTerms; |
| private Locale locale; |
| private boolean caseSensitive; |
| private boolean allTermsToUpperCase; |
| private boolean allTermsToLowerCase; |
| private int minTokenLength = DEFAULT_MIN_TOKEN_LENGTH; |
| private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; |
| |
| public OptionsBuilder() |
| { |
| } |
| |
| public OptionsBuilder stemTerms(boolean stemTerms) |
| { |
| this.stemTerms = stemTerms; |
| return this; |
| } |
| |
| public OptionsBuilder ignoreStopTerms(boolean ignoreStopTerms) |
| { |
| this.ignoreStopTerms = ignoreStopTerms; |
| return this; |
| } |
| |
| public OptionsBuilder useLocale(Locale locale) |
| { |
| this.locale = locale; |
| return this; |
| } |
| |
| public OptionsBuilder caseSensitive(boolean caseSensitive) |
| { |
| this.caseSensitive = caseSensitive; |
| return this; |
| } |
| |
| public OptionsBuilder alwaysUpperCaseTerms(boolean allTermsToUpperCase) |
| { |
| this.allTermsToUpperCase = allTermsToUpperCase; |
| return this; |
| } |
| |
| public OptionsBuilder alwaysLowerCaseTerms(boolean allTermsToLowerCase) |
| { |
| this.allTermsToLowerCase = allTermsToLowerCase; |
| return this; |
| } |
| |
| /** |
| * Set the min allowed token length. Any token shorter |
| * than this is skipped. |
| */ |
| public OptionsBuilder minTokenLength(int minTokenLength) |
| { |
| if (minTokenLength < 1) |
| throw new IllegalArgumentException("minTokenLength must be greater than zero"); |
| this.minTokenLength = minTokenLength; |
| return this; |
| } |
| |
| /** |
| * Set the max allowed token length. Any token longer |
| * than this is skipped. |
| */ |
| public OptionsBuilder maxTokenLength(int maxTokenLength) |
| { |
| if (maxTokenLength < 1) |
| throw new IllegalArgumentException("maxTokenLength must be greater than zero"); |
| this.maxTokenLength = maxTokenLength; |
| return this; |
| } |
| |
| public StandardTokenizerOptions build() |
| { |
| if(allTermsToLowerCase && allTermsToUpperCase) |
| throw new IllegalArgumentException("Options to normalize terms cannot be " + |
| "both uppercase and lowercase at the same time"); |
| |
| StandardTokenizerOptions options = new StandardTokenizerOptions(); |
| options.setIgnoreStopTerms(ignoreStopTerms); |
| options.setStemTerms(stemTerms); |
| options.setLocale(locale); |
| options.setCaseSensitive(caseSensitive); |
| options.setAllTermsToLowerCase(allTermsToLowerCase); |
| options.setAllTermsToUpperCase(allTermsToUpperCase); |
| options.setMinTokenLength(minTokenLength); |
| options.setMaxTokenLength(maxTokenLength); |
| return options; |
| } |
| } |
| |
| public static StandardTokenizerOptions buildFromMap(Map<String, String> optionsMap) |
| { |
| OptionsBuilder optionsBuilder = new OptionsBuilder(); |
| |
| for (Map.Entry<String, String> entry : optionsMap.entrySet()) |
| { |
| switch(entry.getKey()) |
| { |
| case TOKENIZATION_ENABLE_STEMMING: |
| { |
| boolean bool = Boolean.parseBoolean(entry.getValue()); |
| optionsBuilder = optionsBuilder.stemTerms(bool); |
| break; |
| } |
| case TOKENIZATION_SKIP_STOP_WORDS: |
| { |
| boolean bool = Boolean.parseBoolean(entry.getValue()); |
| optionsBuilder = optionsBuilder.ignoreStopTerms(bool); |
| break; |
| } |
| case TOKENIZATION_LOCALE: |
| { |
| Locale locale = new Locale(entry.getValue()); |
| optionsBuilder = optionsBuilder.useLocale(locale); |
| break; |
| } |
| case TOKENIZATION_NORMALIZE_UPPERCASE: |
| { |
| boolean bool = Boolean.parseBoolean(entry.getValue()); |
| optionsBuilder = optionsBuilder.alwaysUpperCaseTerms(bool); |
| break; |
| } |
| case TOKENIZATION_NORMALIZE_LOWERCASE: |
| { |
| boolean bool = Boolean.parseBoolean(entry.getValue()); |
| optionsBuilder = optionsBuilder.alwaysLowerCaseTerms(bool); |
| break; |
| } |
| default: |
| { |
| } |
| } |
| } |
| return optionsBuilder.build(); |
| } |
| |
| public static StandardTokenizerOptions getDefaultOptions() |
| { |
| return new OptionsBuilder() |
| .ignoreStopTerms(true).alwaysLowerCaseTerms(true) |
| .stemTerms(false).useLocale(Locale.ENGLISH).build(); |
| } |
| } |