lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.hunspell;

 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.ResourceLoader;
 import org.apache.lucene.util.ResourceLoaderAware;

 /**
  * TokenFilterFactory that creates instances of {@link HunspellStemFilter}. Example config for
  * British English:
  *
  * <pre class="prettyprint">
  * &lt;filter class=&quot;solr.HunspellStemFilterFactory&quot;
  *         dictionary=&quot;en_GB.dic,my_custom.dic&quot;
  *         affix=&quot;en_GB.aff&quot;
  *         ignoreCase=&quot;false&quot;
  *         longestOnly=&quot;false&quot; /&gt;</pre>
  *
  * Both parameters dictionary and affix are mandatory. Dictionaries for many languages are available
  * through the OpenOffice project.
  *
  * <p>See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
  *
  * @lucene.experimental
  * @since 3.5.0
  * @lucene.spi {@value #NAME}
  */
 public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {

   /** SPI name */
   public static final String NAME = "hunspellStem";

   private static final String PARAM_DICTIONARY = "dictionary";
   private static final String PARAM_AFFIX = "affix";
   // NOTE: this one is currently unused?:
   private static final String PARAM_RECURSION_CAP = "recursionCap";
   private static final String PARAM_IGNORE_CASE = "ignoreCase";
   private static final String PARAM_LONGEST_ONLY = "longestOnly";

   private final String dictionaryFiles;
   private final String affixFile;
   private final boolean ignoreCase;
   private final boolean longestOnly;
   private Dictionary dictionary;

   /** Creates a new HunspellStemFilterFactory */
   public HunspellStemFilterFactory(Map<String, String> args) {
     super(args);
     dictionaryFiles = require(args, PARAM_DICTIONARY);
     affixFile = get(args, PARAM_AFFIX);
     ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
     longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false);
     // this isnt necessary: we properly load all dictionaries.
     // but recognize and ignore for back compat
     getBoolean(args, "strictAffixParsing", true);
     // this isn't necessary: multi-stage stripping is fixed and
     // flags like COMPLEXPREFIXES in the data itself control this.
     // but recognize and ignore for back compat
     getInt(args, "recursionCap", 0);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
   }

   /** Default ctor for compatibility with SPI */
   public HunspellStemFilterFactory() {
     throw defaultCtorException();
   }

   @Override
   public void inform(ResourceLoader loader) throws IOException {
     String dicts[] = dictionaryFiles.split(",");

     InputStream affix = null;
     List<InputStream> dictionaries = new ArrayList<>();

     try {
       dictionaries = new ArrayList<>();
       for (String file : dicts) {
         dictionaries.add(loader.openResource(file));
       }
       affix = loader.openResource(affixFile);

       Path tempPath = Files.createTempDirectory(Dictionary.getDefaultTempDir(), "Hunspell");
       try (Directory tempDir = FSDirectory.open(tempPath)) {
         this.dictionary = new Dictionary(tempDir, "hunspell", affix, dictionaries, ignoreCase);
       } finally {
         IOUtils.rm(tempPath);
       }
     } catch (ParseException e) {
       throw new IOException(
           "Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]",
           e);
     } finally {
       IOUtils.closeWhileHandlingException(affix);
       IOUtils.closeWhileHandlingException(dictionaries);
     }
   }

   @Override
   public TokenStream create(TokenStream tokenStream) {
     return new HunspellStemFilter(tokenStream, dictionary, true, longestOnly);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.hunspell;

	import java.io.IOException;
	import java.io.InputStream;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.text.ParseException;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.Map;
	import org.apache.lucene.analysis.TokenFilterFactory;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	import org.apache.lucene.util.IOUtils;
	import org.apache.lucene.util.ResourceLoader;
	import org.apache.lucene.util.ResourceLoaderAware;

	/**
	* TokenFilterFactory that creates instances of {@link HunspellStemFilter}. Example config for
	* British English:
	*
	* <pre class="prettyprint">
	* <filter class="solr.HunspellStemFilterFactory"
	* dictionary="en_GB.dic,my_custom.dic"
	* affix="en_GB.aff"
	* ignoreCase="false"
	* longestOnly="false" /></pre>
	*
	* Both parameters dictionary and affix are mandatory. Dictionaries for many languages are available
	* through the OpenOffice project.
	*
	* <p>See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
	*
	* @lucene.experimental
	* @since 3.5.0
	* @lucene.spi {@value #NAME}
	*/
	public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {

	/** SPI name */
	public static final String NAME = "hunspellStem";

	private static final String PARAM_DICTIONARY = "dictionary";
	private static final String PARAM_AFFIX = "affix";
	// NOTE: this one is currently unused?:
	private static final String PARAM_RECURSION_CAP = "recursionCap";
	private static final String PARAM_IGNORE_CASE = "ignoreCase";
	private static final String PARAM_LONGEST_ONLY = "longestOnly";

	private final String dictionaryFiles;
	private final String affixFile;
	private final boolean ignoreCase;
	private final boolean longestOnly;
	private Dictionary dictionary;

	/** Creates a new HunspellStemFilterFactory */
	public HunspellStemFilterFactory(Map<String, String> args) {
	super(args);
	dictionaryFiles = require(args, PARAM_DICTIONARY);
	affixFile = get(args, PARAM_AFFIX);
	ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
	longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false);
	// this isnt necessary: we properly load all dictionaries.
	// but recognize and ignore for back compat
	getBoolean(args, "strictAffixParsing", true);
	// this isn't necessary: multi-stage stripping is fixed and
	// flags like COMPLEXPREFIXES in the data itself control this.
	// but recognize and ignore for back compat
	getInt(args, "recursionCap", 0);
	if (!args.isEmpty()) {
	throw new IllegalArgumentException("Unknown parameters: " + args);
	}
	}

	/** Default ctor for compatibility with SPI */
	public HunspellStemFilterFactory() {
	throw defaultCtorException();
	}

	@Override
	public void inform(ResourceLoader loader) throws IOException {
	String dicts[] = dictionaryFiles.split(",");

	InputStream affix = null;
	List<InputStream> dictionaries = new ArrayList<>();

	try {
	dictionaries = new ArrayList<>();
	for (String file : dicts) {
	dictionaries.add(loader.openResource(file));
	}
	affix = loader.openResource(affixFile);

	Path tempPath = Files.createTempDirectory(Dictionary.getDefaultTempDir(), "Hunspell");
	try (Directory tempDir = FSDirectory.open(tempPath)) {
	this.dictionary = new Dictionary(tempDir, "hunspell", affix, dictionaries, ignoreCase);
	} finally {
	IOUtils.rm(tempPath);
	}
	} catch (ParseException e) {
	throw new IOException(
	"Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]",
	e);
	} finally {
	IOUtils.closeWhileHandlingException(affix);
	IOUtils.closeWhileHandlingException(dictionaries);
	}
	}

	@Override
	public TokenStream create(TokenStream tokenStream) {
	return new HunspellStemFilter(tokenStream, dictionary, true, longestOnly);
	}
	}