lucene/core/src/java/org/apache/lucene/analysis/WordlistLoader.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis;


 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.util.IOUtils;

 /**
  * Loader for text files that represent a list of stopwords.
  *
  * @see IOUtils to obtain {@link Reader} instances
  * @lucene.internal
  */
 public class WordlistLoader {

   private static final int INITIAL_CAPACITY = 16;

   /** no instance */
   private WordlistLoader() {}

   /**
    * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
    * leading and trailing whitespace). Every line of the Reader should contain only
    * one word. The words need to be in lowercase if you make use of an
    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
    *
    * @param reader Reader containing the wordlist
    * @param result the {@link CharArraySet} to fill with the readers words
    * @return the given {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
     BufferedReader br = null;
     try {
       br = getBufferedReader(reader);
       String word = null;
       while ((word = br.readLine()) != null) {
         result.add(word.trim());
       }
     }
     finally {
       IOUtils.close(br);
     }
     return result;
   }

   /**
    * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
    * leading and trailing whitespace). Every line of the Reader should contain only
    * one word. The words need to be in lowercase if you make use of an
    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
    *
    * @param reader Reader containing the wordlist
    * @return A {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getWordSet(Reader reader) throws IOException {
     return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
   }

   /**
    * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
    * leading and trailing whitespace). Every line of the Reader should contain only
    * one word. The words need to be in lowercase if you make use of an
    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
    *
    * @param reader Reader containing the wordlist
    * @param comment The string representing a comment.
    * @return A CharArraySet with the reader's words
    */
   public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
     return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
   }

   /**
    * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
    * leading and trailing whitespace). Every line of the Reader should contain only
    * one word. The words need to be in lowercase if you make use of an
    * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
    *
    * @param reader Reader containing the wordlist
    * @param comment The string representing a comment.
    * @param result the {@link CharArraySet} to fill with the readers words
    * @return the given {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
     BufferedReader br = null;
     try {
       br = getBufferedReader(reader);
       String word = null;
       while ((word = br.readLine()) != null) {
         if (word.startsWith(comment) == false){
           result.add(word.trim());
         }
       }
     }
     finally {
       IOUtils.close(br);
     }
     return result;
   }


   /**
    * Reads stopwords from a stopword list in Snowball format.
    * <p>
    * The snowball format is the following:
    * <ul>
    * <li>Lines may contain multiple words separated by whitespace.
    * <li>The comment character is the vertical line (&#124;).
    * <li>Lines may contain trailing comments.
    * </ul>
    *
    * @param reader Reader containing a Snowball stopword list
    * @param result the {@link CharArraySet} to fill with the readers words
    * @return the given {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
       throws IOException {
     BufferedReader br = null;
     try {
       br = getBufferedReader(reader);
       String line = null;
       while ((line = br.readLine()) != null) {
         int comment = line.indexOf('|');
         if (comment >= 0) line = line.substring(0, comment);
         String words[] = line.split("\\s+");
         for (int i = 0; i < words.length; i++)
           if (words[i].length() > 0) result.add(words[i]);
       }
     } finally {
       IOUtils.close(br);
     }
     return result;
   }

   /**
    * Reads stopwords from a stopword list in Snowball format.
    * <p>
    * The snowball format is the following:
    * <ul>
    * <li>Lines may contain multiple words separated by whitespace.
    * <li>The comment character is the vertical line (&#124;).
    * <li>Lines may contain trailing comments.
    * </ul>
    *
    * @param reader Reader containing a Snowball stopword list
    * @return A {@link CharArraySet} with the reader's words
    */
   public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
     return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
   }


   /**
    * Reads a stem dictionary. Each line contains:
    * <pre>word<b>\t</b>stem</pre>
    * (i.e. two tab separated words)
    *
    * @return stem dictionary that overrules the stemming algorithm
    * @throws IOException If there is a low-level I/O error.
    */
   public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
     BufferedReader br = null;
     try {
       br = getBufferedReader(reader);
       String line;
       while ((line = br.readLine()) != null) {
         String[] wordstem = line.split("\t", 2);
         result.put(wordstem[0], wordstem[1]);
       }
     } finally {
       IOUtils.close(br);
     }
     return result;
   }

   /**
    * Accesses a resource by name and returns the (non comment) lines containing
    * data using the given character encoding.
    *
    * <p>
    * A comment line is any line that starts with the character "#"
    * </p>
    *
    * @return a list of non-blank non-comment lines with whitespace trimmed
    * @throws IOException If there is a low-level I/O error.
    */
   public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
     BufferedReader input = null;
     ArrayList<String> lines;
     boolean success = false;
     try {
       input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));

       lines = new ArrayList<>();
       for (String word=null; (word=input.readLine())!=null;) {
         // skip initial bom marker
         if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
           word = word.substring(1);
         // skip comments
         if (word.startsWith("#")) continue;
         word=word.trim();
         // skip blank lines
         if (word.length()==0) continue;
         lines.add(word);
       }
       success = true;
       return lines;
     } finally {
       if (success) {
         IOUtils.close(input);
       } else {
         IOUtils.closeWhileHandlingException(input);
       }
     }
   }

   private static BufferedReader getBufferedReader(Reader reader) {
     return (reader instanceof BufferedReader) ? (BufferedReader) reader
         : new BufferedReader(reader);
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis;


	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.Reader;
	import java.nio.charset.Charset;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.util.IOUtils;

	/**
	* Loader for text files that represent a list of stopwords.
	*
	* @see IOUtils to obtain {@link Reader} instances
	* @lucene.internal
	*/
	public class WordlistLoader {

	private static final int INITIAL_CAPACITY = 16;

	/** no instance */
	private WordlistLoader() {}

	/**
	* Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
	* leading and trailing whitespace). Every line of the Reader should contain only
	* one word. The words need to be in lowercase if you make use of an
	* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
	*
	* @param reader Reader containing the wordlist
	* @param result the {@link CharArraySet} to fill with the readers words
	* @return the given {@link CharArraySet} with the reader's words
	*/
	public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException {
	BufferedReader br = null;
	try {
	br = getBufferedReader(reader);
	String word = null;
	while ((word = br.readLine()) != null) {
	result.add(word.trim());
	}
	}
	finally {
	IOUtils.close(br);
	}
	return result;
	}

	/**
	* Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
	* leading and trailing whitespace). Every line of the Reader should contain only
	* one word. The words need to be in lowercase if you make use of an
	* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
	*
	* @param reader Reader containing the wordlist
	* @return A {@link CharArraySet} with the reader's words
	*/
	public static CharArraySet getWordSet(Reader reader) throws IOException {
	return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
	}

	/**
	* Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
	* leading and trailing whitespace). Every line of the Reader should contain only
	* one word. The words need to be in lowercase if you make use of an
	* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
	*
	* @param reader Reader containing the wordlist
	* @param comment The string representing a comment.
	* @return A CharArraySet with the reader's words
	*/
	public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
	return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
	}

	/**
	* Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
	* leading and trailing whitespace). Every line of the Reader should contain only
	* one word. The words need to be in lowercase if you make use of an
	* Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
	*
	* @param reader Reader containing the wordlist
	* @param comment The string representing a comment.
	* @param result the {@link CharArraySet} to fill with the readers words
	* @return the given {@link CharArraySet} with the reader's words
	*/
	public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException {
	BufferedReader br = null;
	try {
	br = getBufferedReader(reader);
	String word = null;
	while ((word = br.readLine()) != null) {
	if (word.startsWith(comment) == false){
	result.add(word.trim());
	}
	}
	}
	finally {
	IOUtils.close(br);
	}
	return result;
	}


	/**
	* Reads stopwords from a stopword list in Snowball format.
	* <p>
	* The snowball format is the following:
	* <ul>
	* <li>Lines may contain multiple words separated by whitespace.
	* <li>The comment character is the vertical line (\|).
	* <li>Lines may contain trailing comments.
	* </ul>
	*
	* @param reader Reader containing a Snowball stopword list
	* @param result the {@link CharArraySet} to fill with the readers words
	* @return the given {@link CharArraySet} with the reader's words
	*/
	public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
	throws IOException {
	BufferedReader br = null;
	try {
	br = getBufferedReader(reader);
	String line = null;
	while ((line = br.readLine()) != null) {
	int comment = line.indexOf('\|');
	if (comment >= 0) line = line.substring(0, comment);
	String words[] = line.split("\\s+");
	for (int i = 0; i < words.length; i++)
	if (words[i].length() > 0) result.add(words[i]);
	}
	} finally {
	IOUtils.close(br);
	}
	return result;
	}

	/**
	* Reads stopwords from a stopword list in Snowball format.
	* <p>
	* The snowball format is the following:
	* <ul>
	* <li>Lines may contain multiple words separated by whitespace.
	* <li>The comment character is the vertical line (\|).
	* <li>Lines may contain trailing comments.
	* </ul>
	*
	* @param reader Reader containing a Snowball stopword list
	* @return A {@link CharArraySet} with the reader's words
	*/
	public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
	return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
	}


	/**
	* Reads a stem dictionary. Each line contains:
	* <pre>word<b>\t</b>stem</pre>
	* (i.e. two tab separated words)
	*
	* @return stem dictionary that overrules the stemming algorithm
	* @throws IOException If there is a low-level I/O error.
	*/
	public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException {
	BufferedReader br = null;
	try {
	br = getBufferedReader(reader);
	String line;
	while ((line = br.readLine()) != null) {
	String[] wordstem = line.split("\t", 2);
	result.put(wordstem[0], wordstem[1]);
	}
	} finally {
	IOUtils.close(br);
	}
	return result;
	}

	/**
	* Accesses a resource by name and returns the (non comment) lines containing
	* data using the given character encoding.
	*
	* <p>
	* A comment line is any line that starts with the character "#"
	* </p>
	*
	* @return a list of non-blank non-comment lines with whitespace trimmed
	* @throws IOException If there is a low-level I/O error.
	*/
	public static List<String> getLines(InputStream stream, Charset charset) throws IOException{
	BufferedReader input = null;
	ArrayList<String> lines;
	boolean success = false;
	try {
	input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));

	lines = new ArrayList<>();
	for (String word=null; (word=input.readLine())!=null;) {
	// skip initial bom marker
	if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF')
	word = word.substring(1);
	// skip comments
	if (word.startsWith("#")) continue;
	word=word.trim();
	// skip blank lines
	if (word.length()==0) continue;
	lines.add(word);
	}
	success = true;
	return lines;
	} finally {
	if (success) {
	IOUtils.close(input);
	} else {
	IOUtils.closeWhileHandlingException(input);
	}
	}
	}

	private static BufferedReader getBufferedReader(Reader reader) {
	return (reader instanceof BufferedReader) ? (BufferedReader) reader
	: new BufferedReader(reader);
	}

	}