| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis; |
| |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.Reader; |
| import java.nio.charset.Charset; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.lucene.util.IOUtils; |
| |
| /** |
| * Loader for text files that represent a list of stopwords. |
| * |
| * @see IOUtils to obtain {@link Reader} instances |
| * @lucene.internal |
| */ |
| public class WordlistLoader { |
| |
| private static final int INITIAL_CAPACITY = 16; |
| |
| /** no instance */ |
| private WordlistLoader() {} |
| |
| /** |
| * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting |
| * leading and trailing whitespace). Every line of the Reader should contain only |
| * one word. The words need to be in lowercase if you make use of an |
| * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). |
| * |
| * @param reader Reader containing the wordlist |
| * @param result the {@link CharArraySet} to fill with the readers words |
| * @return the given {@link CharArraySet} with the reader's words |
| */ |
| public static CharArraySet getWordSet(Reader reader, CharArraySet result) throws IOException { |
| BufferedReader br = null; |
| try { |
| br = getBufferedReader(reader); |
| String word = null; |
| while ((word = br.readLine()) != null) { |
| result.add(word.trim()); |
| } |
| } |
| finally { |
| IOUtils.close(br); |
| } |
| return result; |
| } |
| |
| /** |
| * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting |
| * leading and trailing whitespace). Every line of the Reader should contain only |
| * one word. The words need to be in lowercase if you make use of an |
| * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). |
| * |
| * @param reader Reader containing the wordlist |
| * @return A {@link CharArraySet} with the reader's words |
| */ |
| public static CharArraySet getWordSet(Reader reader) throws IOException { |
| return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); |
| } |
| |
| /** |
| * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting |
| * leading and trailing whitespace). Every line of the Reader should contain only |
| * one word. The words need to be in lowercase if you make use of an |
| * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). |
| * |
| * @param reader Reader containing the wordlist |
| * @param comment The string representing a comment. |
| * @return A CharArraySet with the reader's words |
| */ |
| public static CharArraySet getWordSet(Reader reader, String comment) throws IOException { |
| return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)); |
| } |
| |
| /** |
| * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting |
| * leading and trailing whitespace). Every line of the Reader should contain only |
| * one word. The words need to be in lowercase if you make use of an |
| * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). |
| * |
| * @param reader Reader containing the wordlist |
| * @param comment The string representing a comment. |
| * @param result the {@link CharArraySet} to fill with the readers words |
| * @return the given {@link CharArraySet} with the reader's words |
| */ |
| public static CharArraySet getWordSet(Reader reader, String comment, CharArraySet result) throws IOException { |
| BufferedReader br = null; |
| try { |
| br = getBufferedReader(reader); |
| String word = null; |
| while ((word = br.readLine()) != null) { |
| if (word.startsWith(comment) == false){ |
| result.add(word.trim()); |
| } |
| } |
| } |
| finally { |
| IOUtils.close(br); |
| } |
| return result; |
| } |
| |
| |
| /** |
| * Reads stopwords from a stopword list in Snowball format. |
| * <p> |
| * The snowball format is the following: |
| * <ul> |
| * <li>Lines may contain multiple words separated by whitespace. |
| * <li>The comment character is the vertical line (|). |
| * <li>Lines may contain trailing comments. |
| * </ul> |
| * |
| * @param reader Reader containing a Snowball stopword list |
| * @param result the {@link CharArraySet} to fill with the readers words |
| * @return the given {@link CharArraySet} with the reader's words |
| */ |
| public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result) |
| throws IOException { |
| BufferedReader br = null; |
| try { |
| br = getBufferedReader(reader); |
| String line = null; |
| while ((line = br.readLine()) != null) { |
| int comment = line.indexOf('|'); |
| if (comment >= 0) line = line.substring(0, comment); |
| String words[] = line.split("\\s+"); |
| for (int i = 0; i < words.length; i++) |
| if (words[i].length() > 0) result.add(words[i]); |
| } |
| } finally { |
| IOUtils.close(br); |
| } |
| return result; |
| } |
| |
| /** |
| * Reads stopwords from a stopword list in Snowball format. |
| * <p> |
| * The snowball format is the following: |
| * <ul> |
| * <li>Lines may contain multiple words separated by whitespace. |
| * <li>The comment character is the vertical line (|). |
| * <li>Lines may contain trailing comments. |
| * </ul> |
| * |
| * @param reader Reader containing a Snowball stopword list |
| * @return A {@link CharArraySet} with the reader's words |
| */ |
| public static CharArraySet getSnowballWordSet(Reader reader) throws IOException { |
| return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); |
| } |
| |
| |
| /** |
| * Reads a stem dictionary. Each line contains: |
| * <pre>word<b>\t</b>stem</pre> |
| * (i.e. two tab separated words) |
| * |
| * @return stem dictionary that overrules the stemming algorithm |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| public static CharArrayMap<String> getStemDict(Reader reader, CharArrayMap<String> result) throws IOException { |
| BufferedReader br = null; |
| try { |
| br = getBufferedReader(reader); |
| String line; |
| while ((line = br.readLine()) != null) { |
| String[] wordstem = line.split("\t", 2); |
| result.put(wordstem[0], wordstem[1]); |
| } |
| } finally { |
| IOUtils.close(br); |
| } |
| return result; |
| } |
| |
| /** |
| * Accesses a resource by name and returns the (non comment) lines containing |
| * data using the given character encoding. |
| * |
| * <p> |
| * A comment line is any line that starts with the character "#" |
| * </p> |
| * |
| * @return a list of non-blank non-comment lines with whitespace trimmed |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| public static List<String> getLines(InputStream stream, Charset charset) throws IOException{ |
| BufferedReader input = null; |
| ArrayList<String> lines; |
| boolean success = false; |
| try { |
| input = getBufferedReader(IOUtils.getDecodingReader(stream, charset)); |
| |
| lines = new ArrayList<>(); |
| for (String word=null; (word=input.readLine())!=null;) { |
| // skip initial bom marker |
| if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF') |
| word = word.substring(1); |
| // skip comments |
| if (word.startsWith("#")) continue; |
| word=word.trim(); |
| // skip blank lines |
| if (word.length()==0) continue; |
| lines.add(word); |
| } |
| success = true; |
| return lines; |
| } finally { |
| if (success) { |
| IOUtils.close(input); |
| } else { |
| IOUtils.closeWhileHandlingException(input); |
| } |
| } |
| } |
| |
| private static BufferedReader getBufferedReader(Reader reader) { |
| return (reader instanceof BufferedReader) ? (BufferedReader) reader |
| : new BufferedReader(reader); |
| } |
| |
| } |