blob: b0660a9e763db8eb7f0f08ec5a3ca8df4b8db285 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Set;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
/**
* Dictionary represented by a text file.
*
* <p>Format allowed: 1 entry per line:<br>
* An entry can be: <br>
* <ul>
* <li>suggestion</li>
* <li>suggestion <code>fieldDelimiter</code> weight</li>
* <li>suggestion <code>fieldDelimiter</code> weight <code>fieldDelimiter</code> payload</li>
* </ul>
* where the default <code>fieldDelimiter</code> is {@value #DEFAULT_FIELD_DELIMITER}<br>
* <p>
* <b>NOTE:</b>
* <ul>
* <li>In order to have payload enabled, the first entry has to have a payload</li>
* <li>If the weight for an entry is not specified then a value of 1 is used</li>
* <li>A payload cannot be specified without having the weight specified for an entry</li>
* <li>If the payload for an entry is not specified (assuming payload is enabled)
* then an empty payload is returned</li>
* <li>An entry cannot have more than two <code>fieldDelimiter</code></li>
* </ul>
* <p>
* <b>Example:</b><br>
* word1 word2 TAB 100 TAB payload1<br>
* word3 TAB 101<br>
* word4 word3 TAB 102<br>
*/
public class FileDictionary implements Dictionary {
/**
* Tab-delimited fields are most common thus the default, but one can override this via the constructor
*/
public final static String DEFAULT_FIELD_DELIMITER = "\t";
private BufferedReader in;
private String line;
private boolean done = false;
private final String fieldDelimiter;
/**
* Creates a dictionary based on an inputstream.
* Using {@link #DEFAULT_FIELD_DELIMITER} as the
* field separator in a line.
* <p>
* NOTE: content is treated as UTF-8
*/
public FileDictionary(InputStream dictFile) {
this(dictFile, DEFAULT_FIELD_DELIMITER);
}
/**
* Creates a dictionary based on a reader.
* Using {@link #DEFAULT_FIELD_DELIMITER} as the
* field separator in a line.
*/
public FileDictionary(Reader reader) {
this(reader, DEFAULT_FIELD_DELIMITER);
}
/**
* Creates a dictionary based on a reader.
* Using <code>fieldDelimiter</code> to separate out the
* fields in a line.
*/
public FileDictionary(Reader reader, String fieldDelimiter) {
in = new BufferedReader(reader);
this.fieldDelimiter = fieldDelimiter;
}
/**
* Creates a dictionary based on an inputstream.
* Using <code>fieldDelimiter</code> to separate out the
* fields in a line.
* <p>
* NOTE: content is treated as UTF-8
*/
public FileDictionary(InputStream dictFile, String fieldDelimiter) {
in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8));
this.fieldDelimiter = fieldDelimiter;
}
@Override
public InputIterator getEntryIterator() {
try {
return new FileIterator();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
final class FileIterator implements InputIterator {
private long curWeight;
private final BytesRefBuilder spare = new BytesRefBuilder();
private BytesRefBuilder curPayload = new BytesRefBuilder();
private boolean isFirstLine = true;
private boolean hasPayloads = false;
private FileIterator() throws IOException {
line = in.readLine();
if (line == null) {
done = true;
IOUtils.close(in);
} else {
String[] fields = line.split(fieldDelimiter);
if (fields.length > 3) {
throw new IllegalArgumentException("More than 3 fields in one line");
} else if (fields.length == 3) { // term, weight, payload
hasPayloads = true;
spare.copyChars(fields[0]);
readWeight(fields[1]);
curPayload.copyChars(fields[2]);
} else if (fields.length == 2) { // term, weight
spare.copyChars(fields[0]);
readWeight(fields[1]);
} else { // only term
spare.copyChars(fields[0]);
curWeight = 1;
}
}
}
@Override
public long weight() {
return curWeight;
}
@Override
public BytesRef next() throws IOException {
if (done) {
return null;
}
if (isFirstLine) {
isFirstLine = false;
return spare.get();
}
line = in.readLine();
if (line != null) {
String[] fields = line.split(fieldDelimiter);
if (fields.length > 3) {
throw new IllegalArgumentException("More than 3 fields in one line");
} else if (fields.length == 3) { // term, weight and payload
spare.copyChars(fields[0]);
readWeight(fields[1]);
if (hasPayloads) {
curPayload.copyChars(fields[2]);
}
} else if (fields.length == 2) { // term, weight
spare.copyChars(fields[0]);
readWeight(fields[1]);
if (hasPayloads) { // have an empty payload
curPayload = new BytesRefBuilder();
}
} else { // only term
spare.copyChars(fields[0]);
curWeight = 1;
if (hasPayloads) {
curPayload = new BytesRefBuilder();
}
}
return spare.get();
} else {
done = true;
IOUtils.close(in);
return null;
}
}
@Override
public BytesRef payload() {
return (hasPayloads) ? curPayload.get() : null;
}
@Override
public boolean hasPayloads() {
return hasPayloads;
}
private void readWeight(String weight) {
// keep reading floats for bw compat
try {
curWeight = Long.parseLong(weight);
} catch (NumberFormatException e) {
curWeight = (long)Double.parseDouble(weight);
}
}
@Override
public Set<BytesRef> contexts() {
return null;
}
@Override
public boolean hasContexts() {
return false;
}
}
}