lucene/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.suggest;

 import java.io.*;
 import java.nio.charset.StandardCharsets;
 import java.util.Set;

 import org.apache.lucene.search.spell.Dictionary;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IOUtils;


 /**
  * Dictionary represented by a text file.
  *
  * <p>Format allowed: 1 entry per line:<br>
  * An entry can be: <br>
  * <ul>
  * <li>suggestion</li>
  * <li>suggestion <code>fieldDelimiter</code> weight</li>
  * <li>suggestion <code>fieldDelimiter</code> weight <code>fieldDelimiter</code> payload</li>
  * </ul>
  * where the default <code>fieldDelimiter</code> is {@value #DEFAULT_FIELD_DELIMITER}<br>
  * <p>
  * <b>NOTE:</b>
  * <ul>
  * <li>In order to have payload enabled, the first entry has to have a payload</li>
  * <li>If the weight for an entry is not specified then a value of 1 is used</li>
  * <li>A payload cannot be specified without having the weight specified for an entry</li>
  * <li>If the payload for an entry is not specified (assuming payload is enabled)
  *  then an empty payload is returned</li>
  * <li>An entry cannot have more than two <code>fieldDelimiter</code></li>
  * </ul>
  * <p>
  * <b>Example:</b><br>
  * word1 word2 TAB 100 TAB payload1<br>
  * word3 TAB 101<br>
  * word4 word3 TAB 102<br>
  */
 public class FileDictionary implements Dictionary {

   /**
    * Tab-delimited fields are most common thus the default, but one can override this via the constructor
    */
   public final static String DEFAULT_FIELD_DELIMITER = "\t";
   private BufferedReader in;
   private String line;
   private boolean done = false;
   private final String fieldDelimiter;

   /**
    * Creates a dictionary based on an inputstream.
    * Using {@link #DEFAULT_FIELD_DELIMITER} as the
    * field separator in a line.
    * <p>
    * NOTE: content is treated as UTF-8
    */
   public FileDictionary(InputStream dictFile) {
     this(dictFile, DEFAULT_FIELD_DELIMITER);
   }

   /**
    * Creates a dictionary based on a reader.
    * Using {@link #DEFAULT_FIELD_DELIMITER} as the
    * field separator in a line.
    */
   public FileDictionary(Reader reader) {
     this(reader, DEFAULT_FIELD_DELIMITER);
   }

   /**
    * Creates a dictionary based on a reader.
    * Using <code>fieldDelimiter</code> to separate out the
    * fields in a line.
    */
   public FileDictionary(Reader reader, String fieldDelimiter) {
     in = new BufferedReader(reader);
     this.fieldDelimiter = fieldDelimiter;
   }

   /**
    * Creates a dictionary based on an inputstream.
    * Using <code>fieldDelimiter</code> to separate out the
    * fields in a line.
    * <p>
    * NOTE: content is treated as UTF-8
    */
   public FileDictionary(InputStream dictFile, String fieldDelimiter) {
     in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8));
     this.fieldDelimiter = fieldDelimiter;
   }

   @Override
   public InputIterator getEntryIterator() {
     try {
       return new FileIterator();
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
   }

   final class FileIterator implements InputIterator {
     private long curWeight;
     private final BytesRefBuilder spare = new BytesRefBuilder();
     private BytesRefBuilder curPayload = new BytesRefBuilder();
     private boolean isFirstLine = true;
     private boolean hasPayloads = false;

     private FileIterator() throws IOException {
       line = in.readLine();
       if (line == null) {
         done = true;
         IOUtils.close(in);
       } else {
         String[] fields = line.split(fieldDelimiter);
         if (fields.length > 3) {
           throw new IllegalArgumentException("More than 3 fields in one line");
         } else if (fields.length == 3) { // term, weight, payload
           hasPayloads = true;
           spare.copyChars(fields[0]);
           readWeight(fields[1]);
           curPayload.copyChars(fields[2]);
         } else if (fields.length == 2) { // term, weight
           spare.copyChars(fields[0]);
           readWeight(fields[1]);
         } else { // only term
           spare.copyChars(fields[0]);
           curWeight = 1;
         }
       }
     }

     @Override
     public long weight() {
       return curWeight;
     }

     @Override
     public BytesRef next() throws IOException {
       if (done) {
         return null;
       }
       if (isFirstLine) {
         isFirstLine = false;
         return spare.get();
       }
       line = in.readLine();
       if (line != null) {
         String[] fields = line.split(fieldDelimiter);
         if (fields.length > 3) {
           throw new IllegalArgumentException("More than 3 fields in one line");
         } else if (fields.length == 3) { // term, weight and payload
           spare.copyChars(fields[0]);
           readWeight(fields[1]);
           if (hasPayloads) {
             curPayload.copyChars(fields[2]);
           }
         } else if (fields.length == 2) { // term, weight
           spare.copyChars(fields[0]);
           readWeight(fields[1]);
           if (hasPayloads) { // have an empty payload
             curPayload = new BytesRefBuilder();
           }
         } else { // only term
           spare.copyChars(fields[0]);
           curWeight = 1;
           if (hasPayloads) {
             curPayload = new BytesRefBuilder();
           }
         }
         return spare.get();
       } else {
         done = true;
         IOUtils.close(in);
         return null;
       }
     }

     @Override
     public BytesRef payload() {
       return (hasPayloads) ? curPayload.get() : null;
     }

     @Override
     public boolean hasPayloads() {
       return hasPayloads;
     }

     private void readWeight(String weight) {
       // keep reading floats for bw compat
       try {
         curWeight = Long.parseLong(weight);
       } catch (NumberFormatException e) {
         curWeight = (long)Double.parseDouble(weight);
       }
     }

     @Override
     public Set<BytesRef> contexts() {
       return null;
     }

     @Override
     public boolean hasContexts() {
       return false;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.suggest;

	import java.io.*;
	import java.nio.charset.StandardCharsets;
	import java.util.Set;

	import org.apache.lucene.search.spell.Dictionary;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.IOUtils;


	/**
	* Dictionary represented by a text file.
	*
	* <p>Format allowed: 1 entry per line:<br>
	* An entry can be: <br>
	* <ul>
	* <li>suggestion</li>
	* <li>suggestion <code>fieldDelimiter</code> weight</li>
	* <li>suggestion <code>fieldDelimiter</code> weight <code>fieldDelimiter</code> payload</li>
	* </ul>
	* where the default <code>fieldDelimiter</code> is {@value #DEFAULT_FIELD_DELIMITER}<br>
	* <p>
	* <b>NOTE:</b>
	* <ul>
	* <li>In order to have payload enabled, the first entry has to have a payload</li>
	* <li>If the weight for an entry is not specified then a value of 1 is used</li>
	* <li>A payload cannot be specified without having the weight specified for an entry</li>
	* <li>If the payload for an entry is not specified (assuming payload is enabled)
	* then an empty payload is returned</li>
	* <li>An entry cannot have more than two <code>fieldDelimiter</code></li>
	* </ul>
	* <p>
	* <b>Example:</b><br>
	* word1 word2 TAB 100 TAB payload1<br>
	* word3 TAB 101<br>
	* word4 word3 TAB 102<br>
	*/
	public class FileDictionary implements Dictionary {

	/**
	* Tab-delimited fields are most common thus the default, but one can override this via the constructor
	*/
	public final static String DEFAULT_FIELD_DELIMITER = "\t";
	private BufferedReader in;
	private String line;
	private boolean done = false;
	private final String fieldDelimiter;

	/**
	* Creates a dictionary based on an inputstream.
	* Using {@link #DEFAULT_FIELD_DELIMITER} as the
	* field separator in a line.
	* <p>
	* NOTE: content is treated as UTF-8
	*/
	public FileDictionary(InputStream dictFile) {
	this(dictFile, DEFAULT_FIELD_DELIMITER);
	}

	/**
	* Creates a dictionary based on a reader.
	* Using {@link #DEFAULT_FIELD_DELIMITER} as the
	* field separator in a line.
	*/
	public FileDictionary(Reader reader) {
	this(reader, DEFAULT_FIELD_DELIMITER);
	}

	/**
	* Creates a dictionary based on a reader.
	* Using <code>fieldDelimiter</code> to separate out the
	* fields in a line.
	*/
	public FileDictionary(Reader reader, String fieldDelimiter) {
	in = new BufferedReader(reader);
	this.fieldDelimiter = fieldDelimiter;
	}

	/**
	* Creates a dictionary based on an inputstream.
	* Using <code>fieldDelimiter</code> to separate out the
	* fields in a line.
	* <p>
	* NOTE: content is treated as UTF-8
	*/
	public FileDictionary(InputStream dictFile, String fieldDelimiter) {
	in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8));
	this.fieldDelimiter = fieldDelimiter;
	}

	@Override
	public InputIterator getEntryIterator() {
	try {
	return new FileIterator();
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}

	final class FileIterator implements InputIterator {
	private long curWeight;
	private final BytesRefBuilder spare = new BytesRefBuilder();
	private BytesRefBuilder curPayload = new BytesRefBuilder();
	private boolean isFirstLine = true;
	private boolean hasPayloads = false;

	private FileIterator() throws IOException {
	line = in.readLine();
	if (line == null) {
	done = true;
	IOUtils.close(in);
	} else {
	String[] fields = line.split(fieldDelimiter);
	if (fields.length > 3) {
	throw new IllegalArgumentException("More than 3 fields in one line");
	} else if (fields.length == 3) { // term, weight, payload
	hasPayloads = true;
	spare.copyChars(fields[0]);
	readWeight(fields[1]);
	curPayload.copyChars(fields[2]);
	} else if (fields.length == 2) { // term, weight
	spare.copyChars(fields[0]);
	readWeight(fields[1]);
	} else { // only term
	spare.copyChars(fields[0]);
	curWeight = 1;
	}
	}
	}

	@Override
	public long weight() {
	return curWeight;
	}

	@Override
	public BytesRef next() throws IOException {
	if (done) {
	return null;
	}
	if (isFirstLine) {
	isFirstLine = false;
	return spare.get();
	}
	line = in.readLine();
	if (line != null) {
	String[] fields = line.split(fieldDelimiter);
	if (fields.length > 3) {
	throw new IllegalArgumentException("More than 3 fields in one line");
	} else if (fields.length == 3) { // term, weight and payload
	spare.copyChars(fields[0]);
	readWeight(fields[1]);
	if (hasPayloads) {
	curPayload.copyChars(fields[2]);
	}
	} else if (fields.length == 2) { // term, weight
	spare.copyChars(fields[0]);
	readWeight(fields[1]);
	if (hasPayloads) { // have an empty payload
	curPayload = new BytesRefBuilder();
	}
	} else { // only term
	spare.copyChars(fields[0]);
	curWeight = 1;
	if (hasPayloads) {
	curPayload = new BytesRefBuilder();
	}
	}
	return spare.get();
	} else {
	done = true;
	IOUtils.close(in);
	return null;
	}
	}

	@Override
	public BytesRef payload() {
	return (hasPayloads) ? curPayload.get() : null;
	}

	@Override
	public boolean hasPayloads() {
	return hasPayloads;
	}

	private void readWeight(String weight) {
	// keep reading floats for bw compat
	try {
	curWeight = Long.parseLong(weight);
	} catch (NumberFormatException e) {
	curWeight = (long)Double.parseDouble(weight);
	}
	}

	@Override
	public Set<BytesRef> contexts() {
	return null;
	}

	@Override
	public boolean hasContexts() {
	return false;
	}
	}
	}