lucene/test-framework/src/java/org/apache/lucene/util/LineFileDocs.java - lucene-solr - Git at Google

 package org.apache.lucene.util;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.BufferedReader;
 import java.io.Closeable;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.RandomAccessFile;
 import java.nio.channels.Channels;
 import java.nio.channels.FileChannel;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.util.Random;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.zip.GZIPInputStream;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;

 /** Minimal port of benchmark's LneDocSource +
  * DocMaker, so tests can enum docs from a line file created
  * by benchmark's WriteLineDoc task */
 public class LineFileDocs implements Closeable {

   private BufferedReader reader;
   private final static int BUFFER_SIZE = 1 << 16;     // 64K
   private final AtomicInteger id = new AtomicInteger();
   private final String path;
   private final boolean useDocValues;

   /** If forever is true, we rewind the file at EOF (repeat
    * the docs over and over) */
   public LineFileDocs(Random random, String path, boolean useDocValues) throws IOException {
     this.path = path;
     this.useDocValues = useDocValues;
     open(random);
   }

   public LineFileDocs(Random random) throws IOException {
     this(random, LuceneTestCase.TEST_LINE_DOCS_FILE, true);
   }

   public LineFileDocs(Random random, boolean useDocValues) throws IOException {
     this(random, LuceneTestCase.TEST_LINE_DOCS_FILE, useDocValues);
   }

   @Override
   public synchronized void close() throws IOException {
     if (reader != null) {
       reader.close();
       reader = null;
     }
   }

   private long randomSeekPos(Random random, long size) {
     if (random == null || size <= 3L)
       return 0L;
     return (random.nextLong()&Long.MAX_VALUE) % (size/3);
   }

   private synchronized void open(Random random) throws IOException {
     InputStream is = getClass().getResourceAsStream(path);
     boolean needSkip = true;
     long size = 0L, seekTo = 0L;
     if (is == null) {
       // if its not in classpath, we load it as absolute filesystem path (e.g. Hudson's home dir)
       File file = new File(path);
       size = file.length();
       if (path.endsWith(".gz")) {
         // if it is a gzip file, we need to use InputStream and slowly skipTo:
         is = new FileInputStream(file);
       } else {
         // optimized seek using RandomAccessFile:
         seekTo = randomSeekPos(random, size);
         final FileChannel channel = new RandomAccessFile(path, "r").getChannel();
         if (LuceneTestCase.VERBOSE) {
           System.out.println("TEST: LineFileDocs: file seek to fp=" + seekTo + " on open");
         }
         channel.position(seekTo);
         is = Channels.newInputStream(channel);
         needSkip = false;
       }
     } else {
       // if the file comes from Classpath:
       size = is.available();
     }

     if (path.endsWith(".gz")) {
       is = new GZIPInputStream(is);
       // guestimate:
       size *= 2.8;
     }

     // If we only have an InputStream, we need to seek now,
     // but this seek is a scan, so very inefficient!!!
     if (needSkip) {
       seekTo = randomSeekPos(random, size);
       if (LuceneTestCase.VERBOSE) {
         System.out.println("TEST: LineFileDocs: stream skip to fp=" + seekTo + " on open");
       }
       is.skip(seekTo);
     }

     // if we seeked somewhere, read until newline char
     if (seekTo > 0L) {
       int b;
       do {
         b = is.read();
       } while (b >= 0 && b != 13 && b != 10);
     }

     CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
         .onMalformedInput(CodingErrorAction.REPORT)
         .onUnmappableCharacter(CodingErrorAction.REPORT);
     reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);

     if (seekTo > 0L) {
       // read one more line, to make sure we are not inside a Windows linebreak (\r\n):
       reader.readLine();
     }
   }

   public synchronized void reset(Random random) throws IOException {
     close();
     open(random);
     id.set(0);
   }

   private final static char SEP = '\t';

   private static final class DocState {
     final Document doc;
     final Field titleTokenized;
     final Field title;
     final Field titleDV;
     final Field body;
     final Field id;
     final Field date;

     public DocState(boolean useDocValues) {
       doc = new Document();

       title = new StringField("title", "", Field.Store.NO);
       doc.add(title);

       FieldType ft = new FieldType(TextField.TYPE_STORED);
       ft.setStoreTermVectors(true);
       ft.setStoreTermVectorOffsets(true);
       ft.setStoreTermVectorPositions(true);

       titleTokenized = new Field("titleTokenized", "", ft);
       doc.add(titleTokenized);

       body = new Field("body", "", ft);
       doc.add(body);

       id = new StringField("docid", "", Field.Store.YES);
       doc.add(id);

       date = new StringField("date", "", Field.Store.YES);
       doc.add(date);

       if (useDocValues) {
         titleDV = new SortedDocValuesField("titleDV", new BytesRef());
         doc.add(titleDV);
       } else {
         titleDV = null;
       }
     }
   }

   private final ThreadLocal<DocState> threadDocs = new ThreadLocal<DocState>();

   /** Note: Document instance is re-used per-thread */
   public Document nextDoc() throws IOException {
     String line;
     synchronized(this) {
       line = reader.readLine();
       if (line == null) {
         // Always rewind at end:
         if (LuceneTestCase.VERBOSE) {
           System.out.println("TEST: LineFileDocs: now rewind file...");
         }
         close();
         open(null);
         line = reader.readLine();
       }
     }

     DocState docState = threadDocs.get();
     if (docState == null) {
       docState = new DocState(useDocValues);
       threadDocs.set(docState);
     }

     int spot = line.indexOf(SEP);
     if (spot == -1) {
       throw new RuntimeException("line: [" + line + "] is in an invalid format !");
     }
     int spot2 = line.indexOf(SEP, 1 + spot);
     if (spot2 == -1) {
       throw new RuntimeException("line: [" + line + "] is in an invalid format !");
     }

     docState.body.setStringValue(line.substring(1+spot2, line.length()));
     final String title = line.substring(0, spot);
     docState.title.setStringValue(title);
     if (docState.titleDV != null) {
       docState.titleDV.setBytesValue(new BytesRef(title));
     }
     docState.titleTokenized.setStringValue(title);
     docState.date.setStringValue(line.substring(1+spot, spot2));
     docState.id.setStringValue(Integer.toString(id.getAndIncrement()));
     return docState.doc;
   }
 }
	package org.apache.lucene.util;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.BufferedReader;
	import java.io.Closeable;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.io.RandomAccessFile;
	import java.nio.channels.Channels;
	import java.nio.channels.FileChannel;
	import java.nio.charset.CharsetDecoder;
	import java.nio.charset.CodingErrorAction;
	import java.util.Random;
	import java.util.concurrent.atomic.AtomicInteger;
	import java.util.zip.GZIPInputStream;

	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.FieldType;
	import org.apache.lucene.document.SortedDocValuesField;
	import org.apache.lucene.document.StringField;
	import org.apache.lucene.document.TextField;

	/** Minimal port of benchmark's LneDocSource +
	* DocMaker, so tests can enum docs from a line file created
	* by benchmark's WriteLineDoc task */
	public class LineFileDocs implements Closeable {

	private BufferedReader reader;
	private final static int BUFFER_SIZE = 1 << 16; // 64K
	private final AtomicInteger id = new AtomicInteger();
	private final String path;
	private final boolean useDocValues;

	/** If forever is true, we rewind the file at EOF (repeat
	* the docs over and over) */
	public LineFileDocs(Random random, String path, boolean useDocValues) throws IOException {
	this.path = path;
	this.useDocValues = useDocValues;
	open(random);
	}

	public LineFileDocs(Random random) throws IOException {
	this(random, LuceneTestCase.TEST_LINE_DOCS_FILE, true);
	}

	public LineFileDocs(Random random, boolean useDocValues) throws IOException {
	this(random, LuceneTestCase.TEST_LINE_DOCS_FILE, useDocValues);
	}

	@Override
	public synchronized void close() throws IOException {
	if (reader != null) {
	reader.close();
	reader = null;
	}
	}

	private long randomSeekPos(Random random, long size) {
	if (random == null \|\| size <= 3L)
	return 0L;
	return (random.nextLong()&Long.MAX_VALUE) % (size/3);
	}

	private synchronized void open(Random random) throws IOException {
	InputStream is = getClass().getResourceAsStream(path);
	boolean needSkip = true;
	long size = 0L, seekTo = 0L;
	if (is == null) {
	// if its not in classpath, we load it as absolute filesystem path (e.g. Hudson's home dir)
	File file = new File(path);
	size = file.length();
	if (path.endsWith(".gz")) {
	// if it is a gzip file, we need to use InputStream and slowly skipTo:
	is = new FileInputStream(file);
	} else {
	// optimized seek using RandomAccessFile:
	seekTo = randomSeekPos(random, size);
	final FileChannel channel = new RandomAccessFile(path, "r").getChannel();
	if (LuceneTestCase.VERBOSE) {
	System.out.println("TEST: LineFileDocs: file seek to fp=" + seekTo + " on open");
	}
	channel.position(seekTo);
	is = Channels.newInputStream(channel);
	needSkip = false;
	}
	} else {
	// if the file comes from Classpath:
	size = is.available();
	}

	if (path.endsWith(".gz")) {
	is = new GZIPInputStream(is);
	// guestimate:
	size *= 2.8;
	}

	// If we only have an InputStream, we need to seek now,
	// but this seek is a scan, so very inefficient!!!
	if (needSkip) {
	seekTo = randomSeekPos(random, size);
	if (LuceneTestCase.VERBOSE) {
	System.out.println("TEST: LineFileDocs: stream skip to fp=" + seekTo + " on open");
	}
	is.skip(seekTo);
	}

	// if we seeked somewhere, read until newline char
	if (seekTo > 0L) {
	int b;
	do {
	b = is.read();
	} while (b >= 0 && b != 13 && b != 10);
	}

	CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
	.onMalformedInput(CodingErrorAction.REPORT)
	.onUnmappableCharacter(CodingErrorAction.REPORT);
	reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);

	if (seekTo > 0L) {
	// read one more line, to make sure we are not inside a Windows linebreak (\r\n):
	reader.readLine();
	}
	}

	public synchronized void reset(Random random) throws IOException {
	close();
	open(random);
	id.set(0);
	}

	private final static char SEP = '\t';

	private static final class DocState {
	final Document doc;
	final Field titleTokenized;
	final Field title;
	final Field titleDV;
	final Field body;
	final Field id;
	final Field date;

	public DocState(boolean useDocValues) {
	doc = new Document();

	title = new StringField("title", "", Field.Store.NO);
	doc.add(title);

	FieldType ft = new FieldType(TextField.TYPE_STORED);
	ft.setStoreTermVectors(true);
	ft.setStoreTermVectorOffsets(true);
	ft.setStoreTermVectorPositions(true);

	titleTokenized = new Field("titleTokenized", "", ft);
	doc.add(titleTokenized);

	body = new Field("body", "", ft);
	doc.add(body);

	id = new StringField("docid", "", Field.Store.YES);
	doc.add(id);

	date = new StringField("date", "", Field.Store.YES);
	doc.add(date);

	if (useDocValues) {
	titleDV = new SortedDocValuesField("titleDV", new BytesRef());
	doc.add(titleDV);
	} else {
	titleDV = null;
	}
	}
	}

	private final ThreadLocal<DocState> threadDocs = new ThreadLocal<DocState>();

	/** Note: Document instance is re-used per-thread */
	public Document nextDoc() throws IOException {
	String line;
	synchronized(this) {
	line = reader.readLine();
	if (line == null) {
	// Always rewind at end:
	if (LuceneTestCase.VERBOSE) {
	System.out.println("TEST: LineFileDocs: now rewind file...");
	}
	close();
	open(null);
	line = reader.readLine();
	}
	}

	DocState docState = threadDocs.get();
	if (docState == null) {
	docState = new DocState(useDocValues);
	threadDocs.set(docState);
	}

	int spot = line.indexOf(SEP);
	if (spot == -1) {
	throw new RuntimeException("line: [" + line + "] is in an invalid format !");
	}
	int spot2 = line.indexOf(SEP, 1 + spot);
	if (spot2 == -1) {
	throw new RuntimeException("line: [" + line + "] is in an invalid format !");
	}

	docState.body.setStringValue(line.substring(1+spot2, line.length()));
	final String title = line.substring(0, spot);
	docState.title.setStringValue(title);
	if (docState.titleDV != null) {
	docState.titleDV.setBytesValue(new BytesRef(title));
	}
	docState.titleTokenized.setStringValue(title);
	docState.date.setStringValue(line.substring(1+spot, spot2));
	docState.id.setStringValue(Integer.toString(id.getAndIncrement()));
	return docState.doc;
	}
	}