| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util; |
| |
| import java.io.BufferedReader; |
| import java.io.Closeable; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.nio.channels.Channels; |
| import java.nio.channels.SeekableByteChannel; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CodingErrorAction; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.Random; |
| import java.util.concurrent.atomic.AtomicInteger; |
| import java.util.zip.GZIPInputStream; |
| |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.IntPoint; |
| import org.apache.lucene.document.NumericDocValuesField; |
| import org.apache.lucene.document.SortedDocValuesField; |
| import org.apache.lucene.document.StringField; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.IndexOptions; |
| import org.apache.lucene.index.IndexableField; |
| |
| /** Minimal port of benchmark's LneDocSource + |
| * DocMaker, so tests can enum docs from a line file created |
| * by benchmark's WriteLineDoc task */ |
| public class LineFileDocs implements Closeable { |
| |
| private BufferedReader reader; |
| private final static int BUFFER_SIZE = 1 << 16; // 64K |
| private final AtomicInteger id = new AtomicInteger(); |
| private final String path; |
| private final Random random; |
| |
| /** If forever is true, we rewind the file at EOF (repeat |
| * the docs over and over) */ |
| public LineFileDocs(Random random, String path) throws IOException { |
| this.path = path; |
| this.random = new Random(random.nextLong()); |
| open(); |
| } |
| |
| public LineFileDocs(Random random) throws IOException { |
| this(random, LuceneTestCase.TEST_LINE_DOCS_FILE); |
| } |
| |
| @Override |
| public synchronized void close() throws IOException { |
| IOUtils.close(reader, threadDocs); |
| reader = null; |
| } |
| |
| private long randomSeekPos(Random random, long size) { |
| if (random == null || size <= 3L) { |
| return 0L; |
| } else { |
| return (random.nextLong()&Long.MAX_VALUE) % (size/3); |
| } |
| } |
| |
| private synchronized void open() throws IOException { |
| InputStream is = getClass().getResourceAsStream(path); |
| |
| // true if the InputStream is not already randomly seek'd after the if/else block below: |
| boolean needSkip; |
| |
| long size = 0L, seekTo = 0L; |
| if (is == null) { |
| // if it's not in classpath, we load it as absolute filesystem path (e.g. Jenkins' home dir) |
| Path file = Paths.get(path); |
| size = Files.size(file); |
| if (path.endsWith(".gz")) { |
| // if it is a gzip file, we need to use InputStream and seek to one of the pre-computed skip points: |
| is = Files.newInputStream(file); |
| needSkip = true; |
| } else { |
| // file is not compressed: optimized seek using SeekableByteChannel |
| seekTo = randomSeekPos(random, size); |
| final SeekableByteChannel channel = Files.newByteChannel(file); |
| if (LuceneTestCase.VERBOSE) { |
| System.out.println("TEST: LineFileDocs: file seek to fp=" + seekTo + " on open"); |
| } |
| channel.position(seekTo); |
| is = Channels.newInputStream(channel); |
| |
| // read until newline char, otherwise we may hit "java.nio.charset.MalformedInputException: Input length = 1" |
| // exception in readline() below, because we seeked part way through a multi-byte (in UTF-8) encoded |
| // unicode character: |
| if (seekTo > 0L) { |
| int b; |
| do { |
| b = is.read(); |
| } while (b >= 0 && b != 13 && b != 10); |
| } |
| |
| needSkip = false; |
| } |
| } else { |
| // if the file comes from Classpath: |
| size = is.available(); |
| needSkip = true; |
| } |
| |
| if (needSkip) { |
| |
| // LUCENE-9191: use the optimized (pre-computed, using dev-tools/scripts/create_line_file_docs.py) |
| // seek file, so we can seek in a gzip'd file |
| |
| int index = path.lastIndexOf('.'); |
| if (index == -1) { |
| throw new IllegalArgumentException("could not determine extension for path \"" + path + "\""); |
| } |
| |
| // e.g. foo.txt --> foo.seek, foo.txt.gz --> foo.txt.seek |
| String seekFilePath = path.substring(0, index) + ".seek"; |
| InputStream seekIS = getClass().getResourceAsStream(seekFilePath); |
| if (seekIS == null) { |
| seekIS = Files.newInputStream(Paths.get(seekFilePath)); |
| } |
| |
| try (BufferedReader reader = new BufferedReader(new InputStreamReader(seekIS, |
| StandardCharsets.UTF_8))) { |
| List<Long> skipPoints = new ArrayList<>(); |
| |
| // explicitly insert implicit 0 as the first skip point: |
| skipPoints.add(0L); |
| |
| while (true) { |
| String line = reader.readLine(); |
| if (line == null) { |
| break; |
| } |
| skipPoints.add(Long.parseLong(line.trim())); |
| } |
| |
| seekTo = skipPoints.get(random.nextInt(skipPoints.size())); |
| |
| // dev-tools/scripts/create_line_file_docs.py ensures this is a "safe" skip point, and we |
| // can begin gunziping from here: |
| is.skip(seekTo); |
| is = new GZIPInputStream(is); |
| |
| if (LuceneTestCase.VERBOSE) { |
| System.out.println("TEST: LineFileDocs: stream skip to fp=" + seekTo + " on open"); |
| } |
| } |
| } |
| |
| CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() |
| .onMalformedInput(CodingErrorAction.REPORT) |
| .onUnmappableCharacter(CodingErrorAction.REPORT); |
| reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE); |
| } |
| |
| public synchronized void reset() throws IOException { |
| reader.close(); |
| reader = null; |
| open(); |
| id.set(0); |
| } |
| |
| private final static char SEP = '\t'; |
| |
| private static final class DocState { |
| final Document doc; |
| final Field titleTokenized; |
| final Field title; |
| final Field titleDV; |
| final Field body; |
| final Field id; |
| final Field idNum; |
| final Field idNumDV; |
| final Field date; |
| |
| public DocState() { |
| doc = new Document(); |
| |
| title = new StringField("title", "", Field.Store.NO); |
| doc.add(title); |
| |
| FieldType ft = new FieldType(TextField.TYPE_STORED); |
| ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); |
| ft.setStoreTermVectors(true); |
| ft.setStoreTermVectorOffsets(true); |
| ft.setStoreTermVectorPositions(true); |
| |
| titleTokenized = new Field("titleTokenized", "", ft); |
| doc.add(titleTokenized); |
| |
| body = new Field("body", "", ft); |
| doc.add(body); |
| |
| id = new StringField("docid", "", Field.Store.YES); |
| doc.add(id); |
| |
| idNum = new IntPoint("docid_int", 0); |
| doc.add(idNum); |
| |
| date = new StringField("date", "", Field.Store.YES); |
| doc.add(date); |
| |
| titleDV = new SortedDocValuesField("titleDV", new BytesRef()); |
| idNumDV = new NumericDocValuesField("docid_intDV", 0); |
| doc.add(titleDV); |
| doc.add(idNumDV); |
| } |
| } |
| |
| private final CloseableThreadLocal<DocState> threadDocs = new CloseableThreadLocal<>(); |
| |
| /** Note: Document instance is re-used per-thread */ |
| public Document nextDoc() throws IOException { |
| String line; |
| synchronized(this) { |
| line = reader.readLine(); |
| if (line == null) { |
| // Always rewind at end: |
| if (LuceneTestCase.VERBOSE) { |
| System.out.println("TEST: LineFileDocs: now rewind file..."); |
| } |
| reader.close(); |
| reader = null; |
| open(); |
| line = reader.readLine(); |
| } |
| } |
| |
| DocState docState = threadDocs.get(); |
| if (docState == null) { |
| docState = new DocState(); |
| threadDocs.set(docState); |
| } |
| |
| int spot = line.indexOf(SEP); |
| if (spot == -1) { |
| throw new RuntimeException("line: [" + line + "] is in an invalid format !"); |
| } |
| int spot2 = line.indexOf(SEP, 1 + spot); |
| if (spot2 == -1) { |
| throw new RuntimeException("line: [" + line + "] is in an invalid format !"); |
| } |
| |
| docState.body.setStringValue(line.substring(1+spot2, line.length())); |
| final String title = line.substring(0, spot); |
| docState.title.setStringValue(title); |
| if (docState.titleDV != null) { |
| docState.titleDV.setBytesValue(new BytesRef(title)); |
| } |
| docState.titleTokenized.setStringValue(title); |
| docState.date.setStringValue(line.substring(1+spot, spot2)); |
| final int i = id.getAndIncrement(); |
| docState.id.setStringValue(Integer.toString(i)); |
| docState.idNum.setIntValue(i); |
| if (docState.idNumDV != null) { |
| docState.idNumDV.setLongValue(i); |
| } |
| |
| if (random.nextInt(5) == 4) { |
| // Make some sparse fields |
| Document doc = new Document(); |
| for(IndexableField field : docState.doc) { |
| doc.add(field); |
| } |
| |
| if (random.nextInt(3) == 1) { |
| int x = random.nextInt(4); |
| doc.add(new IntPoint("docLength" + x, line.length())); |
| } |
| |
| if (random.nextInt(3) == 1) { |
| int x = random.nextInt(4); |
| doc.add(new IntPoint("docTitleLength" + x, title.length())); |
| } |
| |
| if (random.nextInt(3) == 1) { |
| int x = random.nextInt(4); |
| doc.add(new NumericDocValuesField("docLength" + x, line.length())); |
| } |
| |
| // TODO: more random sparse fields here too |
| } |
| |
| return docState.doc; |
| } |
| } |