| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.benchmark.byTask.feeds; |
| |
| import java.io.Closeable; |
| import java.io.IOException; |
| import java.io.UnsupportedEncodingException; |
| import java.nio.charset.StandardCharsets; |
| import java.text.ParsePosition; |
| import java.text.SimpleDateFormat; |
| import java.util.Calendar; |
| import java.util.Date; |
| import java.util.HashMap; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Properties; |
| import java.util.Random; |
| import java.util.TimeZone; |
| import java.util.concurrent.atomic.AtomicInteger; |
| import org.apache.lucene.benchmark.byTask.utils.Config; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.DoublePoint; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.FloatPoint; |
| import org.apache.lucene.document.IntPoint; |
| import org.apache.lucene.document.LongPoint; |
| import org.apache.lucene.document.StringField; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.IndexOptions; |
| |
| /** |
| * Creates {@link Document} objects. Uses a {@link ContentSource} to generate {@link DocData} |
| * objects. Supports the following parameters: |
| * |
| * <ul> |
| * <li><b>content.source</b> - specifies the {@link ContentSource} class to use (default |
| * <b>SingleDocSource</b>). |
| * <li><b>doc.stored</b> - specifies whether fields should be stored (default <b>false</b>). |
| * <li><b>doc.body.stored</b> - specifies whether the body field should be stored (default = |
| * <b>doc.stored</b>). |
| * <li><b>doc.tokenized</b> - specifies whether fields should be tokenized (default <b>true</b>). |
| * <li><b>doc.body.tokenized</b> - specifies whether the body field should be tokenized (default = |
| * <b>doc.tokenized</b>). |
| * <li><b>doc.body.offsets</b> - specifies whether to add offsets into the postings index for the |
| * body field. It is useful for highlighting. (default <b>false</b>) |
| * <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in the index or not. |
| * (default <b>false</b>). |
| * <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be stored in the index for |
| * the body field. This can be set to true, while <code>doc.tokenized.norms</code> is set to |
| * false, to allow norms storing just for the body field. (default <b>true</b>). |
| * <li><b>doc.term.vector</b> - specifies whether term vectors should be stored for fields |
| * (default <b>false</b>). |
| * <li><b>doc.term.vector.positions</b> - specifies whether term vectors should be stored with |
| * positions (default <b>false</b>). |
| * <li><b>doc.term.vector.offsets</b> - specifies whether term vectors should be stored with |
| * offsets (default <b>false</b>). |
| * <li><b>doc.store.body.bytes</b> - specifies whether to store the raw bytes of the document's |
| * content in the document (default <b>false</b>). |
| * <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects should be reused |
| * (default <b>true</b>). |
| * <li><b>doc.index.props</b> - specifies whether the properties returned by |
| * <li><b>doc.random.id.limit</b> - if specified, docs will be assigned random IDs from 0 to this |
| * limit. This is useful with UpdateDoc for testing performance of IndexWriter.updateDocument. |
| * {@link DocData#getProps()} will be indexed. (default <b>false</b>). |
| * </ul> |
| */ |
| public class DocMaker implements Closeable { |
| |
| private static class LeftOver { |
| private DocData docdata; |
| private int cnt; |
| } |
| |
| private Random r; |
| private int updateDocIDLimit; |
| |
| /** |
| * Document state, supports reuse of field instances across documents (see <code>reuseFields |
| * </code> parameter). |
| */ |
| protected static class DocState { |
| |
| private final Map<String, Field> fields; |
| private final Map<String, Field> numericFields; |
| private final boolean reuseFields; |
| final Document doc; |
| DocData docData = new DocData(); |
| |
| public DocState(boolean reuseFields, FieldType ft, FieldType bodyFt) { |
| |
| this.reuseFields = reuseFields; |
| |
| if (reuseFields) { |
| fields = new HashMap<>(); |
| numericFields = new HashMap<>(); |
| |
| // Initialize the map with the default fields. |
| fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyFt)); |
| fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", ft)); |
| fields.put(DATE_FIELD, new Field(DATE_FIELD, "", ft)); |
| fields.put(ID_FIELD, new StringField(ID_FIELD, "", Field.Store.YES)); |
| fields.put(NAME_FIELD, new Field(NAME_FIELD, "", ft)); |
| |
| numericFields.put(DATE_MSEC_FIELD, new LongPoint(DATE_MSEC_FIELD, 0L)); |
| numericFields.put(TIME_SEC_FIELD, new IntPoint(TIME_SEC_FIELD, 0)); |
| |
| doc = new Document(); |
| } else { |
| numericFields = null; |
| fields = null; |
| doc = null; |
| } |
| } |
| |
| /** |
| * Returns a field corresponding to the field name. If <code>reuseFields</code> was set to true, |
| * then it attempts to reuse a Field instance. If such a field does not exist, it creates a new |
| * one. |
| */ |
| Field getField(String name, FieldType ft) { |
| if (!reuseFields) { |
| return new Field(name, "", ft); |
| } |
| |
| Field f = fields.get(name); |
| if (f == null) { |
| f = new Field(name, "", ft); |
| fields.put(name, f); |
| } |
| return f; |
| } |
| |
| Field getNumericField(String name, Class<? extends Number> numericType) { |
| Field f; |
| if (reuseFields) { |
| f = numericFields.get(name); |
| } else { |
| f = null; |
| } |
| |
| if (f == null) { |
| if (numericType.equals(Integer.class)) { |
| f = new IntPoint(name, 0); |
| } else if (numericType.equals(Long.class)) { |
| f = new LongPoint(name, 0L); |
| } else if (numericType.equals(Float.class)) { |
| f = new FloatPoint(name, 0.0F); |
| } else if (numericType.equals(Double.class)) { |
| f = new DoublePoint(name, 0.0); |
| } else { |
| throw new UnsupportedOperationException("Unsupported numeric type: " + numericType); |
| } |
| if (reuseFields) { |
| numericFields.put(name, f); |
| } |
| } |
| return f; |
| } |
| } |
| |
| private boolean storeBytes = false; |
| |
| private static class DateUtil { |
| public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ENGLISH); |
| public Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT); |
| public ParsePosition pos = new ParsePosition(0); |
| |
| public DateUtil() { |
| parser.setLenient(true); |
| } |
| } |
| |
| // leftovers are thread local, because it is unsafe to share residues between threads |
| private ThreadLocal<LeftOver> leftovr = new ThreadLocal<>(); |
| private ThreadLocal<DocState> docState = new ThreadLocal<>(); |
| private ThreadLocal<DateUtil> dateParsers = new ThreadLocal<>(); |
| |
| public static final String BODY_FIELD = "body"; |
| public static final String TITLE_FIELD = "doctitle"; |
| public static final String DATE_FIELD = "docdate"; |
| public static final String DATE_MSEC_FIELD = "docdatenum"; |
| public static final String TIME_SEC_FIELD = "doctimesecnum"; |
| public static final String ID_FIELD = "docid"; |
| public static final String BYTES_FIELD = "bytes"; |
| public static final String NAME_FIELD = "docname"; |
| |
| protected Config config; |
| |
| protected FieldType valType; |
| protected FieldType bodyValType; |
| |
| protected ContentSource source; |
| protected boolean reuseFields; |
| protected boolean indexProperties; |
| |
| private final AtomicInteger numDocsCreated = new AtomicInteger(); |
| |
| public DocMaker() {} |
| |
| // create a doc |
| // use only part of the body, modify it to keep the rest (or use all if size==0). |
| // reset the docdata properties so they are not added more than once. |
| private Document createDocument(DocData docData, int size, int cnt) |
| throws UnsupportedEncodingException { |
| |
| final DocState ds = getDocState(); |
| final Document doc = reuseFields ? ds.doc : new Document(); |
| doc.clear(); |
| |
| // Set ID_FIELD |
| FieldType ft = new FieldType(valType); |
| ft.setStored(true); |
| |
| Field idField = ds.getField(ID_FIELD, ft); |
| int id; |
| if (r != null) { |
| id = r.nextInt(updateDocIDLimit); |
| } else { |
| id = docData.getID(); |
| if (id == -1) { |
| id = numDocsCreated.getAndIncrement(); |
| } |
| } |
| idField.setStringValue(Integer.toString(id)); |
| doc.add(idField); |
| |
| // Set NAME_FIELD |
| String name = docData.getName(); |
| if (name == null) name = ""; |
| name = cnt < 0 ? name : name + "_" + cnt; |
| Field nameField = ds.getField(NAME_FIELD, valType); |
| nameField.setStringValue(name); |
| doc.add(nameField); |
| |
| // Set DATE_FIELD |
| DateUtil util = dateParsers.get(); |
| if (util == null) { |
| util = new DateUtil(); |
| dateParsers.set(util); |
| } |
| Date date = null; |
| String dateString = docData.getDate(); |
| if (dateString != null) { |
| util.pos.setIndex(0); |
| date = util.parser.parse(dateString, util.pos); |
| // System.out.println(dateString + " parsed to " + date); |
| } else { |
| dateString = ""; |
| } |
| Field dateStringField = ds.getField(DATE_FIELD, valType); |
| dateStringField.setStringValue(dateString); |
| doc.add(dateStringField); |
| |
| if (date == null) { |
| // just set to right now |
| date = new Date(); |
| } |
| |
| Field dateField = ds.getNumericField(DATE_MSEC_FIELD, Long.class); |
| dateField.setLongValue(date.getTime()); |
| doc.add(dateField); |
| |
| util.cal.setTime(date); |
| final int sec = |
| util.cal.get(Calendar.HOUR_OF_DAY) * 3600 |
| + util.cal.get(Calendar.MINUTE) * 60 |
| + util.cal.get(Calendar.SECOND); |
| |
| Field timeSecField = ds.getNumericField(TIME_SEC_FIELD, Integer.class); |
| timeSecField.setIntValue(sec); |
| doc.add(timeSecField); |
| |
| // Set TITLE_FIELD |
| String title = docData.getTitle(); |
| Field titleField = ds.getField(TITLE_FIELD, valType); |
| titleField.setStringValue(title == null ? "" : title); |
| doc.add(titleField); |
| |
| String body = docData.getBody(); |
| if (body != null && body.length() > 0) { |
| String bdy; |
| if (size <= 0 || size >= body.length()) { |
| bdy = body; // use all |
| docData.setBody(""); // nothing left |
| } else { |
| // attempt not to break words - if whitespace found within next 20 chars... |
| for (int n = size - 1; n < size + 20 && n < body.length(); n++) { |
| if (Character.isWhitespace(body.charAt(n))) { |
| size = n; |
| break; |
| } |
| } |
| bdy = body.substring(0, size); // use part |
| docData.setBody(body.substring(size)); // some left |
| } |
| Field bodyField = ds.getField(BODY_FIELD, bodyValType); |
| bodyField.setStringValue(bdy); |
| doc.add(bodyField); |
| |
| if (storeBytes) { |
| Field bytesField = ds.getField(BYTES_FIELD, StringField.TYPE_STORED); |
| bytesField.setBytesValue(bdy.getBytes(StandardCharsets.UTF_8)); |
| doc.add(bytesField); |
| } |
| } |
| |
| if (indexProperties) { |
| Properties props = docData.getProps(); |
| if (props != null) { |
| for (final Map.Entry<Object, Object> entry : props.entrySet()) { |
| Field f = ds.getField((String) entry.getKey(), valType); |
| f.setStringValue((String) entry.getValue()); |
| doc.add(f); |
| } |
| docData.setProps(null); |
| } |
| } |
| |
| // System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); |
| return doc; |
| } |
| |
| private void resetLeftovers() { |
| leftovr.set(null); |
| } |
| |
| protected DocState getDocState() { |
| DocState ds = docState.get(); |
| if (ds == null) { |
| ds = new DocState(reuseFields, valType, bodyValType); |
| docState.set(ds); |
| } |
| return ds; |
| } |
| |
| /** |
| * Closes the {@link DocMaker}. The base implementation closes the {@link ContentSource}, and it |
| * can be overridden to do more work (but make sure to call super.close()). |
| */ |
| @Override |
| public void close() throws IOException { |
| source.close(); |
| } |
| |
| /** |
| * Creates a {@link Document} object ready for indexing. This method uses the {@link |
| * ContentSource} to get the next document from the source, and creates a {@link Document} object |
| * from the returned fields. If <code>reuseFields</code> was set to true, it will reuse {@link |
| * Document} and {@link Field} instances. |
| */ |
| public Document makeDocument() throws Exception { |
| resetLeftovers(); |
| DocData docData = source.getNextDocData(getDocState().docData); |
| Document doc = createDocument(docData, 0, -1); |
| return doc; |
| } |
| |
| /** |
| * Same as {@link #makeDocument()}, only this method creates a document of the given size input by |
| * <code>size</code>. |
| */ |
| public Document makeDocument(int size) throws Exception { |
| LeftOver lvr = leftovr.get(); |
| if (lvr == null |
| || lvr.docdata == null |
| || lvr.docdata.getBody() == null |
| || lvr.docdata.getBody().length() == 0) { |
| resetLeftovers(); |
| } |
| DocData docData = getDocState().docData; |
| DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata); |
| int cnt = (lvr == null ? 0 : lvr.cnt); |
| while (dd.getBody() == null || dd.getBody().length() < size) { |
| DocData dd2 = dd; |
| dd = source.getNextDocData(new DocData()); |
| cnt = 0; |
| dd.setBody(dd2.getBody() + dd.getBody()); |
| } |
| Document doc = createDocument(dd, size, cnt); |
| if (dd.getBody() == null || dd.getBody().length() == 0) { |
| resetLeftovers(); |
| } else { |
| if (lvr == null) { |
| lvr = new LeftOver(); |
| leftovr.set(lvr); |
| } |
| lvr.docdata = dd; |
| lvr.cnt = ++cnt; |
| } |
| return doc; |
| } |
| |
| /** Reset inputs so that the test run would behave, input wise, as if it just started. */ |
| public synchronized void resetInputs() throws IOException { |
| source.printStatistics("docs"); |
| // re-initiate since properties by round may have changed. |
| setConfig(config, source); |
| source.resetInputs(); |
| numDocsCreated.set(0); |
| resetLeftovers(); |
| } |
| |
| /** Set the configuration parameters of this doc maker. */ |
| public void setConfig(Config config, ContentSource source) { |
| this.config = config; |
| this.source = source; |
| |
| boolean stored = config.get("doc.stored", false); |
| boolean bodyStored = config.get("doc.body.stored", stored); |
| boolean tokenized = config.get("doc.tokenized", true); |
| boolean bodyTokenized = config.get("doc.body.tokenized", tokenized); |
| boolean norms = config.get("doc.tokenized.norms", false); |
| boolean bodyNorms = config.get("doc.body.tokenized.norms", true); |
| boolean bodyOffsets = config.get("doc.body.offsets", false); |
| boolean termVec = config.get("doc.term.vector", false); |
| boolean termVecPositions = config.get("doc.term.vector.positions", false); |
| boolean termVecOffsets = config.get("doc.term.vector.offsets", false); |
| |
| valType = new FieldType(TextField.TYPE_NOT_STORED); |
| valType.setStored(stored); |
| valType.setTokenized(tokenized); |
| valType.setOmitNorms(!norms); |
| valType.setStoreTermVectors(termVec); |
| valType.setStoreTermVectorPositions(termVecPositions); |
| valType.setStoreTermVectorOffsets(termVecOffsets); |
| valType.freeze(); |
| |
| bodyValType = new FieldType(TextField.TYPE_NOT_STORED); |
| bodyValType.setStored(bodyStored); |
| bodyValType.setTokenized(bodyTokenized); |
| bodyValType.setOmitNorms(!bodyNorms); |
| if (bodyTokenized && bodyOffsets) { |
| bodyValType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| } |
| bodyValType.setStoreTermVectors(termVec); |
| bodyValType.setStoreTermVectorPositions(termVecPositions); |
| bodyValType.setStoreTermVectorOffsets(termVecOffsets); |
| bodyValType.freeze(); |
| |
| storeBytes = config.get("doc.store.body.bytes", false); |
| |
| reuseFields = config.get("doc.reuse.fields", true); |
| |
| // In a multi-rounds run, it is important to reset DocState since settings |
| // of fields may change between rounds, and this is the only way to reset |
| // the cache of all threads. |
| docState = new ThreadLocal<>(); |
| |
| indexProperties = config.get("doc.index.props", false); |
| |
| updateDocIDLimit = config.get("doc.random.id.limit", -1); |
| if (updateDocIDLimit != -1) { |
| r = new Random(179); |
| } |
| } |
| } |