| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.suggest; |
| |
| import java.io.IOException; |
| import java.util.AbstractMap.SimpleEntry; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Random; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.NumericDocValuesField; |
| import org.apache.lucene.document.StoredField; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.IndexableField; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.spell.Dictionary; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.junit.Test; |
| |
| // See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove files on Windows machines occasionally |
| public class DocumentDictionaryTest extends LuceneTestCase { |
| |
| static final String FIELD_NAME = "f1"; |
| static final String WEIGHT_FIELD_NAME = "w1"; |
| static final String PAYLOAD_FIELD_NAME = "p1"; |
| static final String CONTEXT_FIELD_NAME = "c1"; |
| |
| @Test |
| public void testEmptyReader() throws IOException { |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random()); |
| IndexWriterConfig iwc = newIndexWriterConfig(analyzer); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| // Make sure the index is created? |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); |
| writer.commit(); |
| writer.close(); |
| IndexReader ir = DirectoryReader.open(dir); |
| Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME); |
| InputIterator inputIterator = dictionary.getEntryIterator(); |
| |
| assertNull(inputIterator.next()); |
| assertEquals(inputIterator.weight(), 0); |
| assertNull(inputIterator.payload()); |
| |
| IOUtils.close(ir, analyzer, dir); |
| } |
| |
| @Test |
| public void testBasic() throws IOException { |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random()); |
| IndexWriterConfig iwc = newIndexWriterConfig(analyzer); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); |
| Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), false); |
| Map<String, Document> docs = res.getValue(); |
| List<String> invalidDocTerms = res.getKey(); |
| for(Document doc: docs.values()) { |
| writer.addDocument(doc); |
| } |
| writer.commit(); |
| writer.close(); |
| IndexReader ir = DirectoryReader.open(dir); |
| Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME); |
| InputIterator inputIterator = dictionary.getEntryIterator(); |
| BytesRef f; |
| while((f = inputIterator.next())!=null) { |
| Document doc = docs.remove(f.utf8ToString()); |
| assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME)))); |
| IndexableField weightField = doc.getField(WEIGHT_FIELD_NAME); |
| assertEquals(inputIterator.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0); |
| IndexableField payloadField = doc.getField(PAYLOAD_FIELD_NAME); |
| if (payloadField == null) assertTrue(inputIterator.payload().length == 0); |
| else assertEquals(inputIterator.payload(), payloadField.binaryValue()); |
| } |
| |
| for (String invalidTerm : invalidDocTerms) { |
| assertNotNull(docs.remove(invalidTerm)); |
| } |
| assertTrue(docs.isEmpty()); |
| |
| IOUtils.close(ir, analyzer, dir); |
| } |
| |
| @Test |
| public void testWithOptionalPayload() throws IOException { |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random()); |
| IndexWriterConfig iwc = newIndexWriterConfig(analyzer); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); |
| |
| // Create a document that is missing the payload field |
| Document doc = new Document(); |
| Field field = new TextField(FIELD_NAME, "some field", Field.Store.YES); |
| doc.add(field); |
| // do not store the payload or the contexts |
| Field weight = new NumericDocValuesField(WEIGHT_FIELD_NAME, 100); |
| doc.add(weight); |
| writer.addDocument(doc); |
| writer.commit(); |
| writer.close(); |
| IndexReader ir = DirectoryReader.open(dir); |
| |
| // Even though the payload field is missing, the dictionary iterator should not skip the document |
| // because the payload field is optional. |
| Dictionary dictionaryOptionalPayload = |
| new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME); |
| InputIterator inputIterator = dictionaryOptionalPayload.getEntryIterator(); |
| BytesRef f = inputIterator.next(); |
| assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME)))); |
| IndexableField weightField = doc.getField(WEIGHT_FIELD_NAME); |
| assertEquals(inputIterator.weight(), weightField.numericValue().longValue()); |
| IndexableField payloadField = doc.getField(PAYLOAD_FIELD_NAME); |
| assertNull(payloadField); |
| assertTrue(inputIterator.payload().length == 0); |
| IOUtils.close(ir, analyzer, dir); |
| } |
| |
| @Test |
| public void testWithoutPayload() throws IOException { |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random()); |
| IndexWriterConfig iwc = newIndexWriterConfig(analyzer); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); |
| Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), false); |
| Map<String, Document> docs = res.getValue(); |
| List<String> invalidDocTerms = res.getKey(); |
| for(Document doc: docs.values()) { |
| writer.addDocument(doc); |
| } |
| writer.commit(); |
| writer.close(); |
| IndexReader ir = DirectoryReader.open(dir); |
| Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME); |
| InputIterator inputIterator = dictionary.getEntryIterator(); |
| BytesRef f; |
| while((f = inputIterator.next())!=null) { |
| Document doc = docs.remove(f.utf8ToString()); |
| assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME)))); |
| IndexableField weightField = doc.getField(WEIGHT_FIELD_NAME); |
| assertEquals(inputIterator.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0); |
| assertNull(inputIterator.payload()); |
| } |
| |
| for (String invalidTerm : invalidDocTerms) { |
| assertNotNull(docs.remove(invalidTerm)); |
| } |
| |
| assertTrue(docs.isEmpty()); |
| |
| IOUtils.close(ir, analyzer, dir); |
| } |
| |
| @Test |
| public void testWithContexts() throws IOException { |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random()); |
| IndexWriterConfig iwc = newIndexWriterConfig(analyzer); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); |
| Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), true); |
| Map<String, Document> docs = res.getValue(); |
| List<String> invalidDocTerms = res.getKey(); |
| for(Document doc: docs.values()) { |
| writer.addDocument(doc); |
| } |
| writer.commit(); |
| writer.close(); |
| IndexReader ir = DirectoryReader.open(dir); |
| Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME, CONTEXT_FIELD_NAME); |
| InputIterator inputIterator = dictionary.getEntryIterator(); |
| BytesRef f; |
| while((f = inputIterator.next())!=null) { |
| Document doc = docs.remove(f.utf8ToString()); |
| assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME)))); |
| IndexableField weightField = doc.getField(WEIGHT_FIELD_NAME); |
| assertEquals(inputIterator.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0); |
| IndexableField payloadField = doc.getField(PAYLOAD_FIELD_NAME); |
| if (payloadField == null) assertTrue(inputIterator.payload().length == 0); |
| else assertEquals(inputIterator.payload(), payloadField.binaryValue()); |
| Set<BytesRef> oriCtxs = new HashSet<>(); |
| Set<BytesRef> contextSet = inputIterator.contexts(); |
| for (IndexableField ctxf : doc.getFields(CONTEXT_FIELD_NAME)) { |
| oriCtxs.add(ctxf.binaryValue()); |
| } |
| assertEquals(oriCtxs.size(), contextSet.size()); |
| } |
| |
| for (String invalidTerm : invalidDocTerms) { |
| assertNotNull(docs.remove(invalidTerm)); |
| } |
| assertTrue(docs.isEmpty()); |
| |
| IOUtils.close(ir, analyzer, dir); |
| } |
| |
| @Test |
| public void testWithDeletions() throws IOException { |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random()); |
| IndexWriterConfig iwc = newIndexWriterConfig(analyzer); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); |
| Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), false); |
| Map<String, Document> docs = res.getValue(); |
| List<String> invalidDocTerms = res.getKey(); |
| Random rand = random(); |
| List<String> termsToDel = new ArrayList<>(); |
| for(Document doc : docs.values()) { |
| IndexableField f = doc.getField(FIELD_NAME); |
| if(rand.nextBoolean() && f != null && !invalidDocTerms.contains(f.stringValue())) { |
| termsToDel.add(doc.get(FIELD_NAME)); |
| } |
| writer.addDocument(doc); |
| } |
| writer.commit(); |
| |
| Term[] delTerms = new Term[termsToDel.size()]; |
| for(int i=0; i < termsToDel.size() ; i++) { |
| delTerms[i] = new Term(FIELD_NAME, termsToDel.get(i)); |
| } |
| |
| for(Term delTerm: delTerms) { |
| writer.deleteDocuments(delTerm); |
| } |
| writer.commit(); |
| writer.close(); |
| |
| for(String termToDel: termsToDel) { |
| assertTrue(null!=docs.remove(termToDel)); |
| } |
| |
| IndexReader ir = DirectoryReader.open(dir); |
| assertEquals(ir.numDocs(), docs.size()); |
| Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME); |
| InputIterator inputIterator = dictionary.getEntryIterator(); |
| BytesRef f; |
| while((f = inputIterator.next())!=null) { |
| Document doc = docs.remove(f.utf8ToString()); |
| assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME)))); |
| IndexableField weightField = doc.getField(WEIGHT_FIELD_NAME); |
| assertEquals(inputIterator.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0); |
| assertNull(inputIterator.payload()); |
| } |
| |
| for (String invalidTerm : invalidDocTerms) { |
| assertNotNull(docs.remove(invalidTerm)); |
| } |
| assertTrue(docs.isEmpty()); |
| |
| IOUtils.close(ir, analyzer, dir); |
| } |
| |
| @Test |
| public void testMultiValuedField() throws IOException { |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random()); |
| IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); |
| |
| List<Suggestion> suggestions = indexMultiValuedDocuments(atLeast(1000), writer); |
| writer.commit(); |
| writer.close(); |
| |
| IndexReader ir = DirectoryReader.open(dir); |
| Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME, CONTEXT_FIELD_NAME); |
| InputIterator inputIterator = dictionary.getEntryIterator(); |
| BytesRef f; |
| Iterator<Suggestion> suggestionsIter = suggestions.iterator(); |
| while((f = inputIterator.next())!=null) { |
| Suggestion nextSuggestion = suggestionsIter.next(); |
| assertTrue(f.equals(nextSuggestion.term)); |
| long weight = nextSuggestion.weight; |
| assertEquals(inputIterator.weight(), (weight != -1) ? weight : 0); |
| assertEquals(inputIterator.payload(), nextSuggestion.payload); |
| assertTrue(inputIterator.contexts().equals(nextSuggestion.contexts)); |
| } |
| assertFalse(suggestionsIter.hasNext()); |
| IOUtils.close(ir, analyzer, dir); |
| } |
| |
| /** Returns Pair(list of invalid document terms, Map of document term -> document) */ |
| private Map.Entry<List<String>, Map<String, Document>> generateIndexDocuments(int ndocs, boolean requiresContexts) { |
| Map<String, Document> docs = new HashMap<>(); |
| List<String> invalidDocTerms = new ArrayList<>(); |
| for(int i = 0; i < ndocs ; i++) { |
| Document doc = new Document(); |
| boolean invalidDoc = false; |
| Field field = null; |
| // usually have valid term field in document |
| if (usually()) { |
| field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES); |
| doc.add(field); |
| } else { |
| invalidDoc = true; |
| } |
| |
| // even if payload is not required usually have it |
| if (usually()) { |
| Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i)); |
| doc.add(payload); |
| } |
| |
| if (requiresContexts || usually()) { |
| if (usually()) { |
| for (int j = 0; j < atLeast(2); j++) { |
| doc.add(new StoredField(CONTEXT_FIELD_NAME, new BytesRef("context_" + i + "_"+ j))); |
| } |
| } |
| // we should allow entries without context |
| } |
| |
| // usually have valid weight field in document |
| if (usually()) { |
| Field weight = (rarely()) ? |
| new StoredField(WEIGHT_FIELD_NAME, 100d + i) : |
| new NumericDocValuesField(WEIGHT_FIELD_NAME, 100 + i); |
| doc.add(weight); |
| } |
| |
| String term = null; |
| if (invalidDoc) { |
| term = (field!=null) ? field.stringValue() : "invalid_" + i; |
| invalidDocTerms.add(term); |
| } else { |
| term = field.stringValue(); |
| } |
| |
| docs.put(term, doc); |
| } |
| return new SimpleEntry<>(invalidDocTerms, docs); |
| } |
| |
| private List<Suggestion> indexMultiValuedDocuments(int numDocs, RandomIndexWriter writer) throws IOException { |
| List<Suggestion> suggestionList = new ArrayList<>(numDocs); |
| |
| for(int i=0; i<numDocs; i++) { |
| Document doc = new Document(); |
| Field field; |
| BytesRef payloadValue; |
| Set<BytesRef> contextValues = new HashSet<>(); |
| long numericValue = -1; //-1 for missing weight |
| BytesRef term; |
| |
| payloadValue = new BytesRef("payload_" + i); |
| field = new StoredField(PAYLOAD_FIELD_NAME, payloadValue); |
| doc.add(field); |
| |
| if (usually()) { |
| numericValue = 100 + i; |
| field = new NumericDocValuesField(WEIGHT_FIELD_NAME, numericValue); |
| doc.add(field); |
| } |
| |
| int numContexts = atLeast(1); |
| for (int j=0; j<numContexts; j++) { |
| BytesRef contextValue = new BytesRef("context_" + i + "_" + j); |
| field = new StoredField(CONTEXT_FIELD_NAME, contextValue); |
| doc.add(field); |
| contextValues.add(contextValue); |
| } |
| |
| int numSuggestions = atLeast(2); |
| for (int j=0; j<numSuggestions; j++) { |
| term = new BytesRef("field_" + i + "_" + j); |
| field = new StoredField(FIELD_NAME, term); |
| doc.add(field); |
| |
| Suggestion suggestionValue = new Suggestion(); |
| suggestionValue.payload = payloadValue; |
| suggestionValue.contexts = contextValues; |
| suggestionValue.weight = numericValue; |
| suggestionValue.term = term; |
| suggestionList.add(suggestionValue); |
| } |
| writer.addDocument(doc); |
| } |
| return suggestionList; |
| } |
| |
| private static class Suggestion { |
| private long weight; |
| private BytesRef payload; |
| private Set<BytesRef> contexts; |
| private BytesRef term; |
| } |
| |
| |
| } |