blob: f00e4750c0c4b1f23c143730c749b520d32275e6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryUtils;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
public class TestFeatureField extends LuceneTestCase {
/** Round a float value the same way that {@link FeatureField} rounds feature values. */
private static float round(float f) {
int bits = Float.floatToIntBits(f);
bits &= ~0 << 15; // clear last 15 bits
return Float.intBitsToFloat(bits);
}
public void testBasics() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
.setMergePolicy(newLogMergePolicy(random().nextBoolean())));
Document doc = new Document();
FeatureField pagerank = new FeatureField("features", "pagerank", 1);
FeatureField urlLength = new FeatureField("features", "urlLen", 1);
doc.add(pagerank);
doc.add(urlLength);
pagerank.setFeatureValue(10);
urlLength.setFeatureValue(1f / 24);
writer.addDocument(doc);
pagerank.setFeatureValue(100);
urlLength.setFeatureValue(1f / 20);
writer.addDocument(doc);
writer.addDocument(new Document()); // gap
pagerank.setFeatureValue(1);
urlLength.setFeatureValue(1f / 100);
writer.addDocument(doc);
pagerank.setFeatureValue(42);
urlLength.setFeatureValue(1f / 23);
writer.addDocument(doc);
writer.forceMerge(1);
DirectoryReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = new IndexSearcher(reader);
LeafReaderContext context = reader.leaves().get(0);
Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
Weight w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
Scorer s = w.scorer(context);
assertEquals(0, s.iterator().nextDoc());
assertEquals((float) (6.0 * Math.log(4.5f + 10)), s.score(), 0f);
assertEquals(1, s.iterator().nextDoc());
assertEquals((float) (6.0 * Math.log(4.5f + 100)), s.score(), 0f);
assertEquals(3, s.iterator().nextDoc());
assertEquals((float) (6.0 * Math.log(4.5f + 1)), s.score(), 0f);
assertEquals(4, s.iterator().nextDoc());
assertEquals((float) (6.0 * Math.log(4.5f + 42)), s.score(), 0f);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
q = FeatureField.newLinearQuery("features", "pagerank", 3f);
w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
s = w.scorer(context);
assertEquals(0, s.iterator().nextDoc());
assertEquals((float) (6.0 * 10), s.score(), 0f);
assertEquals(1, s.iterator().nextDoc());
assertEquals((float) (6.0 * 100), s.score(), 0f);
assertEquals(3, s.iterator().nextDoc());
assertEquals((float) (6.0 * 1), s.score(), 0f);
assertEquals(4, s.iterator().nextDoc());
assertEquals((float) (6.0 * 42), s.score(), 0f);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
q = FeatureField.newSaturationQuery("features", "pagerank", 3f, 4.5f);
w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
s = w.scorer(context);
assertEquals(0, s.iterator().nextDoc());
assertEquals(6f * (1 - 4.5f / (4.5f + 10)), s.score(), 0f);
assertEquals(1, s.iterator().nextDoc());
assertEquals(6f * (1 - 4.5f / (4.5f + 100)), s.score(), 0f);
assertEquals(3, s.iterator().nextDoc());
assertEquals(6f * (1 - 4.5f / (4.5f + 1)), s.score(), 0f);
assertEquals(4, s.iterator().nextDoc());
assertEquals(6f * (1 - 4.5f / (4.5f + 42)), s.score(), 0f);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
q = FeatureField.newSigmoidQuery("features", "pagerank", 3f, 4.5f, 0.6f);
w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
s = w.scorer(context);
double kPa = Math.pow(4.5f, 0.6f);
assertEquals(0, s.iterator().nextDoc());
assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(10, 0.6f)))), s.score(), 0f);
assertEquals(1, s.iterator().nextDoc());
assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(100, 0.6f)))), s.score(), 0f);
assertEquals(3, s.iterator().nextDoc());
assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(1, 0.6f)))), s.score(), 0f);
assertEquals(4, s.iterator().nextDoc());
assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(42, 0.6f)))), s.score(), 0f);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
q = FeatureField.newSaturationQuery("features", "urlLen", 3f, 1f/24);
w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
s = w.scorer(context);
assertEquals(0, s.iterator().nextDoc());
assertEquals(6f * (1 - (1f/24) / (1f/24 + round(1f/24))), s.score(), 0f);
assertEquals(1, s.iterator().nextDoc());
assertEquals(6f * (1 - 1f/24 / (1f/24 + round(1f/20))), s.score(), 0f);
assertEquals(3, s.iterator().nextDoc());
assertEquals(6f * (1 - 1f/24 / (1f/24 + round(1f/100))), s.score(), 0f);
assertEquals(4, s.iterator().nextDoc());
assertEquals(6f * (1 - 1f/24 / (1f/24 + round(1f/23))), s.score(), 0f);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
reader.close();
dir.close();
}
public void testExplanations() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
.setMergePolicy(newLogMergePolicy(random().nextBoolean())));
Document doc = new Document();
FeatureField pagerank = new FeatureField("features", "pagerank", 1);
doc.add(pagerank);
pagerank.setFeatureValue(10);
writer.addDocument(doc);
pagerank.setFeatureValue(100);
writer.addDocument(doc);
writer.addDocument(new Document()); // gap
pagerank.setFeatureValue(1);
writer.addDocument(doc);
pagerank.setFeatureValue(42);
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = new IndexSearcher(reader);
QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher);
QueryUtils.check(random(), FeatureField.newLinearQuery("features", "pagerank", 1f), searcher);
QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", 1f, 12f), searcher);
QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", 1f, 12f, 0.6f), searcher);
// Test boosts that are > 1
QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f), searcher);
QueryUtils.check(random(), FeatureField.newLinearQuery("features", "pagerank", 3f), searcher);
QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", 3f, 12f), searcher);
QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", 3f, 12f, 0.6f), searcher);
// Test boosts that are < 1
QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", .2f, 4.5f), searcher);
QueryUtils.check(random(), FeatureField.newLinearQuery("features", "pagerank", .2f), searcher);
QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", .2f, 12f), searcher);
QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", .2f, 12f, 0.6f), searcher);
reader.close();
dir.close();
}
public void testLogSimScorer() {
doTestSimScorer(new FeatureField.LogFunction(4.5f).scorer(3f));
}
public void testLinearSimScorer() {
doTestSimScorer(new FeatureField.LinearFunction().scorer(1f));
}
public void testSatuSimScorer() {
doTestSimScorer(new FeatureField.SaturationFunction("foo", "bar", 20f).scorer(3f));
}
public void testSigmSimScorer() {
doTestSimScorer(new FeatureField.SigmoidFunction(20f, 0.6f).scorer(3f));
}
private void doTestSimScorer(SimScorer s) {
float maxScore = s.score(Float.MAX_VALUE, 1);
assertTrue(Float.isFinite(maxScore)); // used to compute max scores
// Test that the score doesn't decrease with freq
for (int freq = 2; freq < 65536; ++freq) {
assertTrue(s.score(freq - 1, 1L) <= s.score(freq, 1L));
}
assertTrue(s.score(65535, 1L) <= maxScore);
}
public void testComputePivotFeatureValue() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
// Make sure that we create a legal pivot on missing features
DirectoryReader reader = writer.getReader();
float pivot = FeatureField.computePivotFeatureValue(reader, "features", "pagerank");
assertTrue(Float.isFinite(pivot));
assertTrue(pivot > 0);
reader.close();
Document doc = new Document();
FeatureField pagerank = new FeatureField("features", "pagerank", 1);
doc.add(pagerank);
pagerank.setFeatureValue(10);
writer.addDocument(doc);
pagerank.setFeatureValue(100);
writer.addDocument(doc);
writer.addDocument(new Document()); // gap
pagerank.setFeatureValue(1);
writer.addDocument(doc);
pagerank.setFeatureValue(42);
writer.addDocument(doc);
reader = writer.getReader();
writer.close();
pivot = FeatureField.computePivotFeatureValue(reader, "features", "pagerank");
double expected = Math.pow(10 * 100 * 1 * 42, 1/4.); // geometric mean
assertEquals(expected, pivot, 0.1);
reader.close();
dir.close();
}
public void testExtractTerms() throws IOException {
IndexReader reader = new MultiReader();
IndexSearcher searcher = newSearcher(reader);
Query query = FeatureField.newLogQuery("field", "term", 2f, 42);
Weight weight = searcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 1f);
Set<Term> terms = new HashSet<>();
weight.extractTerms(terms);
assertEquals(Collections.emptySet(), terms);
terms = new HashSet<>();
weight = searcher.createWeight(query, ScoreMode.COMPLETE, 1f);
weight.extractTerms(terms);
assertEquals(Collections.singleton(new Term("field", "term")), terms);
terms = new HashSet<>();
weight = searcher.createWeight(query, ScoreMode.TOP_SCORES, 1f);
weight.extractTerms(terms);
assertEquals(Collections.singleton(new Term("field", "term")), terms);
}
public void testDemo() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
.setMergePolicy(newLogMergePolicy(random().nextBoolean())));
Document doc = new Document();
FeatureField pagerank = new FeatureField("features", "pagerank", 1);
doc.add(pagerank);
TextField body = new TextField("body", "", Store.NO);
doc.add(body);
pagerank.setFeatureValue(10);
body.setStringValue("Apache Lucene");
writer.addDocument(doc);
pagerank.setFeatureValue(1000);
body.setStringValue("Apache Web HTTP server");
writer.addDocument(doc);
pagerank.setFeatureValue(1);
body.setStringValue("Lucene is a search engine");
writer.addDocument(doc);
pagerank.setFeatureValue(42);
body.setStringValue("Lucene in the sky with diamonds");
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
writer.close();
// NOTE: If you need to make changes below, then you likely also need to
// update javadocs of FeatureField.
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(new BM25Similarity());
Query query = new BooleanQuery.Builder()
.add(new TermQuery(new Term("body", "apache")), Occur.SHOULD)
.add(new TermQuery(new Term("body", "lucene")), Occur.SHOULD)
.build();
Query boost = FeatureField.newSaturationQuery("features", "pagerank");
Query boostedQuery = new BooleanQuery.Builder()
.add(query, Occur.MUST)
.add(boost, Occur.SHOULD)
.build();
TopDocs topDocs = searcher.search(boostedQuery, 10);
assertEquals(4, topDocs.scoreDocs.length);
assertEquals(1, topDocs.scoreDocs[0].doc);
assertEquals(0, topDocs.scoreDocs[1].doc);
assertEquals(3, topDocs.scoreDocs[2].doc);
assertEquals(2, topDocs.scoreDocs[3].doc);
reader.close();
dir.close();
}
}