blob: c792ff6a2f25112a3bb2a80a71355cffefeba85c [file] [log] [blame]
Index: contrib/CHANGES.txt
===================================================================
--- contrib/CHANGES.txt (revision 886705)
+++ contrib/CHANGES.txt (working copy)
@@ -1,4 +1,8 @@
Lucene contrib change Log
+ * LUCENE-2091: Add BM25 Scoring to Lucene. BM25BooleanQuery will now allow
+ using the BM25 and BM25F probabilistic scoring models as an alternative to
+ Lucene's standard boolean+VSM scoring model.
+ (Joaquin Perez Iglesias, Yuval Feinstein)
======================= Trunk (not yet released) =======================
Index: contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25BooleanScorerTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25BooleanScorerTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25BooleanScorerTest.java (revision 0)
@@ -0,0 +1,77 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+
+public class BM25BooleanScorerTest extends LuceneTestCase {
+ private IndexReader reader;
+ private BM25BooleanScorer bmbs;
+
+ protected void setUp() throws Exception {
+ reader = BM25TestUtils.initPresidentsReader();
+ BM25TestUtils.loadBM25Parameters();
+ getNewBM25Scorer();
+ }
+
+ private void getNewBM25Scorer() throws IOException, ParseException {
+ bmbs = new BM25BooleanScorer(reader, BM25TestUtils.getBooleanTermQueries(), null, null, new DefaultSimilarity());
+ }
+
+ public void testDocId() throws IOException {
+ assertEquals("docId not -1 at start", -1, bmbs.docID());
+ bmbs.advance(9);
+ assertEquals("docId not 9 after first advance() call", 9, bmbs.docID());
+ bmbs.nextDoc();
+ assertEquals("docId not NO_MORE_DOCS after nextDoc() calls", DocIdSetIterator.NO_MORE_DOCS, bmbs.docID());
+ }
+
+ public void testNextDoc() throws IOException {
+ for (int i = 0; i < 10; i++) {
+ assertEquals("nextDoc match no. " + (i + 1) + " failed", i, bmbs.nextDoc());
+ }
+ assertEquals("nextDoc 12th match failed", DocIdSetIterator.NO_MORE_DOCS, bmbs.nextDoc());
+ }
+
+ public void testAdvance() throws IOException {
+ assertEquals("advance(1) first match failed", 1, bmbs.advance(1));
+ assertEquals("advance(1) second match failed", 2, bmbs.advance(1));
+ assertEquals("advance(9) first match failed", 9, bmbs.advance(9));
+ assertEquals("advance(9) second match failed", DocIdSetIterator.NO_MORE_DOCS, bmbs.advance(9));
+
+ }
+
+ public void testScore() throws IOException {
+ assertEquals("nextDoc first match failed", 0, bmbs.nextDoc());
+ assertEquals("score(1) failed", 0.0, bmbs.score(), BM25TestUtils.EPSILON);
+ assertEquals("advance(5) first match failed", 5, bmbs.advance(5));
+ assertEquals("score(5) failed", 0.491, bmbs.score(), BM25TestUtils.EPSILON);
+ }
+
+ protected void tearDown() throws Exception {
+ reader.close();
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25SingleBooleanScorerTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25SingleBooleanScorerTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25SingleBooleanScorerTest.java (revision 0)
@@ -0,0 +1,82 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.queryParser.ParseException;
+
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+
+
+public class BM25SingleBooleanScorerTest extends LuceneTestCase {
+ private IndexReader reader;
+ private BM25SingleBooleanScorer bmsbs;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ reader = BM25TestUtils.initPresidentsReader();
+ BM25TestUtils.loadBM25Parameters();
+ getNewBM25Scorer();
+ }
+
+ private void getNewBM25Scorer() throws IOException, ParseException {
+ bmsbs = new BM25SingleBooleanScorer(reader, BM25TestUtils.getBooleanTermQueries(), new DefaultSimilarity());
+ }
+
+ public void testDocId() throws IOException {
+ assertEquals("docId not -1 at start", -1, bmsbs.docID());
+ bmsbs.advance(9);
+ assertEquals("docId not 9 after first advance() call", 9, bmsbs.docID());
+ bmsbs.nextDoc();
+ assertEquals("docId not NO_MORE_DOCS after nextDoc() calls", DocIdSetIterator.NO_MORE_DOCS, bmsbs.docID());
+ }
+
+ public void testNextDoc() throws IOException {
+ assertEquals("nextDoc first match failed", 1, bmsbs.nextDoc());
+ assertEquals("nextDoc second match failed", 5, bmsbs.nextDoc());
+ assertEquals("nextDoc third match failed", 9, bmsbs.nextDoc());
+ assertEquals("nextDoc fourth match failed", DocIdSetIterator.NO_MORE_DOCS, bmsbs.nextDoc());
+ }
+
+ public void testAdvance() throws IOException {
+ assertEquals("advance(1) first match failed", 1, bmsbs.advance(1));
+ assertEquals("advance(1) second match failed", 5, bmsbs.advance(1));
+ assertEquals("advance(5) first match failed", 9, bmsbs.advance(5));
+ assertEquals("advance(5) second match failed", DocIdSetIterator.NO_MORE_DOCS, bmsbs.advance(5));
+
+ }
+
+ public void testScore() throws IOException {
+ assertEquals("nextDoc first match failed", 1, bmsbs.nextDoc());
+ assertEquals("score(1) failed", 0.631, bmsbs.score(), BM25TestUtils.EPSILON);
+ assertEquals("advance(5) first match failed", 5, bmsbs.advance(2));
+ assertEquals("score(5) failed", 0.491, bmsbs.score(), BM25TestUtils.EPSILON);
+ }
+
+ protected void tearDown() throws IOException {
+ reader.close();
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25ParametersTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25ParametersTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25ParametersTest.java (revision 0)
@@ -0,0 +1,55 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class BM25ParametersTest extends LuceneTestCase {
+
+ //To be refactored at a later date.
+ protected void setUp() throws Exception {
+ super.setUp();
+ BM25TestUtils.loadBM25Parameters();
+ }
+
+ public void testGetSetB() {
+ float initialB = BM25Parameters.getB();
+ BM25Parameters.setB(1.3f);
+ assertEquals("set B != 1.3f", 1.3, BM25Parameters.getB(), BM25TestUtils.EPSILON);
+ BM25Parameters.setB(initialB);
+ assertEquals("B != initial value", initialB, BM25Parameters.getB(), BM25TestUtils.EPSILON);
+ }
+
+ public void testGetSetK1() {
+ float initialK1 = BM25Parameters.getK1();
+ BM25Parameters.setK1(4.0f);
+ assertEquals("set K1 != 4.0f", 4.0, BM25Parameters.getK1(), BM25TestUtils.EPSILON);
+ BM25Parameters.setK1(initialK1);
+ assertEquals("K1 != initial K1", initialK1, BM25Parameters.getK1(), BM25TestUtils.EPSILON);
+ }
+
+ public void testGetSetAverageLength() {
+ float initialPresAvgLength = BM25Parameters.getAverageLength("president");
+ BM25Parameters.setAverageLength("president", 5.0f);
+ assertEquals("president avgLength != 5.0 after set", 5.0, BM25Parameters.getAverageLength("president"), BM25TestUtils.EPSILON);
+ BM25Parameters.setAverageLength("president", initialPresAvgLength);
+ assertEquals("president avgLength != 2.5", initialPresAvgLength, BM25Parameters.getAverageLength("president"), BM25TestUtils.EPSILON);
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25BooleanQueryTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25BooleanQueryTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25BooleanQueryTest.java (revision 0)
@@ -0,0 +1,75 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+public class BM25BooleanQueryTest extends LuceneTestCase {
+ private IndexSearcher searcher;
+ private Analyzer analyzer;
+ private final String SEARCH_FIELD = "president";
+
+ @Override
+ protected void setUp() throws Exception {
+ searcher = new IndexSearcher(BM25TestUtils.initPresidentsReader());
+ analyzer = new WhitespaceAnalyzer();
+ BM25TestUtils.loadBM25Parameters();
+ }
+
+ private static class HashMapNow<K, V> extends HashMap<K, V> {
+ public HashMapNow<K, V> with(K key, V value) {
+ put(key, value);
+ return this;
+ }
+ }
+
+ public void testQuerySentence() throws IOException, ParseException {
+ Map<String, String> queryResultMap = new HashMapNow<String, String>()
+ .with("Adams", "1")
+ .with("Jefferson", "2")
+ .with("John", "1");
+
+ for (String q : queryResultMap.keySet()) {
+ Query bmq = new BM25BooleanQuery(q, SEARCH_FIELD, analyzer);
+ TopDocs t = searcher.search(bmq, null, 10);
+ ScoreDoc[] scoreDocs = t.scoreDocs;
+ assertNotNull(scoreDocs);
+ assert (scoreDocs.length > 0);
+ String qid = String.valueOf(scoreDocs[0].doc);
+ assertEquals("The query [" + q + "] returned wrong id; ", queryResultMap.get(q), qid);
+ }
+ }
+
+ @Override
+ protected void tearDown() throws Exception {
+ searcher.close();
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25TermScorerTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25TermScorerTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25TermScorerTest.java (revision 0)
@@ -0,0 +1,81 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+
+public class BM25TermScorerTest extends LuceneTestCase {
+ private IndexReader reader;
+ private BM25TermScorer bmts;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ reader = BM25TestUtils.initPresidentsReader();
+ BM25TestUtils.loadBM25Parameters();
+ getNewScorer();
+ }
+
+
+ public void getNewScorer() throws IOException {
+ bmts = new BM25TermScorer(reader, BM25TestUtils.getPresidentJohnQuery(), new DefaultSimilarity());
+ }
+
+
+ public void testDocId() throws IOException {
+ assertEquals("docId not -1 at start", -1, bmts.docID());
+ bmts.advance(9);
+ assertEquals("docId not 9 after first advance() call", 9, bmts.docID());
+ bmts.nextDoc();
+ assertEquals("docId not NO_MORE_DOCS after nextDoc() calls", DocIdSetIterator.NO_MORE_DOCS, bmts.docID());
+
+ }
+
+ public void testNextDoc() throws IOException {
+ assertEquals("nextDoc first match failed", 1, bmts.nextDoc());
+ assertEquals("nextDoc second match failed", 5, bmts.nextDoc());
+ assertEquals("nextDoc third match failed", 9, bmts.nextDoc());
+ assertEquals("nextDoc fourth match failed", DocIdSetIterator.NO_MORE_DOCS, bmts.nextDoc());
+ }
+
+ public void testAdvance() throws IOException {
+ assertEquals("advance(1) first match failed", 1, bmts.advance(1));
+ assertEquals("advance(1) second match failed", 5, bmts.advance(1));
+ assertEquals("advance(5) first match failed", 9, bmts.advance(5));
+ assertEquals("advance(5) second match failed", DocIdSetIterator.NO_MORE_DOCS, bmts.advance(5));
+
+ }
+
+ public void testScore() throws IOException {
+ assertEquals("nextDoc first match failed", 1, bmts.nextDoc());
+ assertEquals("score(1) failed", 0.631, bmts.score(), BM25TestUtils.EPSILON);
+ assertEquals("advance(5) first match failed", 5, bmts.advance(2));
+ assertEquals("score(5) failed", 0.491, bmts.score(), BM25TestUtils.EPSILON);
+ }
+
+ protected void tearDown() throws IOException {
+ reader.close();
+ }
+
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25SimilarityTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25SimilarityTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25SimilarityTest.java (revision 0)
@@ -0,0 +1,76 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class BM25SimilarityTest extends LuceneTestCase {
+ private class BM25SimilarityData {
+ int first;
+ int second;
+ float expResult;
+
+ private BM25SimilarityData(int first, int second, float expResult) {
+ this.expResult = expResult;
+ this.first = first;
+ this.second = second;
+ }
+ }
+
+ private final static BM25Similarity bms = new BM25Similarity();
+
+ public void testCoord() {
+ final BM25SimilarityData[] ba = {new BM25SimilarityData(0, 5, 0.0f),
+ new BM25SimilarityData(1, 1, 1.0f),
+ new BM25SimilarityData(1, 2, 0.5f)};
+
+ for (BM25SimilarityData bsd : ba) {
+ assertEquals("coord(" + bsd.first + "," + bsd.second + ") should be " + bsd.expResult,
+ bsd.expResult, bms.coord(bsd.first, bsd.second), BM25TestUtils.EPSILON);
+ }
+ }
+
+ public void testIdf() {
+ final BM25SimilarityData[] ba = {new BM25SimilarityData(3, 6, 0.0f),
+ new BM25SimilarityData(6, 6, -2.564f),
+ new BM25SimilarityData(1, 6, 1.299f)};
+
+ for (BM25SimilarityData bsd : ba) {
+ assertEquals("idf(" + bsd.first + "," + bsd.second + ") should be " + bsd.expResult,
+ bsd.expResult, bms.idf(bsd.first, bsd.second), BM25TestUtils.EPSILON);
+ }
+ }
+
+ public void testLengthNorm() {
+ assertEquals("lengthNorm(\"yoyo\",5) != 1.0", 1.0f, bms.lengthNorm("yoyo", 5));
+ }
+
+ public void testQueryNorm() {
+ assertEquals("queryNorm(3.3) != 1.0", 1.0f, bms.queryNorm(3.3f));
+ }
+
+ public void testSloppyFreq() {
+ assertEquals("sloppyFreq(5) != 1.0", 1.0f, bms.sloppyFreq(5));
+ }
+
+ public void testTf() {
+ assertEquals("tf(3) != 3.0", 3.0f, bms.tf(3.0f));
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25BooleanWeightTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25BooleanWeightTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bm25/BM25BooleanWeightTest.java (revision 0)
@@ -0,0 +1,53 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+
+public class BM25BooleanWeightTest extends LuceneTestCase {
+ private IndexReader reader;
+ private BM25BooleanWeight weight;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ reader = BM25TestUtils.initPresidentsReader();
+ BM25TestUtils.loadBM25Parameters();
+ weight = new BM25BooleanWeight(BM25TestUtils.getBooleanTermQueries(), BM25TestUtils.getEmptyBooleanTermQueryArray(), BM25TestUtils.getEmptyBooleanTermQueryArray());
+ }
+
+ public void testGetQuery() {
+ assertNull("Query not null", weight.getQuery());
+ }
+
+ public void testGetValue() {
+ assertEquals("Value not zero", 0.0f, weight.getValue(), BM25TestUtils.EPSILON);
+ }
+
+ public void testSumOfSquaredWeights() throws IOException {
+ assertEquals("sumOfSquaredWeights not zero", 0.0f, weight.sumOfSquaredWeights(), BM25TestUtils.EPSILON);
+ }
+
+ protected void tearDown() throws Exception {
+ reader.close();
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/BM25TestUtils.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/BM25TestUtils.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/BM25TestUtils.java (revision 0)
@@ -0,0 +1,124 @@
+package org.apache.lucene.bm;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.bm.bm25.BM25Parameters;
+import org.apache.lucene.bm.bm25.BM25BooleanQuery;
+import org.apache.lucene.bm.bm25f.BM25FParameters;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.queryParser.ParseException;
+
+import java.io.IOException;
+
+public class BM25TestUtils {
+ public static final float EPSILON = 0.001f;
+
+ public BM25TestUtils() {
+ }
+
+ public static String getPropertyDir() {
+ String fileSeparator = System.getProperty("file.separator");
+ String propDir = System.getProperty("user.dir").concat(fileSeparator).
+ concat("contrib").concat(fileSeparator).concat("bm").
+ concat(fileSeparator).concat("src").concat(fileSeparator).concat("test");
+ return propDir;
+ }
+
+ public static void loadBM25Parameters() throws IOException {
+ String avgLengthPath = getPropertyDir().concat(System.getProperty("file.separator")).concat("bm.properties");
+ BM25Parameters.load(avgLengthPath);
+ }
+
+ public static void loadBM25FParameters() throws IOException {
+ String avgLengthPath = getPropertyDir().concat(System.getProperty("file.separator")).concat("bm.properties");
+ BM25FParameters.load(avgLengthPath);
+ }
+
+ public static IndexReader initPresidentsReader() throws IOException {
+ final String[] presidents = {
+ "George Washington", "John Adams", "Thomas Jefferson", "James Madison", "James Monroe",
+ "John Quincy Adams", "Andrew Jackson", "Martin Van Buren", "William Henry Harrison", "John Tyler"
+ };
+ RAMDirectory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
+ for (String president : presidents) {
+ Document doc = new Document();
+ doc.add(new Field("president", president, Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+ writer.close();
+ return IndexReader.open(dir, true);
+ }
+
+ public static IndexReader initPresidentsAndVicesReader() throws IOException {
+ final String[][] presidentsAndVices = {
+ // president, vice
+ {"George Washington", "John Adams"},
+ {"John Adams", "Thomas Jefferson"},
+ {"Thomas Jefferson", "Aaron Burr"},
+ {"James Madison", "George Clinton"},
+ {"James Monroe", "Daniel D. Tompkins"},
+ {"John Quincy Adams", "John C. Calhoun"},
+ {"Andrew Jackson", "Martin Van Buren"},
+ {"Martin Van Buren", "Richard M. Johnson"},
+ {"William Henry Harrison", "John Tyler"},
+ {"John Tyler", "George M. Dallas"}
+ };
+ RAMDirectory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
+ for (String[] president : presidentsAndVices) {
+ Document doc = new Document();
+ doc.add(new Field("president", president[0], Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("vice", president[1], Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+ writer.close();
+ return IndexReader.open(dir, true);
+ }
+
+ public static TermQuery getPresidentJohnQuery() {
+ Term t = new Term("president", "John");
+ return new TermQuery(t);
+ }
+
+ public static TermQuery getViceJohnQuery() {
+ Term t = new Term("vice", "John");
+ return new TermQuery(t);
+ }
+
+ public static BM25BooleanQuery.BooleanTermQuery[] getBooleanTermQueries() throws ParseException {
+ BM25BooleanQuery bmbq = new BM25BooleanQuery("John", "president", new WhitespaceAnalyzer());
+ BM25BooleanQuery.BooleanTermQuery btq = bmbq.new BooleanTermQuery(getPresidentJohnQuery(), BooleanClause.Occur.MUST);
+ BM25BooleanQuery.BooleanTermQuery[] btqs = {btq};
+ return btqs;
+ }
+
+ public static BM25BooleanQuery.BooleanTermQuery[] getEmptyBooleanTermQueryArray() {
+ BM25BooleanQuery.BooleanTermQuery[] btqs = {};
+ return btqs;
+ }
+}
\ No newline at end of file
Index: contrib/bm/src/test/org/apache/lucene/bm/bm25f/BM25FParametersTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bm25f/BM25FParametersTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bm25f/BM25FParametersTest.java (revision 0)
@@ -0,0 +1,57 @@
+package org.apache.lucene.bm.bm25f;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class BM25FParametersTest extends LuceneTestCase {
+ private final float[] boosts = {1.0f, 2.0f, 3.0f};
+ private final float[] bs = {0.25f, 0.5f, 0.75f};
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ BM25TestUtils.loadBM25FParameters();
+ BM25FParameters.setBoost(boosts);
+ BM25FParameters.setBParam(bs);
+ }
+
+ public void testGetSetK1() {
+ assertEquals("initial K1 != 2f", 2.0, BM25FParameters.getK1(), BM25TestUtils.EPSILON);
+ BM25FParameters.setK1(4.0f);
+ assertEquals("set K1 != 4.0f", 4.0, BM25FParameters.getK1(), BM25TestUtils.EPSILON);
+ }
+
+ public void testGetIdField() {
+ assertEquals("id field is not president", "president", BM25FParameters.getIdfField());
+ }
+
+ public void testGetBoosts() {
+ float[] myBoosts = BM25FParameters.getBoost();
+ assertEquals("boost[0] != 1.0", 1.0f, myBoosts[0]);
+ assertEquals("boost[1] != 2.0", 2.0f, myBoosts[1]);
+ assertEquals("boost[2] != 3.0", 3.0f, myBoosts[2]);
+ }
+
+ public void testGetBs() {
+ float[] myBs = BM25FParameters.getBParam();
+ assertEquals("bs[0] != 0.25", 0.25f, myBs[0]);
+ assertEquals("bs[1] != 0.5", 0.5f, myBs[1]);
+ assertEquals("bs[2] != 0.75", 0.75f, myBs[2]);
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bm25f/BM25FTermScorerTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bm25f/BM25FTermScorerTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bm25f/BM25FTermScorerTest.java (revision 0)
@@ -0,0 +1,89 @@
+package org.apache.lucene.bm.bm25f;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+
+public class BM25FTermScorerTest extends LuceneTestCase {
+ private static final float BM25F_SATURATION_FACTOR = 2.0f;
+ private static final float BM25F_DEFAULT_B_FACTOR = 0.75f;
+
+ private IndexReader reader;
+ private BM25FTermScorer bmts;
+
+ private static final String[] fieldNames = {"president", "vice"};
+ private static final float[] boosts = {1.0f, 0.1f};
+ private static final float[] bm25BParams = new float[fieldNames.length];
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ reader = BM25TestUtils.initPresidentsAndVicesReader();
+ BM25TestUtils.loadBM25Parameters();
+ for (int i = 0; i < bm25BParams.length; i++) {
+ bm25BParams[i] = BM25F_DEFAULT_B_FACTOR;
+ }
+ BM25FParameters.setK1(BM25F_SATURATION_FACTOR);
+ getNewScorer();
+ }
+
+ private void getNewScorer() throws IOException {
+ bmts = new BM25FTermScorer(reader, BM25TestUtils.getPresidentJohnQuery(), fieldNames, boosts, bm25BParams, new DefaultSimilarity());
+ }
+
+ public void testDocId() throws IOException {
+ assertEquals("docId not NO_MORE_DOCS at start", DocIdSetIterator.NO_MORE_DOCS, bmts.docID());
+ bmts.advance(9);
+ assertEquals("docId not 9 after first advance() call", 9, bmts.docID());
+ bmts.nextDoc();
+ assertEquals("docId not NO_MORE_DOCS after nextDoc() calls", DocIdSetIterator.NO_MORE_DOCS, bmts.docID());
+ }
+
+ public void testNextDoc() throws IOException {
+ assertEquals("nextDoc first match failed", 0, bmts.nextDoc());
+ assertEquals("nextDoc second match failed", 1, bmts.nextDoc());
+ assertEquals("nextDoc third match failed", 5, bmts.nextDoc());
+ assertEquals("nextDoc third match failed", 8, bmts.nextDoc());
+ assertEquals("nextDoc third match failed", 9, bmts.nextDoc());
+ assertEquals("nextDoc fourth match failed", DocIdSetIterator.NO_MORE_DOCS, bmts.nextDoc());
+ }
+
+ public void testAdvance() throws IOException {
+ assertEquals("advance(1) first match failed", 1, bmts.advance(1));
+ assertEquals("advance(1) second match failed", 5, bmts.advance(1));
+ assertEquals("advance(9) first match failed", 9, bmts.advance(9));
+ assertEquals("advance(9) second match failed", DocIdSetIterator.NO_MORE_DOCS, bmts.advance(9));
+ }
+
+ public void testScore() throws IOException {
+ assertEquals("nextDoc first match failed", 0, bmts.nextDoc());
+ assertEquals("score(1) failed", 0.089, bmts.score(), BM25TestUtils.EPSILON);
+ assertEquals("advance(5) first match failed", 5, bmts.advance(2));
+ assertEquals("score(5) failed", 0.526, bmts.score(), BM25TestUtils.EPSILON);
+ }
+
+ protected void tearDown() throws IOException {
+ reader.close();
+ }
+}
+
Index: contrib/bm/src/test/org/apache/lucene/bm/bool/MatchAllBooleanScorerTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bool/MatchAllBooleanScorerTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bool/MatchAllBooleanScorerTest.java (revision 0)
@@ -0,0 +1,67 @@
+package org.apache.lucene.bm.bool;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+
+public class MatchAllBooleanScorerTest extends LuceneTestCase {
+ private MatchAllBooleanScorer mabs;
+ final static private int NUM_DOCS = 3;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ getNewScorer();
+ }
+
+ private void getNewScorer() throws IOException {
+ mabs = new MatchAllBooleanScorer(new DefaultSimilarity(), NUM_DOCS);
+ }
+
+
+ public void testDocId() throws IOException {
+ assertEquals("docId not -1 at start", -1, mabs.docID());
+ mabs.advance(2);
+ assertEquals("docId not 2 after first advance() call", 2, mabs.docID());
+ mabs.nextDoc();
+ assertEquals("docId not NO_MORE_DOCS after nextDoc() calls", DocIdSetIterator.NO_MORE_DOCS, mabs.docID());
+ }
+
+
+ public void testNextDoc() throws IOException {
+ assertEquals("nextDoc first match failed", 0, mabs.nextDoc());
+ assertEquals("nextDoc first match failed", 1, mabs.nextDoc());
+ assertEquals("nextDoc first match failed", 2, mabs.nextDoc());
+ assertEquals("nextDoc second match failed", DocIdSetIterator.NO_MORE_DOCS, mabs.nextDoc());
+ }
+
+ public void testAdvance() throws IOException {
+ assertEquals("advance(2) first match failed", 2, mabs.advance(2));
+ assertEquals("advance(2) second match failed", DocIdSetIterator.NO_MORE_DOCS, mabs.advance(2));
+ }
+
+ public void testScore() throws IOException {
+ assertEquals("nextDoc first match failed", 0, mabs.nextDoc());
+ assertEquals("score(2) failed", 0.0, mabs.score(), BM25TestUtils.EPSILON);
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bool/MustBooleanScorerTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bool/MustBooleanScorerTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bool/MustBooleanScorerTest.java (revision 0)
@@ -0,0 +1,75 @@
+package org.apache.lucene.bm.bool;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.bm.bm25.BM25TermScorer;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class MustBooleanScorerTest extends LuceneTestCase {
+ private IndexReader reader;
+ private MustBooleanScorer mbs;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ reader = BM25TestUtils.initPresidentsAndVicesReader();
+ BM25TestUtils.loadBM25Parameters();
+ getNewScorer();
+ }
+
+ private void getNewScorer() throws IOException {
+ BM25TermScorer bmts1 = new BM25TermScorer(reader, BM25TestUtils.getPresidentJohnQuery(), new DefaultSimilarity());
+ BM25TermScorer bmts2 = new BM25TermScorer(reader, BM25TestUtils.getViceJohnQuery(), new DefaultSimilarity());
+ mbs = new MustBooleanScorer(new DefaultSimilarity(), new Scorer[]{bmts1, bmts2});
+ }
+
+ public void testDocId() throws IOException {
+ assertEquals("docId not -1 at start", -1, mbs.docID());
+ mbs.advance(5);
+ assertEquals("docId not 5 after first advance() call", 5, mbs.docID());
+ mbs.nextDoc();
+ assertEquals("docId not NO_MORE_DOCS after nextDoc() calls", DocIdSetIterator.NO_MORE_DOCS, mbs.docID());
+ }
+
+
+ public void testNextDoc() throws IOException {
+ assertEquals("nextDoc first match failed", 5, mbs.nextDoc());
+ assertEquals("nextDoc second match failed", DocIdSetIterator.NO_MORE_DOCS, mbs.nextDoc());
+ }
+
+ public void testAdvance() throws IOException {
+ assertEquals("advance(5) first match failed", 5, mbs.advance(5));
+ assertEquals("advance(5) second match failed", DocIdSetIterator.NO_MORE_DOCS, mbs.advance(5));
+ }
+
+ public void testScore() throws IOException {
+ assertEquals("nextDoc first match failed", 5, mbs.nextDoc());
+ assertEquals("score(1) failed", 0.982, mbs.score(), BM25TestUtils.EPSILON);
+ }
+
+ protected void tearDown() throws IOException {
+ reader.close();
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bool/ShouldBooleanScorerTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bool/ShouldBooleanScorerTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bool/ShouldBooleanScorerTest.java (revision 0)
@@ -0,0 +1,76 @@
+package org.apache.lucene.bm.bool;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.bm.bm25.BM25TermScorer;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+
+public class ShouldBooleanScorerTest extends LuceneTestCase {
+ private IndexReader reader;
+ private ShouldBooleanScorer sbs;
+
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ reader = BM25TestUtils.initPresidentsAndVicesReader();
+ BM25TestUtils.loadBM25Parameters();
+ getNewScorer();
+ }
+
+ private void getNewScorer() throws IOException {
+ BM25TermScorer bmts1 = new BM25TermScorer(reader, BM25TestUtils.getPresidentJohnQuery(), new DefaultSimilarity());
+ BM25TermScorer bmts2 = new BM25TermScorer(reader, BM25TestUtils.getViceJohnQuery(), new DefaultSimilarity());
+ sbs = new ShouldBooleanScorer(new DefaultSimilarity(), new Scorer[]{bmts1, bmts2});
+ }
+
+ public void testDocId() throws IOException {
+ assertEquals("docId not NO_MORE_DOCS at start", DocIdSetIterator.NO_MORE_DOCS, sbs.docID());
+ sbs.advance(5);
+ assertEquals("docId not 5 after first advance() call", 5, sbs.docID());
+ sbs.advance(10);
+ assertEquals("docId not NO_MORE_DOCS after second advance() call", DocIdSetIterator.NO_MORE_DOCS, sbs.docID());
+ }
+
+ public void testNextDoc() throws IOException {
+ int[] matches = new int[]{0, 1, 5, 8, 9, DocIdSetIterator.NO_MORE_DOCS};
+ for (int i = 0; i < matches.length; i++) {
+ assertEquals("nextDoc " + i + " match failed", matches[i], sbs.nextDoc());
+ }
+ }
+
+ public void testAdvance() throws IOException {
+ assertEquals("advance(9) first match failed", 9, sbs.advance(9));
+ assertEquals("advance(9) second match failed", DocIdSetIterator.NO_MORE_DOCS, sbs.advance(9));
+ }
+
+ public void testScore() throws IOException {
+ assertEquals("nextDoc first match failed", 0, sbs.nextDoc());
+ assertEquals("score(0) failed", 0.631, sbs.score(), BM25TestUtils.EPSILON);
+ }
+
+ protected void tearDown() throws IOException {
+ reader.close();
+ }
+}
Index: contrib/bm/src/test/org/apache/lucene/bm/bool/NotBooleanScorerTest.java
===================================================================
--- contrib/bm/src/test/org/apache/lucene/bm/bool/NotBooleanScorerTest.java (revision 0)
+++ contrib/bm/src/test/org/apache/lucene/bm/bool/NotBooleanScorerTest.java (revision 0)
@@ -0,0 +1,85 @@
+package org.apache.lucene.bm.bool;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.bm.bm25.BM25TermScorer;
+import org.apache.lucene.bm.BM25TestUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.io.IOException;
+
+public class NotBooleanScorerTest extends LuceneTestCase {
+ private IndexReader reader;
+ private NotBooleanScorer nbs;
+
+
+ protected void setUp() throws Exception {
+ super.setUp();
+ reader = BM25TestUtils.initPresidentsAndVicesReader();
+ BM25TestUtils.loadBM25Parameters();
+ getNewScorer();
+ }
+
+
+ private void getNewScorer() throws IOException {
+ BM25TermScorer bmts1 = new BM25TermScorer(reader, BM25TestUtils.getPresidentJohnQuery(), new DefaultSimilarity());
+ BM25TermScorer bmts2 = new BM25TermScorer(reader, BM25TestUtils.getViceJohnQuery(), new DefaultSimilarity());
+ nbs = new NotBooleanScorer(new DefaultSimilarity(), new Scorer[]{bmts1, bmts2}, 10);
+ }
+
+
+ public void testDocId() throws IOException {
+ assertEquals("docId not -1 at start", -1, nbs.docID());
+ nbs.advance(5);
+ assertEquals("docId not 5 after first advance() call", 6, nbs.docID());
+ nbs.advance(10);
+ assertEquals("docId not NO_MORE_DOCS after advance(10) call", DocIdSetIterator.NO_MORE_DOCS, nbs.docID());
+ }
+
+
+ public void testNextDoc() throws IOException {
+ assertEquals("nextDoc first match failed", 2, nbs.nextDoc());
+ assertEquals("nextDoc second match failed", 3, nbs.nextDoc());
+ assertEquals("nextDoc third match failed", 4, nbs.nextDoc());
+ assertEquals("nextDoc fourth match failed", 6, nbs.nextDoc());
+ assertEquals("nextDoc fifth match failed", 7, nbs.nextDoc());
+ assertEquals("nextDoc sixth match failed", DocIdSetIterator.NO_MORE_DOCS, nbs.nextDoc());
+ }
+
+
+ public void testAdvance() throws IOException {
+ assertEquals("advance(6) first match failed", 6, nbs.advance(6));
+ assertEquals("advance(6) second match failed", 7, nbs.advance(6));
+ assertEquals("advance(7) first match failed", DocIdSetIterator.NO_MORE_DOCS, nbs.advance(7));
+ }
+
+
+ public void testScore() throws IOException {
+ assertEquals("nextDoc first match failed", 2, nbs.nextDoc());
+ assertEquals("score(2) failed", 1.0, nbs.score(), BM25TestUtils.EPSILON);
+ }
+
+
+ protected void tearDown() throws IOException {
+ reader.close();
+ }
+}
Index: contrib/bm/src/test/bm.properties
===================================================================
--- contrib/bm/src/test/bm.properties (revision 0)
+++ contrib/bm/src/test/bm.properties (revision 0)
@@ -0,0 +1,4 @@
+president
+2.5f
+vice
+2.5f
Index: contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25BooleanScorer.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25BooleanScorer.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25BooleanScorer.java (revision 0)
@@ -0,0 +1,272 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+
+import org.apache.lucene.bm.bm25.BM25BooleanQuery.BooleanTermQuery;
+import org.apache.lucene.bm.bm25f.BM25FTermScorer;
+import org.apache.lucene.bm.bool.AbstractBooleanScorer;
+import org.apache.lucene.bm.bool.MatchAllBooleanScorer;
+import org.apache.lucene.bm.bool.MustBooleanScorer;
+import org.apache.lucene.bm.bool.NotBooleanScorer;
+import org.apache.lucene.bm.bool.ShouldBooleanScorer;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.index.IndexReader;
+
+/**
+ * BM25BooleanScorer, calculates the total relevance value based in a boolean
+ * expression.<BR>
+ *
+ *
+ */
+public class BM25BooleanScorer extends Scorer {
+
+ private AbstractBooleanScorer shouldBooleanScorer;
+ private AbstractBooleanScorer mustBooleanScorer;
+ private AbstractBooleanScorer notBooleanScorer;
+ private boolean hasMoreShould = false;
+ private boolean hasMoreMust = false;
+ private boolean hasMoreNot = false;
+ private int doc = -1;
+ private int ndocs;
+ private boolean initialized = false;
+
+ /**
+ * Build a BM25BooleanScorer composed of atoms that are BM25TermScorers.
+ * The scorer will give the score for a boolean formula combining the subscorers.
+ * @param reader
+ * @param should - array of BM25TermScorers appearing as SHOULD
+ * @param must - array of BM25TermScorers appearing as MUST
+ * @param not - array of BM25TermScorers appearing as NOT
+ * @param similarity
+ * @throws IOException
+ */
+ public BM25BooleanScorer(IndexReader reader, BooleanTermQuery[] should,
+ BooleanTermQuery[] must, BooleanTermQuery[] not,
+ Similarity similarity) throws IOException {
+ super(similarity);
+ this.ndocs = reader.numDocs();
+
+ if (should != null && should.length > 0) {
+
+ Scorer[] shouldScorer = new Scorer[should.length];
+ for (int i = 0; i < shouldScorer.length; i++) {
+ shouldScorer[i] = new BM25TermScorer(reader,
+ should[i].termQuery, similarity);
+ }
+ this.shouldBooleanScorer = new ShouldBooleanScorer(similarity,
+ shouldScorer);
+
+ } else
+ this.shouldBooleanScorer = new MatchAllBooleanScorer(similarity,
+ this.ndocs);
+
+ if (must != null && must.length > 0) {
+ Scorer[] mustScorer = new Scorer[must.length];
+ for (int i = 0; i < mustScorer.length; i++) {
+ mustScorer[i] = new BM25TermScorer(reader, must[i].termQuery,
+ similarity);
+ }
+
+ this.mustBooleanScorer = new MustBooleanScorer(similarity,
+ mustScorer);
+ } else
+ this.mustBooleanScorer = new MatchAllBooleanScorer(similarity,
+ this.ndocs);
+
+ if (not != null && not.length > 0) {
+ Scorer[] notScorer = new Scorer[not.length];
+ for (int i = 0; i < notScorer.length; i++) {
+ notScorer[i] = new BM25TermScorer(reader, not[i].termQuery,
+ similarity);
+ }
+
+ this.notBooleanScorer = new NotBooleanScorer(similarity, notScorer,
+ this.ndocs);
+ } else
+ this.notBooleanScorer = new MatchAllBooleanScorer(similarity,
+ this.ndocs);
+ }
+
+ /**
+ * Build a BM25BooleanScorer composed of atoms that are BM25FTermScorers.
+ * The scorer will give the score for a boolean formula combining the subscorers.
+ * Each subscorer combines the fields' scores using the given boosts and bParams.
+ * @param reader
+ * @param should - array of BM25FTermScorers appearing as SHOULD
+ * @param must - array of BM25FTermScorers appearing as MUST
+ * @param not - array of BM25FTermScorers appearing as NOT
+ * @param similarity
+ * @throws IOException
+ */
+ public BM25BooleanScorer(IndexReader reader, BooleanTermQuery[] should,
+ BooleanTermQuery[] must, BooleanTermQuery[] not,
+ Similarity similarity, String[] fields, float[] boosts,
+ float[] bParams) throws IOException {
+ super(similarity);
+ this.ndocs = reader.numDocs();
+ if (should != null && should.length > 0) {
+ Scorer[] shouldScorer = new Scorer[should.length];
+ for (int i = 0; i < shouldScorer.length; i++) {
+ shouldScorer[i] = new BM25FTermScorer(reader,
+ should[i].termQuery, fields, boosts, bParams,
+ similarity);
+ }
+
+ this.shouldBooleanScorer = new ShouldBooleanScorer(similarity,
+ shouldScorer);
+ } else
+ this.shouldBooleanScorer = new MatchAllBooleanScorer(similarity,
+ this.ndocs);
+
+ if (must != null && must.length > 0) {
+ Scorer[] mustScorer = new Scorer[must.length];
+ for (int i = 0; i < mustScorer.length; i++) {
+ mustScorer[i] = new BM25FTermScorer(reader, must[i].termQuery,
+ fields, boosts, bParams, similarity);
+ }
+
+ this.mustBooleanScorer = new MustBooleanScorer(similarity,
+ mustScorer);
+ } else
+ this.mustBooleanScorer = new MatchAllBooleanScorer(similarity,
+ this.ndocs);
+
+ if (not != null && not.length > 0) {
+ Scorer[] notScorer = new Scorer[not.length];
+ for (int i = 0; i < notScorer.length; i++) {
+ notScorer[i] = new BM25FTermScorer(reader, not[i].termQuery,
+ fields, boosts, bParams, similarity);
+ }
+
+ this.notBooleanScorer = new NotBooleanScorer(similarity, notScorer,
+ this.ndocs);
+ } else
+ this.notBooleanScorer = new MatchAllBooleanScorer(similarity,
+ this.ndocs);
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#docID()
+ */
+ @Override
+ public int docID() {
+ return this.doc;
+ }
+
+ private void init() throws IOException {
+ this.hasMoreShould = (this.shouldBooleanScorer.nextDoc() != NO_MORE_DOCS);
+ this.hasMoreMust = (this.mustBooleanScorer.nextDoc() != NO_MORE_DOCS);
+ this.hasMoreNot = (this.notBooleanScorer.nextDoc() != NO_MORE_DOCS);
+ }
+
+ private void doNext() throws IOException {
+ if (this.hasMoreShould && this.shouldBooleanScorer.docID() == this.doc)
+ this.hasMoreShould = (this.shouldBooleanScorer.nextDoc() != NO_MORE_DOCS);
+ if (this.hasMoreMust && this.mustBooleanScorer.docID() == this.doc)
+ this.hasMoreMust = (this.mustBooleanScorer.nextDoc() != NO_MORE_DOCS);
+ if (this.hasMoreNot && this.notBooleanScorer.docID() == this.doc)
+ this.hasMoreNot = (this.notBooleanScorer.nextDoc() != NO_MORE_DOCS);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#nextDoc()
+ */
+ @Override
+ public int nextDoc() throws IOException {
+
+ if (!this.initialized) {
+ this.initialized = true;
+ this.init();
+ } else {
+ this.doNext();
+ }
+
+ while (this.doc < this.ndocs - 1) {
+ this.doc++;
+ if (this.hasMoreMust) {
+ if (this.mustBooleanScorer.docID() < this.doc)
+ this.hasMoreMust = (this.mustBooleanScorer.nextDoc() != NO_MORE_DOCS);
+ } else {
+ this.doc = NO_MORE_DOCS;
+ return NO_MORE_DOCS;
+ }
+
+ if (this.hasMoreNot) {
+ if (this.notBooleanScorer.docID() < this.doc)
+ this.hasMoreNot = (this.notBooleanScorer.nextDoc() != NO_MORE_DOCS);
+ } else {
+ this.doc = NO_MORE_DOCS;
+ return NO_MORE_DOCS;
+ }
+
+ if (this.hasMoreShould) {
+ if (this.shouldBooleanScorer.docID() < this.doc)
+ this.hasMoreShould = (this.shouldBooleanScorer.nextDoc() != NO_MORE_DOCS);
+ }
+
+ if (this.hasMoreMust && this.hasMoreNot) {
+ if (this.mustBooleanScorer.docID() == this.notBooleanScorer.docID())
+ return this.doc;
+ } else {
+ this.doc = NO_MORE_DOCS;
+ return NO_MORE_DOCS;
+ }
+ }
+
+ this.doc = NO_MORE_DOCS;
+ return NO_MORE_DOCS;
+ }
+
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#score()
+ */
+ @Override
+ public float score() throws IOException {
+ float result = 0f;
+ if (this.hasMoreMust && this.mustBooleanScorer.docID() == doc)
+ result += this.mustBooleanScorer.score();
+
+ if (this.hasMoreShould && this.shouldBooleanScorer.docID() == doc)
+ result += this.shouldBooleanScorer.score();
+
+ return result;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target == NO_MORE_DOCS)
+ return NO_MORE_DOCS;
+ while ((this.nextDoc() != NO_MORE_DOCS) && this.docID() < target) {
+ }
+
+ return this.docID();
+ }
+
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25SingleBooleanScorer.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25SingleBooleanScorer.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25SingleBooleanScorer.java (revision 0)
@@ -0,0 +1,137 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.bm.bm25.BM25BooleanQuery.BooleanTermQuery;
+import org.apache.lucene.bm.bm25f.BM25FTermScorer;
+import org.apache.lucene.bm.bool.AbstractBooleanScorer;
+import org.apache.lucene.bm.bool.MustBooleanScorer;
+import org.apache.lucene.bm.bool.NotBooleanScorer;
+import org.apache.lucene.bm.bool.ShouldBooleanScorer;
+
+/**
+ * BM25SingleBooleanScorer, calculates the total relevance value based boolean
+ * expression, that has just one common operator (AND, OR, NOT) for all terms.<BR>
+ */
+public class BM25SingleBooleanScorer extends Scorer {
+
+ private AbstractBooleanScorer booleanScorer = null;
+
+ /**
+ * Fill scorer array with BM25TermScorers.
+ * @param reader
+ * @param termQuery
+ * @param similarity
+ * @throws IOException
+ */
+ public BM25SingleBooleanScorer(IndexReader reader,
+ BooleanTermQuery[] termQuery, Similarity similarity)
+ throws IOException {
+ super(similarity);
+
+ Scorer[] scorer = new Scorer[termQuery.length];
+ for (int i = 0; i < scorer.length; i++) {
+ scorer[i] = new BM25TermScorer(reader, termQuery[i].termQuery,
+ similarity);
+ }
+
+ if (termQuery[0].occur == BooleanClause.Occur.MUST)
+ this.booleanScorer = new MustBooleanScorer(similarity, scorer);
+ else if (termQuery[0].occur == BooleanClause.Occur.SHOULD)
+ this.booleanScorer = new ShouldBooleanScorer(similarity, scorer);
+ else
+ this.booleanScorer = new NotBooleanScorer(similarity, scorer,
+ reader.numDocs());
+
+ }
+
+ /**
+ * Fill scorer array with BM25FTermScorers using the fields, boosts and bParams parameters.
+ * @param reader
+ * @param termQuery
+ * @param similarity
+ * @param fields
+ * @param boosts
+ * @param bParams
+ * @throws IOException
+ */
+ public BM25SingleBooleanScorer(IndexReader reader,
+ BooleanTermQuery[] termQuery, Similarity similarity,
+ String[] fields, float[] boosts, float[] bParams)
+ throws IOException {
+ super(similarity);
+ Scorer[] scorer = new Scorer[termQuery.length];
+
+ for (int i = 0; i < scorer.length; i++) {
+ scorer[i] = new BM25FTermScorer(reader, termQuery[i].termQuery,
+ fields, boosts, bParams, similarity);
+ }
+
+ if (termQuery[0].occur == BooleanClause.Occur.MUST)
+ this.booleanScorer = new MustBooleanScorer(similarity, scorer);
+ else if (termQuery[0].occur == BooleanClause.Occur.SHOULD)
+ this.booleanScorer = new ShouldBooleanScorer(similarity, scorer);
+ else
+ this.booleanScorer = new NotBooleanScorer(similarity, scorer,
+ reader.numDocs());
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#docID()
+ */
+ @Override
+ public int docID() {
+ return booleanScorer.docID();
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return booleanScorer.nextDoc();
+ }
+
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#score()
+ */
+ @Override
+ public float score() throws IOException {
+ return booleanScorer.score();
+
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target == NO_MORE_DOCS)
+ return NO_MORE_DOCS;
+ while ((this.nextDoc() != NO_MORE_DOCS) && this.docID() < target) {
+ }
+
+ return this.docID();
+ }
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25Parameters.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25Parameters.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25Parameters.java (revision 0)
@@ -0,0 +1,131 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Parameters needed to calculate the BM25 relevance.
+ *
+ *
+ *
+ */
+public class BM25Parameters {
+
+ private static float B = 0.75f;
+ private static float K1 = 2f;
+
+ protected BM25Parameters() {
+ };
+
+ /**
+ * @return the BM25 length normalization parameter b, generally b =[0,1], by
+ * default is equals to 0.75
+ */
+ public static float getB() {
+ return B;
+ }
+
+ /**
+ * Set the BM25 length normalization parameter
+ *
+ * @param b
+ * the b parameter, generally b =[0,1], by default is equals to
+ * 0.75
+ */
+ public static void setB(float b) {
+ B = b;
+ }
+
+ /**
+ *
+ * @return the k1 parameter, by default is equivalent to 2
+ */
+ public static float getK1() {
+ return K1;
+ }
+
+ /**
+ * Set the k1 parameter, by default is equivalent to 2
+ *
+ * @param k1
+ */
+ public static void setK1(float k1) {
+ K1 = k1;
+ }
+
+ protected static Map<String, Float> avgLength = new HashMap<String, Float>();
+
+ /**
+ * Load field average length from a file with the next format: <BR>
+ * FIELD_NAME <BR>
+ * FLOAT_VALUE <BR>
+ * ANOTHER_FIELD_NAME <BR>
+ * ANOTHER_FIELD_VALUE<BR>
+ * for example:<BR>
+ * CONTENT<BR>
+ * 459.2903f<BR>
+ * ANCHOR<BR>
+ * 84.55523f<BR>
+ *
+ * @param path
+ * absolute path of the file
+ * @throws NumberFormatException
+ * @throws IOException
+ */
+ public static void load(String path) throws NumberFormatException,
+ IOException {
+ BufferedReader in = new BufferedReader(new FileReader(path));
+ String line;
+ while (null != (line = in.readLine())) {
+ String field = line;
+ Float avg = new Float(in.readLine());
+ BM25Parameters.setAverageLength(field, avg);
+ }
+ in.close();
+ }
+
+ /**
+ * Set the average length for the field 'field'
+ *
+ * @param field
+ * @param avg
+ */
+ public static void setAverageLength(String field, float avg) {
+ BM25Parameters.avgLength.put(field, avg);
+ }
+
+ /**
+ * Return the field 'field' average length
+ *
+ * @param field
+ * @return field average length
+ */
+ public static float getAverageLength(String field) {
+ try {
+ return BM25Parameters.avgLength.get(field);
+ } catch (NullPointerException e) {
+ throw e;
+ }
+ }
+
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25BooleanQuery.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25BooleanQuery.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25BooleanQuery.java (revision 0)
@@ -0,0 +1,284 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.util.Version;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.bm.bm25f.BM25FParameters;
+
+
+/**
+ * Query based in BM25 family ranking functions.<BR>
+ * <B>WARNING:Searcher similarity will be ignored, in order to calculate the
+ * relevance BM25Similarity or BM25FSimilarity are used.</B><BR>
+ * <B>WARNING:BM25Parameters.setAverageLength or
+ * BM25FParameters.setAverageLength must be invoked in order to set the average
+ * length for the field(s).</B>
+ *
+ *
+ * @see BM25Parameters
+ * @see BM25FParameters
+ */
+@SuppressWarnings("serial")
+public class BM25BooleanQuery extends Query {
+
+ private List<BooleanTermQuery> mustBoolTermQueries = new ArrayList<BooleanTermQuery>();
+ private List<BooleanTermQuery> shouldBoolTermQueries = new ArrayList<BooleanTermQuery>();
+ private List<BooleanTermQuery> notBoolTermQueries = new ArrayList<BooleanTermQuery>();
+ private String[] fields = null;
+ private float[] boosts;
+ private float[] bParams;
+
+ /**
+ * Build a query that will use BM25 function ranking in the field passed as
+ * parameter.<BR>
+ * <B>WARNING:BM25Parameters.setAverageLength must be invoked in order to
+ * set the average length for the field 'field'.</B>
+ *
+ * @param query The query String
+ * @param field The field to search
+ * @param analyzer Analyzer used to parse the query String
+ * @throws ParseException
+ * @throws IOException
+ * @see BM25Parameters
+ */
+ @SuppressWarnings("unchecked")
+ public BM25BooleanQuery(String query, String field, Analyzer analyzer)
+ throws ParseException {
+// IOException {
+ QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, field, analyzer);
+ Query q = qp.parse(query);
+
+ if (q instanceof BooleanQuery) {
+ List<BooleanClause> clauses = ((BooleanQuery) q).clauses();
+ for (int i = 0; i < clauses.size(); i++) {
+ Set<Term> terms = new HashSet<Term>();
+ clauses.get(i).getQuery().extractTerms(terms);
+ Iterator<Term> iter = terms.iterator();
+ while (iter.hasNext()) {
+ BooleanTermQuery boolTerm = new BooleanTermQuery(
+ new TermQuery(new Term(field, iter.next().text())),
+ clauses.get(i).getQuery().getBoost(), clauses
+ .get(i).getOccur());
+ this.addClause(boolTerm);
+ }
+ }
+ } else {
+ Set<Term> terms = new HashSet<Term>();
+ q.extractTerms(terms);
+ Iterator<Term> iter = terms.iterator();
+ while (iter.hasNext()) {
+ this.mustBoolTermQueries.add(new BooleanTermQuery(
+ new TermQuery(new Term(field, iter.next().text())),
+ BooleanClause.Occur.MUST));
+ }
+
+ }
+ }
+
+ /**
+ * Build a query that will use BM25F function ranking. By default a boost
+ * factor equals to 1 for each field will be applied. The length
+ * normalization parameter for each field b_field will be set up to 0.75. <BR>
+ * <B>WARNING:BM25FParameters.setAverageLength must be invoked in order to
+ * set the average length for each field in 'fields'.</B>
+ *
+ * @param query The query String
+ * @param fields The fields to search
+ * @param analyzer Analyzer used to parse the query String
+ * @throws ParseException
+ * @throws IOException
+ * @see BM25FParameters
+ */
+ public BM25BooleanQuery(String query, String[] fields, Analyzer analyzer)
+ throws ParseException, IOException {
+ this(query, "ALL_FIELDS", analyzer);
+ this.fields = fields;
+ this.boosts = new float[this.fields.length];
+ this.bParams = new float[this.fields.length];
+ for (int i = 0; i < this.fields.length; i++) {
+ this.boosts[i] = 1;
+ this.bParams[i] = 0.75f;
+ }
+ BM25FParameters.setBoost(this.boosts);
+ BM25FParameters.setBParam(this.bParams);
+
+ }
+
+ /**
+ *
+ * @param query
+ * The query String
+ * @param fields
+ * The fields to search
+ * @param analyzer
+ * Analyzer used to parse the query String
+ * @throws ParseException
+ * @throws IOException
+ */
+
+ /**
+ * Build a query that will use BM25F function ranking. <BR>
+ * <B>WARNING:BM25FParameters.setAverageLength must be invoked in order to
+ * set the average length for each field in 'fields'.</B>
+ *
+ * @param query The query String
+ * @param fields The fields to search
+ * @param analyzer Analyzer used to parse the query String
+ * @param boosts The boost factor applied to the fields array
+ * @param bParams The length normalization factors applied to the fields array
+ * @throws ParseException
+ * @throws IOException
+ * @see BM25FParameters
+ */
+ public BM25BooleanQuery(String query, String[] fields, Analyzer analyzer,
+ float[] boosts, float[] bParams) throws ParseException, IOException {
+ this(query, "ALL_FIELDS", analyzer);
+ this.fields = fields;
+ this.boosts = boosts;
+ this.bParams = bParams;
+ BM25FParameters.setBoost(this.boosts);
+ BM25FParameters.setBParam(this.bParams);
+
+ }
+
+ /**
+ * @param searcher
+ * @return BM25BooleanWeight corresponding to BM25 or BM25F, depending on wehether fields are initialized
+ * @throws IOException
+ */
+ @Override
+ public Weight weight(Searcher searcher) throws IOException {
+
+ if (this.fields == null)
+ return new BM25BooleanWeight(this.shouldBoolTermQueries
+ .toArray(new BooleanTermQuery[this.shouldBoolTermQueries
+ .size()]), this.mustBoolTermQueries
+ .toArray(new BooleanTermQuery[this.mustBoolTermQueries
+ .size()]), this.notBoolTermQueries
+ .toArray(new BooleanTermQuery[this.notBoolTermQueries
+ .size()]));
+ else
+ return new BM25BooleanWeight(this.shouldBoolTermQueries
+ .toArray(new BooleanTermQuery[this.shouldBoolTermQueries
+ .size()]), this.mustBoolTermQueries
+ .toArray(new BooleanTermQuery[this.mustBoolTermQueries
+ .size()]), this.notBoolTermQueries
+ .toArray(new BooleanTermQuery[this.notBoolTermQueries
+ .size()]), this.fields, this.boosts, this.bParams);
+ }
+
+ private void addClause(BooleanTermQuery boolTerm) {
+ if (boolTerm.occur == BooleanClause.Occur.MUST)
+ this.mustBoolTermQueries.add(boolTerm);
+ else if (boolTerm.occur == BooleanClause.Occur.SHOULD)
+ this.shouldBoolTermQueries.add(boolTerm);
+ else
+ this.notBoolTermQueries.add(boolTerm);
+ }
+
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ for (BooleanTermQuery btq : this.mustBoolTermQueries) {
+ buffer.append(btq.toString());
+ buffer.append(" ");
+ }
+ for (BooleanTermQuery btq : this.shouldBoolTermQueries) {
+ buffer.append(btq.toString());
+ buffer.append(" ");
+ }
+ for (BooleanTermQuery btq : this.notBoolTermQueries) {
+ buffer.append(btq.toString());
+ }
+ return buffer.toString();
+ }
+
+ @Override
+ public String toString(String field) {
+ return this.toString();
+ }
+
+ public class BooleanTermQuery {
+
+ TermQuery termQuery;
+ BooleanClause.Occur occur;
+
+ public BooleanTermQuery(TermQuery termQuery, BooleanClause.Occur occur) {
+ this.termQuery = termQuery;
+ this.occur = occur;
+ }
+
+ public BooleanTermQuery(TermQuery termQuery, float boost,
+ BooleanClause.Occur occur) {
+ this(termQuery, occur);
+ this.termQuery.setBoost(boost);
+ }
+
+ public TermQuery getTermQuery() {
+ return termQuery;
+ }
+
+ public float getBoost() {
+ return this.termQuery.getBoost();
+ }
+
+ public void setTermQuery(TermQuery termQuery) {
+ this.termQuery = termQuery;
+ }
+
+ public BooleanClause.Occur getOccur() {
+ return occur;
+ }
+
+ public void setOccur(BooleanClause.Occur occur) {
+ this.occur = occur;
+ }
+
+ public String toString() {
+ String result = "";
+ result = "(" + this.occur + "(" + this.getTermQuery().getTerm()
+ + "^" + this.getBoost() + "))";
+ return result;
+ }
+ }
+
+ public static void main(String args[]) throws ParseException,
+ CorruptIndexException, IOException {
+ BM25BooleanQuery q = new BM25BooleanQuery("| KK", "CONTENT",
+ new StandardAnalyzer(Version.LUCENE_CURRENT));
+ }
+
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25TermScorer.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25TermScorer.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25TermScorer.java (revision 0)
@@ -0,0 +1,102 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.search.*;
+
+/**
+ * Calculate the relevance value of a single term applying BM25 function
+ * ranking. The {@link BM25Parameters} k1, and b are used.<BR>
+ *
+ *
+ * @see BM25Parameters
+ */
+public class BM25TermScorer extends Scorer {
+
+ private TermQuery term;
+ private IndexReader reader;
+ private TermDocs termDocs;
+ private float idf;
+ private float avgLength;
+ private byte[] norm;
+ private float b;
+ private float k1;
+ private int doc = -1;
+
+ public BM25TermScorer(IndexReader reader, TermQuery term, Similarity similarity)
+ throws IOException {
+ super(similarity);
+ this.reader = reader;
+ this.term = term;
+ this.idf = this.getSimilarity().idf(reader.docFreq(term.getTerm()), reader.numDocs());
+ this.norm = this.reader.norms(this.term.getTerm().field());
+ this.avgLength = BM25Parameters.getAverageLength(this.term.getTerm().field());
+ this.b = BM25Parameters.getB();
+ this.k1 = BM25Parameters.getK1();
+ this.termDocs = this.reader.termDocs(this.term.getTerm());
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+
+
+ @Override
+ public int nextDoc() throws IOException {
+ boolean result = this.termDocs.next();
+ if (result) {
+ doc = this.termDocs.doc();
+ } else {
+ this.termDocs.close();
+ doc = NO_MORE_DOCS;
+ }
+ return doc;
+ }
+
+ @Override
+ public float score() throws IOException {
+ float fieldNorm = this.getSimilarity().decodeNormValue(this.norm[this.docID()]);
+ float length = 1 / (fieldNorm * fieldNorm);
+
+ // LENGTH NORMALIZATION
+ float result = this.b * (length / this.avgLength);
+ result += 1 - this.b;
+
+ result = (this.term.getBoost() * this.termDocs.freq()) / result;
+ // FREQ SATURATION
+ result /= (result + this.k1);
+
+ return result * this.idf;
+
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target == NO_MORE_DOCS)
+ return NO_MORE_DOCS;
+ while (this.nextDoc() != NO_MORE_DOCS && this.docID() < target) {
+ }
+ return this.docID();
+ }
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25Similarity.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25Similarity.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25Similarity.java (revision 0)
@@ -0,0 +1,106 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Similarity;
+
+/**
+ * Similarity for BM25 ranking functions family.<BR>
+ * This class only implements public <I>float idf(int docFreq, int numDocs)</I>,
+ * other method always return 1 and are never invoked from the Scorers.<BR>
+ * IDF is implemented as next:<BR>
+ * log ((N-n+0.5)/(n+0.5))<BR>
+ * where n = docFreq(term) and N = numDocs().
+ *
+ *
+ *
+ */
+public class BM25Similarity extends Similarity {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Similarity#coord(int, int)
+ */
+ @Override
+ public float coord(int overlap, int maxOverlap) {
+ return (float) overlap / (float) maxOverlap;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Similarity#idf(int, int)
+ */
+ @Override
+ public float idf(int docFreq, int numDocs) {
+ float result = (numDocs - docFreq + 0.5f);
+ result = result / (docFreq + 0.5f);
+ return (float) Math.log(result);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Similarity#lengthNorm(java.lang.String,
+ * int)
+ */
+ @Override
+ public float lengthNorm(String fieldName, int numTokens) {
+ return 1;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Similarity#queryNorm(float)
+ */
+ @Override
+ public float queryNorm(float sumOfSquaredWeights) {
+
+ return 1;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Similarity#sloppyFreq(int)
+ */
+ @Override
+ public float sloppyFreq(int distance) {
+
+ return 1;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Similarity#tf(float)
+ */
+ @Override
+ public float tf(float freq) {
+
+ return freq;
+ }
+
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25BooleanWeight.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25BooleanWeight.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bm25/BM25BooleanWeight.java (revision 0)
@@ -0,0 +1,164 @@
+package org.apache.lucene.bm.bm25;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.bm.bm25.BM25BooleanQuery.BooleanTermQuery;
+
+/**
+ * Weight BM25 class, implements <I>public Scorer scorer(IndexReader reader)
+ * throws IOException</I> <BR>
+ * and <I>public Explanation explain(IndexReader reader, int doc) throws
+ * IOException </I><BR>
+ * This is a mostly degenerate class, as Query weight is not used in this BM25 implementation.
+ *
+ *
+ *
+ */
+@SuppressWarnings("serial")
+public class BM25BooleanWeight extends Weight {
+ private BooleanTermQuery[] should;
+ private BooleanTermQuery[] must;
+ private BooleanTermQuery[] not;
+ private BooleanTermQuery[] unique = null;
+ private String[] fields = null;
+ private float[] boosts;
+ private float[] bParams;
+ private int howMany = 0;
+
+ public BM25BooleanWeight(BooleanTermQuery[] should,
+ BooleanTermQuery[] must, BooleanTermQuery[] not) {
+ if (should.length > 0) {
+ this.should = should;
+ this.unique = this.should;
+ howMany++;
+ }
+ if (must.length > 0) {
+ this.must = must;
+ this.unique = this.must;
+ howMany++;
+ }
+ if (not.length > 0) {
+ this.not = not;
+ this.unique = this.not;
+ howMany++;
+ }
+ }
+
+ public BM25BooleanWeight(BooleanTermQuery[] should,
+ BooleanTermQuery[] must, BooleanTermQuery[] not, String fields[],
+ float[] boosts, float[] bParams) {
+ this(should, must, not);
+ this.fields = fields;
+ this.boosts = boosts;
+ this.bParams = bParams;
+ }
+
+ /**
+ * Return null
+ *
+ * @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader,
+ * int)
+ */
+ @Override
+ public Explanation explain(IndexReader reader, int doc) throws IOException {
+ Explanation e = new Explanation();
+ if (this.fields == null) {
+ e.addDetail(new Explanation(0.0f, "using BM25"));
+ } else {
+ e.addDetail(new Explanation(0.0f, "using BM25F"));
+ }
+ return e;
+ }
+
+ /*
+ * Return null
+ *
+ * @see org.apache.lucene.search.Weight#getQuery()
+ */
+ @Override
+ public Query getQuery() {
+ return null;
+ }
+
+ /**
+ * Return 0
+ *
+ * @see org.apache.lucene.search.Weight#getValue()
+ */
+ @Override
+ public float getValue() {
+ return 0;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Weight#normalize(float)
+ */
+ @Override
+ public void normalize(float norm) {
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.IndexReader
+ * )
+ */
+ @Override
+ public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
+ if (howMany > 1) { // BM25BooleaScorer
+ if (this.fields == null)
+ return new BM25BooleanScorer(reader, this.should, this.must,
+ this.not, new BM25Similarity());
+ else
+ return new BM25BooleanScorer(reader, this.should, this.must,
+ this.not, new BM25Similarity(), this.fields,
+ this.boosts, this.bParams);
+ } else {// BM25SingleBooleanScorer
+ if (this.fields == null)
+ return new BM25SingleBooleanScorer(reader, this.unique,
+ new BM25Similarity());
+ else
+ return new BM25SingleBooleanScorer(reader, this.unique,
+ new BM25Similarity(), this.fields, this.boosts,
+ this.bParams);
+ }
+
+ }
+
+ /**
+ * Return 0.
+ *
+ * @see org.apache.lucene.search.Weight#sumOfSquaredWeights()
+ */
+ @Override
+ public float sumOfSquaredWeights() throws IOException {
+ return 0;
+ }
+
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bm25f/BM25FParameters.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bm25f/BM25FParameters.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bm25f/BM25FParameters.java (revision 0)
@@ -0,0 +1,151 @@
+package org.apache.lucene.bm.bm25f;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map.Entry;
+
+import org.apache.lucene.bm.bm25.BM25Parameters;
+
+/**
+ * Parameters needed to calculate the BM25F relevance score.
+ */
+
+public class BM25FParameters extends BM25Parameters {
+
+ private static float K1 = 2f;
+ private static float[] boost;
+ private static float[] bParam;
+ private static String idfField = null;
+
+ protected BM25FParameters() {
+ }
+
+ ;
+
+ /**
+ * @return The field with the longest average length
+ */
+ public static String getIdfField() {
+ if (idfField == null) {
+ float max = -1;
+ String maxField = "";
+ Iterator<Entry<String, Float>> iter = BM25FParameters.avgLength
+ .entrySet().iterator();
+ while (iter.hasNext()) {
+ Entry<String, Float> entry = iter.next();
+ if (entry.getValue() > max) {
+ max = entry.getValue();
+ maxField = entry.getKey();
+ }
+
+ }
+ BM25FParameters.idfField = maxField;
+ }
+ return idfField;
+
+ }
+
+ /**
+ * @return the fields boost,
+ */
+ public static float[] getBoost() {
+ return boost;
+ }
+
+ /**
+ * Set the field boost
+ *
+ * @param boost float array with boost
+ */
+ public static void setBoost(float[] boost) {
+ BM25FParameters.boost = boost;
+ }
+
+ /**
+ * @return the length normalization parameters
+ */
+ public static float[] getBParam() {
+ return bParam;
+ }
+
+ /**
+ * Set the length normalization parameters
+ *
+ * @param param float array with bParam
+ */
+ public static void setBParam(float[] param) {
+ bParam = param;
+ }
+
+ /**
+ * @return the parameter k1, by default is 2
+ */
+ public static float getK1() {
+ return K1;
+ }
+
+ /**
+ * Set the k1 parameter
+ *
+ * @param k1
+ */
+ public static void setK1(float k1) {
+ K1 = k1;
+ }
+
+ /**
+ * Load field average length from a file and set idfField value, the file
+ * must have the next format: <BR>
+ * FIELD_NAME <BR>
+ * FLOAT_VALUE <BR>
+ * ANOTHER_FIELD_NAME <BR>
+ * ANOTHER_FIELD_VALUE<BR>
+ * for example:<BR>
+ * CONTENT<BR>
+ * 459.2903f<BR>
+ * ANCHOR<BR>
+ * 84.55523f<BR>
+ *
+ * @param path absolute path of the file
+ * @throws NumberFormatException
+ * @throws IOException
+ */
+
+ public static void load(String path) throws NumberFormatException,
+ IOException {
+ BufferedReader in = new BufferedReader(new FileReader(path));
+ String line;
+ float max = -1;
+ String maxField = "";
+ while (null != (line = in.readLine())) {
+ String field = line;
+ Float avg = new Float(in.readLine());
+ if (avg > max) {
+ max = avg;
+ maxField = field;
+ }
+ BM25FParameters.setAverageLength(field, avg);
+ }
+ BM25FParameters.idfField = maxField;
+ }
+
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bm25f/BM25FTermScorer.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bm25f/BM25FTermScorer.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bm25f/BM25FTermScorer.java (revision 0)
@@ -0,0 +1,172 @@
+package org.apache.lucene.bm.bm25f;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.TermQuery;
+
+/**
+ * Calculate the relevance value of a term applying BM25F function ranking. The
+ * {@link BM25FParameters} k1,b_field, boost_field are used.<BR>
+ *
+ *
+ * @see BM25FParameters
+ */
+public class BM25FTermScorer extends Scorer {
+
+ private TermDocs[] termDocs;
+ private float idf = 0f;
+ private String[] fields;
+ private float[] boosts;
+ private float[] bParam;
+ private boolean[] termDocsNext;
+ private int doc = Integer.MAX_VALUE;
+ private boolean initializated = false;
+ private byte[][] norms;
+ private float[] averageLengths;
+ private float K1;
+ private int len;
+ private float termBoost;
+ private int docFreq;
+ private int numDocs;
+ private String termText;
+
+ public BM25FTermScorer(IndexReader reader, TermQuery term, String[] fields,
+ float[] boosts, float[] bParams, Similarity similarity) {
+ super(similarity);
+ this.fields = fields;
+ this.boosts = boosts;
+ this.bParam = bParams;
+ len = fields.length;
+ this.termDocs = new TermDocs[len];
+ this.termDocsNext = new boolean[len];
+ this.norms = new byte[len][];
+ this.averageLengths = new float[len];
+ this.K1 = BM25FParameters.getK1();
+ this.termBoost = term.getBoost();
+ this.numDocs = reader.numDocs();
+ this.termText = term.getTerm().text();
+
+ try {
+ this.docFreq = reader.docFreq(new Term(BM25FParameters.getIdfField(), termText));
+ for (int i = 0; i < len; i++) {
+ String field = this.fields[i];
+ this.termDocs[i] = reader.termDocs(new Term(field, termText));
+ norms[i] = reader.norms(field);
+ averageLengths[i] = BM25FParameters.getAverageLength(field);
+ }
+ this.idf = this.getSimilarity().idf(docFreq, numDocs);
+ } catch (IOException e) {
+ }
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#docID()
+ */
+ @Override
+ public int docID() {
+ return this.doc;
+ }
+
+ private boolean init() throws IOException {
+ boolean result = false;
+ for (int i = 0; i < len; i++) {
+ this.termDocsNext[i] = this.termDocs[i].next();
+ if (this.termDocsNext[i] && this.termDocs[i].doc() < this.doc) {
+ result = true;
+ this.doc = this.termDocs[i].doc();
+ }
+ }
+ return result;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#nextDoc()
+ */
+ @Override
+ public int nextDoc() throws IOException {
+ if (!initializated) {
+ this.initializated = true;
+ if (this.init()) {
+ return this.doc;
+ } else {
+ return NO_MORE_DOCS;
+ }
+ }
+
+ int min = NO_MORE_DOCS;
+
+ for (int i = 0; i < len; i++) {
+ if (this.termDocsNext[i] && this.termDocs[i].doc() == this.doc) {
+ this.termDocsNext[i] = this.termDocs[i].next();
+ }
+ if (this.termDocsNext[i] && this.termDocs[i].doc() < min)
+ min = this.termDocs[i].doc();
+ }
+ return (this.doc = min);
+ }
+
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#score()
+ */
+ @Override
+ public float score() throws IOException {
+ float acum = 0f;
+
+ for (int i = 0; i < len; i++) {
+ if (this.termDocs[i].doc() == doc) {
+ float av_length = this.averageLengths[i];
+ float fieldNorm = this.getSimilarity().decodeNormValue(norms[i][this.docID()]);
+ float length = 1 / (fieldNorm * fieldNorm);
+
+ float aux = this.bParam[i] * length / av_length;
+
+ aux += (1 - this.bParam[i]);
+ acum += (this.termBoost * this.boosts[i] * this.termDocs[i].freq()) / aux;
+ }
+ }
+
+ acum /= (this.K1 + acum);
+ acum *= this.idf;
+ return acum;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target == NO_MORE_DOCS)
+ return NO_MORE_DOCS;
+ while (this.nextDoc() != NO_MORE_DOCS && this.docID() < target) {
+ }
+
+ return this.docID();
+ }
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bool/AbstractBooleanScorer.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bool/AbstractBooleanScorer.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bool/AbstractBooleanScorer.java (revision 0)
@@ -0,0 +1,41 @@
+package org.apache.lucene.bm.bool;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
+
+/**
+ * An abstract master class for BM25 Boolean Scorers.
+ * Holds an array of subScorers and another indicating which scorer has a next document.
+ */
+public abstract class AbstractBooleanScorer extends Scorer {
+
+ protected Scorer[] subScorer;
+ protected boolean subScorerNext[];
+
+ protected AbstractBooleanScorer(Similarity similarity, Scorer scorer[])
+ throws IOException {
+ super(similarity);
+ this.subScorer = scorer;
+ if (scorer != null && scorer.length > 0)
+ this.subScorerNext = new boolean[this.subScorer.length];
+ }
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bool/MatchAllBooleanScorer.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bool/MatchAllBooleanScorer.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bool/MatchAllBooleanScorer.java (revision 0)
@@ -0,0 +1,76 @@
+package org.apache.lucene.bm.bool;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.search.Similarity;
+
+/**
+ * Boolean Scorer that matches all documents.<BR>
+ *
+ *
+ */
+public class MatchAllBooleanScorer extends AbstractBooleanScorer {
+
+ private int doc = -1;
+ private int ndocs;
+
+ public MatchAllBooleanScorer(Similarity similarity, int numDocs)
+ throws IOException {
+ super(similarity, null);
+ this.ndocs = numDocs;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#docID()
+ */
+ @Override
+ public int docID() {
+ return this.doc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ this.doc++;
+ if (this.doc >= this.ndocs)
+ this.doc = NO_MORE_DOCS;
+ return this.doc;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#score()
+ */
+ @Override
+ public float score() throws IOException {
+ return 0;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target == NO_MORE_DOCS)
+ return NO_MORE_DOCS;
+ while (this.nextDoc() != NO_MORE_DOCS && this.docID() < target) {
+ }
+ return this.docID();
+ }
+}
\ No newline at end of file
Index: contrib/bm/src/java/org/apache/lucene/bm/bool/MustBooleanScorer.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bool/MustBooleanScorer.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bool/MustBooleanScorer.java (revision 0)
@@ -0,0 +1,139 @@
+package org.apache.lucene.bm.bool;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
+
+/**
+ * Boolean Scorer that matches all documents that contains all terms (AND
+ * operator).<BR>
+ *
+ *
+ */
+public class MustBooleanScorer extends AbstractBooleanScorer {
+
+ private boolean initializated = false;
+ private int doc = -1;
+
+ public MustBooleanScorer(Similarity similarity, Scorer[] scorer)
+ throws IOException {
+ super(similarity, scorer);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#docID()
+ */
+ @Override
+ public int docID() {
+ return this.doc;
+ }
+
+ private boolean init() throws IOException {
+ for (int i = 0; i < this.subScorer.length; i++) {
+ this.subScorerNext[i] = (this.subScorer[i].nextDoc() != NO_MORE_DOCS);
+ if (this.subScorerNext[i] && this.subScorer[i].docID() > this.doc) {
+ this.doc = this.subScorer[i].docID();
+ }
+ }
+ return false;
+ }
+
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#next()
+ */
+ @Override
+ public int nextDoc() throws IOException {
+ // INIT SUBSCORERS
+ if (!this.initializated) {
+ this.init();
+ this.initializated = true;
+ } else {
+ int max = -1;
+ for (int i = 0; i < this.subScorer.length; i++) {
+ if (this.subScorerNext[i]
+ && this.subScorer[i].docID() == this.doc) {
+ this.subScorerNext[i] = (this.subScorer[i].nextDoc() != NO_MORE_DOCS);
+ if (this.subScorerNext[i] && this.subScorer[i].docID() > max)
+ max = this.subScorer[i].docID();
+ }
+ }
+ this.doc = max;
+ }
+ while (true) {
+ int count = 0;
+ boolean more = true;
+ for (int i = 0; i < this.subScorer.length && more; i++) {
+ if (this.subScorerNext[i]) {
+ if (this.subScorer[i].docID() == this.doc) {
+ count++;
+ }
+ if (this.subScorer[i].docID() < this.doc) {
+ this.subScorerNext[i] = (this.subScorer[i].nextDoc() != NO_MORE_DOCS);
+ if (this.subScorerNext[i]
+ && this.subScorer[i].docID() > this.doc) {
+ this.doc = this.subScorer[i].docID();
+ more = false;
+ count = 0;
+ }
+ }
+ if (count == this.subScorer.length)
+ return this.doc;
+ } else {
+ this.doc = NO_MORE_DOCS;
+ return NO_MORE_DOCS;
+ }
+ }
+
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#score()
+ */
+ @Override
+ public float score() throws IOException {
+ double result = 0f;
+ for (int i = 0; i < this.subScorer.length; i++) {
+ if (this.subScorer[i].docID() == this.doc)
+ result = this.subScorer[i].score() + result;
+
+ }
+ return (float) result;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target == NO_MORE_DOCS)
+ return NO_MORE_DOCS;
+ while (this.nextDoc() != NO_MORE_DOCS && this.docID() < target) {
+ }
+
+ return this.docID();
+ }
+
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bool/ShouldBooleanScorer.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bool/ShouldBooleanScorer.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bool/ShouldBooleanScorer.java (revision 0)
@@ -0,0 +1,115 @@
+package org.apache.lucene.bm.bool;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
+
+/**
+ * Boolean Scorer that matches all documents that contains at least one term (OR
+ * operator).<BR>
+ *
+ *
+ */
+public class ShouldBooleanScorer extends AbstractBooleanScorer {
+
+ private boolean initializated = false;
+ private int doc = Integer.MAX_VALUE;
+
+ public ShouldBooleanScorer(Similarity similarity, Scorer scorer[])
+ throws IOException {
+ super(similarity, scorer);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#docID()
+ */
+ @Override
+ public int docID() {
+ return this.doc;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#nextDoc()
+ */
+ @Override
+ public int nextDoc() throws IOException {
+ if (!this.initializated) {
+ this.initializated = true;
+ if (this.init()) {
+ return this.doc;
+ } else {
+ return NO_MORE_DOCS;
+ }
+ }
+ int min = NO_MORE_DOCS;
+ // AVANZO LOS TERMDOCS CON MENOR ID
+ for (int i = 0; i < this.subScorer.length; i++) {
+ if (this.subScorerNext[i] && this.subScorer[i].docID() == this.doc) {
+ this.subScorerNext[i] = (this.subScorer[i].nextDoc() != NO_MORE_DOCS);
+ }
+ if (this.subScorerNext[i] && this.subScorer[i].docID() < min)
+ min = this.subScorer[i].docID();
+ }
+ return (this.doc = min);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#score()
+ */
+ @Override
+ public float score() throws IOException {
+ double result = 0f;
+ for (int i = 0; i < this.subScorer.length; i++) {
+ if (this.subScorer[i].docID() == this.doc)
+ result += this.subScorer[i].score();
+
+ }
+ return (float) result;
+ }
+
+ private boolean init() throws IOException {
+ boolean result = false;
+ for (int i = 0; i < this.subScorer.length; i++) {
+ this.subScorerNext[i] = (this.subScorer[i].nextDoc() != NO_MORE_DOCS);
+ if (this.subScorerNext[i] && this.subScorer[i].docID() < this.doc) {
+ this.doc = this.subScorer[i].docID();
+ result = true;
+ }
+ }
+ return result;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target == NO_MORE_DOCS)
+ return NO_MORE_DOCS;
+ while (this.nextDoc() != NO_MORE_DOCS && this.docID() < target) {
+ }
+
+ return this.docID();
+ }
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/bool/NotBooleanScorer.java
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/bool/NotBooleanScorer.java (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/bool/NotBooleanScorer.java (revision 0)
@@ -0,0 +1,104 @@
+package org.apache.lucene.bm.bool;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
+
+/**
+ * Boolean Scorer that matches all documents that do NOT contains any term (NOT
+ * operator).<BR>
+ *
+ *
+ */
+public class NotBooleanScorer extends AbstractBooleanScorer {
+
+ private int doc = -1;
+ private int numDocs;
+
+ public NotBooleanScorer(Similarity similarity, Scorer[] scorer, int numDocs)
+ throws IOException {
+ super(similarity, scorer);
+ this.numDocs = numDocs;
+ for (int i = 0; i < this.subScorer.length; i++)
+ this.subScorerNext[i] = (this.subScorer[i].nextDoc() != NO_MORE_DOCS);
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#docID()
+ */
+ @Override
+ public int docID() {
+ return this.doc;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#nextDoc()
+ */
+ @Override
+ public int nextDoc() throws IOException {
+ while (this.doc < this.numDocs - 1) {
+ this.doc++;
+ int count = 0;
+ for (int i = 0; i < this.subScorer.length; i++) {
+ if (this.subScorerNext[i])
+ if (this.subScorer[i].docID() != this.doc) {
+ count++;
+ } else {
+ this.subScorerNext[i] = (this.subScorer[i].nextDoc() != NO_MORE_DOCS);
+ count = 0;
+ }
+ else
+ count++;
+ if (count == this.subScorer.length)
+ return this.doc;
+ }
+ }
+ this.doc = NO_MORE_DOCS;
+ return NO_MORE_DOCS;
+ }
+
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.search.Scorer#score()
+ */
+ @Override
+ public float score() throws IOException {
+ return 1;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ if (target == NO_MORE_DOCS)
+ return NO_MORE_DOCS;
+ while (this.nextDoc() != NO_MORE_DOCS && this.docID() < target) {
+ }
+
+ return this.docID();
+ }
+
+}
Index: contrib/bm/src/java/org/apache/lucene/bm/package.html
===================================================================
--- contrib/bm/src/java/org/apache/lucene/bm/package.html (revision 0)
+++ contrib/bm/src/java/org/apache/lucene/bm/package.html (revision 0)
@@ -0,0 +1,409 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+
+<!--Converted with LaTeX2HTML 2002-2-1 (1.71)
+original version by: Nikos Drakos, CBLU, University of Leeds
+* revised and updated by: Marcus Hennecke, Ross Moore, Herb Swan
+* with significant contributions from:
+ Jens Lippmann, Marek Rouchal, Martin Wilck and others -->
+<HTML>
+<HEAD>
+<TITLE>Integrating BM25 &amp; BM25F into Lucene</TITLE>
+<META NAME="description" CONTENT="Integrating BM25 &amp; BM25F into Lucene">
+<META NAME="keywords" CONTENT="main">
+<META NAME="resource-type" CONTENT="document">
+<META NAME="distribution" CONTENT="global">
+
+<META NAME="Generator" CONTENT="LaTeX2HTML v2002-2-1">
+<META HTTP-EQUIV="Content-Style-Type" CONTENT="text/css">
+
+<LINK REL="STYLESHEET" HREF="main.css">
+
+</HEAD>
+
+<BODY >
+
+<H1 ALIGN=CENTER>Integrating BM25 &amp; BM25F into Lucene<!--<A NAME="tex2html1"
+ HREF="#foot6"><SUP><SPAN CLASS="arabic">1</SPAN></SUP></A>--></H1>
+<P ALIGN=CENTER><STRONG>Joaqu&#237;n P&#233;rez-Iglesias</STRONG>
+</P>
+<HR>
+
+<H1><A NAME="SECTION00010000000000000000">
+Introduction</A>
+</H1>
+This document describes the BM25 and BM25F implementation using the Lucene Java Framework. The implementation described here can be downloaded from
+<TT><A NAME="tex2html2"
+ HREF="http://nlp.uned.es/~jperezi/Lucene-BM25/jar/models.jar">http://nlp.uned.es/~jperezi/Lucene-BM25/jar/models.jar</A></TT>. Both models have stood out at TREC by their performance and are considered as state-of-the-art in the IR community.
+BM25 is applied to `ad-hoc' retrieval, that is for documents that do not contain fields, on the other hand BM25F is applied to documents with structure.
+
+<P>
+
+<H1><A NAME="SECTION00020000000000000000">
+BM25 &amp; BM25F</A>
+</H1>
+The developed models are based in the information that can be found at <TT><A NAME="tex2html3"
+ HREF="http://www.zaragozas.info/hugo/academic/pdf/tutorial_sigir07_2d.pdf">http://www.zaragozas.info/hugo/academic/pdf/tutorial_sigir07_2d.pdf</A></TT>.
+More specifically the implemented ranking functions are as next:
+
+<P>
+
+<H2><A NAME="SECTION00021000000000000000">
+BM25</A>
+</H2>
+<P></P>
+<DIV ALIGN="CENTER" CLASS="mathdisplay"><!-- MATH
+ \begin{equation*}
+\textit{R(q,d)} = \sum_{\textit{t en q}} \frac{occurs_t^d}{k_1 ((1-b)+b \cdot \frac{l_d}{avl_d})+{occurs_t^d}} \cdot idf(t)
+\end{equation*}
+ -->
+<TABLE CLASS="equation*" CELLPADDING="0" WIDTH="100%" ALIGN="CENTER">
+<TR VALIGN="MIDDLE">
+<TD NOWRAP ALIGN="CENTER"><SPAN CLASS="MATH"><IMG
+ WIDTH="429" HEIGHT="65" ALIGN="MIDDLE" BORDER="0"
+ SRC="img1.png"
+ ALT="$\displaystyle \textit{R(q,d)} = \sum_{\textit{t en q}} \frac{occurs_t^d}{k_1 ((1-b)+b \cdot \frac{l_d}{avl_d})+{occurs_t^d}} \cdot idf(t)$"></SPAN></TD>
+<TD NOWRAP CLASS="eqno" WIDTH="10" ALIGN="RIGHT">
+&nbsp;&nbsp;&nbsp;</TD></TR>
+</TABLE></DIV>
+<BR CLEAR="ALL"><P></P>
+
+<P>
+Where <!-- MATH
+ $occurs_t^d$
+ -->
+<SPAN CLASS="MATH"><IMG
+ WIDTH="66" HEIGHT="39" ALIGN="MIDDLE" BORDER="0"
+ SRC="img2.png"
+ ALT="$ occurs_t^d$"></SPAN> is the term frequency of `t' in `d'; <SPAN CLASS="MATH"><IMG
+ WIDTH="18" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img3.png"
+ ALT="$ l_d$"></SPAN> is the document `d' length; <SPAN CLASS="MATH"><IMG
+ WIDTH="38" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img4.png"
+ ALT="$ avl_{d}$"></SPAN> is the document average length along the collection; <SPAN CLASS="MATH"><IMG
+ WIDTH="22" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img5.png"
+ ALT="$ k_1$"></SPAN> is a free parameter usually chosen as 2 and <SPAN CLASS="MATH"><IMG
+ WIDTH="13" HEIGHT="17" ALIGN="BOTTOM" BORDER="0"
+ SRC="img6.png"
+ ALT="$ b$"></SPAN> is another free parameter that is between 0 and 1 (usually is 0.75).
+
+<BR><P></P>
+<DIV ALIGN="CENTER" CLASS="mathdisplay"><!-- MATH
+ \begin{equation*}
+idf(t)= \log{\frac{N-df(t)+0.5}{df(t)+0.5}}
+\end{equation*}
+ -->
+<TABLE CLASS="equation*" CELLPADDING="0" WIDTH="100%" ALIGN="CENTER">
+<TR VALIGN="MIDDLE">
+<TD NOWRAP ALIGN="CENTER"><SPAN CLASS="MATH"><IMG
+ WIDTH="233" HEIGHT="62" ALIGN="MIDDLE" BORDER="0"
+ SRC="img7.png"
+ ALT="$\displaystyle idf(t)= \log{\frac{N-df(t)+0.5}{df(t)+0.5}}$"></SPAN></TD>
+<TD NOWRAP CLASS="eqno" WIDTH="10" ALIGN="RIGHT">
+&nbsp;&nbsp;&nbsp;</TD></TR>
+</TABLE></DIV>
+<BR CLEAR="ALL"><P></P>
+
+<P>
+Where <SPAN CLASS="MATH"><IMG
+ WIDTH="22" HEIGHT="17" ALIGN="BOTTOM" BORDER="0"
+ SRC="img8.png"
+ ALT="$ N$"></SPAN> is the number of document in the collection and <SPAN CLASS="MATH"><IMG
+ WIDTH="23" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img9.png"
+ ALT="$ df$"></SPAN> is the number of documents where appears the term `t'.
+
+<P>
+A different version of this formula, as can be found at <TT><A NAME="tex2html4"
+ HREF="http://en.wikipedia.org/wiki/Probabilistic_relevance_model_(BM25)">http://en.wikipedia.org/wiki/Probabilistic_relevance_model_(BM25)</A></TT>, multiplies the obtained bm25 weight by the constant <SPAN CLASS="MATH"><IMG
+ WIDTH="69" HEIGHT="37" ALIGN="MIDDLE" BORDER="0"
+ SRC="img10.png"
+ ALT="$ (k_1 + 1)$"></SPAN> in order to normalize the weight of terms with a frequency equals to 1 that occurs in documents with an average length.
+
+<P>
+
+<H2><A NAME="SECTION00022000000000000000">
+BM25F</A>
+</H2>
+First we obtain the acummulated weight of a term over all fields as next:
+<P></P>
+<DIV ALIGN="CENTER" CLASS="mathdisplay"><!-- MATH
+ \begin{equation*}
+\textit{weight(t,d)} = \sum_{\textit{c in d}} \frac{occurs_{t,c}^d \cdot boost_c}{((1-b_c)+b_c \cdot \frac{l_c}{avl_c})}
+\end{equation*}
+ -->
+<TABLE CLASS="equation*" CELLPADDING="0" WIDTH="100%" ALIGN="CENTER">
+<TR VALIGN="MIDDLE">
+<TD NOWRAP ALIGN="CENTER"><SPAN CLASS="MATH"><IMG
+ WIDTH="316" HEIGHT="69" ALIGN="MIDDLE" BORDER="0"
+ SRC="img11.png"
+ ALT="$\displaystyle \textit{weight(t,d)} = \sum_{\textit{c in d}} \frac{occurs_{t,c}^d \cdot boost_c}{((1-b_c)+b_c \cdot \frac{l_c}{avl_c})}$"></SPAN></TD>
+<TD NOWRAP CLASS="eqno" WIDTH="10" ALIGN="RIGHT">
+&nbsp;&nbsp;&nbsp;</TD></TR>
+</TABLE></DIV>
+<BR CLEAR="ALL"><P></P>
+
+<P>
+Where <SPAN CLASS="MATH"><IMG
+ WIDTH="17" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img12.png"
+ ALT="$ l_c$"></SPAN> is the field length; <SPAN CLASS="MATH"><IMG
+ WIDTH="36" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img13.png"
+ ALT="$ avl_{c}$"></SPAN> is the average length for the field <SPAN CLASS="MATH"><IMG
+ WIDTH="13" HEIGHT="17" ALIGN="BOTTOM" BORDER="0"
+ SRC="img14.png"
+ ALT="$ c$"></SPAN>; <SPAN CLASS="MATH"><IMG
+ WIDTH="19" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img15.png"
+ ALT="$ b_{c}$"></SPAN> is a constant related to the field length, similar to <SPAN CLASS="MATH"><IMG
+ WIDTH="13" HEIGHT="17" ALIGN="BOTTOM" BORDER="0"
+ SRC="img6.png"
+ ALT="$ b$"></SPAN> in BM25 and <SPAN CLASS="MATH"><IMG
+ WIDTH="53" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img16.png"
+ ALT="$ boost_c$"></SPAN> is the boost factor applied to field `c'.
+
+<BR>
+Next, a non-linear saturation is applied <!-- MATH
+ $\frac{weight}{k_1 + weight}$
+ -->
+<SPAN CLASS="MATH"><IMG
+ WIDTH="73" HEIGHT="42" ALIGN="MIDDLE" BORDER="0"
+ SRC="img17.png"
+ ALT="$ \frac{weight}{k_1 + weight}$"></SPAN>:
+<P></P>
+<DIV ALIGN="CENTER" CLASS="mathdisplay"><!-- MATH
+ \begin{equation*}
+\textit{R(q,d)} = \sum_{\textit{t in q}} idf(t) \cdot \frac{weight(t,d)}{k_1 + weight(t,d)}
+\end{equation*}
+ -->
+<TABLE CLASS="equation*" CELLPADDING="0" WIDTH="100%" ALIGN="CENTER">
+<TR VALIGN="MIDDLE">
+<TD NOWRAP ALIGN="CENTER"><SPAN CLASS="MATH"><IMG
+ WIDTH="317" HEIGHT="62" ALIGN="MIDDLE" BORDER="0"
+ SRC="img18.png"
+ ALT="$\displaystyle \textit{R(q,d)} = \sum_{\textit{t in q}} idf(t) \cdot \frac{weight(t,d)}{k_1 + weight(t,d)}$"></SPAN></TD>
+<TD NOWRAP CLASS="eqno" WIDTH="10" ALIGN="RIGHT">
+&nbsp;&nbsp;&nbsp;</TD></TR>
+</TABLE></DIV>
+<BR CLEAR="ALL"><P></P>
+
+<P>
+<P></P>
+<DIV ALIGN="CENTER" CLASS="mathdisplay"><!-- MATH
+ \begin{equation*}
+idf(t)= \log{\frac{N-df(t)+0.5}{df(t)+0.5}}
+\end{equation*}
+ -->
+<TABLE CLASS="equation*" CELLPADDING="0" WIDTH="100%" ALIGN="CENTER">
+<TR VALIGN="MIDDLE">
+<TD NOWRAP ALIGN="CENTER"><SPAN CLASS="MATH"><IMG
+ WIDTH="233" HEIGHT="62" ALIGN="MIDDLE" BORDER="0"
+ SRC="img7.png"
+ ALT="$\displaystyle idf(t)= \log{\frac{N-df(t)+0.5}{df(t)+0.5}}$"></SPAN></TD>
+<TD NOWRAP CLASS="eqno" WIDTH="10" ALIGN="RIGHT">
+&nbsp;&nbsp;&nbsp;</TD></TR>
+</TABLE></DIV>
+<BR CLEAR="ALL"><P></P>
+
+<P>
+Where <SPAN CLASS="MATH"><IMG
+ WIDTH="22" HEIGHT="17" ALIGN="BOTTOM" BORDER="0"
+ SRC="img8.png"
+ ALT="$ N$"></SPAN> is the number of document in the collection and <SPAN CLASS="MATH"><IMG
+ WIDTH="23" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img9.png"
+ ALT="$ df$"></SPAN> is the number of documents where appears the term `t'.
+
+<P>
+
+<H1><A NAME="SECTION00030000000000000000">
+Implementation</A>
+</H1>
+The main purpose of this implementation was to integrate the new model ranking into the search Lucene functionalities. In order to accomplish this objective a new Query, Weight, and several Scorers were developed. The main functionalities are implemented at Scorer level, since the main responsibilities of Query and Weight are to prepare the necessary parameters for the Scorers, and create Scorers instances when the search method is invoked. More information in the Query-Weight-Scorer model can be found at <TT><A NAME="tex2html5"
+ HREF="http://lucene.apache.org/java/2_4_0/scoring.html">http://lucene.apache.org/java/2_4_0/scoring.html</A></TT>.
+
+<P>
+
+<H3><A NAME="SECTION00030100000000000000">
+Query</A>
+</H3>
+The execution of a query can be splitted into two parts, a boolean filtering and the ranking evaluation. The boolean filtering is carried out by the Scorers ShouldBooleanScorer, MustBooleanScorer and NotBooleanScorer depending on the logic operators applied, while ranking functions are implemented in the <SPAN CLASS="textit">score</SPAN> method of BM25TermScorer and BM25FTermScorer.
+
+<P>
+BM25BooleanScorer will create BM25TermScorer or BM25FTermScorer instances depending on the invoked constructor, as next:
+
+<UL>
+<LI><SPAN CLASS="textit">public BM25BooleanQuery(String query, String field, Analyzer analyzer) throws ParseException, IOException</SPAN>, uses BM25 ranking function.
+</LI>
+<LI><SPAN CLASS="textit">public BM25BooleanQuery(String query, String[] fields, Analyzer analyzer) throws ParseException,IOException</SPAN>, uses BM25F ranking function.
+</LI>
+</UL>
+
+<P>
+BM25BooleanScorer will ignore any information related to fields that is treated by Lucene QueryParser, so the search will be carried out only amongst the field(s), passed as parameters in the constructor. Besides only boolean queries are supported, any other query type will be splitted into terms and executed as a boolean query.
+
+<P>
+It should be noted that both ranking functions do not use query weights, therefore all computation can be done at scorer level.
+
+<P>
+
+<H3><A NAME="SECTION00030200000000000000">
+Scoring</A>
+</H3>
+
+<P>
+
+<UL>
+<LI>Almost all necessary information in order to compute BM25 relevance can be obtained through the Lucene expert API (<SPAN CLASS="textit">termdocs</SPAN>, <SPAN CLASS="textit">numdocs</SPAN>, <SPAN CLASS="textit">docfreq</SPAN>,...), apart from the document average length that can not be obtained directly from the API supplied. This value, can be obtained at index time, implementing a specific Similarity that counts and store the length of the documents. As next
+
+<P>
+<PRE>
+public class CollectionSimilarityIndexer extends DefaultSimilarity{
+
+ private static Map&lt;String,Long&gt; length = new HashMap&lt;String, Long&gt;();
+
+ @Override
+ public float lengthNorm(String fieldName, int numTokens) {
+ Long aux = CollectionSimilarityIndexer.length.get(fieldName);
+ if (aux==null)
+ aux = new Long(0);
+ aux+=numTokens;
+ CollectionSimilarityIndexer.length.put(fieldName,aux);
+ return super.lengthNorm(fieldName, numTokens);
+ }
+
+ public static long getLength(String field){
+ return CollectionSimilarityIndexer.length.get(field);
+ }
+}
+</PRE>
+
+<P>
+After the indexing process we can retrieve the length of a specific field, it can be divided by collection <SPAN CLASS="textit">numdocs</SPAN> and save the computed value to a file, where can be read when a Searcher is opened. In the provided implementation a method <SPAN CLASS="textit">load(String filePath)</SPAN> is supplied in BM25Parameters in order to load average lengths, more details can be found in the javadoc documentation at <TT><A NAME="tex2html6"
+ HREF="http://nlp.uned.es/~jperezi/Lucene-BM25/javadoc">http://nlp.uned.es/~jperezi/Lucene-BM25/javadoc</A></TT>.
+
+<P>
+</LI>
+<LI>The specific BM25 parameters can be found at BM25Parameters class, where by default are set at <SPAN CLASS="MATH"><IMG
+ WIDTH="57" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img19.png"
+ ALT="$ k_1=2$"></SPAN> and <SPAN CLASS="MATH"><IMG
+ WIDTH="71" HEIGHT="17" ALIGN="BOTTOM" BORDER="0"
+ SRC="img20.png"
+ ALT="$ b=0.75$"></SPAN>.
+The BM25F case is more complex, since it needs more specific parameters, mainly an array of string that includes the fields where the term should be searched. All the parameters can be found at BM25FParameters, the same <SPAN CLASS="MATH"><IMG
+ WIDTH="22" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img5.png"
+ ALT="$ k_1$"></SPAN> is applied. Related to <SPAN CLASS="MATH"><IMG
+ WIDTH="13" HEIGHT="17" ALIGN="BOTTOM" BORDER="0"
+ SRC="img6.png"
+ ALT="$ b$"></SPAN> is set to 0.75 for each field, but is recommended to use better parameters that can be set when the Query is built as a float array. Something similar happens with boost, these have been initialised with a value of 1, but it may be supplied with a float array. All BM25F based arrays parameters as <!-- MATH
+ $boost_{field}$
+ -->
+<SPAN CLASS="MATH"><IMG
+ WIDTH="77" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img21.png"
+ ALT="$ boost_{field}$"></SPAN> and <SPAN CLASS="MATH"><IMG
+ WIDTH="43" HEIGHT="34" ALIGN="MIDDLE" BORDER="0"
+ SRC="img22.png"
+ ALT="$ b_{field}$"></SPAN> must be supplied ordered, that means that for field i into the array of fields, the boost and the b parameter for that field will be at i position in both arrays.
+
+<P>
+</LI>
+<LI>In both models IDF is computed in BM25Similarity and must be calculated at document level with <SPAN CLASS="textit">docFreq</SPAN> and <SPAN CLASS="textit">numdocs</SPAN>. Lucene returns <SPAN CLASS="textit">docFreq</SPAN> at field level, that is fine for BM25, because the search is accomplished just through a unique field. For the BM25F case this is a serious problem, because IDF cannot be computed at document level, unless a new field that contains all terms is indexed. The supplied implementation computes <SPAN CLASS="textit">docFreq</SPAN> in the field with the longest average length.
+
+<P>
+</LI>
+</UL>
+
+<P>
+
+<H1><A NAME="SECTION00040000000000000000">
+How to use it</A>
+</H1>
+The supplied implementation can be used in a similar way as searches are carried out with Lucene, except that <SPAN CLASS="textbf">BM25Parameters or BM25FParameters must be set before the query is executed, this has to be done in order to set the average lengths</SPAN>, others parameters can be ommited since they are set to default values.
+
+<P>
+A couple examples of use can be seen below:
+
+<P>
+
+<H3><A NAME="SECTION00040100000000000000">
+BM25</A>
+</H3>
+<PRE>
+ IndexSearcher searcher = new IndexSearcher("Index//Path");
+
+ //Load average length
+ BM25Parameters.load(avgLengthPath);
+ BM25BooleanQuery query = new BM25BooleanQuery("This is my Query", "Search-Field",
+ AnalyzerUtil.getPorterStemmerAnalyzer(new StandardAnalyzer()));
+
+ //Retrieving normalized scorer values!!!
+ Hits hits = searcher.search(query);
+
+ //Print results
+ for (int i = 0; i &lt; 10; i++)
+ System.out.println(hits.id(i) + ":"+hits.score(i));
+</PRE>
+
+<P>
+
+<H3><A NAME="SECTION00040200000000000000">
+BM25F</A>
+</H3>
+<PRE>
+ String[] fields ={"FIELD1","FIELD2"};
+ IndexSearcher searcher = new IndexSearcher("Index//Path");
+
+ //Set explicit average Length for each field
+ BM25FParameters.setAverageLength("FIELD1", 123.5f);
+ BM25FParameters.setAverageLength("FIELD2", 42.2f);
+
+ //Set explicit k1 parameter
+ BM25FParameters.setK1(1.2f);
+
+ //Using boost and b defaults parameters
+ BM25BooleanQuery queryF = new BM25BooleanQuery("This is my query",
+ fields, AnalyzerUtil.getPorterStemmerAnalyzer(new StandardAnalyzer()));
+
+ //Retrieving NOT normalized scorer values
+ TopDocs top = searcher.search(queryF, null, 10);
+ ScoreDoc[] docs = top.scoreDocs;
+
+ //Print results
+ for (int i = 0; i &lt; top.scoreDocs.length; i++) {
+ System.out.println(docs[i].doc + ":"+docs[i].score);
+ }
+</PRE>
+
+<P>
+
+<H1><A NAME="SECTION00050000000000000000">
+About this document ...</A>
+</H1>
+ Please send any comments to <a href='http://nlp.uned.es/~jperezi/'>Joaquin Perez-Iglesias</a>, LSI,UNED. Last update: 18-11-09 12:34
+<BR><HR><H4>Footnotes</H4>
+<DL>
+<DT><A NAME="foot6">... Lucene</A><!--<A
+ HREF="main.html#tex2html1"><SUP><SPAN CLASS="arabic">1</SPAN></SUP></A>--></DT>
+<DD>Thanks to Jose and Hugo for their contribution and comments.
+
+</DD>
+</DL>
+<BR><HR>
+
+<script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+try {
+var pageTracker = _gat._getTracker("UA-7283180-1");
+pageTracker._trackPageview();
+} catch(err) {}</script>
+</BODY>
+</HTML>