| package org.apache.lucene.search; |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.IOException; |
| import java.io.StringReader; |
| import java.util.Collection; |
| import java.util.Collections; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.StopFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.TermPositions; |
| import org.apache.lucene.queryParser.QueryParser; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.analysis.LowerCaseTokenizer; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.index.Payload; |
| import org.apache.lucene.search.payloads.PayloadSpanUtil; |
| import org.apache.lucene.search.spans.SpanNearQuery; |
| import org.apache.lucene.search.spans.SpanQuery; |
| import org.apache.lucene.search.spans.SpanTermQuery; |
| import org.apache.lucene.search.spans.Spans; |
| import org.apache.lucene.util.Version; |
| import org.apache.lucene.util.LuceneTestCase; |
| |
| /** |
| * Term position unit test. |
| * |
| * |
| * @version $Revision$ |
| */ |
| public class TestPositionIncrement extends LuceneTestCase { |
| |
| public void testSetPosition() throws Exception { |
| Analyzer analyzer = new Analyzer() { |
| @Override |
| public TokenStream tokenStream(String fieldName, Reader reader) { |
| return new TokenStream() { |
| private final String[] TOKENS = {"1", "2", "3", "4", "5"}; |
| private final int[] INCREMENTS = {0, 2, 1, 0, 1}; |
| private int i = 0; |
| |
| PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| |
| @Override |
| public boolean incrementToken() { |
| if (i == TOKENS.length) |
| return false; |
| clearAttributes(); |
| termAtt.append(TOKENS[i]); |
| offsetAtt.setOffset(i,i); |
| posIncrAtt.setPositionIncrement(INCREMENTS[i]); |
| i++; |
| return true; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| this.i = 0; |
| } |
| }; |
| } |
| }; |
| Directory store = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random, store, analyzer); |
| Document d = new Document(); |
| d.add(newField("field", "bogus", Field.Store.YES, Field.Index.ANALYZED)); |
| writer.addDocument(d); |
| IndexReader reader = writer.getReader(); |
| writer.close(); |
| |
| |
| IndexSearcher searcher = newSearcher(reader); |
| |
| TermPositions pos = searcher.getIndexReader().termPositions(new Term("field", "1")); |
| pos.next(); |
| // first token should be at position 0 |
| assertEquals(0, pos.nextPosition()); |
| |
| pos = searcher.getIndexReader().termPositions(new Term("field", "2")); |
| pos.next(); |
| // second token should be at position 2 |
| assertEquals(2, pos.nextPosition()); |
| |
| PhraseQuery q; |
| ScoreDoc[] hits; |
| |
| q = new PhraseQuery(); |
| q.add(new Term("field", "1")); |
| q.add(new Term("field", "2")); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(0, hits.length); |
| |
| // same as previous, just specify positions explicitely. |
| q = new PhraseQuery(); |
| q.add(new Term("field", "1"),0); |
| q.add(new Term("field", "2"),1); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(0, hits.length); |
| |
| // specifying correct positions should find the phrase. |
| q = new PhraseQuery(); |
| q.add(new Term("field", "1"),0); |
| q.add(new Term("field", "2"),2); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(1, hits.length); |
| |
| q = new PhraseQuery(); |
| q.add(new Term("field", "2")); |
| q.add(new Term("field", "3")); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(1, hits.length); |
| |
| q = new PhraseQuery(); |
| q.add(new Term("field", "3")); |
| q.add(new Term("field", "4")); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(0, hits.length); |
| |
| // phrase query would find it when correct positions are specified. |
| q = new PhraseQuery(); |
| q.add(new Term("field", "3"),0); |
| q.add(new Term("field", "4"),0); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(1, hits.length); |
| |
| // phrase query should fail for non existing searched term |
| // even if there exist another searched terms in the same searched position. |
| q = new PhraseQuery(); |
| q.add(new Term("field", "3"),0); |
| q.add(new Term("field", "9"),0); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(0, hits.length); |
| |
| // multi-phrase query should succed for non existing searched term |
| // because there exist another searched terms in the same searched position. |
| MultiPhraseQuery mq = new MultiPhraseQuery(); |
| mq.add(new Term[]{new Term("field", "3"),new Term("field", "9")},0); |
| hits = searcher.search(mq, null, 1000).scoreDocs; |
| assertEquals(1, hits.length); |
| |
| q = new PhraseQuery(); |
| q.add(new Term("field", "2")); |
| q.add(new Term("field", "4")); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(1, hits.length); |
| |
| q = new PhraseQuery(); |
| q.add(new Term("field", "3")); |
| q.add(new Term("field", "5")); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(1, hits.length); |
| |
| q = new PhraseQuery(); |
| q.add(new Term("field", "4")); |
| q.add(new Term("field", "5")); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(1, hits.length); |
| |
| q = new PhraseQuery(); |
| q.add(new Term("field", "2")); |
| q.add(new Term("field", "5")); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(0, hits.length); |
| |
| // should not find "1 2" because there is a gap of 1 in the index |
| QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", |
| new StopWhitespaceAnalyzer(false)); |
| q = (PhraseQuery) qp.parse("\"1 2\""); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(0, hits.length); |
| |
| // omitted stop word cannot help because stop filter swallows the increments. |
| q = (PhraseQuery) qp.parse("\"1 stop 2\""); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(0, hits.length); |
| |
| // query parser alone won't help, because stop filter swallows the increments. |
| qp.setEnablePositionIncrements(true); |
| q = (PhraseQuery) qp.parse("\"1 stop 2\""); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(0, hits.length); |
| |
| // stop filter alone won't help, because query parser swallows the increments. |
| qp.setEnablePositionIncrements(false); |
| q = (PhraseQuery) qp.parse("\"1 stop 2\""); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(0, hits.length); |
| |
| // when both qp qnd stopFilter propagate increments, we should find the doc. |
| qp = new QueryParser(TEST_VERSION_CURRENT, "field", |
| new StopWhitespaceAnalyzer(true)); |
| qp.setEnablePositionIncrements(true); |
| q = (PhraseQuery) qp.parse("\"1 stop 2\""); |
| hits = searcher.search(q, null, 1000).scoreDocs; |
| assertEquals(1, hits.length); |
| |
| searcher.close(); |
| reader.close(); |
| store.close(); |
| } |
| |
| private static class StopWhitespaceAnalyzer extends Analyzer { |
| boolean enablePositionIncrements; |
| final WhitespaceAnalyzer a = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); |
| public StopWhitespaceAnalyzer(boolean enablePositionIncrements) { |
| this.enablePositionIncrements = enablePositionIncrements; |
| } |
| @Override |
| public TokenStream tokenStream(String fieldName, Reader reader) { |
| TokenStream ts = a.tokenStream(fieldName,reader); |
| return new StopFilter(enablePositionIncrements?TEST_VERSION_CURRENT:Version.LUCENE_24, ts, |
| new CharArraySet(TEST_VERSION_CURRENT, Collections.singleton("stop"), true)); |
| } |
| } |
| |
| public void testPayloadsPos0() throws Exception { |
| Directory dir = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random, dir, new TestPayloadAnalyzer()); |
| Document doc = new Document(); |
| doc.add(new Field("content", |
| new StringReader("a a b c d e a f g h i j a b k k"))); |
| writer.addDocument(doc); |
| |
| IndexReader r = writer.getReader(); |
| |
| TermPositions tp = r.termPositions(new Term("content", "a")); |
| int count = 0; |
| assertTrue(tp.next()); |
| // "a" occurs 4 times |
| assertEquals(4, tp.freq()); |
| int expected = 0; |
| assertEquals(expected, tp.nextPosition()); |
| assertEquals(1, tp.nextPosition()); |
| assertEquals(3, tp.nextPosition()); |
| assertEquals(6, tp.nextPosition()); |
| |
| // only one doc has "a" |
| assertFalse(tp.next()); |
| |
| IndexSearcher is = newSearcher(r); |
| |
| SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a")); |
| SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k")); |
| SpanQuery[] sqs = { stq1, stq2 }; |
| SpanNearQuery snq = new SpanNearQuery(sqs, 30, false); |
| |
| count = 0; |
| boolean sawZero = false; |
| //System.out.println("\ngetPayloadSpans test"); |
| Spans pspans = snq.getSpans(is.getIndexReader()); |
| while (pspans.next()) { |
| //System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end()); |
| Collection<byte[]> payloads = pspans.getPayload(); |
| sawZero |= pspans.start() == 0; |
| count += payloads.size(); |
| } |
| assertEquals(5, count); |
| assertTrue(sawZero); |
| |
| //System.out.println("\ngetSpans test"); |
| Spans spans = snq.getSpans(is.getIndexReader()); |
| count = 0; |
| sawZero = false; |
| while (spans.next()) { |
| count++; |
| sawZero |= spans.start() == 0; |
| //System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end()); |
| } |
| assertEquals(4, count); |
| assertTrue(sawZero); |
| |
| //System.out.println("\nPayloadSpanUtil test"); |
| |
| sawZero = false; |
| PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader()); |
| Collection<byte[]> pls = psu.getPayloadsForQuery(snq); |
| count = pls.size(); |
| for (byte[] bytes : pls) { |
| String s = new String(bytes); |
| //System.out.println(s); |
| sawZero |= s.equals("pos: 0"); |
| } |
| assertEquals(5, count); |
| assertTrue(sawZero); |
| writer.close(); |
| is.getIndexReader().close(); |
| dir.close(); |
| } |
| } |
| |
| final class TestPayloadAnalyzer extends Analyzer { |
| |
| @Override |
| public TokenStream tokenStream(String fieldName, Reader reader) { |
| TokenStream result = new LowerCaseTokenizer(LuceneTestCase.TEST_VERSION_CURRENT, reader); |
| return new PayloadFilter(result, fieldName); |
| } |
| } |
| |
| final class PayloadFilter extends TokenFilter { |
| String fieldName; |
| |
| int pos; |
| |
| int i; |
| |
| final PositionIncrementAttribute posIncrAttr; |
| final PayloadAttribute payloadAttr; |
| final CharTermAttribute termAttr; |
| |
| public PayloadFilter(TokenStream input, String fieldName) { |
| super(input); |
| this.fieldName = fieldName; |
| pos = 0; |
| i = 0; |
| posIncrAttr = input.addAttribute(PositionIncrementAttribute.class); |
| payloadAttr = input.addAttribute(PayloadAttribute.class); |
| termAttr = input.addAttribute(CharTermAttribute.class); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if (input.incrementToken()) { |
| payloadAttr.setPayload(new Payload(("pos: " + pos).getBytes())); |
| int posIncr; |
| if (i % 2 == 1) { |
| posIncr = 1; |
| } else { |
| posIncr = 0; |
| } |
| posIncrAttr.setPositionIncrement(posIncr); |
| pos += posIncr; |
| if (TestPositionIncrement.VERBOSE) { |
| System.out.println("term=" + termAttr + " pos=" + pos); |
| } |
| i++; |
| return true; |
| } else { |
| return false; |
| } |
| } |
| } |