| Index: src/test/org/apache/lucene/index/TestDocumentWriter.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 414705) |
| +++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy) |
| @@ -16,11 +16,15 @@ |
| * limitations under the License. |
| */ |
| |
| +import java.util.LinkedList; |
| +import java.util.List; |
| import junit.framework.TestCase; |
| import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.WhitespaceTokenizer; |
| +import org.apache.lucene.analysis.TokenSelector; |
| import org.apache.lucene.document.*; |
| import org.apache.lucene.search.Similarity; |
| import org.apache.lucene.store.RAMDirectory; |
| @@ -54,6 +58,16 @@ |
| Analyzer analyzer = new WhitespaceAnalyzer(); |
| Similarity similarity = Similarity.getDefault(); |
| DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); |
| + writer.setTermVectorTokenSelector(new TokenSelector(){ |
| + public boolean accept(String field, Token t) { |
| + return Character.isLowerCase(t.termText().charAt(0)); |
| + } |
| + }); |
| + writer.setPositionsTokenSelector(new TokenSelector(){ |
| + public boolean accept(String field, Token t) { |
| + return Character.isLowerCase(t.termText().charAt(0)); |
| + } |
| + }); |
| String segName = "test"; |
| writer.addDocument(segName, testDoc); |
| //After adding the document, we should be able to read it back in |
| @@ -84,6 +98,31 @@ |
| fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY); |
| assertTrue(fields != null && fields.length == 1); |
| assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT)); |
| + |
| + fields = doc.getFields(DocHelper.TEXT_FIELD_UTF2_KEY); |
| + assertTrue(fields != null && fields.length == 1); |
| + assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_UTF2_TEXT)); |
| + assertTrue(fields[0].isTermVectorStored()); |
| + TermFreqVector tv = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_UTF2_KEY); |
| + assertTrue(tv != null); |
| + String[] words = DocHelper.FIELD_UTF2_TEXT.split("\\s+"); |
| + String[] tvwords = tv.getTerms(); |
| + List uniques = new LinkedList(); |
| + int omitted = 0; |
| + for (int i=0; i<words.length; i++) |
| + if (!uniques.contains(words[i])) { |
| + uniques.add(words[i]); |
| + if (!Character.isLowerCase(words[i].charAt(0))) |
| + omitted++; |
| + } |
| + assertTrue(omitted!=0); |
| + assertTrue(omitted!=uniques.size()); |
| + assertEquals(uniques.size()-omitted, tvwords.length); |
| + for (int i=0; i<uniques.size(); i++) { |
| + for (int j=0; j<tvwords.length; j++) |
| + if (uniques.get(i).equals(tvwords[j])) |
| + assertTrue(Character.isLowerCase(((String)uniques.get(i)).charAt(0))); |
| + } |
| |
| // test that the norm file is not present if omitNorms is true |
| for (int i = 0; i < reader.fieldInfos.size(); i++) { |
| Index: src/java/org/apache/lucene/analysis/TokenSelector.java |
| =================================================================== |
| --- src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0) |
| +++ src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0) |
| @@ -0,0 +1,24 @@ |
| +/* |
| + * TokenSelector.java |
| + * |
| + * Created on June 13, 2006, 12:18 PM |
| + * |
| + */ |
| + |
| +package org.apache.lucene.analysis; |
| + |
| +/** |
| + * An interface for selecting a subset of a token stream |
| + * |
| + * @author Chuck Wiliams |
| + */ |
| +public interface TokenSelector { |
| + |
| + /** Determine if a token should be selected |
| + * @param fieldName field in which token was found |
| + * @param token a token |
| + * @return true iff token should be selected |
| + */ |
| + public boolean accept(String fieldName, Token token); |
| + |
| +} |
| Index: src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java |
| =================================================================== |
| --- src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0) |
| +++ src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0) |
| @@ -0,0 +1,44 @@ |
| +/* |
| + * PerFieldTokenSelectorWrapper.java |
| + * |
| + * Created on June 13, 2006, 4:09 PM |
| + * |
| + */ |
| + |
| +package org.apache.lucene.analysis; |
| + |
| +import java.util.HashMap; |
| +import java.util.Map; |
| + |
| +/** |
| + * Expert: TokenSelector that implements a mapping from field names to TokenSelectors |
| + * |
| + * @author Chuck Williams |
| + */ |
| +public class PerFieldTokenSelectorWrapper implements TokenSelector { |
| + |
| + private Map selectors = new HashMap(); |
| + private TokenSelector defaultSelector; |
| + |
| + /** Expert: create a PerFieldTokenSelector with given default selector (null means select all) */ |
| + public PerFieldTokenSelectorWrapper(TokenSelector defaultSelector) { |
| + this.defaultSelector = defaultSelector; |
| + } |
| + |
| + /** Add a token selector for the named field */ |
| + public void addSelector(String fieldName, TokenSelector selector) { |
| + selectors.put(fieldName, selector); |
| + } |
| + |
| + /** Determine if token is accepted by fieldName */ |
| + public boolean accept(String fieldName, Token token) { |
| + TokenSelector selector = (TokenSelector) selectors.get(fieldName); |
| + if (selector!=null) |
| + return selector.accept(fieldName, token); |
| + else if (defaultSelector!=null) |
| + return defaultSelector.accept(fieldName, token); |
| + else |
| + return true; |
| + } |
| + |
| +} |
| \ No newline at end of file |
| Index: src/java/org/apache/lucene/index/IndexWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/IndexWriter.java (revision 414705) |
| +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) |
| @@ -17,6 +17,7 @@ |
| */ |
| |
| import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.TokenSelector; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.search.Similarity; |
| import org.apache.lucene.store.Directory; |
| @@ -100,8 +101,10 @@ |
| */ |
| public final static int DEFAULT_TERM_INDEX_INTERVAL = 128; |
| |
| - private Directory directory; // where this index resides |
| - private Analyzer analyzer; // how to analyze text |
| + private Directory directory; // where this index resides |
| + private Analyzer analyzer; // how to analyze text |
| + private TokenSelector termVectorTokenSelector; // subset of token stream stored in term vectors |
| + private TokenSelector positionsTokenSelector; // subset of token stream for which positions are stored |
| |
| private Similarity similarity = Similarity.getDefault(); // how to normalize |
| |
| @@ -153,6 +156,38 @@ |
| return this.similarity; |
| } |
| |
| + /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors. |
| + * @param selector the term vector TokenSelector |
| + */ |
| + public void setTermVectorTokenSelector(TokenSelector selector) { |
| + this.termVectorTokenSelector = selector; |
| + } |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors. |
| + * @return the TokenSelector used to determine term vector tokens |
| + */ |
| + public TokenSelector getTermVectorTokenSelector() { |
| + return termVectorTokenSelector; |
| + } |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens for which positions are stored. |
| + * (At least one position is always stored for each term in each doc to ensure the term stays in |
| + * the index so long as any docs reference it) |
| + * @param selector the positions TokenSelector |
| + */ |
| + public void setPositionsTokenSelector(TokenSelector selector) { |
| + this.positionsTokenSelector = selector; |
| + } |
| + |
| + /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored.. |
| + * (At least one position is always stored for each term in each doc to ensure the term stays in |
| + * the index so long as any docs reference it) |
| + * @return the positions TokenSelector |
| + */ |
| + public TokenSelector getPositionsTokenSelector() { |
| + return positionsTokenSelector; |
| + } |
| + |
| /** Expert: Set the interval between indexed terms. Large values cause less |
| * memory to be used by IndexReader, but slow random-access to terms. Small |
| * values cause more memory to be used by an IndexReader, and speed |
| @@ -471,6 +506,8 @@ |
| public void addDocument(Document doc, Analyzer analyzer) throws IOException { |
| DocumentWriter dw = |
| new DocumentWriter(ramDirectory, analyzer, this); |
| + dw.setTermVectorTokenSelector(termVectorTokenSelector); |
| + dw.setPositionsTokenSelector(positionsTokenSelector); |
| dw.setInfoStream(infoStream); |
| String segmentName = newSegmentName(); |
| dw.addDocument(segmentName, doc); |
| Index: src/java/org/apache/lucene/index/DocumentWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/DocumentWriter.java (revision 414705) |
| +++ src/java/org/apache/lucene/index/DocumentWriter.java (working copy) |
| @@ -17,6 +17,7 @@ |
| */ |
| |
| import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.TokenSelector; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.document.Document; |
| @@ -35,6 +36,8 @@ |
| |
| final class DocumentWriter { |
| private Analyzer analyzer; |
| + private TokenSelector termVectorTokenSelector; |
| + private TokenSelector positionsTokenSelector; |
| private Directory directory; |
| private Similarity similarity; |
| private FieldInfos fieldInfos; |
| @@ -142,9 +145,9 @@ |
| if (!field.isTokenized()) { // un-tokenized field |
| String stringValue = field.stringValue(); |
| if(field.isStoreOffsetWithTermVector()) |
| - addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length())); |
| + addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()), false, false); |
| else |
| - addPosition(fieldName, stringValue, position++, null); |
| + addPosition(fieldName, stringValue, position++, null, false, false); |
| offset += stringValue.length(); |
| length++; |
| } else |
| @@ -165,10 +168,16 @@ |
| for (Token t = stream.next(); t != null; t = stream.next()) { |
| position += (t.getPositionIncrement() - 1); |
| |
| - if(field.isStoreOffsetWithTermVector()) |
| - addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())); |
| - else |
| - addPosition(fieldName, t.termText(), position++, null); |
| + boolean omittv = false, omitpos = false; |
| + if (termVectorTokenSelector!=null && !termVectorTokenSelector.accept(field.name(), t)) |
| + omittv = true; |
| + if (positionsTokenSelector !=null && !positionsTokenSelector. accept(field.name(), t)) |
| + omitpos = true; |
| + |
| + addPosition(fieldName, t.termText(), position++, |
| + field.isStoreOffsetWithTermVector() && !omittv ? new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()) |
| + : null, |
| + omittv, omitpos); |
| |
| lastToken = t; |
| if (++length > maxFieldLength) { |
| @@ -196,20 +205,24 @@ |
| |
| private final Term termBuffer = new Term("", ""); // avoid consing |
| |
| - private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) { |
| + private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset, |
| + boolean omitFromTermVector, boolean omitPosition) { |
| termBuffer.set(field, text); |
| //System.out.println("Offset: " + offset); |
| Posting ti = (Posting) postingTable.get(termBuffer); |
| if (ti != null) { // word seen before |
| int freq = ti.freq; |
| - if (ti.positions.length == freq) { // positions array is full |
| - int[] newPositions = new int[freq * 2]; // double size |
| - int[] positions = ti.positions; |
| - for (int i = 0; i < freq; i++) // copy old positions to new |
| - newPositions[i] = positions[i]; |
| - ti.positions = newPositions; |
| + |
| + if (!omitPosition) { |
| + if (ti.positions.length == freq) { // positions array is full |
| + int[] newPositions = new int[freq * 2]; // double size |
| + int[] positions = ti.positions; |
| + for (int i = 0; i < freq; i++) // copy old positions to new |
| + newPositions[i] = positions[i]; |
| + ti.positions = newPositions; |
| + } |
| + ti.positions[freq] = position; // add new position |
| } |
| - ti.positions[freq] = position; // add new position |
| |
| if (offset != null) { |
| if (ti.offsets.length == freq){ |
| @@ -223,10 +236,12 @@ |
| } |
| ti.offsets[freq] = offset; |
| } |
| - ti.freq = freq + 1; // update frequency |
| - } else { // word not seen before |
| + |
| + if (!omitPosition) |
| + ti.freq = freq + 1; // update frequency |
| + } else { // word not seen before |
| Term term = new Term(field, text, false); |
| - postingTable.put(term, new Posting(term, position, offset)); |
| + postingTable.put(term, new Posting(term, position, offset, omitFromTermVector)); |
| } |
| } |
| |
| @@ -351,7 +366,7 @@ |
| termVectorWriter.closeField(); |
| } |
| } |
| - if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { |
| + if (termVectorWriter != null && termVectorWriter.isFieldOpen() && !posting.omitFromTermVector) { |
| termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); |
| } |
| } |
| @@ -390,6 +405,16 @@ |
| this.infoStream = infoStream; |
| } |
| |
| + /** If non-null, this will be used to select which tokens are stored in term vectors */ |
| + void setTermVectorTokenSelector(TokenSelector selector) { |
| + this.termVectorTokenSelector = selector; |
| + } |
| + |
| + /** If non-null, this will be used to select which tokens have positions stored in the index. */ |
| + void setPositionsTokenSelector(TokenSelector selector) { |
| + this.positionsTokenSelector = selector; |
| + } |
| + |
| } |
| |
| final class Posting { // info about a Term in a doc |
| @@ -397,17 +422,17 @@ |
| int freq; // its frequency in doc |
| int[] positions; // positions it occurs at |
| TermVectorOffsetInfo [] offsets; |
| + boolean omitFromTermVector; // if true, omit from term vector |
| |
| - Posting(Term t, int position, TermVectorOffsetInfo offset) { |
| + Posting(Term t, int position, TermVectorOffsetInfo offset, boolean omitFromTermVector) { |
| term = t; |
| freq = 1; |
| positions = new int[1]; |
| positions[0] = position; |
| - if(offset != null){ |
| - offsets = new TermVectorOffsetInfo[1]; |
| - offsets[0] = offset; |
| + if(offset != null) { |
| + offsets = new TermVectorOffsetInfo[1]; |
| + offsets[0] = offset; |
| } |
| - else |
| - offsets = null; |
| + this.omitFromTermVector = omitFromTermVector; |
| } |
| } |