blob: 2818b50e95eda90753fd28cec4158d9863a771e6 [file] [log] [blame]
Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 414705)
+++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy)
@@ -16,11 +16,15 @@
* limitations under the License.
*/
+import java.util.LinkedList;
+import java.util.List;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.TokenSelector;
import org.apache.lucene.document.*;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.RAMDirectory;
@@ -54,6 +58,16 @@
Analyzer analyzer = new WhitespaceAnalyzer();
Similarity similarity = Similarity.getDefault();
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
+ writer.setTermVectorTokenSelector(new TokenSelector(){
+ public boolean accept(String field, Token t) {
+ return Character.isLowerCase(t.termText().charAt(0));
+ }
+ });
+ writer.setPositionsTokenSelector(new TokenSelector(){
+ public boolean accept(String field, Token t) {
+ return Character.isLowerCase(t.termText().charAt(0));
+ }
+ });
String segName = "test";
writer.addDocument(segName, testDoc);
//After adding the document, we should be able to read it back in
@@ -84,6 +98,31 @@
fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
assertTrue(fields != null && fields.length == 1);
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
+
+ fields = doc.getFields(DocHelper.TEXT_FIELD_UTF2_KEY);
+ assertTrue(fields != null && fields.length == 1);
+ assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_UTF2_TEXT));
+ assertTrue(fields[0].isTermVectorStored());
+ TermFreqVector tv = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_UTF2_KEY);
+ assertTrue(tv != null);
+ String[] words = DocHelper.FIELD_UTF2_TEXT.split("\\s+");
+ String[] tvwords = tv.getTerms();
+ List uniques = new LinkedList();
+ int omitted = 0;
+ for (int i=0; i<words.length; i++)
+ if (!uniques.contains(words[i])) {
+ uniques.add(words[i]);
+ if (!Character.isLowerCase(words[i].charAt(0)))
+ omitted++;
+ }
+ assertTrue(omitted!=0);
+ assertTrue(omitted!=uniques.size());
+ assertEquals(uniques.size()-omitted, tvwords.length);
+ for (int i=0; i<uniques.size(); i++) {
+ for (int j=0; j<tvwords.length; j++)
+ if (uniques.get(i).equals(tvwords[j]))
+ assertTrue(Character.isLowerCase(((String)uniques.get(i)).charAt(0)));
+ }
// test that the norm file is not present if omitNorms is true
for (int i = 0; i < reader.fieldInfos.size(); i++) {
Index: src/java/org/apache/lucene/analysis/TokenSelector.java
===================================================================
--- src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0)
+++ src/java/org/apache/lucene/analysis/TokenSelector.java (revision 0)
@@ -0,0 +1,24 @@
+/*
+ * TokenSelector.java
+ *
+ * Created on June 13, 2006, 12:18 PM
+ *
+ */
+
+package org.apache.lucene.analysis;
+
+/**
+ * An interface for selecting a subset of a token stream
+ *
+ * @author Chuck Wiliams
+ */
+public interface TokenSelector {
+
+ /** Determine if a token should be selected
+ * @param fieldName field in which token was found
+ * @param token a token
+ * @return true iff token should be selected
+ */
+ public boolean accept(String fieldName, Token token);
+
+}
Index: src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java
===================================================================
--- src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0)
+++ src/java/org/apache/lucene/analysis/PerFieldTokenSelectorWrapper.java (revision 0)
@@ -0,0 +1,44 @@
+/*
+ * PerFieldTokenSelectorWrapper.java
+ *
+ * Created on June 13, 2006, 4:09 PM
+ *
+ */
+
+package org.apache.lucene.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Expert: TokenSelector that implements a mapping from field names to TokenSelectors
+ *
+ * @author Chuck Williams
+ */
+public class PerFieldTokenSelectorWrapper implements TokenSelector {
+
+ private Map selectors = new HashMap();
+ private TokenSelector defaultSelector;
+
+ /** Expert: create a PerFieldTokenSelector with given default selector (null means select all) */
+ public PerFieldTokenSelectorWrapper(TokenSelector defaultSelector) {
+ this.defaultSelector = defaultSelector;
+ }
+
+ /** Add a token selector for the named field */
+ public void addSelector(String fieldName, TokenSelector selector) {
+ selectors.put(fieldName, selector);
+ }
+
+ /** Determine if token is accepted by fieldName */
+ public boolean accept(String fieldName, Token token) {
+ TokenSelector selector = (TokenSelector) selectors.get(fieldName);
+ if (selector!=null)
+ return selector.accept(fieldName, token);
+ else if (defaultSelector!=null)
+ return defaultSelector.accept(fieldName, token);
+ else
+ return true;
+ }
+
+}
\ No newline at end of file
Index: src/java/org/apache/lucene/index/IndexWriter.java
===================================================================
--- src/java/org/apache/lucene/index/IndexWriter.java (revision 414705)
+++ src/java/org/apache/lucene/index/IndexWriter.java (working copy)
@@ -17,6 +17,7 @@
*/
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenSelector;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
@@ -100,8 +101,10 @@
*/
public final static int DEFAULT_TERM_INDEX_INTERVAL = 128;
- private Directory directory; // where this index resides
- private Analyzer analyzer; // how to analyze text
+ private Directory directory; // where this index resides
+ private Analyzer analyzer; // how to analyze text
+ private TokenSelector termVectorTokenSelector; // subset of token stream stored in term vectors
+ private TokenSelector positionsTokenSelector; // subset of token stream for which positions are stored
private Similarity similarity = Similarity.getDefault(); // how to normalize
@@ -153,6 +156,38 @@
return this.similarity;
}
+ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
+ * @param selector the term vector TokenSelector
+ */
+ public void setTermVectorTokenSelector(TokenSelector selector) {
+ this.termVectorTokenSelector = selector;
+ }
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens stored in term vectors.
+ * @return the TokenSelector used to determine term vector tokens
+ */
+ public TokenSelector getTermVectorTokenSelector() {
+ return termVectorTokenSelector;
+ }
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens for which positions are stored.
+ * (At least one position is always stored for each term in each doc to ensure the term stays in
+ * the index so long as any docs reference it)
+ * @param selector the positions TokenSelector
+ */
+ public void setPositionsTokenSelector(TokenSelector selector) {
+ this.positionsTokenSelector = selector;
+ }
+
+ /** Expert: Set the TokenSelector used to determine subset of tokens for which freq and positions are stored..
+ * (At least one position is always stored for each term in each doc to ensure the term stays in
+ * the index so long as any docs reference it)
+ * @return the positions TokenSelector
+ */
+ public TokenSelector getPositionsTokenSelector() {
+ return positionsTokenSelector;
+ }
+
/** Expert: Set the interval between indexed terms. Large values cause less
* memory to be used by IndexReader, but slow random-access to terms. Small
* values cause more memory to be used by an IndexReader, and speed
@@ -471,6 +506,8 @@
public void addDocument(Document doc, Analyzer analyzer) throws IOException {
DocumentWriter dw =
new DocumentWriter(ramDirectory, analyzer, this);
+ dw.setTermVectorTokenSelector(termVectorTokenSelector);
+ dw.setPositionsTokenSelector(positionsTokenSelector);
dw.setInfoStream(infoStream);
String segmentName = newSegmentName();
dw.addDocument(segmentName, doc);
Index: src/java/org/apache/lucene/index/DocumentWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentWriter.java (revision 414705)
+++ src/java/org/apache/lucene/index/DocumentWriter.java (working copy)
@@ -17,6 +17,7 @@
*/
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenSelector;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
@@ -35,6 +36,8 @@
final class DocumentWriter {
private Analyzer analyzer;
+ private TokenSelector termVectorTokenSelector;
+ private TokenSelector positionsTokenSelector;
private Directory directory;
private Similarity similarity;
private FieldInfos fieldInfos;
@@ -142,9 +145,9 @@
if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue();
if(field.isStoreOffsetWithTermVector())
- addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
+ addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()), false, false);
else
- addPosition(fieldName, stringValue, position++, null);
+ addPosition(fieldName, stringValue, position++, null, false, false);
offset += stringValue.length();
length++;
} else
@@ -165,10 +168,16 @@
for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1);
- if(field.isStoreOffsetWithTermVector())
- addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
- else
- addPosition(fieldName, t.termText(), position++, null);
+ boolean omittv = false, omitpos = false;
+ if (termVectorTokenSelector!=null && !termVectorTokenSelector.accept(field.name(), t))
+ omittv = true;
+ if (positionsTokenSelector !=null && !positionsTokenSelector. accept(field.name(), t))
+ omitpos = true;
+
+ addPosition(fieldName, t.termText(), position++,
+ field.isStoreOffsetWithTermVector() && !omittv ? new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())
+ : null,
+ omittv, omitpos);
lastToken = t;
if (++length > maxFieldLength) {
@@ -196,20 +205,24 @@
private final Term termBuffer = new Term("", ""); // avoid consing
- private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
+ private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset,
+ boolean omitFromTermVector, boolean omitPosition) {
termBuffer.set(field, text);
//System.out.println("Offset: " + offset);
Posting ti = (Posting) postingTable.get(termBuffer);
if (ti != null) { // word seen before
int freq = ti.freq;
- if (ti.positions.length == freq) { // positions array is full
- int[] newPositions = new int[freq * 2]; // double size
- int[] positions = ti.positions;
- for (int i = 0; i < freq; i++) // copy old positions to new
- newPositions[i] = positions[i];
- ti.positions = newPositions;
+
+ if (!omitPosition) {
+ if (ti.positions.length == freq) { // positions array is full
+ int[] newPositions = new int[freq * 2]; // double size
+ int[] positions = ti.positions;
+ for (int i = 0; i < freq; i++) // copy old positions to new
+ newPositions[i] = positions[i];
+ ti.positions = newPositions;
+ }
+ ti.positions[freq] = position; // add new position
}
- ti.positions[freq] = position; // add new position
if (offset != null) {
if (ti.offsets.length == freq){
@@ -223,10 +236,12 @@
}
ti.offsets[freq] = offset;
}
- ti.freq = freq + 1; // update frequency
- } else { // word not seen before
+
+ if (!omitPosition)
+ ti.freq = freq + 1; // update frequency
+ } else { // word not seen before
Term term = new Term(field, text, false);
- postingTable.put(term, new Posting(term, position, offset));
+ postingTable.put(term, new Posting(term, position, offset, omitFromTermVector));
}
}
@@ -351,7 +366,7 @@
termVectorWriter.closeField();
}
}
- if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
+ if (termVectorWriter != null && termVectorWriter.isFieldOpen() && !posting.omitFromTermVector) {
termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
}
}
@@ -390,6 +405,16 @@
this.infoStream = infoStream;
}
+ /** If non-null, this will be used to select which tokens are stored in term vectors */
+ void setTermVectorTokenSelector(TokenSelector selector) {
+ this.termVectorTokenSelector = selector;
+ }
+
+ /** If non-null, this will be used to select which tokens have positions stored in the index. */
+ void setPositionsTokenSelector(TokenSelector selector) {
+ this.positionsTokenSelector = selector;
+ }
+
}
final class Posting { // info about a Term in a doc
@@ -397,17 +422,17 @@
int freq; // its frequency in doc
int[] positions; // positions it occurs at
TermVectorOffsetInfo [] offsets;
+ boolean omitFromTermVector; // if true, omit from term vector
- Posting(Term t, int position, TermVectorOffsetInfo offset) {
+ Posting(Term t, int position, TermVectorOffsetInfo offset, boolean omitFromTermVector) {
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
- if(offset != null){
- offsets = new TermVectorOffsetInfo[1];
- offsets[0] = offset;
+ if(offset != null) {
+ offsets = new TermVectorOffsetInfo[1];
+ offsets[0] = offset;
}
- else
- offsets = null;
+ this.omitFromTermVector = omitFromTermVector;
}
}