docs/attachments/LUCENE-1370/ShingleFilter.patch - lucene-jira-archive - Git at Google

 Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
 ===================================================================
 --- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java	(revision 669476)
 +++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java	(working copy)
 @@ -67,6 +67,11 @@
    private boolean outputUnigrams = true;

    /**
 +   * Sometimes you may want to make sure to output a single unigram in case
 +   * there are no shingles. */
 +  private boolean outputUnigramIfNoNgrams = false;
 +
 +  /**
     * maximum shingle size (number of tokens)
     */
    private int maxShingleSize;
 @@ -124,6 +129,10 @@
      this.outputUnigrams = outputUnigrams;
    }

 +  public void setOutputUnigramIfNoNgrams(boolean outputUnigramIfNoNgrams) {
 +    this.outputUnigramIfNoNgrams = outputUnigramIfNoNgrams;
 +  }
 +
    /**
     * Set the max shingle size (default: 2)
     *
 @@ -149,6 +158,9 @@
      }
    }

 +  private Token mostRecentInputToken = null;
 +  private boolean returnedAnyTokensYet = false;
 +
    /* (non-Javadoc)
  	 * @see org.apache.lucene.analysis.TokenStream#next()
  	 */
 @@ -159,9 +171,18 @@
      Token nextToken = null;
      if ( ! outputBuf.isEmpty())
      {
 -      nextToken = (Token)outputBuf.remove(0);
 +      returnedAnyTokensYet = true;
 +      return (Token)outputBuf.remove(0);
      }
 -    return nextToken;
 +    else if (outputUnigramIfNoNgrams && !returnedAnyTokensYet && mostRecentInputToken != null)
 +    {
 +      returnedAnyTokensYet = true;
 +      return mostRecentInputToken;
 +    }
 +    else
 +    {
 +      return null;
 +    }
    }

    /**
 @@ -182,6 +203,7 @@
                                   lastToken.startOffset()));
          }
          tokenBuf.add(lastToken);
 +        mostRecentInputToken = lastToken;
          return getNextToken();
        } else {
          return null;
 Index: contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
 ===================================================================
 --- contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java	(revision 669476)
 +++ contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java	(working copy)
 @@ -100,6 +100,23 @@
      1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
    };

 +  public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
 +    new Token("please divide", 0, 13),
 +    new Token("divide this", 7, 18),
 +    new Token("this sentence", 14, 27),
 +    new Token("sentence into", 19, 32),
 +    new Token("into shingles", 28, 39),
 +  };
 +
 +  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
 +    1, 1, 1, 1, 1, 1
 +  };
 +
 +  public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
 +    "shingle", "shingle", "shingle", "shingle", "shingle"
 +  };
 +
 +
    public static final Token[] TRI_GRAM_TOKENS = new Token[] {
      new Token("please", 0, 6),
      new Token("please divide", 0, 13),
 @@ -132,6 +149,24 @@
    };


 +  public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
 +      new Token("please", 0, 6),
 +  };
 +
 +  public static final Token[] SINGLE_TOKEN = new Token[] {
 +    new Token("please", 0, 6),
 +  };
 +
 +  public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
 +    1
 +  };
 +
 +  public static final String[] SINGLE_TOKEN_TYPES = new String[] {
 +    "word"
 +  };
 +
 +
 +
    protected void setUp() throws Exception {
      super.setUp();
      testTokenWithHoles = new Token[] {
 @@ -148,6 +183,7 @@
    /*
     * Class under test for void ShingleFilter(TokenStream, int)
     */
 +
    public void testBiGramFilter() throws IOException {
      this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
                             BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
 @@ -158,16 +194,50 @@
                             BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
    }

 +  public void testBigramFilterWithJustOneToken() throws IOException {
 +    this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
 +                           SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES);
 +  }
 +
 +  public void testBiGramFilterWithOutputUnigramIfNoNgramsOption() throws IOException {
 +    this.shingleFilterTestOutputUnigramIfNoNgrams(2, TEST_TOKEN,
 +                                                  BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
 +                                                  BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
 +                                                  BI_GRAM_TYPES_WITHOUT_UNIGRAMS);
 +  }
 +
 +  public void testBiGramFilterWithOutputUnigramIfNoNgramsOptionAndJustOneToken() throws IOException {
 +    this.shingleFilterTestOutputUnigramIfNoNgrams(2, TEST_SINGLE_TOKEN,
 +                                                  SINGLE_TOKEN, SINGLE_TOKEN_INCREMENTS,
 +                                                  SINGLE_TOKEN_TYPES);
 +  }
 +
    public void testTriGramFilter() throws IOException {
      this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
                             TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
    }

 +
    protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
                                     int[] positionIncrements, String[] types)
      throws IOException {
 +      TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
 +      this.shingleFilterTest(filter, tokensToShingle, tokensToCompare, positionIncrements, types);
 +  }

 -    TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
 +  protected void shingleFilterTestOutputUnigramIfNoNgrams(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
 +                                   int[] positionIncrements, String[] types)
 +    throws IOException {
 +      ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
 +      filter.setOutputUnigrams(false);
 +      filter.setOutputUnigramIfNoNgrams(true);
 +      this.shingleFilterTest(filter, tokensToShingle, tokensToCompare, positionIncrements, types);
 +  }
 +
 +  protected void shingleFilterTest(TokenStream filter, Token[] tokensToShingle, Token[] tokensToCompare,
 +                                   int[] positionIncrements, String[] types)
 +    throws IOException {
 +
      Token token;
      int i = 0;

 @@ -185,5 +255,7 @@
        assertEquals("Wrong type for token \"" + termText + "\"", types[i], token.type());
        i++;
      }
 +
 +    assertEquals("Wrong number of tokens returned", tokensToCompare.length, i);
    }
  }
	Index: contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
	===================================================================
	--- contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (revision 669476)
	+++ contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (working copy)
	@@ -67,6 +67,11 @@
	private boolean outputUnigrams = true;

	/**
	+ * Sometimes you may want to make sure to output a single unigram in case
	+ * there are no shingles. */
	+ private boolean outputUnigramIfNoNgrams = false;
	+
	+ /**
	* maximum shingle size (number of tokens)
	*/
	private int maxShingleSize;
	@@ -124,6 +129,10 @@
	this.outputUnigrams = outputUnigrams;
	}

	+ public void setOutputUnigramIfNoNgrams(boolean outputUnigramIfNoNgrams) {
	+ this.outputUnigramIfNoNgrams = outputUnigramIfNoNgrams;
	+ }
	+
	/**
	* Set the max shingle size (default: 2)
	*
	@@ -149,6 +158,9 @@
	}
	}

	+ private Token mostRecentInputToken = null;
	+ private boolean returnedAnyTokensYet = false;
	+
	/* (non-Javadoc)
	* @see org.apache.lucene.analysis.TokenStream#next()
	*/
	@@ -159,9 +171,18 @@
	Token nextToken = null;
	if ( ! outputBuf.isEmpty())
	{
	- nextToken = (Token)outputBuf.remove(0);
	+ returnedAnyTokensYet = true;
	+ return (Token)outputBuf.remove(0);
	}
	- return nextToken;
	+ else if (outputUnigramIfNoNgrams && !returnedAnyTokensYet && mostRecentInputToken != null)
	+ {
	+ returnedAnyTokensYet = true;
	+ return mostRecentInputToken;
	+ }
	+ else
	+ {
	+ return null;
	+ }
	}

	/**
	@@ -182,6 +203,7 @@
	lastToken.startOffset()));
	}
	tokenBuf.add(lastToken);
	+ mostRecentInputToken = lastToken;
	return getNextToken();
	} else {
	return null;
	Index: contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
	===================================================================
	--- contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (revision 669476)
	+++ contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (working copy)
	@@ -100,6 +100,23 @@
	1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
	};

	+ public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS = new Token[] {
	+ new Token("please divide", 0, 13),
	+ new Token("divide this", 7, 18),
	+ new Token("this sentence", 14, 27),
	+ new Token("sentence into", 19, 32),
	+ new Token("into shingles", 28, 39),
	+ };
	+
	+ public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS = new int[] {
	+ 1, 1, 1, 1, 1, 1
	+ };
	+
	+ public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS = new String[] {
	+ "shingle", "shingle", "shingle", "shingle", "shingle"
	+ };
	+
	+
	public static final Token[] TRI_GRAM_TOKENS = new Token[] {
	new Token("please", 0, 6),
	new Token("please divide", 0, 13),
	@@ -132,6 +149,24 @@
	};


	+ public static final Token[] TEST_SINGLE_TOKEN = new Token[] {
	+ new Token("please", 0, 6),
	+ };
	+
	+ public static final Token[] SINGLE_TOKEN = new Token[] {
	+ new Token("please", 0, 6),
	+ };
	+
	+ public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {
	+ 1
	+ };
	+
	+ public static final String[] SINGLE_TOKEN_TYPES = new String[] {
	+ "word"
	+ };
	+
	+
	+
	protected void setUp() throws Exception {
	super.setUp();
	testTokenWithHoles = new Token[] {
	@@ -148,6 +183,7 @@
	/*
	* Class under test for void ShingleFilter(TokenStream, int)
	*/
	+
	public void testBiGramFilter() throws IOException {
	this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
	BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
	@@ -158,16 +194,50 @@
	BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
	}

	+ public void testBigramFilterWithJustOneToken() throws IOException {
	+ this.shingleFilterTest(2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
	+ SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES);
	+ }
	+
	+ public void testBiGramFilterWithOutputUnigramIfNoNgramsOption() throws IOException {
	+ this.shingleFilterTestOutputUnigramIfNoNgrams(2, TEST_TOKEN,
	+ BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
	+ BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
	+ BI_GRAM_TYPES_WITHOUT_UNIGRAMS);
	+ }
	+
	+ public void testBiGramFilterWithOutputUnigramIfNoNgramsOptionAndJustOneToken() throws IOException {
	+ this.shingleFilterTestOutputUnigramIfNoNgrams(2, TEST_SINGLE_TOKEN,
	+ SINGLE_TOKEN, SINGLE_TOKEN_INCREMENTS,
	+ SINGLE_TOKEN_TYPES);
	+ }
	+
	public void testTriGramFilter() throws IOException {
	this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
	TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
	}

	+
	protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
	int[] positionIncrements, String[] types)
	throws IOException {
	+ TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
	+ this.shingleFilterTest(filter, tokensToShingle, tokensToCompare, positionIncrements, types);
	+ }

	- TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
	+ protected void shingleFilterTestOutputUnigramIfNoNgrams(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
	+ int[] positionIncrements, String[] types)
	+ throws IOException {
	+ ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
	+ filter.setOutputUnigrams(false);
	+ filter.setOutputUnigramIfNoNgrams(true);
	+ this.shingleFilterTest(filter, tokensToShingle, tokensToCompare, positionIncrements, types);
	+ }
	+
	+ protected void shingleFilterTest(TokenStream filter, Token[] tokensToShingle, Token[] tokensToCompare,
	+ int[] positionIncrements, String[] types)
	+ throws IOException {
	+
	Token token;
	int i = 0;

	@@ -185,5 +255,7 @@
	assertEquals("Wrong type for token \"" + termText + "\"", types[i], token.type());
	i++;
	}
	+
	+ assertEquals("Wrong number of tokens returned", tokensToCompare.length, i);
	}
	}