opennlp-similarity/src/main/java/opennlp/tools/stemmer/PorterStemmer.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.stemmer;


 	import java.io.IOException;
 	import java.io.InputStream;
 	import java.io.FileInputStream;

 	import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
 	import org.apache.lucene.util.ArrayUtil;

 	/**
 	 *
 	 * Stemmer, implementing the Porter Stemming Algorithm
 	 *
 	 * The Stemmer class transforms a word into its root form.  The input
 	 * word can be provided a character at time (by calling add()), or at once
 	 * by calling one of the various stem(something) methods.
 	 */

 	public class PorterStemmer
 	{
 	  private char[] b;
 	  private int i,    /* offset into b */
 	    j, k, k0;
 	  private boolean dirty = false;
 	  private static final int INITIAL_SIZE = 50;

 	  public PorterStemmer() {
 	    b = new char[INITIAL_SIZE];
 	    i = 0;
 	  }

 	  /**
 	   * reset() resets the stemmer so it can stem another word.  If you invoke
 	   * the stemmer by calling add(char) and then stem(), you must call reset()
 	   * before starting another word.
 	   */
 	  public void reset() { i = 0; dirty = false; }

 	  /**
 	   * Add a character to the word being stemmed.  When you are finished
 	   * adding characters, you can call stem(void) to process the word.
 	   */
 	  public void add(char ch) {
 	    if (b.length <= i) {
 	      b = ArrayUtil.grow(b, i+1);
 	    }
 	    b[i++] = ch;
 	  }

 	  /**
 	   * After a word has been stemmed, it can be retrieved by toString(),
 	   * or a reference to the internal buffer can be retrieved by getResultBuffer
 	   * and getResultLength (which is generally more efficient.)
 	   */
 	  @Override
 	  public String toString() { return new String(b,0,i); }

 	  /**
 	   * Returns the length of the word resulting from the stemming process.
 	   */
 	  public int getResultLength() { return i; }

 	  /**
 	   * Returns a reference to a character buffer containing the results of
 	   * the stemming process.  You also need to consult getResultLength()
 	   * to determine the length of the result.
 	   */
 	  public char[] getResultBuffer() { return b; }

 	  /* cons(i) is true <=> b[i] is a consonant. */

 	  private final boolean cons(int i) {
 	    switch (b[i]) {
 	    case 'a': case 'e': case 'i': case 'o': case 'u':
 	      return false;
 	    case 'y':
 	      return (i==k0) ? true : !cons(i-1);
 	    default:
 	      return true;
 	    }
 	  }

 	  /* m() measures the number of consonant sequences between k0 and j. if c is
 	     a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
 	     presence,

 	          <c><v>       gives 0
 	          <c>vc<v>     gives 1
 	          <c>vcvc<v>   gives 2
 	          <c>vcvcvc<v> gives 3
 	          ....
 	  */

 	  private final int m() {
 	    int n = 0;
 	    int i = k0;
 	    while(true) {
 	      if (i > j)
 	        return n;
 	      if (! cons(i))
 	        break;
 	      i++;
 	    }
 	    i++;
 	    while(true) {
 	      while(true) {
 	        if (i > j)
 	          return n;
 	        if (cons(i))
 	          break;
 	        i++;
 	      }
 	      i++;
 	      n++;
 	      while(true) {
 	        if (i > j)
 	          return n;
 	        if (! cons(i))
 	          break;
 	        i++;
 	      }
 	      i++;
 	    }
 	  }

 	  /* vowelinstem() is true <=> k0,...j contains a vowel */

 	  private final boolean vowelinstem() {
 	    int i;
 	    for (i = k0; i <= j; i++)
 	      if (! cons(i))
 	        return true;
 	    return false;
 	  }

 	  /* doublec(j) is true <=> j,(j-1) contain a double consonant. */

 	  private final boolean doublec(int j) {
 	    if (j < k0+1)
 	      return false;
 	    if (b[j] != b[j-1])
 	      return false;
 	    return cons(j);
 	  }

 	  /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
 	     and also if the second c is not w,x or y. this is used when trying to
 	     restore an e at the end of a short word. e.g.

 	          cav(e), lov(e), hop(e), crim(e), but
 	          snow, box, tray.

 	  */

 	  private final boolean cvc(int i) {
 	    if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
 	      return false;
 	    else {
 	      int ch = b[i];
 	      if (ch == 'w' || ch == 'x' || ch == 'y') return false;
 	    }
 	    return true;
 	  }

 	  private final boolean ends(String s) {
 	    int l = s.length();
 	    int o = k-l+1;
 	    if (o < k0)
 	      return false;
 	    for (int i = 0; i < l; i++)
 	      if (b[o+i] != s.charAt(i))
 	        return false;
 	    j = k-l;
 	    return true;
 	  }

 	  /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
 	     k. */

 	  void setto(String s) {
 	    int l = s.length();
 	    int o = j+1;
 	    for (int i = 0; i < l; i++)
 	      b[o+i] = s.charAt(i);
 	    k = j+l;
 	    dirty = true;
 	  }

 	  /* r(s) is used further down. */

 	  void r(String s) { if (m() > 0) setto(s); }

 	  /* step1() gets rid of plurals and -ed or -ing. e.g.

 	           caresses  ->  caress
 	           ponies    ->  poni
 	           ties      ->  ti
 	           caress    ->  caress
 	           cats      ->  cat

 	           feed      ->  feed
 	           agreed    ->  agree
 	           disabled  ->  disable

 	           matting   ->  mat
 	           mating    ->  mate
 	           meeting   ->  meet
 	           milling   ->  mill
 	           messing   ->  mess

 	           meetings  ->  meet

 	  */

 	  private final void step1() {
 	    if (b[k] == 's') {
 	      if (ends("sses")) k -= 2;
 	      else if (ends("ies")) setto("i");
 	      else if (b[k-1] != 's') k--;
 	    }
 	    if (ends("eed")) {
 	      if (m() > 0)
 	        k--;
 	    }
 	    else if ((ends("ed") || ends("ing")) && vowelinstem()) {
 	      k = j;
 	      if (ends("at")) setto("ate");
 	      else if (ends("bl")) setto("ble");
 	      else if (ends("iz")) setto("ize");
 	      else if (doublec(k)) {
 	        int ch = b[k--];
 	        if (ch == 'l' || ch == 's' || ch == 'z')
 	          k++;
 	      }
 	      else if (m() == 1 && cvc(k))
 	        setto("e");
 	    }
 	  }

 	  /* step2() turns terminal y to i when there is another vowel in the stem. */

 	  private final void step2() {
 	    if (ends("y") && vowelinstem()) {
 	      b[k] = 'i';
 	      dirty = true;
 	    }
 	  }

 	  /* step3() maps double suffices to single ones. so -ization ( = -ize plus
 	     -ation) maps to -ize etc. note that the string before the suffix must give
 	     m() > 0. */

 	  private final void step3() {
 	    if (k == k0) return; /* For Bug 1 */
 	    switch (b[k-1]) {
 	    case 'a':
 	      if (ends("ational")) { r("ate"); break; }
 	      if (ends("tional")) { r("tion"); break; }
 	      break;
 	    case 'c':
 	      if (ends("enci")) { r("ence"); break; }
 	      if (ends("anci")) { r("ance"); break; }
 	      break;
 	    case 'e':
 	      if (ends("izer")) { r("ize"); break; }
 	      break;
 	    case 'l':
 	      if (ends("bli")) { r("ble"); break; }
 	      if (ends("alli")) { r("al"); break; }
 	      if (ends("entli")) { r("ent"); break; }
 	      if (ends("eli")) { r("e"); break; }
 	      if (ends("ousli")) { r("ous"); break; }
 	      break;
 	    case 'o':
 	      if (ends("ization")) { r("ize"); break; }
 	      if (ends("ation")) { r("ate"); break; }
 	      if (ends("ator")) { r("ate"); break; }
 	      break;
 	    case 's':
 	      if (ends("alism")) { r("al"); break; }
 	      if (ends("iveness")) { r("ive"); break; }
 	      if (ends("fulness")) { r("ful"); break; }
 	      if (ends("ousness")) { r("ous"); break; }
 	      break;
 	    case 't':
 	      if (ends("aliti")) { r("al"); break; }
 	      if (ends("iviti")) { r("ive"); break; }
 	      if (ends("biliti")) { r("ble"); break; }
 	      break;
 	    case 'g':
 	      if (ends("logi")) { r("log"); break; }
 	    }
 	  }

 	  /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */

 	  private final void step4() {
 	    switch (b[k]) {
 	    case 'e':
 	      if (ends("icate")) { r("ic"); break; }
 	      if (ends("ative")) { r(""); break; }
 	      if (ends("alize")) { r("al"); break; }
 	      break;
 	    case 'i':
 	      if (ends("iciti")) { r("ic"); break; }
 	      break;
 	    case 'l':
 	      if (ends("ical")) { r("ic"); break; }
 	      if (ends("ful")) { r(""); break; }
 	      break;
 	    case 's':
 	      if (ends("ness")) { r(""); break; }
 	      break;
 	    }
 	  }

 	  /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */

 	  private final void step5() {
 	    if (k == k0) return; /* for Bug 1 */
 	    switch (b[k-1]) {
 	    case 'a':
 	      if (ends("al")) break;
 	      return;
 	    case 'c':
 	      if (ends("ance")) break;
 	      if (ends("ence")) break;
 	      return;
 	    case 'e':
 	      if (ends("er")) break; return;
 	    case 'i':
 	      if (ends("ic")) break; return;
 	    case 'l':
 	      if (ends("able")) break;
 	      if (ends("ible")) break; return;
 	    case 'n':
 	      if (ends("ant")) break;
 	      if (ends("ement")) break;
 	      if (ends("ment")) break;
 	      /* element etc. not stripped before the m */
 	      if (ends("ent")) break;
 	      return;
 	    case 'o':
 	      if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
 	      /* j >= 0 fixes Bug 2 */
 	      if (ends("ou")) break;
 	      return;
 	      /* takes care of -ous */
 	    case 's':
 	      if (ends("ism")) break;
 	      return;
 	    case 't':
 	      if (ends("ate")) break;
 	      if (ends("iti")) break;
 	      return;
 	    case 'u':
 	      if (ends("ous")) break;
 	      return;
 	    case 'v':
 	      if (ends("ive")) break;
 	      return;
 	    case 'z':
 	      if (ends("ize")) break;
 	      return;
 	    default:
 	      return;
 	    }
 	    if (m() > 1)
 	      k = j;
 	  }

 	  /* step6() removes a final -e if m() > 1. */

 	  private final void step6() {
 	    j = k;
 	    if (b[k] == 'e') {
 	      int a = m();
 	      if (a > 1 || a == 1 && !cvc(k-1))
 	        k--;
 	    }
 	    if (b[k] == 'l' && doublec(k) && m() > 1)
 	      k--;
 	  }


 	  /**
 	   * Stem a word provided as a String.  Returns the result as a String.
 	   */
 	  public String stem(String s) {
 	    if (stem(s.toCharArray(), s.length()))
 	      return toString();
 	    else
 	      return s;
 	  }

 	  /** Stem a word contained in a char[].  Returns true if the stemming process
 	   * resulted in a word different from the input.  You can retrieve the
 	   * result with getResultLength()/getResultBuffer() or toString().
 	   */
 	  public boolean stem(char[] word) {
 	    return stem(word, word.length);
 	  }

 	  /** Stem a word contained in a portion of a char[] array.  Returns
 	   * true if the stemming process resulted in a word different from
 	   * the input.  You can retrieve the result with
 	   * getResultLength()/getResultBuffer() or toString().
 	   */
 	  public boolean stem(char[] wordBuffer, int offset, int wordLen) {
 	    reset();
 	    if (b.length < wordLen) {
 	      b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
 	    }
 	    System.arraycopy(wordBuffer, offset, b, 0, wordLen);
 	    i = wordLen;
 	    return stem(0);
 	  }

 	  /** Stem a word contained in a leading portion of a char[] array.
 	   * Returns true if the stemming process resulted in a word different
 	   * from the input.  You can retrieve the result with
 	   * getResultLength()/getResultBuffer() or toString().
 	   */
 	  public boolean stem(char[] word, int wordLen) {
 	    return stem(word, 0, wordLen);
 	  }

 	  /** Stem the word placed into the Stemmer buffer through calls to add().
 	   * Returns true if the stemming process resulted in a word different
 	   * from the input.  You can retrieve the result with
 	   * getResultLength()/getResultBuffer() or toString().
 	   */
 	  public boolean stem() {
 	    return stem(0);
 	  }

 	  public boolean stem(int i0) {
 	    k = i - 1;
 	    k0 = i0;
 	    if (k > k0+1) {
 	      step1(); step2(); step3(); step4(); step5(); step6();
 	    }
 	    // Also, a word is considered dirty if we lopped off letters
 	    // Thanks to Ifigenia Vairelles for pointing this out.
 	    if (i != k+1)
 	      dirty = true;
 	    i = k+1;
 	    return dirty;
 	  }

 	  /** Test program for demonstrating the Stemmer.  It reads a file and
 	   * stems each word, writing the result to standard out.
 	   * Usage: Stemmer file-name
 	   */
 	  public static void main(String[] args) {
 	    PorterStemmer s = new PorterStemmer();

 	    for (int i = 0; i < args.length; i++) {
 	      try {
 	        InputStream in = new FileInputStream(args[i]);
 	        byte[] buffer = new byte[1024];
 	        int bufferLen, offset, ch;

 	        bufferLen = in.read(buffer);
 	        offset = 0;
 	        s.reset();

 	        while(true) {
 	          if (offset < bufferLen)
 	            ch = buffer[offset++];
 	          else {
 	            bufferLen = in.read(buffer);
 	            offset = 0;
 	            if (bufferLen < 0)
 	              ch = -1;
 	            else
 	              ch = buffer[offset++];
 	          }

 	          if (Character.isLetter((char) ch)) {
 	            s.add(Character.toLowerCase((char) ch));
 	          }
 	          else {
 	             s.stem();
 	             System.out.print(s.toString());
 	             s.reset();
 	             if (ch < 0)
 	               break;
 	             else {
 	               System.out.print((char) ch);
 	             }
 	           }
 	        }

 	        in.close();
 	      }
 	      catch (IOException e) {
 	        System.out.println("error reading " + args[i]);
 	      }
 	    }
 	  }
 	}
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.stemmer;


	import java.io.IOException;
	import java.io.InputStream;
	import java.io.FileInputStream;

	import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
	import org.apache.lucene.util.ArrayUtil;

	/**
	*
	* Stemmer, implementing the Porter Stemming Algorithm
	*
	* The Stemmer class transforms a word into its root form. The input
	* word can be provided a character at time (by calling add()), or at once
	* by calling one of the various stem(something) methods.
	*/

	public class PorterStemmer
	{
	private char[] b;
	private int i, /* offset into b */
	j, k, k0;
	private boolean dirty = false;
	private static final int INITIAL_SIZE = 50;

	public PorterStemmer() {
	b = new char[INITIAL_SIZE];
	i = 0;
	}

	/**
	* reset() resets the stemmer so it can stem another word. If you invoke
	* the stemmer by calling add(char) and then stem(), you must call reset()
	* before starting another word.
	*/
	public void reset() { i = 0; dirty = false; }

	/**
	* Add a character to the word being stemmed. When you are finished
	* adding characters, you can call stem(void) to process the word.
	*/
	public void add(char ch) {
	if (b.length <= i) {
	b = ArrayUtil.grow(b, i+1);
	}
	b[i++] = ch;
	}

	/**
	* After a word has been stemmed, it can be retrieved by toString(),
	* or a reference to the internal buffer can be retrieved by getResultBuffer
	* and getResultLength (which is generally more efficient.)
	*/
	@Override
	public String toString() { return new String(b,0,i); }

	/**
	* Returns the length of the word resulting from the stemming process.
	*/
	public int getResultLength() { return i; }

	/**
	* Returns a reference to a character buffer containing the results of
	* the stemming process. You also need to consult getResultLength()
	* to determine the length of the result.
	*/
	public char[] getResultBuffer() { return b; }

	/* cons(i) is true <=> b[i] is a consonant. */

	private final boolean cons(int i) {
	switch (b[i]) {
	case 'a': case 'e': case 'i': case 'o': case 'u':
	return false;
	case 'y':
	return (i==k0) ? true : !cons(i-1);
	default:
	return true;
	}
	}

	/* m() measures the number of consonant sequences between k0 and j. if c is
	a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
	presence,

	<c><v> gives 0
	<c>vc<v> gives 1
	<c>vcvc<v> gives 2
	<c>vcvcvc<v> gives 3
	....
	*/

	private final int m() {
	int n = 0;
	int i = k0;
	while(true) {
	if (i > j)
	return n;
	if (! cons(i))
	break;
	i++;
	}
	i++;
	while(true) {
	while(true) {
	if (i > j)
	return n;
	if (cons(i))
	break;
	i++;
	}
	i++;
	n++;
	while(true) {
	if (i > j)
	return n;
	if (! cons(i))
	break;
	i++;
	}
	i++;
	}
	}

	/* vowelinstem() is true <=> k0,...j contains a vowel */

	private final boolean vowelinstem() {
	int i;
	for (i = k0; i <= j; i++)
	if (! cons(i))
	return true;
	return false;
	}

	/* doublec(j) is true <=> j,(j-1) contain a double consonant. */

	private final boolean doublec(int j) {
	if (j < k0+1)
	return false;
	if (b[j] != b[j-1])
	return false;
	return cons(j);
	}

	/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
	and also if the second c is not w,x or y. this is used when trying to
	restore an e at the end of a short word. e.g.

	cav(e), lov(e), hop(e), crim(e), but
	snow, box, tray.

	*/

	private final boolean cvc(int i) {
	if (i < k0+2 \|\| !cons(i) \|\| cons(i-1) \|\| !cons(i-2))
	return false;
	else {
	int ch = b[i];
	if (ch == 'w' \|\| ch == 'x' \|\| ch == 'y') return false;
	}
	return true;
	}

	private final boolean ends(String s) {
	int l = s.length();
	int o = k-l+1;
	if (o < k0)
	return false;
	for (int i = 0; i < l; i++)
	if (b[o+i] != s.charAt(i))
	return false;
	j = k-l;
	return true;
	}

	/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
	k. */

	void setto(String s) {
	int l = s.length();
	int o = j+1;
	for (int i = 0; i < l; i++)
	b[o+i] = s.charAt(i);
	k = j+l;
	dirty = true;
	}

	/* r(s) is used further down. */

	void r(String s) { if (m() > 0) setto(s); }

	/* step1() gets rid of plurals and -ed or -ing. e.g.

	caresses -> caress
	ponies -> poni
	ties -> ti
	caress -> caress
	cats -> cat

	feed -> feed
	agreed -> agree
	disabled -> disable

	matting -> mat
	mating -> mate
	meeting -> meet
	milling -> mill
	messing -> mess

	meetings -> meet

	*/

	private final void step1() {
	if (b[k] == 's') {
	if (ends("sses")) k -= 2;
	else if (ends("ies")) setto("i");
	else if (b[k-1] != 's') k--;
	}
	if (ends("eed")) {
	if (m() > 0)
	k--;
	}
	else if ((ends("ed") \|\| ends("ing")) && vowelinstem()) {
	k = j;
	if (ends("at")) setto("ate");
	else if (ends("bl")) setto("ble");
	else if (ends("iz")) setto("ize");
	else if (doublec(k)) {
	int ch = b[k--];
	if (ch == 'l' \|\| ch == 's' \|\| ch == 'z')
	k++;
	}
	else if (m() == 1 && cvc(k))
	setto("e");
	}
	}

	/* step2() turns terminal y to i when there is another vowel in the stem. */

	private final void step2() {
	if (ends("y") && vowelinstem()) {
	b[k] = 'i';
	dirty = true;
	}
	}

	/* step3() maps double suffices to single ones. so -ization ( = -ize plus
	-ation) maps to -ize etc. note that the string before the suffix must give
	m() > 0. */

	private final void step3() {
	if (k == k0) return; /* For Bug 1 */
	switch (b[k-1]) {
	case 'a':
	if (ends("ational")) { r("ate"); break; }
	if (ends("tional")) { r("tion"); break; }
	break;
	case 'c':
	if (ends("enci")) { r("ence"); break; }
	if (ends("anci")) { r("ance"); break; }
	break;
	case 'e':
	if (ends("izer")) { r("ize"); break; }
	break;
	case 'l':
	if (ends("bli")) { r("ble"); break; }
	if (ends("alli")) { r("al"); break; }
	if (ends("entli")) { r("ent"); break; }
	if (ends("eli")) { r("e"); break; }
	if (ends("ousli")) { r("ous"); break; }
	break;
	case 'o':
	if (ends("ization")) { r("ize"); break; }
	if (ends("ation")) { r("ate"); break; }
	if (ends("ator")) { r("ate"); break; }
	break;
	case 's':
	if (ends("alism")) { r("al"); break; }
	if (ends("iveness")) { r("ive"); break; }
	if (ends("fulness")) { r("ful"); break; }
	if (ends("ousness")) { r("ous"); break; }
	break;
	case 't':
	if (ends("aliti")) { r("al"); break; }
	if (ends("iviti")) { r("ive"); break; }
	if (ends("biliti")) { r("ble"); break; }
	break;
	case 'g':
	if (ends("logi")) { r("log"); break; }
	}
	}

	/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */

	private final void step4() {
	switch (b[k]) {
	case 'e':
	if (ends("icate")) { r("ic"); break; }
	if (ends("ative")) { r(""); break; }
	if (ends("alize")) { r("al"); break; }
	break;
	case 'i':
	if (ends("iciti")) { r("ic"); break; }
	break;
	case 'l':
	if (ends("ical")) { r("ic"); break; }
	if (ends("ful")) { r(""); break; }
	break;
	case 's':
	if (ends("ness")) { r(""); break; }
	break;
	}
	}

	/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */

	private final void step5() {
	if (k == k0) return; /* for Bug 1 */
	switch (b[k-1]) {
	case 'a':
	if (ends("al")) break;
	return;
	case 'c':
	if (ends("ance")) break;
	if (ends("ence")) break;
	return;
	case 'e':
	if (ends("er")) break; return;
	case 'i':
	if (ends("ic")) break; return;
	case 'l':
	if (ends("able")) break;
	if (ends("ible")) break; return;
	case 'n':
	if (ends("ant")) break;
	if (ends("ement")) break;
	if (ends("ment")) break;
	/* element etc. not stripped before the m */
	if (ends("ent")) break;
	return;
	case 'o':
	if (ends("ion") && j >= 0 && (b[j] == 's' \|\| b[j] == 't')) break;
	/* j >= 0 fixes Bug 2 */
	if (ends("ou")) break;
	return;
	/* takes care of -ous */
	case 's':
	if (ends("ism")) break;
	return;
	case 't':
	if (ends("ate")) break;
	if (ends("iti")) break;
	return;
	case 'u':
	if (ends("ous")) break;
	return;
	case 'v':
	if (ends("ive")) break;
	return;
	case 'z':
	if (ends("ize")) break;
	return;
	default:
	return;
	}
	if (m() > 1)
	k = j;
	}

	/* step6() removes a final -e if m() > 1. */

	private final void step6() {
	j = k;
	if (b[k] == 'e') {
	int a = m();
	if (a > 1 \|\| a == 1 && !cvc(k-1))
	k--;
	}
	if (b[k] == 'l' && doublec(k) && m() > 1)
	k--;
	}


	/**
	* Stem a word provided as a String. Returns the result as a String.
	*/
	public String stem(String s) {
	if (stem(s.toCharArray(), s.length()))
	return toString();
	else
	return s;
	}

	/** Stem a word contained in a char[]. Returns true if the stemming process
	* resulted in a word different from the input. You can retrieve the
	* result with getResultLength()/getResultBuffer() or toString().
	*/
	public boolean stem(char[] word) {
	return stem(word, word.length);
	}

	/** Stem a word contained in a portion of a char[] array. Returns
	* true if the stemming process resulted in a word different from
	* the input. You can retrieve the result with
	* getResultLength()/getResultBuffer() or toString().
	*/
	public boolean stem(char[] wordBuffer, int offset, int wordLen) {
	reset();
	if (b.length < wordLen) {
	b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
	}
	System.arraycopy(wordBuffer, offset, b, 0, wordLen);
	i = wordLen;
	return stem(0);
	}

	/** Stem a word contained in a leading portion of a char[] array.
	* Returns true if the stemming process resulted in a word different
	* from the input. You can retrieve the result with
	* getResultLength()/getResultBuffer() or toString().
	*/
	public boolean stem(char[] word, int wordLen) {
	return stem(word, 0, wordLen);
	}

	/** Stem the word placed into the Stemmer buffer through calls to add().
	* Returns true if the stemming process resulted in a word different
	* from the input. You can retrieve the result with
	* getResultLength()/getResultBuffer() or toString().
	*/
	public boolean stem() {
	return stem(0);
	}

	public boolean stem(int i0) {
	k = i - 1;
	k0 = i0;
	if (k > k0+1) {
	step1(); step2(); step3(); step4(); step5(); step6();
	}
	// Also, a word is considered dirty if we lopped off letters
	// Thanks to Ifigenia Vairelles for pointing this out.
	if (i != k+1)
	dirty = true;
	i = k+1;
	return dirty;
	}

	/** Test program for demonstrating the Stemmer. It reads a file and
	* stems each word, writing the result to standard out.
	* Usage: Stemmer file-name
	*/
	public static void main(String[] args) {
	PorterStemmer s = new PorterStemmer();

	for (int i = 0; i < args.length; i++) {
	try {
	InputStream in = new FileInputStream(args[i]);
	byte[] buffer = new byte[1024];
	int bufferLen, offset, ch;

	bufferLen = in.read(buffer);
	offset = 0;
	s.reset();

	while(true) {
	if (offset < bufferLen)
	ch = buffer[offset++];
	else {
	bufferLen = in.read(buffer);
	offset = 0;
	if (bufferLen < 0)
	ch = -1;
	else
	ch = buffer[offset++];
	}

	if (Character.isLetter((char) ch)) {
	s.add(Character.toLowerCase((char) ch));
	}
	else {
	s.stem();
	System.out.print(s.toString());
	s.reset();
	if (ch < 0)
	break;
	else {
	System.out.print((char) ch);
	}
	}
	}

	in.close();
	}
	catch (IOException e) {
	System.out.println("error reading " + args[i]);
	}
	}
	}
	}