lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiTokenizer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.th;


 import java.text.BreakIterator;
 import java.util.Locale;

 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.util.CharArrayIterator;
 import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
 import org.apache.lucene.util.AttributeFactory;

 /**
  * Tokenizer that use {@link BreakIterator} to tokenize Thai text.
  * <p>WARNING: this tokenizer may not be supported by all JREs.
  *    It is known to work with Sun/Oracle and Harmony JREs.
  *    If your application needs to be fully portable, consider using ICUTokenizer instead,
  *    which uses an ICU Thai BreakIterator that will always be available.
  */
 public class ThaiTokenizer extends SegmentingTokenizerBase {
   /**
    * True if the JRE supports a working dictionary-based breakiterator for Thai.
    * If this is false, this tokenizer will not work at all!
    */
   public static final boolean DBBI_AVAILABLE;
   private static final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
   static {
     // check that we have a working dictionary-based break iterator for thai
     proto.setText("ภาษาไทย");
     DBBI_AVAILABLE = proto.isBoundary(4);
   }

   /** used for breaking the text into sentences */
   private static final BreakIterator sentenceProto = BreakIterator.getSentenceInstance(Locale.ROOT);

   private final BreakIterator wordBreaker;
   private final CharArrayIterator wrapper = CharArrayIterator.newWordInstance();

   int sentenceStart;
   int sentenceEnd;

   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

   /** Creates a new ThaiTokenizer */
   public ThaiTokenizer() {
     this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY);
   }

   /** Creates a new ThaiTokenizer, supplying the AttributeFactory */
   public ThaiTokenizer(AttributeFactory factory) {
     super(factory, (BreakIterator)sentenceProto.clone());
     if (!DBBI_AVAILABLE) {
       throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
     }
     wordBreaker = (BreakIterator)proto.clone();
   }

   @Override
   protected void setNextSentence(int sentenceStart, int sentenceEnd) {
     this.sentenceStart = sentenceStart;
     this.sentenceEnd = sentenceEnd;
     wrapper.setText(buffer, sentenceStart, sentenceEnd - sentenceStart);
     wordBreaker.setText(wrapper);
   }

   @Override
   protected boolean incrementWord() {
     int start = wordBreaker.current();
     if (start == BreakIterator.DONE) {
       return false; // BreakIterator exhausted
     }

     // find the next set of boundaries, skipping over non-tokens
     int end = wordBreaker.next();
     while (end != BreakIterator.DONE &&
            !Character.isLetterOrDigit(Character.codePointAt(buffer, sentenceStart + start, sentenceEnd))) {
       start = end;
       end = wordBreaker.next();
     }

     if (end == BreakIterator.DONE) {
       return false; // BreakIterator exhausted
     }

     clearAttributes();
     termAtt.copyBuffer(buffer, sentenceStart + start, end - start);
     offsetAtt.setOffset(correctOffset(offset + sentenceStart + start), correctOffset(offset + sentenceStart + end));
     return true;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.th;


	import java.text.BreakIterator;
	import java.util.Locale;

	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.util.CharArrayIterator;
	import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
	import org.apache.lucene.util.AttributeFactory;

	/**
	* Tokenizer that use {@link BreakIterator} to tokenize Thai text.
	* <p>WARNING: this tokenizer may not be supported by all JREs.
	* It is known to work with Sun/Oracle and Harmony JREs.
	* If your application needs to be fully portable, consider using ICUTokenizer instead,
	* which uses an ICU Thai BreakIterator that will always be available.
	*/
	public class ThaiTokenizer extends SegmentingTokenizerBase {
	/**
	* True if the JRE supports a working dictionary-based breakiterator for Thai.
	* If this is false, this tokenizer will not work at all!
	*/
	public static final boolean DBBI_AVAILABLE;
	private static final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
	static {
	// check that we have a working dictionary-based break iterator for thai
	proto.setText("ภาษาไทย");
	DBBI_AVAILABLE = proto.isBoundary(4);
	}

	/** used for breaking the text into sentences */
	private static final BreakIterator sentenceProto = BreakIterator.getSentenceInstance(Locale.ROOT);

	private final BreakIterator wordBreaker;
	private final CharArrayIterator wrapper = CharArrayIterator.newWordInstance();

	int sentenceStart;
	int sentenceEnd;

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

	/** Creates a new ThaiTokenizer */
	public ThaiTokenizer() {
	this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY);
	}

	/** Creates a new ThaiTokenizer, supplying the AttributeFactory */
	public ThaiTokenizer(AttributeFactory factory) {
	super(factory, (BreakIterator)sentenceProto.clone());
	if (!DBBI_AVAILABLE) {
	throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
	}
	wordBreaker = (BreakIterator)proto.clone();
	}

	@Override
	protected void setNextSentence(int sentenceStart, int sentenceEnd) {
	this.sentenceStart = sentenceStart;
	this.sentenceEnd = sentenceEnd;
	wrapper.setText(buffer, sentenceStart, sentenceEnd - sentenceStart);
	wordBreaker.setText(wrapper);
	}

	@Override
	protected boolean incrementWord() {
	int start = wordBreaker.current();
	if (start == BreakIterator.DONE) {
	return false; // BreakIterator exhausted
	}

	// find the next set of boundaries, skipping over non-tokens
	int end = wordBreaker.next();
	while (end != BreakIterator.DONE &&
	!Character.isLetterOrDigit(Character.codePointAt(buffer, sentenceStart + start, sentenceEnd))) {
	start = end;
	end = wordBreaker.next();
	}

	if (end == BreakIterator.DONE) {
	return false; // BreakIterator exhausted
	}

	clearAttributes();
	termAtt.copyBuffer(buffer, sentenceStart + start, end - start);
	offsetAtt.setOffset(correctOffset(offset + sentenceStart + start), correctOffset(offset + sentenceStart + end));
	return true;
	}
	}