lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.icu.segmentation;

 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

 /**
  * Wraps RuleBasedBreakIterator, making object reuse convenient and
  * emitting a rule status for emoji sequences.
  * @lucene.experimental
  */
 final class BreakIteratorWrapper {
   private final CharArrayIterator textIterator = new CharArrayIterator();
   private final RuleBasedBreakIterator rbbi;
   private char text[];
   private int start;
   private int status;

   BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
     this.rbbi = rbbi;
   }

   int current() {
     return rbbi.current();
   }

   int getRuleStatus() {
     return status;
   }

   int next() {
     int current = rbbi.current();
     int next = rbbi.next();
     status = calcStatus(current, next);
     return next;
   }

   /** Returns current rule status for the text between breaks. (determines token type) */
   private int calcStatus(int current, int next) {
     // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
     // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
     if (next != BreakIterator.DONE && isEmoji(current, next)) {
       return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
     } else {
       return rbbi.getRuleStatus();
     }
   }

   // See unicode doc L2/16-315 for rationale.
   // basically for us the ambiguous cases (keycap/etc) as far as types go.
   static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
   // faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
   static final UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").freeze();

   /** Returns true if the current text represents emoji character or sequence */
   private boolean isEmoji(int current, int next) {
     int begin = start + current;
     int end = start + next;
     int codepoint = UTF16.charAt(text, 0, end, begin);
     if (EMOJI.contains(codepoint)) {
       if (EMOJI_RK.contains(codepoint)) {
         // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
         // an emoji presentation selector or keycap follows.
         int trailer = begin + Character.charCount(codepoint);
         return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
       } else {
         return true;
       }
     }
     return false;
   }

   void setText(char text[], int start, int length) {
     this.text = text;
     this.start = start;
     textIterator.setText(text, start, length);
     rbbi.setText(textIterator);
     status = RuleBasedBreakIterator.WORD_NONE;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.icu.segmentation;

	import com.ibm.icu.text.BreakIterator;
	import com.ibm.icu.text.RuleBasedBreakIterator;
	import com.ibm.icu.text.UTF16;
	import com.ibm.icu.text.UnicodeSet;

	/**
	* Wraps RuleBasedBreakIterator, making object reuse convenient and
	* emitting a rule status for emoji sequences.
	* @lucene.experimental
	*/
	final class BreakIteratorWrapper {
	private final CharArrayIterator textIterator = new CharArrayIterator();
	private final RuleBasedBreakIterator rbbi;
	private char text[];
	private int start;
	private int status;

	BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
	this.rbbi = rbbi;
	}

	int current() {
	return rbbi.current();
	}

	int getRuleStatus() {
	return status;
	}

	int next() {
	int current = rbbi.current();
	int next = rbbi.next();
	status = calcStatus(current, next);
	return next;
	}

	/** Returns current rule status for the text between breaks. (determines token type) */
	private int calcStatus(int current, int next) {
	// to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
	// https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
	if (next != BreakIterator.DONE && isEmoji(current, next)) {
	return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
	} else {
	return rbbi.getRuleStatus();
	}
	}

	// See unicode doc L2/16-315 for rationale.
	// basically for us the ambiguous cases (keycap/etc) as far as types go.
	static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
	// faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
	static final UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").freeze();

	/** Returns true if the current text represents emoji character or sequence */
	private boolean isEmoji(int current, int next) {
	int begin = start + current;
	int end = start + next;
	int codepoint = UTF16.charAt(text, 0, end, begin);
	if (EMOJI.contains(codepoint)) {
	if (EMOJI_RK.contains(codepoint)) {
	// if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
	// an emoji presentation selector or keycap follows.
	int trailer = begin + Character.charCount(codepoint);
	return trailer < end && (text[trailer] == 0xFE0F \|\| text[trailer] == 0x20E3);
	} else {
	return true;
	}
	}
	return false;
	}

	void setText(char text[], int start, int length) {
	this.text = text;
	this.start = start;
	textIterator.setText(text, start, length);
	rbbi.setText(textIterator);
	status = RuleBasedBreakIterator.WORD_NONE;
	}
	}