components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java - camel - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.camel.dataformat.bindy;

 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;

 import com.ibm.icu.text.BreakIterator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * This class replicates the essential parts of the String class in order to aid
  * proper work for Unicode chars in the presense of UTF-16. So for all operations
  * please see {@link String} with the same signature. This class is equally immutable.
  */
 public class UnicodeHelper implements Serializable {
     /**
      * Defines how length if a string is defined, i.e how chars are counted.
      */
     public enum Method {
         /**
          * One "char" is one Unicode codepoint, which is the standard case.
          */
         CODEPOINTS,

         /**
          * One "char" is one graphem.
          */
         GRAPHEME;
     }

     private static final Logger LOG = LoggerFactory.getLogger(UnicodeHelper.class);

     private String input;

     private List<Integer> splitted;

     private Method method;

     /**
      * Create instance.
      *
      * @param input
      *         String, that is to be wrapped.
      * @param method
      *         Method, that is used to determin "chars" of string.
      */
     public UnicodeHelper(final String input, final Method method) {
         this.input = input;
         this.method = method;
         this.splitted = null;
     }

     /**
      * For Serialization only!
      */
     protected UnicodeHelper() {
         // Empty
     }

     /**
      * @return
      *         Returns the method used to determining the string length.
      */
     public Method getMethod() {
         return method;
     }

     /**
      * @see String#substring(int)
      */
     public String substring(final int beginIndex) {
         split();

         final int beginChar = splitted.get(beginIndex);
         return input.substring(beginChar);
     }

     /**
      * @see String#substring(int, int)
      */
     public String substring(final int beginIndex, final int endIndex) {
         split();

         final int beginChar = splitted.get(beginIndex);
         final int endChar = splitted.get(endIndex);
         return input.substring(beginChar, endChar);
     }

     /**
      * @see String#length()
      */
     public int length() {
         split();

         return splitted.size() - 1;
     }

     /**
      * @see String#indexOf(String)
      */
     public int indexOf(final String str) {
         return indexOf(str, 0);
     }

     /**
      * @see String#indexOf(String, int)
      */
     public int indexOf(final String str, final int fromIndex) {
         split();

         final int len = new UnicodeHelper(str, method).length();

         for (int index = fromIndex; index + len < length(); index++) {
             if (str.equals(input.substring(splitted.get(index), splitted.get(index + len)))) {
                 return index;
             }
         }

         return -1;
     }

     private void split() {
         if (this.splitted != null) {
             return;
         }

         if (method.equals(Method.CODEPOINTS)) {
             splitCodepoints();

         } else /* (method.equals(Method.GRAPHEME)) */ {
             splitGrapheme();
         }

         LOG.debug("\"{}\" is splitted into {} ({} {}).", input, splitted, splitted.size() - 1, method);
         if (LOG.isTraceEnabled()) {
             for (int i = 0; i < splitted.size() - 2; i++) {
                 LOG.trace("segment [{},{}[=\"{}\".", splitted.get(i), splitted.get(i + 1), input.substring(splitted.get(i), splitted.get(i + 1)));
             }
         }
     }

     private void splitCodepoints() {
         final List<Integer> result = new ArrayList<>();

         int i = 0;
         final int len = input.length();
         while (i < len) {
             result.add(i);
             i += (Character.codePointAt(input, i) > 0xffff) ? 2 : 1;
         }
         result.add(len);

         this.splitted = result;
     }

     private void splitGrapheme() {
         final List<Integer> result = new ArrayList<>();

         //
         // Caution: The BreakIterator of ICU lib (com.ibm.icu.text.BreakIterator; siehe Dependencies) ist used here,
         //          since the Java builtin one cannot handle modern unicode (Emojis with sex, skin colour, etc.) correctly.
         //
         final BreakIterator bit = BreakIterator.getCharacterInstance();
         bit.setText(input);

         result.add(bit.first());
         for (int end = bit.next(); end != BreakIterator.DONE; end = bit.next()) {
             result.add(end);
         }
         this.splitted = result;
     }

     @Override
     public String toString() {
         return "StringHelper [input=" + input + ", splitted=" + splitted + ", method=" + method + "]";
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.camel.dataformat.bindy;

	import java.io.Serializable;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.stream.Collectors;

	import com.ibm.icu.text.BreakIterator;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* This class replicates the essential parts of the String class in order to aid
	* proper work for Unicode chars in the presense of UTF-16. So for all operations
	* please see {@link String} with the same signature. This class is equally immutable.
	*/
	public class UnicodeHelper implements Serializable {
	/**
	* Defines how length if a string is defined, i.e how chars are counted.
	*/
	public enum Method {
	/**
	* One "char" is one Unicode codepoint, which is the standard case.
	*/
	CODEPOINTS,

	/**
	* One "char" is one graphem.
	*/
	GRAPHEME;
	}

	private static final Logger LOG = LoggerFactory.getLogger(UnicodeHelper.class);

	private String input;

	private List<Integer> splitted;

	private Method method;

	/**
	* Create instance.
	*
	* @param input
	* String, that is to be wrapped.
	* @param method
	* Method, that is used to determin "chars" of string.
	*/
	public UnicodeHelper(final String input, final Method method) {
	this.input = input;
	this.method = method;
	this.splitted = null;
	}

	/**
	* For Serialization only!
	*/
	protected UnicodeHelper() {
	// Empty
	}

	/**
	* @return
	* Returns the method used to determining the string length.
	*/
	public Method getMethod() {
	return method;
	}

	/**
	* @see String#substring(int)
	*/
	public String substring(final int beginIndex) {
	split();

	final int beginChar = splitted.get(beginIndex);
	return input.substring(beginChar);
	}

	/**
	* @see String#substring(int, int)
	*/
	public String substring(final int beginIndex, final int endIndex) {
	split();

	final int beginChar = splitted.get(beginIndex);
	final int endChar = splitted.get(endIndex);
	return input.substring(beginChar, endChar);
	}

	/**
	* @see String#length()
	*/
	public int length() {
	split();

	return splitted.size() - 1;
	}

	/**
	* @see String#indexOf(String)
	*/
	public int indexOf(final String str) {
	return indexOf(str, 0);
	}

	/**
	* @see String#indexOf(String, int)
	*/
	public int indexOf(final String str, final int fromIndex) {
	split();

	final int len = new UnicodeHelper(str, method).length();

	for (int index = fromIndex; index + len < length(); index++) {
	if (str.equals(input.substring(splitted.get(index), splitted.get(index + len)))) {
	return index;
	}
	}

	return -1;
	}

	private void split() {
	if (this.splitted != null) {
	return;
	}

	if (method.equals(Method.CODEPOINTS)) {
	splitCodepoints();

	} else /* (method.equals(Method.GRAPHEME)) */ {
	splitGrapheme();
	}

	LOG.debug("\"{}\" is splitted into {} ({} {}).", input, splitted, splitted.size() - 1, method);
	if (LOG.isTraceEnabled()) {
	for (int i = 0; i < splitted.size() - 2; i++) {
	LOG.trace("segment [{},{}[=\"{}\".", splitted.get(i), splitted.get(i + 1), input.substring(splitted.get(i), splitted.get(i + 1)));
	}
	}
	}

	private void splitCodepoints() {
	final List<Integer> result = new ArrayList<>();

	int i = 0;
	final int len = input.length();
	while (i < len) {
	result.add(i);
	i += (Character.codePointAt(input, i) > 0xffff) ? 2 : 1;
	}
	result.add(len);

	this.splitted = result;
	}

	private void splitGrapheme() {
	final List<Integer> result = new ArrayList<>();

	//
	// Caution: The BreakIterator of ICU lib (com.ibm.icu.text.BreakIterator; siehe Dependencies) ist used here,
	// since the Java builtin one cannot handle modern unicode (Emojis with sex, skin colour, etc.) correctly.
	//
	final BreakIterator bit = BreakIterator.getCharacterInstance();
	bit.setText(input);

	result.add(bit.first());
	for (int end = bit.next(); end != BreakIterator.DONE; end = bit.next()) {
	result.add(end);
	}
	this.splitted = result;
	}

	@Override
	public String toString() {
	return "StringHelper [input=" + input + ", splitted=" + splitted + ", method=" + method + "]";
	}
	}