lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.util.automaton;


 import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.List;

 // TODO
 //   - do we really need the .bits...?  if not we can make util in UnicodeUtil to convert 1 char into a BytesRef

 /**
  * Converts UTF-32 automata to the equivalent UTF-8 representation.
  * @lucene.internal
  */
 public final class UTF32ToUTF8 {

   // Unicode boundaries for UTF8 bytes 1,2,3,4
   private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
   private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111};

   static int[] MASKS = new int[32];
   static {
     int v = 2;
     for(int i=0;i<32;i++) {
       MASKS[i] = v-1;
       v *= 2;
     }
   }

   // Represents one of the N utf8 bytes that (in sequence)
   // define a code point.  value is the byte value; bits is
   // how many bits are "used" by utf8 at that byte
   private static class UTF8Byte {
     int value;                                    // TODO: change to byte
     byte bits;
   }

   // Holds a single code point, as a sequence of 1-4 utf8 bytes:
   // TODO: maybe move to UnicodeUtil?
   private static class UTF8Sequence {
     private final UTF8Byte[] bytes;
     private int len;

     public UTF8Sequence() {
       bytes = new UTF8Byte[4];
       for(int i=0;i<4;i++) {
         bytes[i] = new UTF8Byte();
       }
     }

     public int byteAt(int idx) {
       return bytes[idx].value;
     }

     public int numBits(int idx) {
       return bytes[idx].bits;
     }

     private void set(int code) {
       if (code < 128) {
         // 0xxxxxxx
         bytes[0].value = code;
         bytes[0].bits = 7;
         len = 1;
       } else if (code < 2048) {
         // 110yyyxx 10xxxxxx
         bytes[0].value = (6 << 5) | (code >> 6);
         bytes[0].bits = 5;
         setRest(code, 1);
         len = 2;
       } else if (code < 65536) {
         // 1110yyyy 10yyyyxx 10xxxxxx
         bytes[0].value = (14 << 4) | (code >> 12);
         bytes[0].bits = 4;
         setRest(code, 2);
         len = 3;
       } else {
         // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
         bytes[0].value = (30 << 3) | (code >> 18);
         bytes[0].bits = 3;
         setRest(code, 3);
         len = 4;
       }
     }

     private void setRest(int code, int numBytes) {
       for(int i=0;i<numBytes;i++) {
         bytes[numBytes-i].value = 128 | (code & MASKS[5]);
         bytes[numBytes-i].bits = 6;
         code = code >> 6;
       }
     }

     @Override
     public String toString() {
       StringBuilder b = new StringBuilder();
       for(int i=0;i<len;i++) {
         if (i > 0) {
           b.append(' ');
         }
         b.append(Integer.toBinaryString(bytes[i].value));
       }
       return b.toString();
     }
   }

   /** Sole constructor. */
   public UTF32ToUTF8() {
   }

   private final UTF8Sequence startUTF8 = new UTF8Sequence();
   private final UTF8Sequence endUTF8 = new UTF8Sequence();

   private final UTF8Sequence tmpUTF8a = new UTF8Sequence();
   private final UTF8Sequence tmpUTF8b = new UTF8Sequence();

   // Builds necessary utf8 edges between start & end
   void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) {
     startUTF8.set(startCodePoint);
     endUTF8.set(endCodePoint);
     build(start, end, startUTF8, endUTF8, 0);
   }

   private void build(int start, int end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) {

     // Break into start, middle, end:
     if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) {
       // Degen case: lead with the same byte:
       if (upto == startUTF8.len-1 && upto == endUTF8.len-1) {
         // Super degen: just single edge, one UTF8 byte:
         utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
         return;
       } else {
         assert startUTF8.len > upto+1;
         assert endUTF8.len > upto+1;
         int n = utf8.createState();

         // Single value leading edge
         utf8.addTransition(start, n, startUTF8.byteAt(upto));
         //start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=single

         // Recurse for the rest
         build(n, end, startUTF8, endUTF8, 1+upto);
       }
     } else if (startUTF8.len == endUTF8.len) {
       if (upto == startUTF8.len-1) {
         //start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end));        // type=startend
         utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
       } else {
         start(start, end, startUTF8, upto, false);
         if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) {
           // There is a middle
           all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1);
         }
         end(start, end, endUTF8, upto, false);
       }
     } else {

       // start
       start(start, end, startUTF8, upto, true);

       // possibly middle, spanning multiple num bytes
       int byteCount = 1+startUTF8.len-upto;
       final int limit = endUTF8.len-upto;
       while (byteCount < limit) {
         // wasteful: we only need first byte, and, we should
         // statically encode this first byte:
         tmpUTF8a.set(startCodes[byteCount-1]);
         tmpUTF8b.set(endCodes[byteCount-1]);
         all(start, end,
             tmpUTF8a.byteAt(0),
             tmpUTF8b.byteAt(0),
             tmpUTF8a.len - 1);
         byteCount++;
       }

       // end
       end(start, end, endUTF8, upto, true);
     }
   }

   private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) {
     if (upto == startUTF8.len-1) {
       // Done recursing
       utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start
       //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end));  // type=start
     } else {
       int n = utf8.createState();
       utf8.addTransition(start, n, startUTF8.byteAt(upto));
       //start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=start
       start(n, end, startUTF8, 1+upto, true);
       int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1];
       if (doAll && startUTF8.byteAt(upto) != endCode) {
         all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1);
       }
     }
   }

   private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
     if (upto == endUTF8.len-1) {
       // Done recursing
       //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end));   // type=end
       utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto));
     } else {
       final int startCode;
       if (endUTF8.numBits(upto) == 5) {
         // special case -- avoid created unused edges (endUTF8
         // doesn't accept certain byte sequences) -- there
         // are other cases we could optimize too:
         startCode = 194;
       } else {
         startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]);
       }
       if (doAll && endUTF8.byteAt(upto) != startCode) {
         all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1);
       }
       int n = utf8.createState();
       //start.addTransition(new Transition(endUTF8.byteAt(upto), n));  // type=end
       utf8.addTransition(start, n, endUTF8.byteAt(upto));
       end(n, end, endUTF8, 1+upto, true);
     }
   }

   private void all(int start, int end, int startCode, int endCode, int left) {
     if (left == 0) {
       //start.addTransition(new Transition(startCode, endCode, end));  // type=all
       utf8.addTransition(start, end, startCode, endCode);
     } else {
       int lastN = utf8.createState();
       //start.addTransition(new Transition(startCode, endCode, lastN));  // type=all
       utf8.addTransition(start, lastN, startCode, endCode);
       while (left > 1) {
         int n = utf8.createState();
         //lastN.addTransition(new Transition(128, 191, n));  // type=all*
         utf8.addTransition(lastN, n, 128, 191); // type=all*
         left--;
         lastN = n;
       }
       //lastN.addTransition(new Transition(128, 191, end)); // type = all*
       utf8.addTransition(lastN, end, 128, 191); // type = all*
     }
   }

   Automaton.Builder utf8;

   /** Converts an incoming utf32 automaton to an equivalent
    *  utf8 one.  The incoming automaton need not be
    *  deterministic.  Note that the returned automaton will
    *  not in general be deterministic, so you must
    *  determinize it if that's needed. */
   public Automaton convert(Automaton utf32) {
     if (utf32.getNumStates() == 0) {
       return utf32;
     }

     int[] map = new int[utf32.getNumStates()];
     Arrays.fill(map, -1);

     List<Integer> pending = new ArrayList<>();
     int utf32State = 0;
     pending.add(utf32State);
     utf8 = new Automaton.Builder();

     int utf8State = utf8.createState();

     utf8.setAccept(utf8State, utf32.isAccept(utf32State));

     map[utf32State] = utf8State;

     Transition scratch = new Transition();

     while (pending.size() != 0) {
       utf32State = pending.remove(pending.size()-1);
       utf8State = map[utf32State];
       assert utf8State != -1;

       int numTransitions = utf32.getNumTransitions(utf32State);
       utf32.initTransition(utf32State, scratch);
       for(int i=0;i<numTransitions;i++) {
         utf32.getNextTransition(scratch);
         int destUTF32 = scratch.dest;
         int destUTF8 = map[destUTF32];
         if (destUTF8 == -1) {
           destUTF8 = utf8.createState();
           utf8.setAccept(destUTF8, utf32.isAccept(destUTF32));
           map[destUTF32] = destUTF8;
           pending.add(destUTF32);
         }

         // Writes new transitions into pendingTransitions:
         convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max);
       }
     }

     return utf8.finish();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.util.automaton;


	import java.util.Arrays;
	import java.util.ArrayList;
	import java.util.List;

	// TODO
	// - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef

	/**
	* Converts UTF-32 automata to the equivalent UTF-8 representation.
	* @lucene.internal
	*/
	public final class UTF32ToUTF8 {

	// Unicode boundaries for UTF8 bytes 1,2,3,4
	private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
	private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111};

	static int[] MASKS = new int[32];
	static {
	int v = 2;
	for(int i=0;i<32;i++) {
	MASKS[i] = v-1;
	v *= 2;
	}
	}

	// Represents one of the N utf8 bytes that (in sequence)
	// define a code point. value is the byte value; bits is
	// how many bits are "used" by utf8 at that byte
	private static class UTF8Byte {
	int value; // TODO: change to byte
	byte bits;
	}

	// Holds a single code point, as a sequence of 1-4 utf8 bytes:
	// TODO: maybe move to UnicodeUtil?
	private static class UTF8Sequence {
	private final UTF8Byte[] bytes;
	private int len;

	public UTF8Sequence() {
	bytes = new UTF8Byte[4];
	for(int i=0;i<4;i++) {
	bytes[i] = new UTF8Byte();
	}
	}

	public int byteAt(int idx) {
	return bytes[idx].value;
	}

	public int numBits(int idx) {
	return bytes[idx].bits;
	}

	private void set(int code) {
	if (code < 128) {
	// 0xxxxxxx
	bytes[0].value = code;
	bytes[0].bits = 7;
	len = 1;
	} else if (code < 2048) {
	// 110yyyxx 10xxxxxx
	bytes[0].value = (6 << 5) \| (code >> 6);
	bytes[0].bits = 5;
	setRest(code, 1);
	len = 2;
	} else if (code < 65536) {
	// 1110yyyy 10yyyyxx 10xxxxxx
	bytes[0].value = (14 << 4) \| (code >> 12);
	bytes[0].bits = 4;
	setRest(code, 2);
	len = 3;
	} else {
	// 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
	bytes[0].value = (30 << 3) \| (code >> 18);
	bytes[0].bits = 3;
	setRest(code, 3);
	len = 4;
	}
	}

	private void setRest(int code, int numBytes) {
	for(int i=0;i<numBytes;i++) {
	bytes[numBytes-i].value = 128 \| (code & MASKS[5]);
	bytes[numBytes-i].bits = 6;
	code = code >> 6;
	}
	}

	@Override
	public String toString() {
	StringBuilder b = new StringBuilder();
	for(int i=0;i<len;i++) {
	if (i > 0) {
	b.append(' ');
	}
	b.append(Integer.toBinaryString(bytes[i].value));
	}
	return b.toString();
	}
	}

	/** Sole constructor. */
	public UTF32ToUTF8() {
	}

	private final UTF8Sequence startUTF8 = new UTF8Sequence();
	private final UTF8Sequence endUTF8 = new UTF8Sequence();

	private final UTF8Sequence tmpUTF8a = new UTF8Sequence();
	private final UTF8Sequence tmpUTF8b = new UTF8Sequence();

	// Builds necessary utf8 edges between start & end
	void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) {
	startUTF8.set(startCodePoint);
	endUTF8.set(endCodePoint);
	build(start, end, startUTF8, endUTF8, 0);
	}

	private void build(int start, int end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) {

	// Break into start, middle, end:
	if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) {
	// Degen case: lead with the same byte:
	if (upto == startUTF8.len-1 && upto == endUTF8.len-1) {
	// Super degen: just single edge, one UTF8 byte:
	utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
	return;
	} else {
	assert startUTF8.len > upto+1;
	assert endUTF8.len > upto+1;
	int n = utf8.createState();

	// Single value leading edge
	utf8.addTransition(start, n, startUTF8.byteAt(upto));
	//start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single

	// Recurse for the rest
	build(n, end, startUTF8, endUTF8, 1+upto);
	}
	} else if (startUTF8.len == endUTF8.len) {
	if (upto == startUTF8.len-1) {
	//start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend
	utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto));
	} else {
	start(start, end, startUTF8, upto, false);
	if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) {
	// There is a middle
	all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1);
	}
	end(start, end, endUTF8, upto, false);
	}
	} else {

	// start
	start(start, end, startUTF8, upto, true);

	// possibly middle, spanning multiple num bytes
	int byteCount = 1+startUTF8.len-upto;
	final int limit = endUTF8.len-upto;
	while (byteCount < limit) {
	// wasteful: we only need first byte, and, we should
	// statically encode this first byte:
	tmpUTF8a.set(startCodes[byteCount-1]);
	tmpUTF8b.set(endCodes[byteCount-1]);
	all(start, end,
	tmpUTF8a.byteAt(0),
	tmpUTF8b.byteAt(0),
	tmpUTF8a.len - 1);
	byteCount++;
	}

	// end
	end(start, end, endUTF8, upto, true);
	}
	}

	private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) {
	if (upto == startUTF8.len-1) {
	// Done recursing
	utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) \| MASKS[startUTF8.numBits(upto)-1]); // type=start
	//start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) \| MASKS[startUTF8.numBits(upto)-1], end)); // type=start
	} else {
	int n = utf8.createState();
	utf8.addTransition(start, n, startUTF8.byteAt(upto));
	//start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start
	start(n, end, startUTF8, 1+upto, true);
	int endCode = startUTF8.byteAt(upto) \| MASKS[startUTF8.numBits(upto)-1];
	if (doAll && startUTF8.byteAt(upto) != endCode) {
	all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1);
	}
	}
	}

	private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
	if (upto == endUTF8.len-1) {
	// Done recursing
	//start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end
	utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto));
	} else {
	final int startCode;
	if (endUTF8.numBits(upto) == 5) {
	// special case -- avoid created unused edges (endUTF8
	// doesn't accept certain byte sequences) -- there
	// are other cases we could optimize too:
	startCode = 194;
	} else {
	startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]);
	}
	if (doAll && endUTF8.byteAt(upto) != startCode) {
	all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1);
	}
	int n = utf8.createState();
	//start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end
	utf8.addTransition(start, n, endUTF8.byteAt(upto));
	end(n, end, endUTF8, 1+upto, true);
	}
	}

	private void all(int start, int end, int startCode, int endCode, int left) {
	if (left == 0) {
	//start.addTransition(new Transition(startCode, endCode, end)); // type=all
	utf8.addTransition(start, end, startCode, endCode);
	} else {
	int lastN = utf8.createState();
	//start.addTransition(new Transition(startCode, endCode, lastN)); // type=all
	utf8.addTransition(start, lastN, startCode, endCode);
	while (left > 1) {
	int n = utf8.createState();
	//lastN.addTransition(new Transition(128, 191, n)); // type=all*
	utf8.addTransition(lastN, n, 128, 191); // type=all*
	left--;
	lastN = n;
	}
	//lastN.addTransition(new Transition(128, 191, end)); // type = all*
	utf8.addTransition(lastN, end, 128, 191); // type = all*
	}
	}

	Automaton.Builder utf8;

	/** Converts an incoming utf32 automaton to an equivalent
	* utf8 one. The incoming automaton need not be
	* deterministic. Note that the returned automaton will
	* not in general be deterministic, so you must
	* determinize it if that's needed. */
	public Automaton convert(Automaton utf32) {
	if (utf32.getNumStates() == 0) {
	return utf32;
	}

	int[] map = new int[utf32.getNumStates()];
	Arrays.fill(map, -1);

	List<Integer> pending = new ArrayList<>();
	int utf32State = 0;
	pending.add(utf32State);
	utf8 = new Automaton.Builder();

	int utf8State = utf8.createState();

	utf8.setAccept(utf8State, utf32.isAccept(utf32State));

	map[utf32State] = utf8State;

	Transition scratch = new Transition();

	while (pending.size() != 0) {
	utf32State = pending.remove(pending.size()-1);
	utf8State = map[utf32State];
	assert utf8State != -1;

	int numTransitions = utf32.getNumTransitions(utf32State);
	utf32.initTransition(utf32State, scratch);
	for(int i=0;i<numTransitions;i++) {
	utf32.getNextTransition(scratch);
	int destUTF32 = scratch.dest;
	int destUTF8 = map[destUTF32];
	if (destUTF8 == -1) {
	destUTF8 = utf8.createState();
	utf8.setAccept(destUTF8, utf32.isAccept(destUTF32));
	map[destUTF32] = destUTF8;
	pending.add(destUTF32);
	}

	// Writes new transitions into pendingTransitions:
	convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max);
	}
	}

	return utf8.finish();
	}
	}