hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java - asterixdb - Git at Google

 /*
  * Copyright 2009-2010 by The Regents of the University of California
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * you may obtain a copy of the License from
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package edu.uci.ics.hyracks.dataflow.common.data.util;

 import java.io.DataOutput;
 import java.io.IOException;

 public class StringUtils {
 	public static char charAt(byte[] b, int s) {
 		int c = b[s] & 0xff;
 		switch (c >> 4) {
 		case 0:
 		case 1:
 		case 2:
 		case 3:
 		case 4:
 		case 5:
 		case 6:
 		case 7:
 			return (char) c;

 		case 12:
 		case 13:
 			return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));

 		case 14:
 			return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));

 		default:
 			throw new IllegalArgumentException();
 		}
 	}

 	public static int charSize(byte[] b, int s) {
 		int c = b[s] & 0xff;
 		switch (c >> 4) {
 		case 0:
 		case 1:
 		case 2:
 		case 3:
 		case 4:
 		case 5:
 		case 6:
 		case 7:
 			return 1;

 		case 12:
 		case 13:
 			return 2;

 		case 14:
 			return 3;
 		}
 		throw new IllegalStateException();
 	}

 	public static int getModifiedUTF8Len(char c) {
 		if (c >= 0x0000 && c <= 0x007F) {
 			return 1;
 		} else if (c <= 0x07FF) {
 			return 2;
 		} else {
 			return 3;
 		}
 	}

 	public static int getStrLen(byte[] b, int s) {
 		int pos = s + 2;
 		int end = pos + getUTFLen(b, s);
 		int charCount = 0;
 		while (pos < end) {
 			charCount++;
 			pos += charSize(b, pos);
 		}
 		return charCount;
 	}

 	public static int getUTFLen(byte[] b, int s) {
 		return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
 	}

 	public static char toLowerCase(char c) {
 		switch (c) {
 		case 'A':
 			return 'a';
 		case 'B':
 			return 'b';
 		case 'C':
 			return 'c';
 		case 'D':
 			return 'd';
 		case 'E':
 			return 'e';
 		case 'F':
 			return 'f';
 		case 'G':
 			return 'g';
 		case 'H':
 			return 'h';
 		case 'I':
 			return 'i';
 		case 'J':
 			return 'j';
 		case 'K':
 			return 'k';
 		case 'L':
 			return 'l';
 		case 'M':
 			return 'm';
 		case 'N':
 			return 'n';
 		case 'O':
 			return 'o';
 		case 'P':
 			return 'p';
 		case 'Q':
 			return 'q';
 		case 'R':
 			return 'r';
 		case 'S':
 			return 's';
 		case 'T':
 			return 't';
 		case 'U':
 			return 'u';
 		case 'V':
 			return 'v';
 		case 'W':
 			return 'w';
 		case 'X':
 			return 'x';
 		case 'Y':
 			return 'y';
 		case 'Z':
 			return 'z';
 		case 'Ä':
 			return 'ä';
 		case 'Ǟ':
 			return 'ǟ';
 		case 'Ë':
 			return 'ë';
 		case 'Ḧ':
 			return 'ḧ';
 		case 'Ï':
 			return 'ï';
 		case 'Ḯ':
 			return 'ḯ';
 		case 'Ö':
 			return 'ö';
 		case 'Ȫ':
 			return 'ȫ';
 		case 'Ṏ':
 			return 'ṏ';
 		case 'Ü':
 			return 'ü';
 		case 'Ǖ':
 			return 'ǖ';
 		case 'Ǘ':
 			return 'ǘ';
 		case 'Ǚ':
 			return 'ǚ';
 		case 'Ǜ':
 			return 'ǜ';
 		case 'Ṳ':
 			return 'ṳ';
 		case 'Ṻ':
 			return 'ṻ';
 		case 'Ẅ':
 			return 'ẅ';
 		case 'Ẍ':
 			return 'ẍ';
 		case 'Ÿ':
 			return 'ÿ';
 		default:
 			// since I probably missed some chars above
 			// use Java to convert to lower case to be safe
 			return Character.toLowerCase(c);
 		}
 	}

 	public static void writeCharAsModifiedUTF8(char c, DataOutput dos)
 			throws IOException {

 		if (c >= 0x0000 && c <= 0x007F) {
 			dos.writeByte(c);
 		} else if (c <= 0x07FF) {
 			dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
 			dos.writeByte((byte) (0x80 | (c & 0x3F)));
 		} else {
 			dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
 			dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
 			dos.writeByte((byte) (0x80 | (c & 0x3F)));
 		}
 	}

 	public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
 		dos.write((len >>> 8) & 0xFF);
 		dos.write((len >>> 0) & 0xFF);
 	}
 }
	/*
	* Copyright 2009-2010 by The Regents of the University of California
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* you may obtain a copy of the License from
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package edu.uci.ics.hyracks.dataflow.common.data.util;

	import java.io.DataOutput;
	import java.io.IOException;

	public class StringUtils {
	public static char charAt(byte[] b, int s) {
	int c = b[s] & 0xff;
	switch (c >> 4) {
	case 0:
	case 1:
	case 2:
	case 3:
	case 4:
	case 5:
	case 6:
	case 7:
	return (char) c;

	case 12:
	case 13:
	return (char) (((c & 0x1F) << 6) \| ((b[s + 1]) & 0x3F));

	case 14:
	return (char) (((c & 0x0F) << 12) \| (((b[s + 1]) & 0x3F) << 6) \| (((b[s + 2]) & 0x3F) << 0));

	default:
	throw new IllegalArgumentException();
	}
	}

	public static int charSize(byte[] b, int s) {
	int c = b[s] & 0xff;
	switch (c >> 4) {
	case 0:
	case 1:
	case 2:
	case 3:
	case 4:
	case 5:
	case 6:
	case 7:
	return 1;

	case 12:
	case 13:
	return 2;

	case 14:
	return 3;
	}
	throw new IllegalStateException();
	}

	public static int getModifiedUTF8Len(char c) {
	if (c >= 0x0000 && c <= 0x007F) {
	return 1;
	} else if (c <= 0x07FF) {
	return 2;
	} else {
	return 3;
	}
	}

	public static int getStrLen(byte[] b, int s) {
	int pos = s + 2;
	int end = pos + getUTFLen(b, s);
	int charCount = 0;
	while (pos < end) {
	charCount++;
	pos += charSize(b, pos);
	}
	return charCount;
	}

	public static int getUTFLen(byte[] b, int s) {
	return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
	}

	public static char toLowerCase(char c) {
	switch (c) {
	case 'A':
	return 'a';
	case 'B':
	return 'b';
	case 'C':
	return 'c';
	case 'D':
	return 'd';
	case 'E':
	return 'e';
	case 'F':
	return 'f';
	case 'G':
	return 'g';
	case 'H':
	return 'h';
	case 'I':
	return 'i';
	case 'J':
	return 'j';
	case 'K':
	return 'k';
	case 'L':
	return 'l';
	case 'M':
	return 'm';
	case 'N':
	return 'n';
	case 'O':
	return 'o';
	case 'P':
	return 'p';
	case 'Q':
	return 'q';
	case 'R':
	return 'r';
	case 'S':
	return 's';
	case 'T':
	return 't';
	case 'U':
	return 'u';
	case 'V':
	return 'v';
	case 'W':
	return 'w';
	case 'X':
	return 'x';
	case 'Y':
	return 'y';
	case 'Z':
	return 'z';
	case 'Ä':
	return 'ä';
	case 'Ǟ':
	return 'ǟ';
	case 'Ë':
	return 'ë';
	case 'Ḧ':
	return 'ḧ';
	case 'Ï':
	return 'ï';
	case 'Ḯ':
	return 'ḯ';
	case 'Ö':
	return 'ö';
	case 'Ȫ':
	return 'ȫ';
	case 'Ṏ':
	return 'ṏ';
	case 'Ü':
	return 'ü';
	case 'Ǖ':
	return 'ǖ';
	case 'Ǘ':
	return 'ǘ';
	case 'Ǚ':
	return 'ǚ';
	case 'Ǜ':
	return 'ǜ';
	case 'Ṳ':
	return 'ṳ';
	case 'Ṻ':
	return 'ṻ';
	case 'Ẅ':
	return 'ẅ';
	case 'Ẍ':
	return 'ẍ';
	case 'Ÿ':
	return 'ÿ';
	default:
	// since I probably missed some chars above
	// use Java to convert to lower case to be safe
	return Character.toLowerCase(c);
	}
	}

	public static void writeCharAsModifiedUTF8(char c, DataOutput dos)
	throws IOException {

	if (c >= 0x0000 && c <= 0x007F) {
	dos.writeByte(c);
	} else if (c <= 0x07FF) {
	dos.writeByte((byte) (0xC0 \| ((c >> 6) & 0x3F)));
	dos.writeByte((byte) (0x80 \| (c & 0x3F)));
	} else {
	dos.writeByte((byte) (0xE0 \| ((c >> 12) & 0x0F)));
	dos.writeByte((byte) (0x80 \| ((c >> 6) & 0x3F)));
	dos.writeByte((byte) (0x80 \| (c & 0x3F)));
	}
	}

	public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
	dos.write((len >>> 8) & 0xFF);
	dos.write((len >>> 0) & 0xFF);
	}
	}