blob: 8ddddaccee5d76f45e3993ae32e2ae66df40cccc [file] [log] [blame]
/*
* Copyright 2009-2010 by The Regents of the University of California
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.hyracks.dataflow.common.data.parsers;
import java.io.DataOutput;
import java.io.IOException;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
public class UTF8StringParserFactory implements IValueParserFactory {
public static final IValueParserFactory INSTANCE = new UTF8StringParserFactory();
private static final long serialVersionUID = 1L;
private UTF8StringParserFactory() {
}
@Override
public IValueParser createValueParser() {
return new IValueParser() {
private byte[] utf8;
@Override
public void parse(char[] buffer, int start, int length, DataOutput out) throws HyracksDataException {
int utflen = 0;
for (int i = 0; i < length; i++) {
char ch = buffer[i + start];
if ((ch >= 0x0001) && (ch <= 0x007F)) {
utflen++;
} else if (ch > 0x07ff) {
utflen += 3;
} else {
utflen += 2;
}
}
if (utf8 == null || utf8.length < utflen + 2) {
utf8 = new byte[utflen + 2];
}
int count = 0;
utf8[count++] = (byte) ((utflen >>> 8) & 0xff);
utf8[count++] = (byte) ((utflen >>> 0) & 0xff);
int i = 0;
for (i = 0; i < length; i++) {
char ch = buffer[i + start];
if (!((ch >= 0x0001) && (ch <= 0x007F)))
break;
utf8[count++] = (byte) ch;
}
for (; i < length; i++) {
char ch = buffer[i + start];
if ((ch >= 0x0001) && (ch <= 0x007F)) {
utf8[count++] = (byte) ch;
} else if (ch > 0x07FF) {
utf8[count++] = (byte) (0xE0 | ((ch >> 12) & 0x0F));
utf8[count++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
utf8[count++] = (byte) (0x80 | ((ch >> 0) & 0x3F));
} else {
utf8[count++] = (byte) (0xC0 | ((ch >> 6) & 0x1F));
utf8[count++] = (byte) (0x80 | ((ch >> 0) & 0x3F));
}
}
try {
out.write(utf8, 0, utflen + 2);
} catch (IOException e) {
throw new HyracksDataException(e);
}
}
};
}
}