blob: f30184888bc8bb56db93a8100d18cc72147ab34d [file] [log] [blame]
Index: src/test/org/apache/lucene/index/MockIndexOutput.java
===================================================================
--- src/test/org/apache/lucene/index/MockIndexOutput.java (revision 0)
+++ src/test/org/apache/lucene/index/MockIndexOutput.java (revision 0)
@@ -0,0 +1,64 @@
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.IndexOutput;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+public class MockIndexOutput extends IndexOutput {
+
+ public MockIndexOutput() { };
+
+ private ByteArrayOutputStream baos = new ByteArrayOutputStream();
+
+ public byte[] getBuf() {
+ return baos.toByteArray();
+ }
+
+ public void reset() {
+ baos.reset();
+ }
+
+ public void writeByte(byte b) {
+ baos.write(b);
+ }
+
+ public void writeBytes(byte[] b, int length) {
+ baos.write(b, baos.size(), length);
+ }
+
+ public void close() {
+ // ignore
+ }
+
+ public void flush() throws IOException {
+ throw new IOException();
+ }
+
+ public void seek(long pos) throws IOException {
+ throw new IOException();
+ }
+
+ public long getFilePointer() {
+ return (long) baos.size();
+ }
+
+ public long length() {
+ return (long) baos.size();
+ }
+}
Index: src/test/org/apache/lucene/index/TestIndexInput.java
===================================================================
--- src/test/org/apache/lucene/index/TestIndexInput.java (revision 405159)
+++ src/test/org/apache/lucene/index/TestIndexInput.java (working copy)
@@ -22,16 +22,70 @@
import java.io.IOException;
public class TestIndexInput extends TestCase {
- public void testRead() throws IOException {
- IndexInput is = new MockIndexInput(new byte[] { (byte) 0x80, 0x01,
- (byte) 0xFF, 0x7F,
- (byte) 0x80, (byte) 0x80, 0x01,
- (byte) 0x81, (byte) 0x80, 0x01,
- 0x06, 'L', 'u', 'c', 'e', 'n', 'e'});
- assertEquals(128,is.readVInt());
- assertEquals(16383,is.readVInt());
- assertEquals(16384,is.readVInt());
- assertEquals(16385,is.readVInt());
- assertEquals("Lucene",is.readString());
- }
+ public void testRead() throws IOException {
+ IndexInput is = new MockIndexInput(new byte[] {
+ (byte) 0x80, 0x01,
+ (byte) 0xFF, 0x7F,
+ (byte) 0x80, (byte) 0x80, 0x01,
+ (byte) 0x81, (byte) 0x80, 0x01,
+ 0x06, 'L', 'u', 'c', 'e', 'n', 'e',
+
+ // 2-byte UTF-8 (U+00BF "INVERTED QUESTION MARK")
+ 0x02, (byte) 0xC2, (byte) 0xBF,
+ 0x0A, 'L', 'u', (byte) 0xC2, (byte) 0xBF,
+ 'c', 'e', (byte) 0xC2, (byte) 0xBF,
+ 'n', 'e',
+
+ // 3-byte UTF-8 (U+2620 "SKULL AND CROSSBONES")
+ 0x03, (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
+ 0x0C, 'L', 'u', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
+ 'c', 'e', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
+ 'n', 'e',
+
+ // surrogate pairs
+ // (U+1D11E "MUSICAL SYMBOL G CLEF")
+ // (U+1D160 "MUSICAL SYMBOL EIGHTH NOTE")
+ 0x04, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
+ 0x08, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
+ (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0,
+ 0x0E, 'L', 'u',
+ (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
+ 'c', 'e',
+ (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0,
+ 'n', 'e',
+
+ // null bytes
+ 0x01, 0x00,
+ 0x08, 'L', 'u', 0x00, 'c', 'e', 0x00, 'n', 'e',
+
+ // Modified UTF-8 null bytes
+ 0x02, (byte) 0xC0, (byte) 0x80,
+ 0x0A, 'L', 'u', (byte) 0xC0, (byte) 0x80,
+ 'c', 'e', (byte) 0xC0, (byte) 0x80,
+ 'n', 'e',
+
+ });
+
+ assertEquals(128,is.readVInt());
+ assertEquals(16383,is.readVInt());
+ assertEquals(16384,is.readVInt());
+ assertEquals(16385,is.readVInt());
+ assertEquals("Lucene",is.readString());
+
+ assertEquals("\u00BF",is.readString());
+ assertEquals("Lu\u00BFce\u00BFne",is.readString());
+
+ assertEquals("\u2620",is.readString());
+ assertEquals("Lu\u2620ce\u2620ne",is.readString());
+
+ assertEquals("\uD834\uDD1E",is.readString());
+ assertEquals("\uD834\uDD1E\uD834\uDD60",is.readString());
+ assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne",is.readString());
+
+ assertEquals("\u0000",is.readString());
+ assertEquals("Lu\u0000ce\u0000ne",is.readString());
+
+ assertEquals("\u0000",is.readString());
+ assertEquals("Lu\u0000ce\u0000ne",is.readString());
+ }
}
Index: src/test/org/apache/lucene/index/TestIndexOutput.java
===================================================================
--- src/test/org/apache/lucene/index/TestIndexOutput.java (revision 0)
+++ src/test/org/apache/lucene/index/TestIndexOutput.java (revision 0)
@@ -0,0 +1,103 @@
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.store.IndexOutput;
+import java.util.Arrays;
+import java.io.IOException;
+
+public class TestIndexOutput extends TestCase {
+
+ private void checkStringToBytes(String s, byte[] correctBytes)
+ throws IOException {
+ MockIndexOutput mockio = new MockIndexOutput();
+ mockio.writeString(s);
+ byte[] writtenBytes = mockio.getBuf();
+ assertEquals(writtenBytes.length, correctBytes.length);
+ for (int i = 0; i < writtenBytes.length; i++) {
+ assertEquals(writtenBytes[i], correctBytes[i]);
+ }
+ }
+
+ public void testWrite() throws IOException {
+ MockIndexOutput mockio = new MockIndexOutput();
+ mockio.writeVInt(128);
+ mockio.writeVInt(16383);
+ mockio.writeVInt(16384);
+ mockio.writeVInt(16385);
+ mockio.writeString("Lucene");
+ boolean check = Arrays.equals(
+ mockio.getBuf(),
+ new byte[] {
+ (byte) 0x80, (byte) 0x01,
+ (byte) 0xFF, (byte) 0x7F,
+ (byte) 0x80, (byte) 0x80, (byte) 0x01,
+ (byte) 0x81, (byte) 0x80, (byte) 0x01,
+ (byte) 0x06, 'L', 'u', 'c', 'e', 'n', 'e'
+ }
+ );
+ assertEquals(check, true);
+
+ // 2-byte UTF-8 (U+00BF "INVERTED QUESTION MARK")
+ checkStringToBytes("\u00BF", new byte[] {
+ 0x02, (byte) 0xC2, (byte) 0xBF });
+ checkStringToBytes("Lu\u00BFce\u00BFne", new byte[] {
+ 0x0A, 'L', 'u', (byte) 0xC2, (byte) 0xBF,
+ 'c', 'e', (byte) 0xC2, (byte) 0xBF,
+ 'n', 'e' });
+
+ // 3-byte UTF-8 (U+2620 "SKULL AND CROSSBONES")
+ checkStringToBytes("\u2620", new byte[] {
+ 0x03, (byte) 0xE2, (byte) 0x98, (byte) 0xA0 });
+ checkStringToBytes("Lu\u2620ce\u2620ne", new byte[] {
+ 0x0C, 'L', 'u', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
+ 'c', 'e', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
+ 'n', 'e' });
+
+ // surrogate pairs
+ // (U+1D11E "MUSICAL SYMBOL G CLEF")
+ // (U+1D160 "MUSICAL SYMBOL EIGHTH NOTE")
+ checkStringToBytes("\uD834\uDD1E", new byte[] {
+ 0x04, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E });
+ checkStringToBytes("Lu\uD834\uDD1Ece\uD834\uDD60ne", new byte[] {
+ 0x0E, 'L', 'u', (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
+ 'c', 'e', (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0,
+ 'n', 'e' });
+
+ // null bytes
+ checkStringToBytes("\u0000", new byte[] {
+ 0x01, (byte) 0x00 });
+ checkStringToBytes("Lu\u0000ce\u0000ne", new byte[] {
+ 0x08, 'L', 'u', 0x00, 'c', 'e', 0x00, 'n', 'e' });
+
+ // illegal unpaired high surrogate gets replaced
+ checkStringToBytes("\uD834", new byte[] {
+ 0x03, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD });
+ checkStringToBytes("Lu\uD834ce\uD834ne", new byte[] {
+ 0x0C, 'L', 'u', (byte) 0xEF, (byte) 0xBF, (byte) 0xBD,
+ 'c', 'e', (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, 'n', 'e' });
+
+ // illegal unpaired low surrogate gets replaced
+ checkStringToBytes("\u0061\uDD1E", new byte[] {
+ 0x04, (byte) 0x61, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD });
+ checkStringToBytes("Lu\u0061\uDD1Ece\u0061\uDD60ne", new byte[] {
+ 0x0E, 'L', 'u', (byte) 0x61, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD,
+ 'c', 'e', (byte) 0x61, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD,
+ 'n', 'e' });
+ }
+}
Index: src/java/org/apache/lucene/index/TermVectorsReader.java
===================================================================
--- src/java/org/apache/lucene/index/TermVectorsReader.java (revision 405159)
+++ src/java/org/apache/lucene/index/TermVectorsReader.java (working copy)
@@ -241,8 +241,8 @@
int start = 0;
int deltaLength = 0;
int totalLength = 0;
- char [] buffer = new char[10]; // init the buffer with a length of 10 character
- char[] previousBuffer = {};
+ byte[] buffer = new byte[20]; // init the buffer with a length of 20 bytes
+ byte[] previousBuffer = {};
for (int i = 0; i < numTerms; i++) {
start = tvf.readVInt();
@@ -250,14 +250,14 @@
totalLength = start + deltaLength;
if (buffer.length < totalLength) { // increase buffer
buffer = null; // give a hint to garbage collector
- buffer = new char[totalLength];
+ buffer = new byte[totalLength];
if (start > 0) // just copy if necessary
System.arraycopy(previousBuffer, 0, buffer, 0, start);
}
- tvf.readChars(buffer, start, deltaLength);
- terms[i] = new String(buffer, 0, totalLength);
+ tvf.readBytes(buffer, start, deltaLength);
+ terms[i] = new String(buffer, 0, totalLength, "UTF-8");
previousBuffer = buffer;
int freq = tvf.readVInt();
termFreqs[i] = freq;
Index: src/java/org/apache/lucene/index/TermBuffer.java
===================================================================
--- src/java/org/apache/lucene/index/TermBuffer.java (revision 405159)
+++ src/java/org/apache/lucene/index/TermBuffer.java (working copy)
@@ -18,42 +18,43 @@
import java.io.IOException;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.StringHelper;
final class TermBuffer implements Cloneable {
- private static final char[] NO_CHARS = new char[0];
+ private static final byte[] NO_BYTES = new byte[0];
private String field;
- private char[] text = NO_CHARS;
- private int textLength;
+ private byte[] bytes = NO_BYTES;
+ private int bytesLength;
private Term term; // cached
public final int compareTo(TermBuffer other) {
if (field == other.field) // fields are interned
- return compareChars(text, textLength, other.text, other.textLength);
+ return compareBytes(bytes, bytesLength, other.bytes, other.bytesLength);
else
return field.compareTo(other.field);
}
- private static final int compareChars(char[] v1, int len1,
- char[] v2, int len2) {
+ private static final int compareBytes(byte[] bytes1, int len1,
+ byte[] bytes2, int len2) {
int end = Math.min(len1, len2);
for (int k = 0; k < end; k++) {
- char c1 = v1[k];
- char c2 = v2[k];
- if (c1 != c2) {
- return c1 - c2;
+ int b1 = (bytes1[k] & 0xFF);
+ int b2 = (bytes2[k] & 0xFF);
+ if (b1 != b2) {
+ return b1 - b2;
}
}
return len1 - len2;
}
- private final void setTextLength(int newLength) {
- if (text.length < newLength) {
- char[] newText = new char[newLength];
- System.arraycopy(text, 0, newText, 0, textLength);
- text = newText;
+ private final void setBytesLength(int newLength) {
+ if (bytes.length < newLength) {
+ byte[] newBytes = new byte[newLength];
+ System.arraycopy(bytes, 0, newBytes, 0, bytesLength);
+ bytes = newBytes;
}
- textLength = newLength;
+ bytesLength = newLength;
}
public final void read(IndexInput input, FieldInfos fieldInfos)
@@ -62,28 +63,29 @@
int start = input.readVInt();
int length = input.readVInt();
int totalLength = start + length;
- setTextLength(totalLength);
- input.readChars(this.text, start, length);
+ setBytesLength(totalLength);
+ input.readBytes(this.bytes, start, length);
this.field = fieldInfos.fieldName(input.readVInt());
}
- public final void set(Term term) {
- if (term == null) {
+ public final void set(Term t) {
+ if (t == null) {
reset();
return;
}
- // copy text into the buffer
- setTextLength(term.text().length());
- term.text().getChars(0, term.text().length(), text, 0);
-
- this.field = term.field();
- this.term = term;
+ // convert chars into UTF-8 bytes, store in buffer
+ try {
+ bytes = t.text().getBytes("UTF-8");
+ } catch (java.io.UnsupportedEncodingException e) { }
+ setBytesLength(bytes.length);
+ this.field = t.field();
+ this.term = t;
}
public final void set(TermBuffer other) {
- setTextLength(other.textLength);
- System.arraycopy(other.text, 0, text, 0, textLength);
+ setBytesLength(other.bytesLength);
+ System.arraycopy(other.bytes, 0, bytes, 0, bytesLength);
this.field = other.field;
this.term = other.term;
@@ -91,7 +93,7 @@
public void reset() {
this.field = null;
- this.textLength = 0;
+ this.bytesLength = 0;
this.term = null;
}
@@ -100,7 +102,10 @@
return null;
if (term == null)
- term = new Term(field, new String(text, 0, textLength), false);
+ try {
+ term = new Term(field,
+ new String(bytes, 0, bytesLength, "UTF-8"), false );
+ } catch (java.io.UnsupportedEncodingException e) { }
return term;
}
@@ -111,8 +116,8 @@
clone = (TermBuffer)super.clone();
} catch (CloneNotSupportedException e) {}
- clone.text = new char[text.length];
- System.arraycopy(text, 0, clone.text, 0, textLength);
+ clone.bytes = new byte[bytes.length];
+ System.arraycopy(bytes, 0, clone.bytes, 0, bytesLength);
return clone;
}
Index: src/java/org/apache/lucene/index/TermInfosWriter.java
===================================================================
--- src/java/org/apache/lucene/index/TermInfosWriter.java (revision 405159)
+++ src/java/org/apache/lucene/index/TermInfosWriter.java (working copy)
@@ -33,6 +33,8 @@
private IndexOutput output;
private Term lastTerm = new Term("", "");
private TermInfo lastTi = new TermInfo();
+ private static final byte[] NO_BYTES = new byte[0];
+ private byte[] lastBytes = NO_BYTES;
private long size = 0;
// TODO: the default values for these two parameters should be settable from
@@ -121,15 +123,21 @@
private final void writeTerm(Term term)
throws IOException {
- int start = StringHelper.stringDifference(lastTerm.text, term.text);
- int length = term.text.length() - start;
+ byte[] bytes = term.text().getBytes("UTF-8");
+ int totalLength = bytes.length;
+ int start = StringHelper.bytesDifference(lastBytes, bytes);
+ int diffLength = totalLength - start;
+
output.writeVInt(start); // write shared prefix length
- output.writeVInt(length); // write delta length
- output.writeChars(term.text, start, length); // write delta chars
+ output.writeVInt(diffLength); // write delta length
+ for (int i = start; i < totalLength; i++) {
+ output.writeByte(bytes[i]); // write delta UTF-8 bytes
+ }
output.writeVInt(fieldInfos.fieldNumber(term.field)); // write field num
+ lastBytes = bytes;
lastTerm = term;
}
Index: src/java/org/apache/lucene/index/TermVectorsWriter.java
===================================================================
--- src/java/org/apache/lucene/index/TermVectorsWriter.java (revision 405159)
+++ src/java/org/apache/lucene/index/TermVectorsWriter.java (working copy)
@@ -282,16 +282,23 @@
bits |= STORE_OFFSET_WITH_TERMVECTOR;
tvf.writeByte(bits);
- String lastTermText = "";
+ byte[] lastTermBytes = new byte[0];
for (int i = 0; i < size; i++) {
TVTerm term = (TVTerm) terms.elementAt(i);
- int start = StringHelper.stringDifference(lastTermText, term.termText);
- int length = term.termText.length() - start;
+ byte[] termBytes = term.termText.getBytes("UTF-8");
+ int totalLength = termBytes.length;
+
+ int start = StringHelper.bytesDifference(lastTermBytes, termBytes);
+ int diffLength = totalLength - start;
+
tvf.writeVInt(start); // write shared prefix length
- tvf.writeVInt(length); // write delta length
- tvf.writeChars(term.termText, start, length); // write delta chars
+ tvf.writeVInt(diffLength); // write delta length
+ for (int k = start; k < totalLength; k++) {
+ tvf.writeByte(termBytes[k]); // write delta UTF-8 bytes
+ }
+
tvf.writeVInt(term.freq);
- lastTermText = term.termText;
+ lastTermBytes = termBytes;
if(storePositions){
if(term.positions == null)
Index: src/java/org/apache/lucene/store/IndexInput.java
===================================================================
--- src/java/org/apache/lucene/store/IndexInput.java (revision 405159)
+++ src/java/org/apache/lucene/store/IndexInput.java (working copy)
@@ -17,13 +17,14 @@
*/
import java.io.IOException;
+import org.apache.lucene.util.StringHelper;
/** Abstract base class for input from a file in a {@link Directory}. A
* random-access input stream. Used for all Lucene index input operations.
* @see Directory
*/
public abstract class IndexInput implements Cloneable {
- private char[] chars; // used by readString()
+ private byte[] bytes; // used by readString()
/** Reads and returns a single byte.
* @see IndexOutput#writeByte(byte)
@@ -87,10 +88,10 @@
*/
public String readString() throws IOException {
int length = readVInt();
- if (chars == null || length > chars.length)
- chars = new char[length];
- readChars(chars, 0, length);
- return new String(chars, 0, length);
+ if (bytes == null || length > bytes.length)
+ bytes = new byte[length];
+ readBytes(bytes, 0, length);
+ return new String(bytes, 0, length, "UTF-8");
}
/** Reads UTF-8 encoded characters into an array.
@@ -104,15 +105,29 @@
final int end = start + length;
for (int i = start; i < end; i++) {
byte b = readByte();
- if ((b & 0x80) == 0)
- buffer[i] = (char)(b & 0x7F);
- else if ((b & 0xE0) != 0xE0) {
- buffer[i] = (char)(((b & 0x1F) << 6)
- | (readByte() & 0x3F));
- } else
- buffer[i] = (char)(((b & 0x0F) << 12)
- | ((readByte() & 0x3F) << 6)
- | (readByte() & 0x3F));
+ switch (StringHelper.TRAILING_BYTES_FOR_UTF8[b & 0xFF]) {
+ case 0:
+ buffer[i] = (char)(b & 0x7F);
+ break;
+ case 1:
+ buffer[i] = (char)(((b & 0x1F) << 6)
+ | (readByte() & 0x3F));
+ break;
+ case 2:
+ buffer[i] = (char)(((b & 0x0F) << 12)
+ | ((readByte() & 0x3F) << 6)
+ | (readByte() & 0x3F));
+ break;
+ case 3:
+ int utf32 = (((b & 0x0F) << 18)
+ | ((readByte() & 0x3F) << 12)
+ | ((readByte() & 0x3F) << 6)
+ | (readByte() & 0x3F));
+ buffer[i] = (char)((utf32 >> 10) + 0xD7C0);
+ i++;
+ buffer[i] = (char)((utf32 & 0x03FF) + 0xDC00);
+ break;
+ }
}
}
@@ -148,7 +163,7 @@
clone = (IndexInput)super.clone();
} catch (CloneNotSupportedException e) {}
- clone.chars = null;
+ clone.bytes = null;
return clone;
}
Index: src/java/org/apache/lucene/store/IndexOutput.java
===================================================================
--- src/java/org/apache/lucene/store/IndexOutput.java (revision 405159)
+++ src/java/org/apache/lucene/store/IndexOutput.java (working copy)
@@ -17,6 +17,7 @@
*/
import java.io.IOException;
+import org.apache.lucene.util.StringHelper;
/** Abstract base class for output to a file in a Directory. A random-access
* output stream. Used for all Lucene index output operations.
@@ -85,9 +86,9 @@
* @see IndexInput#readString()
*/
public void writeString(String s) throws IOException {
- int length = s.length();
- writeVInt(length);
- writeChars(s, 0, length);
+ int byteCount = StringHelper.countUTF8Bytes(s);
+ writeVInt(byteCount);
+ writeChars(s, 0, s.length());
}
/** Writes a sequence of UTF-8 encoded characters from a string.
@@ -101,15 +102,37 @@
final int end = start + length;
for (int i = start; i < end; i++) {
final int code = (int)s.charAt(i);
- if (code >= 0x01 && code <= 0x7F)
- writeByte((byte)code);
- else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
- writeByte((byte)(0xC0 | (code >> 6)));
- writeByte((byte)(0x80 | (code & 0x3F)));
+ if (code < 0x80)
+ writeByte((byte)code);
+ else if (code < 0x800) {
+ writeByte((byte)(0xC0 | (code >> 6)));
+ writeByte((byte)(0x80 | (code & 0x3F)));
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ writeByte((byte)(0xE0 | (code >> 12)));
+ writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
+ writeByte((byte)(0x80 | (code & 0x3F)));
} else {
- writeByte((byte)(0xE0 | (code >>> 12)));
- writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
- writeByte((byte)(0x80 | (code & 0x3F)));
+ // surrogate pair
+ int utf32;
+ // confirm valid high surrogate
+ if (code < 0xDC00 && (i < end-1)) {
+ utf32 = ((int)s.charAt(i+1));
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+ i++;
+ writeByte((byte)(0xF0 | (utf32 >> 18)));
+ writeByte((byte)(0x80 | ((utf32 >> 12) & 0x3F)));
+ writeByte((byte)(0x80 | ((utf32 >> 6) & 0x3F)));
+ writeByte((byte)(0x80 | (utf32 & 0x3F)));
+ continue;
+ }
+ }
+ // replace unpaired surrogate or out-of-order low surrogate
+ // with substitution character
+ writeByte((byte)0xEF);
+ writeByte((byte)0xBF);
+ writeByte((byte)0xBD);
}
}
}
Index: src/java/org/apache/lucene/util/StringHelper.java
===================================================================
--- src/java/org/apache/lucene/util/StringHelper.java (revision 405159)
+++ src/java/org/apache/lucene/util/StringHelper.java (working copy)
@@ -16,6 +16,8 @@
* limitations under the License.
*/
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
/**
* Methods for manipulating strings.
@@ -25,6 +27,87 @@
public abstract class StringHelper {
/**
+ * Compares two byte[] arrays, element by element, and returns the
+ * number of elements common to both arrays.
+ *
+ * @param bytes1 The first byte[] to compare
+ * @param bytes2 The second byte[] to compare
+ * @return The number of common elements.
+ */
+ public static final int bytesDifference(byte[] bytes1, byte[] bytes2) {
+ int len1 = bytes1.length;
+ int len2 = bytes2.length;
+ int len = len1 < len2 ? len1 : len2;
+ for (int i = 0; i < len; i++) {
+ if (bytes1[i] != bytes2[i]) {
+ return i;
+ }
+ }
+ return len;
+ }
+
+ public static final byte[] TRAILING_BYTES_FOR_UTF8 = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3
+ };
+
+ /**
+ * Count the number of bytes which would be occupied by this string
+ * were it to be converted to UTF-8.
+ *
+ * @param s The string to operate against
+ * @return The number of UTF-8 bytes
+ */
+ public static final int countUTF8Bytes(String s) {
+ int end = s.length();
+ int byteCount = end; // start with 1 byte per char
+ for (int i = 0; i < end; i++) {
+ // add the number of trailing bytes for each char
+ final int code = (int)s.charAt(i);
+ if (code < 0x80)
+ continue;
+ else if (code < 0x800) {
+ byteCount += 1;
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ byteCount += 2;
+ } else {
+ // surrogate pair
+ int utf32;
+ // confirm valid high surrogate
+ if (code < 0xDC00 && (i < end-1)) {
+ utf32 = ((int)s.charAt(i+1));
+ // confirm valid low surrogate
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ byteCount += 2; // not 3; compensate for extra char
+ i++;
+ continue;
+ }
+ }
+ // replace unpaired surrogate or out-of-order low surrogate
+ // with substitution character, which is 3 bytes in UTF-8
+ byteCount += 2;
+ }
+ }
+ return byteCount;
+ }
+
+
+ /**
* Compares two strings, character by character, and returns the
* first position where the two strings differ from one another.
*