| Index: src/test/org/apache/lucene/index/MockIndexOutput.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/MockIndexOutput.java (revision 0) |
| +++ src/test/org/apache/lucene/index/MockIndexOutput.java (revision 0) |
| @@ -0,0 +1,64 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Copyright 2004 The Apache Software Foundation |
| + * |
| + * Licensed under the Apache License, Version 2.0 (the "License"); |
| + * you may not use this file except in compliance with the License. |
| + * You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.store.IndexOutput; |
| +import java.io.ByteArrayOutputStream; |
| +import java.io.IOException; |
| + |
| +public class MockIndexOutput extends IndexOutput { |
| + |
| + public MockIndexOutput() { }; |
| + |
| + private ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| + |
| + public byte[] getBuf() { |
| + return baos.toByteArray(); |
| + } |
| + |
| + public void reset() { |
| + baos.reset(); |
| + } |
| + |
| + public void writeByte(byte b) { |
| + baos.write(b); |
| + } |
| + |
| + public void writeBytes(byte[] b, int length) { |
| + baos.write(b, baos.size(), length); |
| + } |
| + |
| + public void close() { |
| + // ignore |
| + } |
| + |
| + public void flush() throws IOException { |
| + throw new IOException(); |
| + } |
| + |
| + public void seek(long pos) throws IOException { |
| + throw new IOException(); |
| + } |
| + |
| + public long getFilePointer() { |
| + return (long) baos.size(); |
| + } |
| + |
| + public long length() { |
| + return (long) baos.size(); |
| + } |
| +} |
| Index: src/test/org/apache/lucene/index/TestIndexInput.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/TestIndexInput.java (revision 405159) |
| +++ src/test/org/apache/lucene/index/TestIndexInput.java (working copy) |
| @@ -22,16 +22,70 @@ |
| import java.io.IOException; |
| |
| public class TestIndexInput extends TestCase { |
| - public void testRead() throws IOException { |
| - IndexInput is = new MockIndexInput(new byte[] { (byte) 0x80, 0x01, |
| - (byte) 0xFF, 0x7F, |
| - (byte) 0x80, (byte) 0x80, 0x01, |
| - (byte) 0x81, (byte) 0x80, 0x01, |
| - 0x06, 'L', 'u', 'c', 'e', 'n', 'e'}); |
| - assertEquals(128,is.readVInt()); |
| - assertEquals(16383,is.readVInt()); |
| - assertEquals(16384,is.readVInt()); |
| - assertEquals(16385,is.readVInt()); |
| - assertEquals("Lucene",is.readString()); |
| - } |
| + public void testRead() throws IOException { |
| + IndexInput is = new MockIndexInput(new byte[] { |
| + (byte) 0x80, 0x01, |
| + (byte) 0xFF, 0x7F, |
| + (byte) 0x80, (byte) 0x80, 0x01, |
| + (byte) 0x81, (byte) 0x80, 0x01, |
| + 0x06, 'L', 'u', 'c', 'e', 'n', 'e', |
| + |
| + // 2-byte UTF-8 (U+00BF "INVERTED QUESTION MARK") |
| + 0x02, (byte) 0xC2, (byte) 0xBF, |
| + 0x0A, 'L', 'u', (byte) 0xC2, (byte) 0xBF, |
| + 'c', 'e', (byte) 0xC2, (byte) 0xBF, |
| + 'n', 'e', |
| + |
| + // 3-byte UTF-8 (U+2620 "SKULL AND CROSSBONES") |
| + 0x03, (byte) 0xE2, (byte) 0x98, (byte) 0xA0, |
| + 0x0C, 'L', 'u', (byte) 0xE2, (byte) 0x98, (byte) 0xA0, |
| + 'c', 'e', (byte) 0xE2, (byte) 0x98, (byte) 0xA0, |
| + 'n', 'e', |
| + |
| + // surrogate pairs |
| + // (U+1D11E "MUSICAL SYMBOL G CLEF") |
| + // (U+1D160 "MUSICAL SYMBOL EIGHTH NOTE") |
| + 0x04, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E, |
| + 0x08, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E, |
| + (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0, |
| + 0x0E, 'L', 'u', |
| + (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E, |
| + 'c', 'e', |
| + (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0, |
| + 'n', 'e', |
| + |
| + // null bytes |
| + 0x01, 0x00, |
| + 0x08, 'L', 'u', 0x00, 'c', 'e', 0x00, 'n', 'e', |
| + |
| + // Modified UTF-8 null bytes |
| + 0x02, (byte) 0xC0, (byte) 0x80, |
| + 0x0A, 'L', 'u', (byte) 0xC0, (byte) 0x80, |
| + 'c', 'e', (byte) 0xC0, (byte) 0x80, |
| + 'n', 'e', |
| + |
| + }); |
| + |
| + assertEquals(128,is.readVInt()); |
| + assertEquals(16383,is.readVInt()); |
| + assertEquals(16384,is.readVInt()); |
| + assertEquals(16385,is.readVInt()); |
| + assertEquals("Lucene",is.readString()); |
| + |
| + assertEquals("\u00BF",is.readString()); |
| + assertEquals("Lu\u00BFce\u00BFne",is.readString()); |
| + |
| + assertEquals("\u2620",is.readString()); |
| + assertEquals("Lu\u2620ce\u2620ne",is.readString()); |
| + |
| + assertEquals("\uD834\uDD1E",is.readString()); |
| + assertEquals("\uD834\uDD1E\uD834\uDD60",is.readString()); |
| + assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne",is.readString()); |
| + |
| + assertEquals("\u0000",is.readString()); |
| + assertEquals("Lu\u0000ce\u0000ne",is.readString()); |
| + |
| + assertEquals("\u0000",is.readString()); |
| + assertEquals("Lu\u0000ce\u0000ne",is.readString()); |
| + } |
| } |
| Index: src/test/org/apache/lucene/index/TestIndexOutput.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/TestIndexOutput.java (revision 0) |
| +++ src/test/org/apache/lucene/index/TestIndexOutput.java (revision 0) |
| @@ -0,0 +1,103 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Copyright 2004 The Apache Software Foundation |
| + * |
| + * Licensed under the Apache License, Version 2.0 (the "License"); |
| + * you may not use this file except in compliance with the License. |
| + * You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import junit.framework.TestCase; |
| +import org.apache.lucene.store.IndexOutput; |
| +import java.util.Arrays; |
| +import java.io.IOException; |
| + |
| +public class TestIndexOutput extends TestCase { |
| + |
| + private void checkStringToBytes(String s, byte[] correctBytes) |
| + throws IOException { |
| + MockIndexOutput mockio = new MockIndexOutput(); |
| + mockio.writeString(s); |
| + byte[] writtenBytes = mockio.getBuf(); |
| + assertEquals(writtenBytes.length, correctBytes.length); |
| + for (int i = 0; i < writtenBytes.length; i++) { |
| + assertEquals(writtenBytes[i], correctBytes[i]); |
| + } |
| + } |
| + |
| + public void testWrite() throws IOException { |
| + MockIndexOutput mockio = new MockIndexOutput(); |
| + mockio.writeVInt(128); |
| + mockio.writeVInt(16383); |
| + mockio.writeVInt(16384); |
| + mockio.writeVInt(16385); |
| + mockio.writeString("Lucene"); |
| + boolean check = Arrays.equals( |
| + mockio.getBuf(), |
| + new byte[] { |
| + (byte) 0x80, (byte) 0x01, |
| + (byte) 0xFF, (byte) 0x7F, |
| + (byte) 0x80, (byte) 0x80, (byte) 0x01, |
| + (byte) 0x81, (byte) 0x80, (byte) 0x01, |
| + (byte) 0x06, 'L', 'u', 'c', 'e', 'n', 'e' |
| + } |
| + ); |
| + assertEquals(check, true); |
| + |
| + // 2-byte UTF-8 (U+00BF "INVERTED QUESTION MARK") |
| + checkStringToBytes("\u00BF", new byte[] { |
| + 0x02, (byte) 0xC2, (byte) 0xBF }); |
| + checkStringToBytes("Lu\u00BFce\u00BFne", new byte[] { |
| + 0x0A, 'L', 'u', (byte) 0xC2, (byte) 0xBF, |
| + 'c', 'e', (byte) 0xC2, (byte) 0xBF, |
| + 'n', 'e' }); |
| + |
| + // 3-byte UTF-8 (U+2620 "SKULL AND CROSSBONES") |
| + checkStringToBytes("\u2620", new byte[] { |
| + 0x03, (byte) 0xE2, (byte) 0x98, (byte) 0xA0 }); |
| + checkStringToBytes("Lu\u2620ce\u2620ne", new byte[] { |
| + 0x0C, 'L', 'u', (byte) 0xE2, (byte) 0x98, (byte) 0xA0, |
| + 'c', 'e', (byte) 0xE2, (byte) 0x98, (byte) 0xA0, |
| + 'n', 'e' }); |
| + |
| + // surrogate pairs |
| + // (U+1D11E "MUSICAL SYMBOL G CLEF") |
| + // (U+1D160 "MUSICAL SYMBOL EIGHTH NOTE") |
| + checkStringToBytes("\uD834\uDD1E", new byte[] { |
| + 0x04, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E }); |
| + checkStringToBytes("Lu\uD834\uDD1Ece\uD834\uDD60ne", new byte[] { |
| + 0x0E, 'L', 'u', (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E, |
| + 'c', 'e', (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0, |
| + 'n', 'e' }); |
| + |
| + // null bytes |
| + checkStringToBytes("\u0000", new byte[] { |
| + 0x01, (byte) 0x00 }); |
| + checkStringToBytes("Lu\u0000ce\u0000ne", new byte[] { |
| + 0x08, 'L', 'u', 0x00, 'c', 'e', 0x00, 'n', 'e' }); |
| + |
| + // illegal unpaired high surrogate gets replaced |
| + checkStringToBytes("\uD834", new byte[] { |
| + 0x03, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD }); |
| + checkStringToBytes("Lu\uD834ce\uD834ne", new byte[] { |
| + 0x0C, 'L', 'u', (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, |
| + 'c', 'e', (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, 'n', 'e' }); |
| + |
| + // illegal unpaired low surrogate gets replaced |
| + checkStringToBytes("\u0061\uDD1E", new byte[] { |
| + 0x04, (byte) 0x61, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD }); |
| + checkStringToBytes("Lu\u0061\uDD1Ece\u0061\uDD60ne", new byte[] { |
| + 0x0E, 'L', 'u', (byte) 0x61, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, |
| + 'c', 'e', (byte) 0x61, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, |
| + 'n', 'e' }); |
| + } |
| +} |
| Index: src/java/org/apache/lucene/index/TermVectorsReader.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/TermVectorsReader.java (revision 405159) |
| +++ src/java/org/apache/lucene/index/TermVectorsReader.java (working copy) |
| @@ -241,8 +241,8 @@ |
| int start = 0; |
| int deltaLength = 0; |
| int totalLength = 0; |
| - char [] buffer = new char[10]; // init the buffer with a length of 10 character |
| - char[] previousBuffer = {}; |
| + byte[] buffer = new byte[20]; // init the buffer with a length of 20 bytes |
| + byte[] previousBuffer = {}; |
| |
| for (int i = 0; i < numTerms; i++) { |
| start = tvf.readVInt(); |
| @@ -250,14 +250,14 @@ |
| totalLength = start + deltaLength; |
| if (buffer.length < totalLength) { // increase buffer |
| buffer = null; // give a hint to garbage collector |
| - buffer = new char[totalLength]; |
| + buffer = new byte[totalLength]; |
| |
| if (start > 0) // just copy if necessary |
| System.arraycopy(previousBuffer, 0, buffer, 0, start); |
| } |
| |
| - tvf.readChars(buffer, start, deltaLength); |
| - terms[i] = new String(buffer, 0, totalLength); |
| + tvf.readBytes(buffer, start, deltaLength); |
| + terms[i] = new String(buffer, 0, totalLength, "UTF-8"); |
| previousBuffer = buffer; |
| int freq = tvf.readVInt(); |
| termFreqs[i] = freq; |
| Index: src/java/org/apache/lucene/index/TermBuffer.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/TermBuffer.java (revision 405159) |
| +++ src/java/org/apache/lucene/index/TermBuffer.java (working copy) |
| @@ -18,42 +18,43 @@ |
| |
| import java.io.IOException; |
| import org.apache.lucene.store.IndexInput; |
| +import org.apache.lucene.util.StringHelper; |
| |
| final class TermBuffer implements Cloneable { |
| - private static final char[] NO_CHARS = new char[0]; |
| + private static final byte[] NO_BYTES = new byte[0]; |
| |
| private String field; |
| - private char[] text = NO_CHARS; |
| - private int textLength; |
| + private byte[] bytes = NO_BYTES; |
| + private int bytesLength; |
| private Term term; // cached |
| |
| public final int compareTo(TermBuffer other) { |
| if (field == other.field) // fields are interned |
| - return compareChars(text, textLength, other.text, other.textLength); |
| + return compareBytes(bytes, bytesLength, other.bytes, other.bytesLength); |
| else |
| return field.compareTo(other.field); |
| } |
| |
| - private static final int compareChars(char[] v1, int len1, |
| - char[] v2, int len2) { |
| + private static final int compareBytes(byte[] bytes1, int len1, |
| + byte[] bytes2, int len2) { |
| int end = Math.min(len1, len2); |
| for (int k = 0; k < end; k++) { |
| - char c1 = v1[k]; |
| - char c2 = v2[k]; |
| - if (c1 != c2) { |
| - return c1 - c2; |
| + int b1 = (bytes1[k] & 0xFF); |
| + int b2 = (bytes2[k] & 0xFF); |
| + if (b1 != b2) { |
| + return b1 - b2; |
| } |
| } |
| return len1 - len2; |
| } |
| |
| - private final void setTextLength(int newLength) { |
| - if (text.length < newLength) { |
| - char[] newText = new char[newLength]; |
| - System.arraycopy(text, 0, newText, 0, textLength); |
| - text = newText; |
| + private final void setBytesLength(int newLength) { |
| + if (bytes.length < newLength) { |
| + byte[] newBytes = new byte[newLength]; |
| + System.arraycopy(bytes, 0, newBytes, 0, bytesLength); |
| + bytes = newBytes; |
| } |
| - textLength = newLength; |
| + bytesLength = newLength; |
| } |
| |
| public final void read(IndexInput input, FieldInfos fieldInfos) |
| @@ -62,28 +63,29 @@ |
| int start = input.readVInt(); |
| int length = input.readVInt(); |
| int totalLength = start + length; |
| - setTextLength(totalLength); |
| - input.readChars(this.text, start, length); |
| + setBytesLength(totalLength); |
| + input.readBytes(this.bytes, start, length); |
| this.field = fieldInfos.fieldName(input.readVInt()); |
| } |
| |
| - public final void set(Term term) { |
| - if (term == null) { |
| + public final void set(Term t) { |
| + if (t == null) { |
| reset(); |
| return; |
| } |
| |
| - // copy text into the buffer |
| - setTextLength(term.text().length()); |
| - term.text().getChars(0, term.text().length(), text, 0); |
| - |
| - this.field = term.field(); |
| - this.term = term; |
| + // convert chars into UTF-8 bytes, store in buffer |
| + try { |
| + bytes = t.text().getBytes("UTF-8"); |
| + } catch (java.io.UnsupportedEncodingException e) { } |
| + setBytesLength(bytes.length); |
| + this.field = t.field(); |
| + this.term = t; |
| } |
| |
| public final void set(TermBuffer other) { |
| - setTextLength(other.textLength); |
| - System.arraycopy(other.text, 0, text, 0, textLength); |
| + setBytesLength(other.bytesLength); |
| + System.arraycopy(other.bytes, 0, bytes, 0, bytesLength); |
| |
| this.field = other.field; |
| this.term = other.term; |
| @@ -91,7 +93,7 @@ |
| |
| public void reset() { |
| this.field = null; |
| - this.textLength = 0; |
| + this.bytesLength = 0; |
| this.term = null; |
| } |
| |
| @@ -100,7 +102,10 @@ |
| return null; |
| |
| if (term == null) |
| - term = new Term(field, new String(text, 0, textLength), false); |
| + try { |
| + term = new Term(field, |
| + new String(bytes, 0, bytesLength, "UTF-8"), false ); |
| + } catch (java.io.UnsupportedEncodingException e) { } |
| |
| return term; |
| } |
| @@ -111,8 +116,8 @@ |
| clone = (TermBuffer)super.clone(); |
| } catch (CloneNotSupportedException e) {} |
| |
| - clone.text = new char[text.length]; |
| - System.arraycopy(text, 0, clone.text, 0, textLength); |
| + clone.bytes = new byte[bytes.length]; |
| + System.arraycopy(bytes, 0, clone.bytes, 0, bytesLength); |
| |
| return clone; |
| } |
| Index: src/java/org/apache/lucene/index/TermInfosWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/TermInfosWriter.java (revision 405159) |
| +++ src/java/org/apache/lucene/index/TermInfosWriter.java (working copy) |
| @@ -33,6 +33,8 @@ |
| private IndexOutput output; |
| private Term lastTerm = new Term("", ""); |
| private TermInfo lastTi = new TermInfo(); |
| + private static final byte[] NO_BYTES = new byte[0]; |
| + private byte[] lastBytes = NO_BYTES; |
| private long size = 0; |
| |
| // TODO: the default values for these two parameters should be settable from |
| @@ -121,15 +123,21 @@ |
| |
| private final void writeTerm(Term term) |
| throws IOException { |
| - int start = StringHelper.stringDifference(lastTerm.text, term.text); |
| - int length = term.text.length() - start; |
| + byte[] bytes = term.text().getBytes("UTF-8"); |
| + int totalLength = bytes.length; |
| |
| + int start = StringHelper.bytesDifference(lastBytes, bytes); |
| + int diffLength = totalLength - start; |
| + |
| output.writeVInt(start); // write shared prefix length |
| - output.writeVInt(length); // write delta length |
| - output.writeChars(term.text, start, length); // write delta chars |
| + output.writeVInt(diffLength); // write delta length |
| + for (int i = start; i < totalLength; i++) { |
| + output.writeByte(bytes[i]); // write delta UTF-8 bytes |
| + } |
| |
| output.writeVInt(fieldInfos.fieldNumber(term.field)); // write field num |
| |
| + lastBytes = bytes; |
| lastTerm = term; |
| } |
| |
| Index: src/java/org/apache/lucene/index/TermVectorsWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/TermVectorsWriter.java (revision 405159) |
| +++ src/java/org/apache/lucene/index/TermVectorsWriter.java (working copy) |
| @@ -282,16 +282,23 @@ |
| bits |= STORE_OFFSET_WITH_TERMVECTOR; |
| tvf.writeByte(bits); |
| |
| - String lastTermText = ""; |
| + byte[] lastTermBytes = new byte[0]; |
| for (int i = 0; i < size; i++) { |
| TVTerm term = (TVTerm) terms.elementAt(i); |
| - int start = StringHelper.stringDifference(lastTermText, term.termText); |
| - int length = term.termText.length() - start; |
| + byte[] termBytes = term.termText.getBytes("UTF-8"); |
| + int totalLength = termBytes.length; |
| + |
| + int start = StringHelper.bytesDifference(lastTermBytes, termBytes); |
| + int diffLength = totalLength - start; |
| + |
| tvf.writeVInt(start); // write shared prefix length |
| - tvf.writeVInt(length); // write delta length |
| - tvf.writeChars(term.termText, start, length); // write delta chars |
| + tvf.writeVInt(diffLength); // write delta length |
| + for (int k = start; k < totalLength; k++) { |
| + tvf.writeByte(termBytes[k]); // write delta UTF-8 bytes |
| + } |
| + |
| tvf.writeVInt(term.freq); |
| - lastTermText = term.termText; |
| + lastTermBytes = termBytes; |
| |
| if(storePositions){ |
| if(term.positions == null) |
| Index: src/java/org/apache/lucene/store/IndexInput.java |
| =================================================================== |
| --- src/java/org/apache/lucene/store/IndexInput.java (revision 405159) |
| +++ src/java/org/apache/lucene/store/IndexInput.java (working copy) |
| @@ -17,13 +17,14 @@ |
| */ |
| |
| import java.io.IOException; |
| +import org.apache.lucene.util.StringHelper; |
| |
| /** Abstract base class for input from a file in a {@link Directory}. A |
| * random-access input stream. Used for all Lucene index input operations. |
| * @see Directory |
| */ |
| public abstract class IndexInput implements Cloneable { |
| - private char[] chars; // used by readString() |
| + private byte[] bytes; // used by readString() |
| |
| /** Reads and returns a single byte. |
| * @see IndexOutput#writeByte(byte) |
| @@ -87,10 +88,10 @@ |
| */ |
| public String readString() throws IOException { |
| int length = readVInt(); |
| - if (chars == null || length > chars.length) |
| - chars = new char[length]; |
| - readChars(chars, 0, length); |
| - return new String(chars, 0, length); |
| + if (bytes == null || length > bytes.length) |
| + bytes = new byte[length]; |
| + readBytes(bytes, 0, length); |
| + return new String(bytes, 0, length, "UTF-8"); |
| } |
| |
| /** Reads UTF-8 encoded characters into an array. |
| @@ -104,15 +105,29 @@ |
| final int end = start + length; |
| for (int i = start; i < end; i++) { |
| byte b = readByte(); |
| - if ((b & 0x80) == 0) |
| - buffer[i] = (char)(b & 0x7F); |
| - else if ((b & 0xE0) != 0xE0) { |
| - buffer[i] = (char)(((b & 0x1F) << 6) |
| - | (readByte() & 0x3F)); |
| - } else |
| - buffer[i] = (char)(((b & 0x0F) << 12) |
| - | ((readByte() & 0x3F) << 6) |
| - | (readByte() & 0x3F)); |
| + switch (StringHelper.TRAILING_BYTES_FOR_UTF8[b & 0xFF]) { |
| + case 0: |
| + buffer[i] = (char)(b & 0x7F); |
| + break; |
| + case 1: |
| + buffer[i] = (char)(((b & 0x1F) << 6) |
| + | (readByte() & 0x3F)); |
| + break; |
| + case 2: |
| + buffer[i] = (char)(((b & 0x0F) << 12) |
| + | ((readByte() & 0x3F) << 6) |
| + | (readByte() & 0x3F)); |
| + break; |
| + case 3: |
| + int utf32 = (((b & 0x0F) << 18) |
| + | ((readByte() & 0x3F) << 12) |
| + | ((readByte() & 0x3F) << 6) |
| + | (readByte() & 0x3F)); |
| + buffer[i] = (char)((utf32 >> 10) + 0xD7C0); |
| + i++; |
| + buffer[i] = (char)((utf32 & 0x03FF) + 0xDC00); |
| + break; |
| + } |
| } |
| } |
| |
| @@ -148,7 +163,7 @@ |
| clone = (IndexInput)super.clone(); |
| } catch (CloneNotSupportedException e) {} |
| |
| - clone.chars = null; |
| + clone.bytes = null; |
| |
| return clone; |
| } |
| Index: src/java/org/apache/lucene/store/IndexOutput.java |
| =================================================================== |
| --- src/java/org/apache/lucene/store/IndexOutput.java (revision 405159) |
| +++ src/java/org/apache/lucene/store/IndexOutput.java (working copy) |
| @@ -17,6 +17,7 @@ |
| */ |
| |
| import java.io.IOException; |
| +import org.apache.lucene.util.StringHelper; |
| |
| /** Abstract base class for output to a file in a Directory. A random-access |
| * output stream. Used for all Lucene index output operations. |
| @@ -85,9 +86,9 @@ |
| * @see IndexInput#readString() |
| */ |
| public void writeString(String s) throws IOException { |
| - int length = s.length(); |
| - writeVInt(length); |
| - writeChars(s, 0, length); |
| + int byteCount = StringHelper.countUTF8Bytes(s); |
| + writeVInt(byteCount); |
| + writeChars(s, 0, s.length()); |
| } |
| |
| /** Writes a sequence of UTF-8 encoded characters from a string. |
| @@ -101,15 +102,37 @@ |
| final int end = start + length; |
| for (int i = start; i < end; i++) { |
| final int code = (int)s.charAt(i); |
| - if (code >= 0x01 && code <= 0x7F) |
| - writeByte((byte)code); |
| - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { |
| - writeByte((byte)(0xC0 | (code >> 6))); |
| - writeByte((byte)(0x80 | (code & 0x3F))); |
| + if (code < 0x80) |
| + writeByte((byte)code); |
| + else if (code < 0x800) { |
| + writeByte((byte)(0xC0 | (code >> 6))); |
| + writeByte((byte)(0x80 | (code & 0x3F))); |
| + } else if (code < 0xD800 || code > 0xDFFF) { |
| + writeByte((byte)(0xE0 | (code >> 12))); |
| + writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); |
| + writeByte((byte)(0x80 | (code & 0x3F))); |
| } else { |
| - writeByte((byte)(0xE0 | (code >>> 12))); |
| - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); |
| - writeByte((byte)(0x80 | (code & 0x3F))); |
| + // surrogate pair |
| + int utf32; |
| + // confirm valid high surrogate |
| + if (code < 0xDC00 && (i < end-1)) { |
| + utf32 = ((int)s.charAt(i+1)); |
| + // confirm valid low surrogate and write pair |
| + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { |
| + utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); |
| + i++; |
| + writeByte((byte)(0xF0 | (utf32 >> 18))); |
| + writeByte((byte)(0x80 | ((utf32 >> 12) & 0x3F))); |
| + writeByte((byte)(0x80 | ((utf32 >> 6) & 0x3F))); |
| + writeByte((byte)(0x80 | (utf32 & 0x3F))); |
| + continue; |
| + } |
| + } |
| + // replace unpaired surrogate or out-of-order low surrogate |
| + // with substitution character |
| + writeByte((byte)0xEF); |
| + writeByte((byte)0xBF); |
| + writeByte((byte)0xBD); |
| } |
| } |
| } |
| Index: src/java/org/apache/lucene/util/StringHelper.java |
| =================================================================== |
| --- src/java/org/apache/lucene/util/StringHelper.java (revision 405159) |
| +++ src/java/org/apache/lucene/util/StringHelper.java (working copy) |
| @@ -16,6 +16,8 @@ |
| * limitations under the License. |
| */ |
| |
| +import java.nio.ByteBuffer; |
| +import java.nio.CharBuffer; |
| |
| /** |
| * Methods for manipulating strings. |
| @@ -25,6 +27,87 @@ |
| public abstract class StringHelper { |
| |
| /** |
| + * Compares two byte[] arrays, element by element, and returns the |
| + * number of elements common to both arrays. |
| + * |
| + * @param bytes1 The first byte[] to compare |
| + * @param bytes2 The second byte[] to compare |
| + * @return The number of common elements. |
| + */ |
| + public static final int bytesDifference(byte[] bytes1, byte[] bytes2) { |
| + int len1 = bytes1.length; |
| + int len2 = bytes2.length; |
| + int len = len1 < len2 ? len1 : len2; |
| + for (int i = 0; i < len; i++) { |
| + if (bytes1[i] != bytes2[i]) { |
| + return i; |
| + } |
| + } |
| + return len; |
| + } |
| + |
| + public static final byte[] TRAILING_BYTES_FOR_UTF8 = { |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| + 3,3,3,3,3,3,3,3 |
| + }; |
| + |
| + /** |
| + * Count the number of bytes which would be occupied by this string |
| + * were it to be converted to UTF-8. |
| + * |
| + * @param s The string to operate against |
| + * @return The number of UTF-8 bytes |
| + */ |
| + public static final int countUTF8Bytes(String s) { |
| + int end = s.length(); |
| + int byteCount = end; // start with 1 byte per char |
| + for (int i = 0; i < end; i++) { |
| + // add the number of trailing bytes for each char |
| + final int code = (int)s.charAt(i); |
| + if (code < 0x80) |
| + continue; |
| + else if (code < 0x800) { |
| + byteCount += 1; |
| + } else if (code < 0xD800 || code > 0xDFFF) { |
| + byteCount += 2; |
| + } else { |
| + // surrogate pair |
| + int utf32; |
| + // confirm valid high surrogate |
| + if (code < 0xDC00 && (i < end-1)) { |
| + utf32 = ((int)s.charAt(i+1)); |
| + // confirm valid low surrogate |
| + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { |
| + byteCount += 2; // not 3; compensate for extra char |
| + i++; |
| + continue; |
| + } |
| + } |
| + // replace unpaired surrogate or out-of-order low surrogate |
| + // with substitution character, which is 3 bytes in UTF-8 |
| + byteCount += 2; |
| + } |
| + } |
| + return byteCount; |
| + } |
| + |
| + |
| + /** |
| * Compares two strings, character by character, and returns the |
| * first position where the two strings differ from one another. |
| * |