docs/attachments/LUCENE-510/strings.diff - lucene-jira-archive - Git at Google

 Index: src/test/org/apache/lucene/index/MockIndexOutput.java
 ===================================================================
 --- src/test/org/apache/lucene/index/MockIndexOutput.java	(revision 0)
 +++ src/test/org/apache/lucene/index/MockIndexOutput.java	(revision 0)
 @@ -0,0 +1,64 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Copyright 2004 The Apache Software Foundation
 + *
 + * Licensed under the Apache License, Version 2.0 (the "License");
 + * you may not use this file except in compliance with the License.
 + * You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import org.apache.lucene.store.IndexOutput;
 +import java.io.ByteArrayOutputStream;
 +import java.io.IOException;
 +
 +public class MockIndexOutput extends IndexOutput {
 +
 +  public MockIndexOutput() { };
 +
 +  private ByteArrayOutputStream baos = new ByteArrayOutputStream();
 +
 +  public byte[] getBuf() {
 +    return baos.toByteArray();
 +  }
 +
 +  public void reset() {
 +    baos.reset();
 +  }
 +
 +  public void writeByte(byte b) {
 +    baos.write(b);
 +  }
 +
 +  public void writeBytes(byte[] b, int length) {
 +    baos.write(b, baos.size(), length);
 +  }
 +
 +  public void close() {
 +    // ignore
 +  }
 +
 +  public void flush() throws IOException {
 +    throw new IOException();
 +  }
 +
 +  public void seek(long pos) throws IOException {
 +    throw new IOException();
 +  }
 +
 +  public long getFilePointer() {
 +    return (long) baos.size();
 +  }
 +
 +  public long length() {
 +    return (long) baos.size();
 +  }
 +}
 Index: src/test/org/apache/lucene/index/TestIndexInput.java
 ===================================================================
 --- src/test/org/apache/lucene/index/TestIndexInput.java	(revision 405159)
 +++ src/test/org/apache/lucene/index/TestIndexInput.java	(working copy)
 @@ -22,16 +22,70 @@
  import java.io.IOException;

  public class TestIndexInput extends TestCase {
 -    public void testRead() throws IOException {
 -        IndexInput is = new MockIndexInput(new byte[] { (byte) 0x80, 0x01,
 -                                                        (byte) 0xFF, 0x7F,
 -                                                        (byte) 0x80, (byte) 0x80, 0x01,
 -                                                        (byte) 0x81, (byte) 0x80, 0x01,
 -                                                        0x06, 'L', 'u', 'c', 'e', 'n', 'e'});
 -        assertEquals(128,is.readVInt());
 -        assertEquals(16383,is.readVInt());
 -        assertEquals(16384,is.readVInt());
 -        assertEquals(16385,is.readVInt());
 -        assertEquals("Lucene",is.readString());
 -    }
 +  public void testRead() throws IOException {
 +    IndexInput is = new MockIndexInput(new byte[] {
 +      (byte) 0x80, 0x01,
 +      (byte) 0xFF, 0x7F,
 +      (byte) 0x80, (byte) 0x80, 0x01,
 +      (byte) 0x81, (byte) 0x80, 0x01,
 +      0x06, 'L', 'u', 'c', 'e', 'n', 'e',
 +
 +      // 2-byte UTF-8 (U+00BF "INVERTED QUESTION MARK")
 +      0x02, (byte) 0xC2, (byte) 0xBF,
 +      0x0A, 'L', 'u', (byte) 0xC2, (byte) 0xBF,
 +            'c', 'e', (byte) 0xC2, (byte) 0xBF,
 +            'n', 'e',
 +
 +      // 3-byte UTF-8 (U+2620 "SKULL AND CROSSBONES")
 +      0x03, (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
 +      0x0C, 'L', 'u', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
 +            'c', 'e', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
 +            'n', 'e',
 +
 +      // surrogate pairs
 +      // (U+1D11E "MUSICAL SYMBOL G CLEF")
 +      // (U+1D160 "MUSICAL SYMBOL EIGHTH NOTE")
 +      0x04, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
 +      0x08, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
 +            (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0,
 +      0x0E, 'L', 'u',
 +            (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
 +            'c', 'e',
 +            (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0,
 +            'n', 'e',
 +
 +      // null bytes
 +      0x01, 0x00,
 +      0x08, 'L', 'u', 0x00, 'c', 'e', 0x00, 'n', 'e',
 +
 +      // Modified UTF-8 null bytes
 +      0x02, (byte) 0xC0, (byte) 0x80,
 +      0x0A, 'L', 'u', (byte) 0xC0, (byte) 0x80,
 +            'c', 'e', (byte) 0xC0, (byte) 0x80,
 +            'n', 'e',
 +
 +    });
 +
 +    assertEquals(128,is.readVInt());
 +    assertEquals(16383,is.readVInt());
 +    assertEquals(16384,is.readVInt());
 +    assertEquals(16385,is.readVInt());
 +    assertEquals("Lucene",is.readString());
 +
 +    assertEquals("\u00BF",is.readString());
 +    assertEquals("Lu\u00BFce\u00BFne",is.readString());
 +
 +    assertEquals("\u2620",is.readString());
 +    assertEquals("Lu\u2620ce\u2620ne",is.readString());
 +
 +    assertEquals("\uD834\uDD1E",is.readString());
 +    assertEquals("\uD834\uDD1E\uD834\uDD60",is.readString());
 +    assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne",is.readString());
 +
 +    assertEquals("\u0000",is.readString());
 +    assertEquals("Lu\u0000ce\u0000ne",is.readString());
 +
 +    assertEquals("\u0000",is.readString());
 +    assertEquals("Lu\u0000ce\u0000ne",is.readString());
 +  }
  }
 Index: src/test/org/apache/lucene/index/TestIndexOutput.java
 ===================================================================
 --- src/test/org/apache/lucene/index/TestIndexOutput.java	(revision 0)
 +++ src/test/org/apache/lucene/index/TestIndexOutput.java	(revision 0)
 @@ -0,0 +1,103 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Copyright 2004 The Apache Software Foundation
 + *
 + * Licensed under the Apache License, Version 2.0 (the "License");
 + * you may not use this file except in compliance with the License.
 + * You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import junit.framework.TestCase;
 +import org.apache.lucene.store.IndexOutput;
 +import java.util.Arrays;
 +import java.io.IOException;
 +
 +public class TestIndexOutput extends TestCase {
 +
 +  private void checkStringToBytes(String s, byte[] correctBytes)
 +                                     throws IOException {
 +    MockIndexOutput mockio = new MockIndexOutput();
 +    mockio.writeString(s);
 +    byte[] writtenBytes = mockio.getBuf();
 +    assertEquals(writtenBytes.length, correctBytes.length);
 +    for (int i = 0; i < writtenBytes.length; i++) {
 +      assertEquals(writtenBytes[i], correctBytes[i]);
 +    }
 +  }
 +
 +  public void testWrite() throws IOException {
 +    MockIndexOutput mockio = new MockIndexOutput();
 +    mockio.writeVInt(128);
 +    mockio.writeVInt(16383);
 +    mockio.writeVInt(16384);
 +    mockio.writeVInt(16385);
 +    mockio.writeString("Lucene");
 +    boolean check = Arrays.equals(
 +      mockio.getBuf(),
 +      new byte[] {
 +        (byte) 0x80, (byte) 0x01,
 +        (byte) 0xFF, (byte) 0x7F,
 +        (byte) 0x80, (byte) 0x80, (byte) 0x01,
 +        (byte) 0x81, (byte) 0x80, (byte) 0x01,
 +        (byte) 0x06, 'L', 'u', 'c', 'e', 'n', 'e'
 +      }
 +    );
 +    assertEquals(check, true);
 +
 +    // 2-byte UTF-8 (U+00BF "INVERTED QUESTION MARK")
 +    checkStringToBytes("\u00BF", new byte[] {
 +      0x02, (byte) 0xC2, (byte) 0xBF });
 +    checkStringToBytes("Lu\u00BFce\u00BFne", new byte[] {
 +      0x0A, 'L', 'u', (byte) 0xC2, (byte) 0xBF,
 +            'c', 'e', (byte) 0xC2, (byte) 0xBF,
 +            'n', 'e' });
 +
 +    // 3-byte UTF-8 (U+2620 "SKULL AND CROSSBONES")
 +    checkStringToBytes("\u2620", new byte[] {
 +        0x03, (byte) 0xE2, (byte) 0x98, (byte) 0xA0 });
 +    checkStringToBytes("Lu\u2620ce\u2620ne", new byte[] {
 +        0x0C, 'L', 'u', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
 +              'c', 'e', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
 +              'n', 'e' });
 +
 +    // surrogate pairs
 +    // (U+1D11E "MUSICAL SYMBOL G CLEF")
 +    // (U+1D160 "MUSICAL SYMBOL EIGHTH NOTE")
 +    checkStringToBytes("\uD834\uDD1E", new byte[] {
 +      0x04, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E });
 +    checkStringToBytes("Lu\uD834\uDD1Ece\uD834\uDD60ne", new byte[] {
 +      0x0E, 'L', 'u', (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
 +            'c', 'e', (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0,
 +            'n', 'e' });
 +
 +    // null bytes
 +    checkStringToBytes("\u0000", new byte[] {
 +      0x01, (byte) 0x00 });
 +    checkStringToBytes("Lu\u0000ce\u0000ne", new byte[] {
 +        0x08, 'L', 'u', 0x00, 'c', 'e', 0x00, 'n', 'e' });
 +
 +    // illegal unpaired high surrogate gets replaced
 +    checkStringToBytes("\uD834", new byte[] {
 +      0x03, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD });
 +    checkStringToBytes("Lu\uD834ce\uD834ne", new byte[] {
 +      0x0C, 'L', 'u', (byte) 0xEF, (byte) 0xBF, (byte) 0xBD,
 +            'c', 'e', (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, 'n', 'e' });
 +
 +    // illegal unpaired low surrogate gets replaced
 +    checkStringToBytes("\u0061\uDD1E", new byte[] {
 +      0x04, (byte) 0x61, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD });
 +    checkStringToBytes("Lu\u0061\uDD1Ece\u0061\uDD60ne", new byte[] {
 +        0x0E, 'L', 'u', (byte) 0x61, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD,
 +              'c', 'e', (byte) 0x61, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD,
 +              'n', 'e' });
 +  }
 +}
 Index: src/java/org/apache/lucene/index/TermVectorsReader.java
 ===================================================================
 --- src/java/org/apache/lucene/index/TermVectorsReader.java	(revision 405159)
 +++ src/java/org/apache/lucene/index/TermVectorsReader.java	(working copy)
 @@ -241,8 +241,8 @@
      int start = 0;
      int deltaLength = 0;
      int totalLength = 0;
 -    char [] buffer = new char[10];    // init the buffer with a length of 10 character
 -    char[] previousBuffer = {};
 +    byte[] buffer = new byte[20];    // init the buffer with a length of 20 bytes
 +    byte[] previousBuffer = {};

      for (int i = 0; i < numTerms; i++) {
        start = tvf.readVInt();
 @@ -250,14 +250,14 @@
        totalLength = start + deltaLength;
        if (buffer.length < totalLength) {  // increase buffer
          buffer = null;    // give a hint to garbage collector
 -        buffer = new char[totalLength];
 +        buffer = new byte[totalLength];

          if (start > 0)  // just copy if necessary
            System.arraycopy(previousBuffer, 0, buffer, 0, start);
        }

 -      tvf.readChars(buffer, start, deltaLength);
 -      terms[i] = new String(buffer, 0, totalLength);
 +      tvf.readBytes(buffer, start, deltaLength);
 +      terms[i] = new String(buffer, 0, totalLength, "UTF-8");
        previousBuffer = buffer;
        int freq = tvf.readVInt();
        termFreqs[i] = freq;
 Index: src/java/org/apache/lucene/index/TermBuffer.java
 ===================================================================
 --- src/java/org/apache/lucene/index/TermBuffer.java	(revision 405159)
 +++ src/java/org/apache/lucene/index/TermBuffer.java	(working copy)
 @@ -18,42 +18,43 @@

  import java.io.IOException;
  import org.apache.lucene.store.IndexInput;
 +import org.apache.lucene.util.StringHelper;

  final class TermBuffer implements Cloneable {
 -  private static final char[] NO_CHARS = new char[0];
 +  private static final byte[] NO_BYTES = new byte[0];

    private String field;
 -  private char[] text = NO_CHARS;
 -  private int textLength;
 +  private byte[] bytes = NO_BYTES;
 +  private int bytesLength;
    private Term term;                            // cached

    public final int compareTo(TermBuffer other) {
      if (field == other.field)			  // fields are interned
 -      return compareChars(text, textLength, other.text, other.textLength);
 +      return compareBytes(bytes, bytesLength, other.bytes, other.bytesLength);
      else
        return field.compareTo(other.field);
    }

 -  private static final int compareChars(char[] v1, int len1,
 -                                        char[] v2, int len2) {
 +  private static final int compareBytes(byte[] bytes1, int len1,
 +                                        byte[] bytes2, int len2) {
      int end = Math.min(len1, len2);
      for (int k = 0; k < end; k++) {
 -      char c1 = v1[k];
 -      char c2 = v2[k];
 -      if (c1 != c2) {
 -        return c1 - c2;
 +      int b1 = (bytes1[k] & 0xFF);
 +      int b2 = (bytes2[k] & 0xFF);
 +      if (b1 != b2) {
 +        return b1 - b2;
        }
      }
      return len1 - len2;
    }

 -  private final void setTextLength(int newLength) {
 -    if (text.length < newLength) {
 -      char[] newText = new char[newLength];
 -      System.arraycopy(text, 0, newText, 0, textLength);
 -      text = newText;
 +  private final void setBytesLength(int newLength) {
 +    if (bytes.length < newLength) {
 +      byte[] newBytes = new byte[newLength];
 +      System.arraycopy(bytes, 0, newBytes, 0, bytesLength);
 +      bytes = newBytes;
      }
 -    textLength = newLength;
 +    bytesLength = newLength;
    }

    public final void read(IndexInput input, FieldInfos fieldInfos)
 @@ -62,28 +63,29 @@
      int start = input.readVInt();
      int length = input.readVInt();
      int totalLength = start + length;
 -    setTextLength(totalLength);
 -    input.readChars(this.text, start, length);
 +    setBytesLength(totalLength);
 +    input.readBytes(this.bytes, start, length);
      this.field = fieldInfos.fieldName(input.readVInt());
    }

 -  public final void set(Term term) {
 -    if (term == null) {
 +  public final void set(Term t) {
 +    if (t == null) {
        reset();
        return;
      }

 -    // copy text into the buffer
 -    setTextLength(term.text().length());
 -    term.text().getChars(0, term.text().length(), text, 0);
 -
 -    this.field = term.field();
 -    this.term = term;
 +    // convert chars into UTF-8 bytes, store in buffer
 +    try {
 +        bytes = t.text().getBytes("UTF-8");
 +    } catch (java.io.UnsupportedEncodingException e) { }
 +    setBytesLength(bytes.length);
 +    this.field = t.field();
 +    this.term = t;
    }

    public final void set(TermBuffer other) {
 -    setTextLength(other.textLength);
 -    System.arraycopy(other.text, 0, text, 0, textLength);
 +    setBytesLength(other.bytesLength);
 +    System.arraycopy(other.bytes, 0, bytes, 0, bytesLength);

      this.field = other.field;
      this.term = other.term;
 @@ -91,7 +93,7 @@

    public void reset() {
      this.field = null;
 -    this.textLength = 0;
 +    this.bytesLength = 0;
      this.term = null;
    }

 @@ -100,7 +102,10 @@
        return null;

      if (term == null)
 -      term = new Term(field, new String(text, 0, textLength), false);
 +      try {
 +        term = new Term(field,
 +            new String(bytes, 0, bytesLength, "UTF-8"), false );
 +    } catch (java.io.UnsupportedEncodingException e) { }

      return term;
    }
 @@ -111,8 +116,8 @@
        clone = (TermBuffer)super.clone();
      } catch (CloneNotSupportedException e) {}

 -    clone.text = new char[text.length];
 -    System.arraycopy(text, 0, clone.text, 0, textLength);
 +    clone.bytes = new byte[bytes.length];
 +    System.arraycopy(bytes, 0, clone.bytes, 0, bytesLength);

      return clone;
    }
 Index: src/java/org/apache/lucene/index/TermInfosWriter.java
 ===================================================================
 --- src/java/org/apache/lucene/index/TermInfosWriter.java	(revision 405159)
 +++ src/java/org/apache/lucene/index/TermInfosWriter.java	(working copy)
 @@ -33,6 +33,8 @@
    private IndexOutput output;
    private Term lastTerm = new Term("", "");
    private TermInfo lastTi = new TermInfo();
 +  private static final byte[] NO_BYTES = new byte[0];
 +  private byte[] lastBytes = NO_BYTES;
    private long size = 0;

    // TODO: the default values for these two parameters should be settable from
 @@ -121,15 +123,21 @@

    private final void writeTerm(Term term)
         throws IOException {
 -    int start = StringHelper.stringDifference(lastTerm.text, term.text);
 -    int length = term.text.length() - start;
 +    byte[] bytes = term.text().getBytes("UTF-8");
 +    int totalLength = bytes.length;

 +    int start = StringHelper.bytesDifference(lastBytes, bytes);
 +    int diffLength = totalLength - start;
 +
      output.writeVInt(start);                   // write shared prefix length
 -    output.writeVInt(length);                  // write delta length
 -    output.writeChars(term.text, start, length);  // write delta chars
 +    output.writeVInt(diffLength);                  // write delta length
 +    for (int i = start; i < totalLength; i++) {
 +      output.writeByte(bytes[i]);              // write delta UTF-8 bytes
 +    }

      output.writeVInt(fieldInfos.fieldNumber(term.field)); // write field num

 +    lastBytes = bytes;
      lastTerm = term;
    }

 Index: src/java/org/apache/lucene/index/TermVectorsWriter.java
 ===================================================================
 --- src/java/org/apache/lucene/index/TermVectorsWriter.java	(revision 405159)
 +++ src/java/org/apache/lucene/index/TermVectorsWriter.java	(working copy)
 @@ -282,16 +282,23 @@
        bits |= STORE_OFFSET_WITH_TERMVECTOR;
      tvf.writeByte(bits);

 -    String lastTermText = "";
 +    byte[] lastTermBytes = new byte[0];
      for (int i = 0; i < size; i++) {
        TVTerm term = (TVTerm) terms.elementAt(i);
 -      int start = StringHelper.stringDifference(lastTermText, term.termText);
 -      int length = term.termText.length() - start;
 +      byte[] termBytes = term.termText.getBytes("UTF-8");
 +      int totalLength = termBytes.length;
 +
 +      int start = StringHelper.bytesDifference(lastTermBytes, termBytes);
 +      int diffLength = totalLength - start;
 +
        tvf.writeVInt(start);       // write shared prefix length
 -      tvf.writeVInt(length);        // write delta length
 -      tvf.writeChars(term.termText, start, length);  // write delta chars
 +      tvf.writeVInt(diffLength);        // write delta length
 +      for (int k = start; k < totalLength; k++) {
 +        tvf.writeByte(termBytes[k]);              // write delta UTF-8 bytes
 +      }
 +
        tvf.writeVInt(term.freq);
 -      lastTermText = term.termText;
 +      lastTermBytes = termBytes;

        if(storePositions){
          if(term.positions == null)
 Index: src/java/org/apache/lucene/store/IndexInput.java
 ===================================================================
 --- src/java/org/apache/lucene/store/IndexInput.java	(revision 405159)
 +++ src/java/org/apache/lucene/store/IndexInput.java	(working copy)
 @@ -17,13 +17,14 @@
   */

  import java.io.IOException;
 +import org.apache.lucene.util.StringHelper;

  /** Abstract base class for input from a file in a {@link Directory}.  A
   * random-access input stream.  Used for all Lucene index input operations.
   * @see Directory
   */
  public abstract class IndexInput implements Cloneable {
 -  private char[] chars;                           // used by readString()
 +  private byte[] bytes;                           // used by readString()

    /** Reads and returns a single byte.
     * @see IndexOutput#writeByte(byte)
 @@ -87,10 +88,10 @@
     */
    public String readString() throws IOException {
      int length = readVInt();
 -    if (chars == null || length > chars.length)
 -      chars = new char[length];
 -    readChars(chars, 0, length);
 -    return new String(chars, 0, length);
 +    if (bytes == null || length > bytes.length)
 +      bytes = new byte[length];
 +    readBytes(bytes, 0, length);
 +    return new String(bytes, 0, length, "UTF-8");
    }

    /** Reads UTF-8 encoded characters into an array.
 @@ -104,15 +105,29 @@
      final int end = start + length;
      for (int i = start; i < end; i++) {
        byte b = readByte();
 -      if ((b & 0x80) == 0)
 -	buffer[i] = (char)(b & 0x7F);
 -      else if ((b & 0xE0) != 0xE0) {
 -	buffer[i] = (char)(((b & 0x1F) << 6)
 -		 | (readByte() & 0x3F));
 -      } else
 -	buffer[i] = (char)(((b & 0x0F) << 12)
 -		| ((readByte() & 0x3F) << 6)
 -	        |  (readByte() & 0x3F));
 +      switch (StringHelper.TRAILING_BYTES_FOR_UTF8[b & 0xFF]) {
 +        case 0:
 +          buffer[i] = (char)(b & 0x7F);
 +          break;
 +        case 1:
 +          buffer[i] = (char)(((b & 0x1F) << 6)
 +            | (readByte() & 0x3F));
 +          break;
 +        case 2:
 +          buffer[i] = (char)(((b & 0x0F) << 12)
 +            | ((readByte() & 0x3F) << 6)
 +            |  (readByte() & 0x3F));
 +          break;
 +        case 3:
 +          int utf32 = (((b & 0x0F) << 18)
 +            | ((readByte() & 0x3F) << 12)
 +            | ((readByte() & 0x3F) << 6)
 +            |  (readByte() & 0x3F));
 +          buffer[i] = (char)((utf32 >> 10) + 0xD7C0);
 +          i++;
 +          buffer[i] = (char)((utf32 & 0x03FF) + 0xDC00);
 +          break;
 +      }
      }
    }

 @@ -148,7 +163,7 @@
        clone = (IndexInput)super.clone();
      } catch (CloneNotSupportedException e) {}

 -    clone.chars = null;
 +    clone.bytes = null;

      return clone;
    }
 Index: src/java/org/apache/lucene/store/IndexOutput.java
 ===================================================================
 --- src/java/org/apache/lucene/store/IndexOutput.java	(revision 405159)
 +++ src/java/org/apache/lucene/store/IndexOutput.java	(working copy)
 @@ -17,6 +17,7 @@
   */

  import java.io.IOException;
 +import org.apache.lucene.util.StringHelper;

  /** Abstract base class for output to a file in a Directory.  A random-access
   * output stream.  Used for all Lucene index output operations.
 @@ -85,9 +86,9 @@
     * @see IndexInput#readString()
     */
    public void writeString(String s) throws IOException {
 -    int length = s.length();
 -    writeVInt(length);
 -    writeChars(s, 0, length);
 +    int byteCount = StringHelper.countUTF8Bytes(s);
 +    writeVInt(byteCount);
 +    writeChars(s, 0, s.length());
    }

    /** Writes a sequence of UTF-8 encoded characters from a string.
 @@ -101,15 +102,37 @@
      final int end = start + length;
      for (int i = start; i < end; i++) {
        final int code = (int)s.charAt(i);
 -      if (code >= 0x01 && code <= 0x7F)
 -	writeByte((byte)code);
 -      else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
 -	writeByte((byte)(0xC0 | (code >> 6)));
 -	writeByte((byte)(0x80 | (code & 0x3F)));
 +      if (code < 0x80)
 +        writeByte((byte)code);
 +      else if (code < 0x800) {
 +        writeByte((byte)(0xC0 | (code >> 6)));
 +        writeByte((byte)(0x80 | (code & 0x3F)));
 +      } else if (code < 0xD800 || code > 0xDFFF) {
 +        writeByte((byte)(0xE0 | (code >> 12)));
 +        writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
 +        writeByte((byte)(0x80 | (code & 0x3F)));
        } else {
 -	writeByte((byte)(0xE0 | (code >>> 12)));
 -	writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
 -	writeByte((byte)(0x80 | (code & 0x3F)));
 +        // surrogate pair
 +        int utf32;
 +        // confirm valid high surrogate
 +        if (code < 0xDC00 && (i < end-1)) {
 +          utf32 = ((int)s.charAt(i+1));
 +          // confirm valid low surrogate and write pair
 +          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
 +            utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
 +            i++;
 +            writeByte((byte)(0xF0 | (utf32 >> 18)));
 +            writeByte((byte)(0x80 | ((utf32 >> 12) & 0x3F)));
 +            writeByte((byte)(0x80 | ((utf32 >> 6) & 0x3F)));
 +            writeByte((byte)(0x80 | (utf32 & 0x3F)));
 +            continue;
 +          }
 +        }
 +        // replace unpaired surrogate or out-of-order low surrogate
 +        // with substitution character
 +        writeByte((byte)0xEF);
 +        writeByte((byte)0xBF);
 +        writeByte((byte)0xBD);
        }
      }
    }
 Index: src/java/org/apache/lucene/util/StringHelper.java
 ===================================================================
 --- src/java/org/apache/lucene/util/StringHelper.java	(revision 405159)
 +++ src/java/org/apache/lucene/util/StringHelper.java	(working copy)
 @@ -16,6 +16,8 @@
   * limitations under the License.
   */

 +import java.nio.ByteBuffer;
 +import java.nio.CharBuffer;

  /**
   * Methods for manipulating strings.
 @@ -25,6 +27,87 @@
  public abstract class StringHelper {

    /**
 +   * Compares two byte[] arrays, element by element, and returns the
 +   * number of elements common to both arrays.
 +   *
 +   * @param bytes1 The first byte[] to compare
 +   * @param bytes2 The second byte[] to compare
 +   * @return The number of common elements.
 +   */
 +  public static final int bytesDifference(byte[] bytes1, byte[] bytes2) {
 +    int len1 = bytes1.length;
 +    int len2 = bytes2.length;
 +    int len = len1 < len2 ? len1 : len2;
 +    for (int i = 0; i < len; i++) {
 +      if (bytes1[i] != bytes2[i]) {
 +        return i;
 +      }
 +    }
 +    return len;
 +  }
 +
 +  public static final byte[] TRAILING_BYTES_FOR_UTF8 = {
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 +    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 +    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 +    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 +    3,3,3,3,3,3,3,3
 +  };
 +
 +  /**
 +   * Count the number of bytes which would be occupied by this string
 +   * were it to be converted to UTF-8.
 +   *
 +   * @param s The string to operate against
 +   * @return The number of UTF-8 bytes
 +   */
 +  public static final int countUTF8Bytes(String s) {
 +    int end = s.length();
 +    int byteCount = end;    // start with 1 byte per char
 +    for (int i = 0; i < end; i++) {
 +      // add the number of trailing bytes for each char
 +      final int code = (int)s.charAt(i);
 +      if (code < 0x80)
 +        continue;
 +      else if (code < 0x800) {
 +        byteCount += 1;
 +      } else if (code < 0xD800 || code > 0xDFFF) {
 +        byteCount += 2;
 +      } else {
 +        // surrogate pair
 +        int utf32;
 +        // confirm valid high surrogate
 +        if (code < 0xDC00 && (i < end-1)) {
 +          utf32 = ((int)s.charAt(i+1));
 +          // confirm valid low surrogate
 +          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
 +            byteCount += 2;  // not 3; compensate for extra char
 +            i++;
 +            continue;
 +          }
 +        }
 +        // replace unpaired surrogate or out-of-order low surrogate
 +        // with substitution character, which is 3 bytes in UTF-8
 +        byteCount += 2;
 +      }
 +    }
 +    return byteCount;
 +  }
 +
 +
 +  /**
     * Compares two strings, character by character, and returns the
     * first position where the two strings differ from one another.
     *