branch-2.0.4-alpha/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/TestUTF8.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.io;

 import junit.framework.TestCase;
 import java.io.IOException;
 import java.io.UTFDataFormatException;
 import java.util.Random;

 import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.util.StringUtils;

 /** Unit tests for UTF8. */
 @SuppressWarnings("deprecation")
 public class TestUTF8 extends TestCase {
   public TestUTF8(String name) { super(name); }

   private static final Random RANDOM = new Random();

   public static String getTestString() throws Exception {
     StringBuilder buffer = new StringBuilder();
     int length = RANDOM.nextInt(100);
     for (int i = 0; i < length; i++) {
       buffer.append((char)(RANDOM.nextInt(Character.MAX_VALUE)));
     }
     return buffer.toString();
   }

   public void testWritable() throws Exception {
     for (int i = 0; i < 10000; i++) {
       TestWritable.testWritable(new UTF8(getTestString()));
     }
   }

   public void testGetBytes() throws Exception {
     for (int i = 0; i < 10000; i++) {

       // generate a random string
       String before = getTestString();

       // check its utf8
       assertEquals(before, new String(UTF8.getBytes(before), "UTF-8"));
     }
   }

   public void testIO() throws Exception {
     DataOutputBuffer out = new DataOutputBuffer();
     DataInputBuffer in = new DataInputBuffer();

     for (int i = 0; i < 10000; i++) {
       // generate a random string
       String before = getTestString();

       // write it
       out.reset();
       UTF8.writeString(out, before);

       // test that it reads correctly
       in.reset(out.getData(), out.getLength());
       String after = UTF8.readString(in);
       assertEquals(before, after);

       // test that it reads correctly with DataInput
       in.reset(out.getData(), out.getLength());
       String after2 = in.readUTF();
       assertEquals(before, after2);

       // test that it is compatible with Java's other decoder
       String after3 = new String(out.getData(), 2, out.getLength()-2, "UTF-8");
       assertEquals(before, after3);

     }

   }

   public void testNullEncoding() throws Exception {
     String s = new String(new char[] { 0 });

     DataOutputBuffer dob = new DataOutputBuffer();
     new UTF8(s).write(dob);

     assertEquals(s, new String(dob.getData(), 2, dob.getLength()-2, "UTF-8"));
   }

   /**
    * Test encoding and decoding of UTF8 outside the basic multilingual plane.
    *
    * This is a regression test for HADOOP-9103.
    */
   public void testNonBasicMultilingualPlane() throws Exception {
     // Test using the "CAT FACE" character (U+1F431)
     // See http://www.fileformat.info/info/unicode/char/1f431/index.htm
     String catFace = "\uD83D\uDC31";

     // This encodes to 4 bytes in UTF-8:
     byte[] encoded = catFace.getBytes("UTF-8");
     assertEquals(4, encoded.length);
     assertEquals("f09f90b1", StringUtils.byteToHexString(encoded));

     // Decode back to String using our own decoder
     String roundTrip = UTF8.fromBytes(encoded);
     assertEquals(catFace, roundTrip);
   }

   /**
    * Test that decoding invalid UTF8 throws an appropriate error message.
    */
   public void testInvalidUTF8() throws Exception {
     byte[] invalid = new byte[] {
         0x01, 0x02, (byte)0xff, (byte)0xff, 0x01, 0x02, 0x03, 0x04, 0x05 };
     try {
       UTF8.fromBytes(invalid);
       fail("did not throw an exception");
     } catch (UTFDataFormatException utfde) {
       GenericTestUtils.assertExceptionContains(
           "Invalid UTF8 at ffff01020304", utfde);
     }
   }

   /**
    * Test for a 5-byte UTF8 sequence, which is now considered illegal.
    */
   public void test5ByteUtf8Sequence() throws Exception {
     byte[] invalid = new byte[] {
         0x01, 0x02, (byte)0xf8, (byte)0x88, (byte)0x80,
         (byte)0x80, (byte)0x80, 0x04, 0x05 };
     try {
       UTF8.fromBytes(invalid);
       fail("did not throw an exception");
     } catch (UTFDataFormatException utfde) {
       GenericTestUtils.assertExceptionContains(
           "Invalid UTF8 at f88880808004", utfde);
     }
   }

   /**
    * Test that decoding invalid UTF8 due to truncation yields the correct
    * exception type.
    */
   public void testInvalidUTF8Truncated() throws Exception {
     // Truncated CAT FACE character -- this is a 4-byte sequence, but we
     // only have the first three bytes.
     byte[] truncated = new byte[] {
         (byte)0xF0, (byte)0x9F, (byte)0x90 };
     try {
       UTF8.fromBytes(truncated);
       fail("did not throw an exception");
     } catch (UTFDataFormatException utfde) {
       GenericTestUtils.assertExceptionContains(
           "Truncated UTF8 at f09f90", utfde);
     }
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.io;

	import junit.framework.TestCase;
	import java.io.IOException;
	import java.io.UTFDataFormatException;
	import java.util.Random;

	import org.apache.hadoop.test.GenericTestUtils;
	import org.apache.hadoop.util.StringUtils;

	/** Unit tests for UTF8. */
	@SuppressWarnings("deprecation")
	public class TestUTF8 extends TestCase {
	public TestUTF8(String name) { super(name); }

	private static final Random RANDOM = new Random();

	public static String getTestString() throws Exception {
	StringBuilder buffer = new StringBuilder();
	int length = RANDOM.nextInt(100);
	for (int i = 0; i < length; i++) {
	buffer.append((char)(RANDOM.nextInt(Character.MAX_VALUE)));
	}
	return buffer.toString();
	}

	public void testWritable() throws Exception {
	for (int i = 0; i < 10000; i++) {
	TestWritable.testWritable(new UTF8(getTestString()));
	}
	}

	public void testGetBytes() throws Exception {
	for (int i = 0; i < 10000; i++) {

	// generate a random string
	String before = getTestString();

	// check its utf8
	assertEquals(before, new String(UTF8.getBytes(before), "UTF-8"));
	}
	}

	public void testIO() throws Exception {
	DataOutputBuffer out = new DataOutputBuffer();
	DataInputBuffer in = new DataInputBuffer();

	for (int i = 0; i < 10000; i++) {
	// generate a random string
	String before = getTestString();

	// write it
	out.reset();
	UTF8.writeString(out, before);

	// test that it reads correctly
	in.reset(out.getData(), out.getLength());
	String after = UTF8.readString(in);
	assertEquals(before, after);

	// test that it reads correctly with DataInput
	in.reset(out.getData(), out.getLength());
	String after2 = in.readUTF();
	assertEquals(before, after2);

	// test that it is compatible with Java's other decoder
	String after3 = new String(out.getData(), 2, out.getLength()-2, "UTF-8");
	assertEquals(before, after3);

	}

	}

	public void testNullEncoding() throws Exception {
	String s = new String(new char[] { 0 });

	DataOutputBuffer dob = new DataOutputBuffer();
	new UTF8(s).write(dob);

	assertEquals(s, new String(dob.getData(), 2, dob.getLength()-2, "UTF-8"));
	}

	/**
	* Test encoding and decoding of UTF8 outside the basic multilingual plane.
	*
	* This is a regression test for HADOOP-9103.
	*/
	public void testNonBasicMultilingualPlane() throws Exception {
	// Test using the "CAT FACE" character (U+1F431)
	// See http://www.fileformat.info/info/unicode/char/1f431/index.htm
	String catFace = "\uD83D\uDC31";

	// This encodes to 4 bytes in UTF-8:
	byte[] encoded = catFace.getBytes("UTF-8");
	assertEquals(4, encoded.length);
	assertEquals("f09f90b1", StringUtils.byteToHexString(encoded));

	// Decode back to String using our own decoder
	String roundTrip = UTF8.fromBytes(encoded);
	assertEquals(catFace, roundTrip);
	}

	/**
	* Test that decoding invalid UTF8 throws an appropriate error message.
	*/
	public void testInvalidUTF8() throws Exception {
	byte[] invalid = new byte[] {
	0x01, 0x02, (byte)0xff, (byte)0xff, 0x01, 0x02, 0x03, 0x04, 0x05 };
	try {
	UTF8.fromBytes(invalid);
	fail("did not throw an exception");
	} catch (UTFDataFormatException utfde) {
	GenericTestUtils.assertExceptionContains(
	"Invalid UTF8 at ffff01020304", utfde);
	}
	}

	/**
	* Test for a 5-byte UTF8 sequence, which is now considered illegal.
	*/
	public void test5ByteUtf8Sequence() throws Exception {
	byte[] invalid = new byte[] {
	0x01, 0x02, (byte)0xf8, (byte)0x88, (byte)0x80,
	(byte)0x80, (byte)0x80, 0x04, 0x05 };
	try {
	UTF8.fromBytes(invalid);
	fail("did not throw an exception");
	} catch (UTFDataFormatException utfde) {
	GenericTestUtils.assertExceptionContains(
	"Invalid UTF8 at f88880808004", utfde);
	}
	}

	/**
	* Test that decoding invalid UTF8 due to truncation yields the correct
	* exception type.
	*/
	public void testInvalidUTF8Truncated() throws Exception {
	// Truncated CAT FACE character -- this is a 4-byte sequence, but we
	// only have the first three bytes.
	byte[] truncated = new byte[] {
	(byte)0xF0, (byte)0x9F, (byte)0x90 };
	try {
	UTF8.fromBytes(truncated);
	fail("did not throw an exception");
	} catch (UTFDataFormatException utfde) {
	GenericTestUtils.assertExceptionContains(
	"Truncated UTF8 at f09f90", utfde);
	}
	}
	}