src/java/org/apache/cassandra/serializers/UTF8Serializer.java - cassandra - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.cassandra.serializers;

 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;

 public class UTF8Serializer extends AbstractTextSerializer
 {
     public static final UTF8Serializer instance = new UTF8Serializer();

     private UTF8Serializer()
     {
         super(StandardCharsets.UTF_8);
     }

     public void validate(ByteBuffer bytes) throws MarshalException
     {
         if (!UTF8Validator.validate(bytes))
             throw new MarshalException("String didn't validate.");
     }

     static class UTF8Validator
     {
         enum State
         {
             START,
             TWO,
             TWO_80,
             THREE_a0bf,
             THREE_80bf_1,
             THREE_80bf_2,
             FOUR_90bf,
             FOUR_80bf_3,
         };

         // since we're not converting to java strings, we don't need to worry about converting to surrogates.
         // buf has already been sliced/duplicated.
         static boolean validate(ByteBuffer buf)
         {
             if (buf == null)
                 return false;

             buf = buf.slice();
             int b = 0;
             State state = State.START;
             while (buf.remaining() > 0)
             {
                 b = buf.get();
                 switch (state)
                 {
                     case START:
                         if (b >= 0)
                         {
                             // ascii, state stays start.
                             if (b > 127)
                                 return false;
                         }
                         else if ((b >> 5) == -2)
                         {
                             // validate first byte of 2-byte char, 0xc2-0xdf
                             if (b == (byte) 0xc0)
                                 // special case: modified utf8 null is 0xc080.
                                 state = State.TWO_80;
                             else if ((b & 0x1e) == 0)
                                 return false;
                             else
                                 state = State.TWO;
                         }
                         else if ((b >> 4) == -2)
                         {
                             // 3 bytes. first byte will be 0xe0 or 0xe1-0xef. handling of second byte will differ.
                             // so 0xe0,0xa0-0xbf,0x80-0xbf or 0xe1-0xef,0x80-0xbf,0x80-0xbf.
                             if (b == (byte)0xe0)
                                 state = State.THREE_a0bf;
                             else
                                 state = State.THREE_80bf_2;
                             break;
                         }
                         else if ((b >> 3) == -2)
                         {
                             // 4 bytes. this is where the fun starts.
                             if (b == (byte)0xf0)
                                 // 0xf0, 0x90-0xbf, 0x80-0xbf, 0x80-0xbf
                                 state = State.FOUR_90bf;
                             else
                                 // 0xf4, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf
                                 // 0xf1-0xf3, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf
                                 state = State.FOUR_80bf_3;
                             break;
                         }
                         else
                             return false; // malformed.
                         break;
                     case TWO:
                         // validate second byte of 2-byte char, 0x80-0xbf
                         if ((b & 0xc0) != 0x80)
                             return false;
                         state = State.START;
                         break;
                     case TWO_80:
                         if (b != (byte)0x80)
                             return false;
                         state = State.START;
                         break;
                     case THREE_a0bf:
                         if ((b & 0xe0) == 0x80)
                             return false;
                         state = State.THREE_80bf_1;
                         break;
                     case THREE_80bf_1:
                         // expecting 0x80-0xbf
                         if ((b & 0xc0) != 0x80)
                             return false;
                         state = State.START;
                         break;
                     case THREE_80bf_2:
                         // expecting 0x80-bf and then another of the same.
                         if ((b & 0xc0) != 0x80)
                             return false;
                         state = State.THREE_80bf_1;
                         break;
                     case FOUR_90bf:
                         // expecting 0x90-bf. 2nd byte of 4byte sequence. after that it should degrade to 80-bf,80-bf (like 3byte seq).
                         if ((b & 0x30) == 0)
                             return false;
                         state = State.THREE_80bf_2;
                         break;
                     case FOUR_80bf_3:
                         // expecting 0x80-bf 3 times. degenerates to THREE_80bf_2.
                         if ((b & 0xc0) != 0x80)
                             return false;
                         state = State.THREE_80bf_2;
                         break;
                     default:
                         return false; // invalid state.
                 }
             }
             // if state != start, we've got underflow. that's an error.
             return state == State.START;
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.cassandra.serializers;

	import java.nio.ByteBuffer;
	import java.nio.charset.StandardCharsets;

	public class UTF8Serializer extends AbstractTextSerializer
	{
	public static final UTF8Serializer instance = new UTF8Serializer();

	private UTF8Serializer()
	{
	super(StandardCharsets.UTF_8);
	}

	public void validate(ByteBuffer bytes) throws MarshalException
	{
	if (!UTF8Validator.validate(bytes))
	throw new MarshalException("String didn't validate.");
	}

	static class UTF8Validator
	{
	enum State
	{
	START,
	TWO,
	TWO_80,
	THREE_a0bf,
	THREE_80bf_1,
	THREE_80bf_2,
	FOUR_90bf,
	FOUR_80bf_3,
	};

	// since we're not converting to java strings, we don't need to worry about converting to surrogates.
	// buf has already been sliced/duplicated.
	static boolean validate(ByteBuffer buf)
	{
	if (buf == null)
	return false;

	buf = buf.slice();
	int b = 0;
	State state = State.START;
	while (buf.remaining() > 0)
	{
	b = buf.get();
	switch (state)
	{
	case START:
	if (b >= 0)
	{
	// ascii, state stays start.
	if (b > 127)
	return false;
	}
	else if ((b >> 5) == -2)
	{
	// validate first byte of 2-byte char, 0xc2-0xdf
	if (b == (byte) 0xc0)
	// special case: modified utf8 null is 0xc080.
	state = State.TWO_80;
	else if ((b & 0x1e) == 0)
	return false;
	else
	state = State.TWO;
	}
	else if ((b >> 4) == -2)
	{
	// 3 bytes. first byte will be 0xe0 or 0xe1-0xef. handling of second byte will differ.
	// so 0xe0,0xa0-0xbf,0x80-0xbf or 0xe1-0xef,0x80-0xbf,0x80-0xbf.
	if (b == (byte)0xe0)
	state = State.THREE_a0bf;
	else
	state = State.THREE_80bf_2;
	break;
	}
	else if ((b >> 3) == -2)
	{
	// 4 bytes. this is where the fun starts.
	if (b == (byte)0xf0)
	// 0xf0, 0x90-0xbf, 0x80-0xbf, 0x80-0xbf
	state = State.FOUR_90bf;
	else
	// 0xf4, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf
	// 0xf1-0xf3, 0x80-0xbf, 0x80-0xbf, 0x80-0xbf
	state = State.FOUR_80bf_3;
	break;
	}
	else
	return false; // malformed.
	break;
	case TWO:
	// validate second byte of 2-byte char, 0x80-0xbf
	if ((b & 0xc0) != 0x80)
	return false;
	state = State.START;
	break;
	case TWO_80:
	if (b != (byte)0x80)
	return false;
	state = State.START;
	break;
	case THREE_a0bf:
	if ((b & 0xe0) == 0x80)
	return false;
	state = State.THREE_80bf_1;
	break;
	case THREE_80bf_1:
	// expecting 0x80-0xbf
	if ((b & 0xc0) != 0x80)
	return false;
	state = State.START;
	break;
	case THREE_80bf_2:
	// expecting 0x80-bf and then another of the same.
	if ((b & 0xc0) != 0x80)
	return false;
	state = State.THREE_80bf_1;
	break;
	case FOUR_90bf:
	// expecting 0x90-bf. 2nd byte of 4byte sequence. after that it should degrade to 80-bf,80-bf (like 3byte seq).
	if ((b & 0x30) == 0)
	return false;
	state = State.THREE_80bf_2;
	break;
	case FOUR_80bf_3:
	// expecting 0x80-bf 3 times. degenerates to THREE_80bf_2.
	if ((b & 0xc0) != 0x80)
	return false;
	state = State.THREE_80bf_2;
	break;
	default:
	return false; // invalid state.
	}
	}
	// if state != start, we've got underflow. that's an error.
	return state == State.START;
	}
	}
	}