tools/gnmsg/modified_utf8.py - geode-native - Git at Google

 #!/usr/local/bin/python3

 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # translated from: http://hg.openjdk.java.net/jdk8/jdk8/jdk/file/94cc251d0c45/src/share/npt/utf.c
 # Source:  https://gist.github.com/BarelyAliveMau5/000e7e453b6d4ebd0cb06f39bc2e7aec

 def utf8s_to_utf8m(string):
     """
     :param string: utf8 encoded string
     :return: modified utf8 encoded string
     """
     new_str = []
     i = 0
     while i < len(string):
         byte1 = string[i]
         # NULL bytes and bytes starting with 11110xxx are special
         if (byte1 & 0x80) == 0:
             if byte1 == 0:
                 new_str.append(0xC0)
                 new_str.append(0x80)
             else:
                 # Single byte
                 new_str.append(byte1)

         elif (byte1 & 0xE0) == 0xC0:  # 2byte encoding
             new_str.append(byte1)
             i += 1
             new_str.append(string[i])

         elif (byte1 & 0xF0) == 0xE0:  # 3byte encoding
             new_str.append(byte1)
             i += 1
             new_str.append(string[i])
             i += 1
             new_str.append(string[i])

         elif (byte1 & 0xF8) == 0xF0:  # 4byte encoding
             # Beginning of 4byte encoding, turn into 2 3byte encodings
             # Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
             i += 1
             byte2 = string[i]
             i += 1
             byte3 = string[i]
             i += 1
             byte4 = string[i]

             # Reconstruct full 21bit value
             u21 = (byte1 & 0x07) << 18
             u21 += (byte2 & 0x3F) << 12
             u21 += (byte3 & 0x3F) << 6
             u21 += byte4 & 0x3F

             # Bits out: 11101101 1010xxxx 10xxxxxx
             new_str.append(0xED)
             new_str.append((0xA0 + (((u21 >> 16) - 1) & 0x0F)))
             new_str.append((0x80 + ((u21 >> 10) & 0x3F)))

             # Bits out: 11101101 1011xxxx 10xxxxxx
             new_str.append(0xED)
             new_str.append((0xB0 + ((u21 >> 6) & 0x0F)))
             new_str.append(byte4)
         i += 1
     return bytes(new_str)


 def utf8m_to_utf8s(string):
     """
     :param string: modified utf8 encoded string
     :return: utf8 encoded string
     """
     new_string = []
     length = len(string)
     i = 0
     while i < length:
         byte1 = string[i]
         if (byte1 & 0x80) == 0:  # 1byte encoding
             new_string.append(byte1)
         elif (byte1 & 0xE0) == 0xC0:  # 2byte encoding
             i += 1
             byte2 = string[i]
             if byte1 != 0xC0 or byte2 != 0x80:
                 new_string.append(byte1)
                 new_string.append(byte2)
             else:
                 new_string.append(0)
         elif (byte1 & 0xF0) == 0xE0:  # 3byte encoding
             i += 1
             byte2 = string[i]
             i += 1
             byte3 = string[i]
             if i + 3 < length and byte1 == 0xED and (byte2 & 0xF0) == 0xA0:
                 # See if this is a pair of 3byte encodings
                 byte4 = string[i + 1]
                 byte5 = string[i + 2]
                 byte6 = string[i + 3]
                 if byte4 == 0xED and (byte5 & 0xF0) == 0xB0:
                     # Bits in: 11101101 1010xxxx 10xxxxxx
                     # Bits in: 11101101 1011xxxx 10xxxxxx
                     i += 3

                     # Reconstruct 21 bit code
                     u21 = ((byte2 & 0x0F) + 1) << 16
                     u21 += (byte3 & 0x3F) << 10
                     u21 += (byte5 & 0x0F) << 6
                     u21 += byte6 & 0x3F

                     # Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

                     # Convert to 4byte encoding
                     new_string.append(0xF0 + ((u21 >> 18) & 0x07))
                     new_string.append(0x80 + ((u21 >> 12) & 0x3F))
                     new_string.append(0x80 + ((u21 >> 6) & 0x3F))
                     new_string.append(0x80 + (u21 & 0x3F))
                     continue
             new_string.append(byte1)
             new_string.append(byte2)
             new_string.append(byte3)
         i += 1
     return bytes(new_string)
	#!/usr/local/bin/python3

	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# translated from: http://hg.openjdk.java.net/jdk8/jdk8/jdk/file/94cc251d0c45/src/share/npt/utf.c
	# Source: https://gist.github.com/BarelyAliveMau5/000e7e453b6d4ebd0cb06f39bc2e7aec

	def utf8s_to_utf8m(string):
	"""
	:param string: utf8 encoded string
	:return: modified utf8 encoded string
	"""
	new_str = []
	i = 0
	while i < len(string):
	byte1 = string[i]
	# NULL bytes and bytes starting with 11110xxx are special
	if (byte1 & 0x80) == 0:
	if byte1 == 0:
	new_str.append(0xC0)
	new_str.append(0x80)
	else:
	# Single byte
	new_str.append(byte1)

	elif (byte1 & 0xE0) == 0xC0: # 2byte encoding
	new_str.append(byte1)
	i += 1
	new_str.append(string[i])

	elif (byte1 & 0xF0) == 0xE0: # 3byte encoding
	new_str.append(byte1)
	i += 1
	new_str.append(string[i])
	i += 1
	new_str.append(string[i])

	elif (byte1 & 0xF8) == 0xF0: # 4byte encoding
	# Beginning of 4byte encoding, turn into 2 3byte encodings
	# Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	i += 1
	byte2 = string[i]
	i += 1
	byte3 = string[i]
	i += 1
	byte4 = string[i]

	# Reconstruct full 21bit value
	u21 = (byte1 & 0x07) << 18
	u21 += (byte2 & 0x3F) << 12
	u21 += (byte3 & 0x3F) << 6
	u21 += byte4 & 0x3F

	# Bits out: 11101101 1010xxxx 10xxxxxx
	new_str.append(0xED)
	new_str.append((0xA0 + (((u21 >> 16) - 1) & 0x0F)))
	new_str.append((0x80 + ((u21 >> 10) & 0x3F)))

	# Bits out: 11101101 1011xxxx 10xxxxxx
	new_str.append(0xED)
	new_str.append((0xB0 + ((u21 >> 6) & 0x0F)))
	new_str.append(byte4)
	i += 1
	return bytes(new_str)


	def utf8m_to_utf8s(string):
	"""
	:param string: modified utf8 encoded string
	:return: utf8 encoded string
	"""
	new_string = []
	length = len(string)
	i = 0
	while i < length:
	byte1 = string[i]
	if (byte1 & 0x80) == 0: # 1byte encoding
	new_string.append(byte1)
	elif (byte1 & 0xE0) == 0xC0: # 2byte encoding
	i += 1
	byte2 = string[i]
	if byte1 != 0xC0 or byte2 != 0x80:
	new_string.append(byte1)
	new_string.append(byte2)
	else:
	new_string.append(0)
	elif (byte1 & 0xF0) == 0xE0: # 3byte encoding
	i += 1
	byte2 = string[i]
	i += 1
	byte3 = string[i]
	if i + 3 < length and byte1 == 0xED and (byte2 & 0xF0) == 0xA0:
	# See if this is a pair of 3byte encodings
	byte4 = string[i + 1]
	byte5 = string[i + 2]
	byte6 = string[i + 3]
	if byte4 == 0xED and (byte5 & 0xF0) == 0xB0:
	# Bits in: 11101101 1010xxxx 10xxxxxx
	# Bits in: 11101101 1011xxxx 10xxxxxx
	i += 3

	# Reconstruct 21 bit code
	u21 = ((byte2 & 0x0F) + 1) << 16
	u21 += (byte3 & 0x3F) << 10
	u21 += (byte5 & 0x0F) << 6
	u21 += byte6 & 0x3F

	# Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

	# Convert to 4byte encoding
	new_string.append(0xF0 + ((u21 >> 18) & 0x07))
	new_string.append(0x80 + ((u21 >> 12) & 0x3F))
	new_string.append(0x80 + ((u21 >> 6) & 0x3F))
	new_string.append(0x80 + (u21 & 0x3F))
	continue
	new_string.append(byte1)
	new_string.append(byte2)
	new_string.append(byte3)
	i += 1
	return bytes(new_string)