src/main/java/org/apache/xml/security/c14n/implementations/UtfHelpper.java - santuario-java - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements. See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership. The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License. You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied. See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.xml.security.c14n.implementations;

 import java.io.IOException;
 import java.io.OutputStream;
 import java.security.AccessController;
 import java.security.PrivilegedAction;
 import java.util.Map;

 public final class UtfHelpper {

     /**
      * Revert to the old behavior (version 2 or before), i.e. surrogate pairs characters becomes
      * '??' in output. Set system property org.apache.xml.security.c14n.oldUtf8=true if you want
      * to verify signatures generated by version 2 or before that contains 32 bit chars in the
      * XML document.
      */
     private static final boolean OLD_UTF8 =
         AccessController.doPrivileged((PrivilegedAction<Boolean>)
             () -> Boolean.getBoolean("org.apache.xml.security.c14n.oldUtf8"));

     private UtfHelpper() {
         // complete
     }

     public static void writeByte(
         final String str,
         final OutputStream out,
         Map<String, byte[]> cache
     ) throws IOException {
         byte[] result = cache.get(str);
         if (result == null) {
             result = getStringInUtf8(str);
             cache.put(str, result);
         }

         out.write(result);
     }

     public static void writeCodePointToUtf8(final int c, final OutputStream out) throws IOException {
         if (!Character.isValidCodePoint(c) || c >= 0xD800 && c <= 0xDBFF || c >= 0xDC00 && c <= 0xDFFF) {
             // valid code point: c >= 0x0000 && c <= 0x10FFFF
             out.write(0x3f);
             return;
         }
         if (OLD_UTF8 && c >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
             // version 2 or before output 2 question mark characters for 32 bit chars
             out.write(0x3f);
             out.write(0x3f);
             return;
         }

         if (c < 0x80) {
             // 0x00000000 - 0x0000007F
             // 0xxxxxxx
             out.write(c);
             return;
         }
         byte extraByte = 0;
         if (c < 0x800) {
             // 0x00000080 - 0x000007FF
             // 110xxxxx 10xxxxxx
             extraByte = 1;
         } else if (c < 0x10000) {
             // 0x00000800 - 0x0000FFFF
             // 1110xxxx 10xxxxxx 10xxxxxx
             extraByte = 2;
         } else if (c < 0x200000) {
             // 0x00010000 - 0x001FFFFF
             // 11110xxx 10xxxxx 10xxxxxx 10xxxxxx
             extraByte = 3;
         } else if (c < 0x4000000) {
             // 0x00200000 - 0x03FFFFFF
             // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
             // already outside valid Character range, just for completeness
             extraByte = 4;
         } else if (c <= 0x7FFFFFFF) {
             // 0x04000000 - 0x7FFFFFFF
             // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
             // already outside valid Character range, just for completeness
             extraByte = 5;
         } else {
             // 0x80000000 - 0xFFFFFFFF
             // case not possible as java has no unsigned int
             out.write(0x3f);
             return;
         }

         byte write;
         int shift = 6 * extraByte;
         write = (byte)((0xFE << (6 - extraByte)) | (c >>> shift));
         out.write(write);
         for (int i = extraByte - 1; i >= 0; i--) {
             shift -= 6;
             write = (byte)(0x80 | ((c >>> shift) & 0x3F));
             out.write(write);
         }
     }

     public static void writeStringToUtf8(
         final String str, final OutputStream out
     ) throws IOException {
         final int length = str.length();
         int i = 0;
         int c;
         while (i < length) {
             c = str.codePointAt(i);
             i += Character.charCount(c);
             if (!Character.isValidCodePoint(c) || c >= 0xD800 && c <= 0xDBFF || c >= 0xDC00 && c <= 0xDFFF) {
                 // valid code point: c >= 0x0000 && c <= 0x10FFFF
                 out.write(0x3f);
                 continue;
             }
             if (OLD_UTF8 && c >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
                 // version 2 or before output 2 question mark characters for 32 bit chars
                 out.write(0x3f);
                 out.write(0x3f);
                 continue;
             }
             if (c < 0x80)  {
                 out.write(c);
                 continue;
             }
             byte extraByte = 0;
             if (c < 0x800) {
                 // 0x00000080 - 0x000007FF
                 // 110xxxxx 10xxxxxx
                 extraByte = 1;
             } else if (c < 0x10000) {
                 // 0x00000800 - 0x0000FFFF
                 // 1110xxxx 10xxxxxx 10xxxxxx
                 extraByte = 2;
             } else if (c < 0x200000) {
                 // 0x00010000 - 0x001FFFFF
                 // 11110xxx 10xxxxx 10xxxxxx 10xxxxxx
                 extraByte = 3;
             } else if (c < 0x4000000) {
                 // 0x00200000 - 0x03FFFFFF
                 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                 // already outside valid Character range, just for completeness
                 extraByte = 4;
             } else if (c <= 0x7FFFFFFF) {
                 // 0x04000000 - 0x7FFFFFFF
                 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                 // already outside valid Character range, just for completeness
                 extraByte = 5;
             } else {
                 // 0x80000000 - 0xFFFFFFFF
                 // case not possible as java has no unsigned int
                 out.write(0x3f);
                 continue;
             }
             byte write;
             int shift = 6 * extraByte;
             write = (byte)((0xFE << (6 - extraByte)) | (c >>> shift));
             out.write(write);
             for (int j = extraByte - 1; j >= 0; j--) {
                 shift -= 6;
                 write = (byte)(0x80 | ((c >>> shift) & 0x3F));
                 out.write(write);
             }

         }

     }

     public static byte[] getStringInUtf8(final String str) {
         final int length = str.length();
         boolean expanded = false;
         byte[] result = new byte[length];
         int i = 0;
         int out = 0;
         int c;
         while (i < length) {
             c = str.codePointAt(i);
             i += Character.charCount(c);
             if (!Character.isValidCodePoint(c) || c >= 0xD800 && c <= 0xDBFF || c >= 0xDC00 && c <= 0xDFFF) {
                 // valid code point: c >= 0x0000 && c <= 0x10FFFF
                 result[out++] = (byte)0x3f;
                 continue;
             }
             if (OLD_UTF8 && c >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
                 // version 2 or before output 2 question mark characters for 32 bit chars
                 result[out++] = (byte)0x3f;
                 result[out++] = (byte)0x3f;
                 continue;
             }
             if (c < 0x80) {
                 result[out++] = (byte)c;
                 continue;
             }
             if (!expanded) {
                 byte[] newResult = new byte[6*length];
                 System.arraycopy(result, 0, newResult, 0, out);
                 result = newResult;
                 expanded = true;
             }
             byte extraByte = 0;
             if (c < 0x800) {
                 // 0x00000080 - 0x000007FF
                 // 110xxxxx 10xxxxxx
                 extraByte = 1;
             } else if (c < 0x10000) {
                 // 0x00000800 - 0x0000FFFF
                 // 1110xxxx 10xxxxxx 10xxxxxx
                 extraByte = 2;
             } else if (c < 0x200000) {
                 // 0x00010000 - 0x001FFFFF
                 // 11110xxx 10xxxxx 10xxxxxx 10xxxxxx
                 extraByte = 3;
             } else if (c < 0x4000000) {
                 // 0x00200000 - 0x03FFFFFF
                 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                 // already outside valid Character range, just for completeness
                 extraByte = 4;
             } else if (c <= 0x7FFFFFFF) {
                 // 0x04000000 - 0x7FFFFFFF
                 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
                 // already outside valid Character range, just for completeness
                 extraByte = 5;
             } else {
                 // 0x80000000 - 0xFFFFFFFF
                 // case not possible as java has no unsigned int
                 result[out++] = 0x3f;
                 continue;
             }
             byte write;
             int shift = 6 * extraByte;
             write = (byte)((0xFE << (6 - extraByte)) | (c >>> shift));
             result[out++] = write;
             for (int j = extraByte - 1; j >= 0; j--) {
                 shift -= 6;
                 write = (byte)(0x80 | ((c >>> shift) & 0x3F));
                 result[out++] = write;
             }
         }
         if (expanded) {
             byte[] newResult = new byte[out];
             System.arraycopy(result, 0, newResult, 0, out);
             result = newResult;
         }
         return result;
     }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.xml.security.c14n.implementations;

	import java.io.IOException;
	import java.io.OutputStream;
	import java.security.AccessController;
	import java.security.PrivilegedAction;
	import java.util.Map;

	public final class UtfHelpper {

	/**
	* Revert to the old behavior (version 2 or before), i.e. surrogate pairs characters becomes
	* '??' in output. Set system property org.apache.xml.security.c14n.oldUtf8=true if you want
	* to verify signatures generated by version 2 or before that contains 32 bit chars in the
	* XML document.
	*/
	private static final boolean OLD_UTF8 =
	AccessController.doPrivileged((PrivilegedAction<Boolean>)
	() -> Boolean.getBoolean("org.apache.xml.security.c14n.oldUtf8"));

	private UtfHelpper() {
	// complete
	}

	public static void writeByte(
	final String str,
	final OutputStream out,
	Map<String, byte[]> cache
	) throws IOException {
	byte[] result = cache.get(str);
	if (result == null) {
	result = getStringInUtf8(str);
	cache.put(str, result);
	}

	out.write(result);
	}

	public static void writeCodePointToUtf8(final int c, final OutputStream out) throws IOException {
	if (!Character.isValidCodePoint(c) \|\| c >= 0xD800 && c <= 0xDBFF \|\| c >= 0xDC00 && c <= 0xDFFF) {
	// valid code point: c >= 0x0000 && c <= 0x10FFFF
	out.write(0x3f);
	return;
	}
	if (OLD_UTF8 && c >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
	// version 2 or before output 2 question mark characters for 32 bit chars
	out.write(0x3f);
	out.write(0x3f);
	return;
	}

	if (c < 0x80) {
	// 0x00000000 - 0x0000007F
	// 0xxxxxxx
	out.write(c);
	return;
	}
	byte extraByte = 0;
	if (c < 0x800) {
	// 0x00000080 - 0x000007FF
	// 110xxxxx 10xxxxxx
	extraByte = 1;
	} else if (c < 0x10000) {
	// 0x00000800 - 0x0000FFFF
	// 1110xxxx 10xxxxxx 10xxxxxx
	extraByte = 2;
	} else if (c < 0x200000) {
	// 0x00010000 - 0x001FFFFF
	// 11110xxx 10xxxxx 10xxxxxx 10xxxxxx
	extraByte = 3;
	} else if (c < 0x4000000) {
	// 0x00200000 - 0x03FFFFFF
	// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	// already outside valid Character range, just for completeness
	extraByte = 4;
	} else if (c <= 0x7FFFFFFF) {
	// 0x04000000 - 0x7FFFFFFF
	// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	// already outside valid Character range, just for completeness
	extraByte = 5;
	} else {
	// 0x80000000 - 0xFFFFFFFF
	// case not possible as java has no unsigned int
	out.write(0x3f);
	return;
	}

	byte write;
	int shift = 6 * extraByte;
	write = (byte)((0xFE << (6 - extraByte)) \| (c >>> shift));
	out.write(write);
	for (int i = extraByte - 1; i >= 0; i--) {
	shift -= 6;
	write = (byte)(0x80 \| ((c >>> shift) & 0x3F));
	out.write(write);
	}
	}

	public static void writeStringToUtf8(
	final String str, final OutputStream out
	) throws IOException {
	final int length = str.length();
	int i = 0;
	int c;
	while (i < length) {
	c = str.codePointAt(i);
	i += Character.charCount(c);
	if (!Character.isValidCodePoint(c) \|\| c >= 0xD800 && c <= 0xDBFF \|\| c >= 0xDC00 && c <= 0xDFFF) {
	// valid code point: c >= 0x0000 && c <= 0x10FFFF
	out.write(0x3f);
	continue;
	}
	if (OLD_UTF8 && c >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
	// version 2 or before output 2 question mark characters for 32 bit chars
	out.write(0x3f);
	out.write(0x3f);
	continue;
	}
	if (c < 0x80) {
	out.write(c);
	continue;
	}
	byte extraByte = 0;
	if (c < 0x800) {
	// 0x00000080 - 0x000007FF
	// 110xxxxx 10xxxxxx
	extraByte = 1;
	} else if (c < 0x10000) {
	// 0x00000800 - 0x0000FFFF
	// 1110xxxx 10xxxxxx 10xxxxxx
	extraByte = 2;
	} else if (c < 0x200000) {
	// 0x00010000 - 0x001FFFFF
	// 11110xxx 10xxxxx 10xxxxxx 10xxxxxx
	extraByte = 3;
	} else if (c < 0x4000000) {
	// 0x00200000 - 0x03FFFFFF
	// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	// already outside valid Character range, just for completeness
	extraByte = 4;
	} else if (c <= 0x7FFFFFFF) {
	// 0x04000000 - 0x7FFFFFFF
	// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	// already outside valid Character range, just for completeness
	extraByte = 5;
	} else {
	// 0x80000000 - 0xFFFFFFFF
	// case not possible as java has no unsigned int
	out.write(0x3f);
	continue;
	}
	byte write;
	int shift = 6 * extraByte;
	write = (byte)((0xFE << (6 - extraByte)) \| (c >>> shift));
	out.write(write);
	for (int j = extraByte - 1; j >= 0; j--) {
	shift -= 6;
	write = (byte)(0x80 \| ((c >>> shift) & 0x3F));
	out.write(write);
	}

	}

	}

	public static byte[] getStringInUtf8(final String str) {
	final int length = str.length();
	boolean expanded = false;
	byte[] result = new byte[length];
	int i = 0;
	int out = 0;
	int c;
	while (i < length) {
	c = str.codePointAt(i);
	i += Character.charCount(c);
	if (!Character.isValidCodePoint(c) \|\| c >= 0xD800 && c <= 0xDBFF \|\| c >= 0xDC00 && c <= 0xDFFF) {
	// valid code point: c >= 0x0000 && c <= 0x10FFFF
	result[out++] = (byte)0x3f;
	continue;
	}
	if (OLD_UTF8 && c >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
	// version 2 or before output 2 question mark characters for 32 bit chars
	result[out++] = (byte)0x3f;
	result[out++] = (byte)0x3f;
	continue;
	}
	if (c < 0x80) {
	result[out++] = (byte)c;
	continue;
	}
	if (!expanded) {
	byte[] newResult = new byte[6*length];
	System.arraycopy(result, 0, newResult, 0, out);
	result = newResult;
	expanded = true;
	}
	byte extraByte = 0;
	if (c < 0x800) {
	// 0x00000080 - 0x000007FF
	// 110xxxxx 10xxxxxx
	extraByte = 1;
	} else if (c < 0x10000) {
	// 0x00000800 - 0x0000FFFF
	// 1110xxxx 10xxxxxx 10xxxxxx
	extraByte = 2;
	} else if (c < 0x200000) {
	// 0x00010000 - 0x001FFFFF
	// 11110xxx 10xxxxx 10xxxxxx 10xxxxxx
	extraByte = 3;
	} else if (c < 0x4000000) {
	// 0x00200000 - 0x03FFFFFF
	// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	// already outside valid Character range, just for completeness
	extraByte = 4;
	} else if (c <= 0x7FFFFFFF) {
	// 0x04000000 - 0x7FFFFFFF
	// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	// already outside valid Character range, just for completeness
	extraByte = 5;
	} else {
	// 0x80000000 - 0xFFFFFFFF
	// case not possible as java has no unsigned int
	result[out++] = 0x3f;
	continue;
	}
	byte write;
	int shift = 6 * extraByte;
	write = (byte)((0xFE << (6 - extraByte)) \| (c >>> shift));
	result[out++] = write;
	for (int j = extraByte - 1; j >= 0; j--) {
	shift -= 6;
	write = (byte)(0x80 \| ((c >>> shift) & 0x3F));
	result[out++] = write;
	}
	}
	if (expanded) {
	byte[] newResult = new byte[out];
	System.arraycopy(result, 0, newResult, 0, out);
	result = newResult;
	}
	return result;
	}
	}