jena-base/src/main/java/org/apache/jena/atlas/lib/EscapeStr.java - jena - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.jena.atlas.lib;

 import org.apache.jena.atlas.AtlasException ;
 import org.apache.jena.atlas.io.AWriter ;
 import org.apache.jena.atlas.io.OutputUtils ;
 import org.apache.jena.atlas.io.StringWriterI ;

 /** String escape utilities */
 public class EscapeStr
 {
     /*
      * Escape characters in a string according to Turtle rules.
      */
     public static String stringEsc(String s) {
         AWriter w = new StringWriterI() ;
         stringEsc(w, s, Chars.CH_QUOTE2, true, CharSpace.UTF8) ;
         return w.toString() ;
     }

     /** Write a string - basic escaping, no quote escaping. */
     public static void stringEsc(AWriter out, String s, boolean asciiOnly) {
         int len = s.length() ;
         for (int i = 0; i < len; i++) {
             char c = s.charAt(i);
             // \\ Escape always possible.
             if (c == '\\') {
                 out.print('\\') ;
                 out.print(c) ;
                 continue ;
             }
             switch(c) {
                 case '\n':  out.print("\\n"); continue;
                 case '\t':  out.print("\\t"); continue;
                 case '\r':  out.print("\\r"); continue;
                 case '\f':  out.print("\\f"); continue;
                 default:    // Drop through
             }
             if ( !asciiOnly )
                 out.print(c);
             else
                 writeCharAsASCII(out, c) ;
         }
     }

     public static void stringEsc(AWriter out, String s, char quoteChar, boolean singleLineString) {
         stringEsc(out, s, quoteChar, singleLineString, CharSpace.UTF8);
     }

     public static void stringEsc(AWriter out, String s, char quoteChar, boolean singleLineString, CharSpace charSpace) {
         boolean ascii = ( CharSpace.ASCII == charSpace ) ;
         int len = s.length() ;
         int quotesInARow = 0 ;
         for (int i = 0; i < len; i++) {
             char c = s.charAt(i);
             // \\ Escape always possible.
             if (c == '\\') {
                 out.print('\\') ;
                 out.print(c) ;
                 continue ;
             }
             if ( ! singleLineString ) {
                 // Multiline string.
                 if ( c == quoteChar ) {
                     quotesInARow++ ;
                     if ( quotesInARow == 3 ) {
                         out.print("\\");
                         out.print(quoteChar);
                         quotesInARow = 0;
                         continue;
                     }
                 } else {
                     quotesInARow = 0 ;
                 }
             } else {
                 if ( c == quoteChar ) {
                     out.print("\\"); out.print(c) ; continue ;
                 }
                 switch(c) {
                     case '\n':  out.print("\\n"); continue;
                     case '\t':  out.print("\\t"); continue;
                     case '\r':  out.print("\\r"); continue;
                     case '\f':  out.print("\\f"); continue;
                     default:    // Drop through
                 }
             }

             if ( !ascii )
                 out.print(c);
             else
                 writeCharAsASCII(out, c) ;
         }
     }

     /** Write a string with Unicode to ASCII conversion using \-u escapes */
     public static void writeASCII(AWriter out, String s) {
         int len = s.length() ;
         for (int i = 0; i < len; i++) {
             char c = s.charAt(i);
             writeCharAsASCII(out, c);
         }
     }

     /** Write a character with Unicode to ASCII conversion using \-u escapes */
     public static void writeCharAsASCII(AWriter out, char c) {
         if ( c >= 32 && c < 127 )
             out.print(c);
         else {
             // Outside the charset range.
             // Does not cover beyond 16 bits codepoints directly
             // (i.e. \U escapes) but Java keeps these as surrogate
             // pairs and will print as characters
             out.print("\\u") ;
             OutputUtils.printHex(out, c, 4) ;
         }
     }

     // Utilities to remove escapes

     /** Replace \ escapes (\\u, \t, \n etc) in a string */
     public static String unescapeStr(String s)
     { return unescapeStr(s, '\\') ; }

     /** Replace \ escapes (\\u, \t, \n etc) in a string */
     public static String unescapeStr(String s, char escapeChar)
     { return unescape(s, escapeChar, false) ; }


     /** Unicode escapes  \-u and \-U only */
     public static String unescapeUnicode(String s) {
         return unescape(s, '\\', true) ;
     }

     // Main worker function for unescaping strings.
     public static String unescape(String s, char escape, boolean pointCodeOnly) {
         int i = s.indexOf(escape) ;

         if ( i == -1 )
             return s ;

         // Dump the initial part straight into the string buffer
         StringBuilder sb = new StringBuilder(s.substring(0,i)) ;

         for ( ; i < s.length() ; i++ )
         {
             char ch = s.charAt(i) ;

             if ( ch != escape )
             {
                 sb.append(ch) ;
                 continue ;
             }

             // Escape
             if ( i >= s.length()-1 )
                 throw new AtlasException("Illegal escape at end of string") ;
             char ch2 = s.charAt(i+1) ;
             i = i + 1 ;

             // \\u and \\U
             if ( ch2 == 'u' )
             {
                 if ( i+4 >= s.length() )
                     throw new AtlasException("\\u escape too short") ;
                 int x4 = Hex.hexStringToInt(s, i+1, 4) ;
                 sb.append((char)x4) ;
                 // Jump 1 2 3 4 -- already skipped \ and u
                 i = i+4 ;
                 continue ;
             }
             if ( ch2 == 'U' )
             {
                 if ( i+8 >= s.length() )
                     throw new AtlasException("\\U escape too short") ;
                 int ch8 = Hex.hexStringToInt(s, i+1, 8) ;
                 if ( Character.charCount(ch8) == 1 )
                     sb.append((char)ch8);
                 else {
                     // See also TokenerText.insertCodepoint and TokenerText.readUnicodeEscape
                     // Convert to UTF-16. Note that the rest of any system this is used
                     // in must also respect codepoints and surrogate pairs.
                     if ( !Character.isDefined(ch8) && !Character.isSupplementaryCodePoint(ch8) )
                         throw new AtlasException(String.format("Illegal codepoint: 0x%04X", ch8));
                     if ( ch8 > Character.MAX_CODE_POINT )
                         throw new AtlasException(String.format("Illegal code point in \\U sequence value: 0x%08X", ch8));
                     char[] chars = Character.toChars(ch8);
                     sb.append(chars);
                 }
                 // Jump 1 2 3 4 5 6 7 8 -- already skipped \ and u
                 i = i+8 ;
                 continue ;
             }

             // Are we doing just point code escapes?
             // If so, \X-anything else is legal as a literal "\" and "X"

             if ( pointCodeOnly )
             {
                 sb.append('\\') ;
                 sb.append(ch2) ;
                 continue ;
             }

             // Not just codepoints.  Must be a legal escape.
             char ch3 = 0 ;
             switch (ch2)
             {
                 case 'n': ch3 = '\n' ;  break ;
                 case 't': ch3 = '\t' ;  break ;
                 case 'r': ch3 = '\r' ;  break ;
                 case 'b': ch3 = '\b' ;  break ;
                 case 'f': ch3 = '\f' ;  break ;
                 case '\'': ch3 = '\'' ; break ;
                 case '\"': ch3 = '\"' ; break ;
                 case '\\': ch3 = '\\' ; break ;
                 default:
                     throw new AtlasException("Unknown escape: \\"+ch2) ;
             }
             sb.append(ch3) ;
         }
         return sb.toString() ;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.jena.atlas.lib;

	import org.apache.jena.atlas.AtlasException ;
	import org.apache.jena.atlas.io.AWriter ;
	import org.apache.jena.atlas.io.OutputUtils ;
	import org.apache.jena.atlas.io.StringWriterI ;

	/** String escape utilities */
	public class EscapeStr
	{
	/*
	* Escape characters in a string according to Turtle rules.
	*/
	public static String stringEsc(String s) {
	AWriter w = new StringWriterI() ;
	stringEsc(w, s, Chars.CH_QUOTE2, true, CharSpace.UTF8) ;
	return w.toString() ;
	}

	/** Write a string - basic escaping, no quote escaping. */
	public static void stringEsc(AWriter out, String s, boolean asciiOnly) {
	int len = s.length() ;
	for (int i = 0; i < len; i++) {
	char c = s.charAt(i);
	// \\ Escape always possible.
	if (c == '\\') {
	out.print('\\') ;
	out.print(c) ;
	continue ;
	}
	switch(c) {
	case '\n': out.print("\\n"); continue;
	case '\t': out.print("\\t"); continue;
	case '\r': out.print("\\r"); continue;
	case '\f': out.print("\\f"); continue;
	default: // Drop through
	}
	if ( !asciiOnly )
	out.print(c);
	else
	writeCharAsASCII(out, c) ;
	}
	}

	public static void stringEsc(AWriter out, String s, char quoteChar, boolean singleLineString) {
	stringEsc(out, s, quoteChar, singleLineString, CharSpace.UTF8);
	}

	public static void stringEsc(AWriter out, String s, char quoteChar, boolean singleLineString, CharSpace charSpace) {
	boolean ascii = ( CharSpace.ASCII == charSpace ) ;
	int len = s.length() ;
	int quotesInARow = 0 ;
	for (int i = 0; i < len; i++) {
	char c = s.charAt(i);
	// \\ Escape always possible.
	if (c == '\\') {
	out.print('\\') ;
	out.print(c) ;
	continue ;
	}
	if ( ! singleLineString ) {
	// Multiline string.
	if ( c == quoteChar ) {
	quotesInARow++ ;
	if ( quotesInARow == 3 ) {
	out.print("\\");
	out.print(quoteChar);
	quotesInARow = 0;
	continue;
	}
	} else {
	quotesInARow = 0 ;
	}
	} else {
	if ( c == quoteChar ) {
	out.print("\\"); out.print(c) ; continue ;
	}
	switch(c) {
	case '\n': out.print("\\n"); continue;
	case '\t': out.print("\\t"); continue;
	case '\r': out.print("\\r"); continue;
	case '\f': out.print("\\f"); continue;
	default: // Drop through
	}
	}

	if ( !ascii )
	out.print(c);
	else
	writeCharAsASCII(out, c) ;
	}
	}

	/** Write a string with Unicode to ASCII conversion using \-u escapes */
	public static void writeASCII(AWriter out, String s) {
	int len = s.length() ;
	for (int i = 0; i < len; i++) {
	char c = s.charAt(i);
	writeCharAsASCII(out, c);
	}
	}

	/** Write a character with Unicode to ASCII conversion using \-u escapes */
	public static void writeCharAsASCII(AWriter out, char c) {
	if ( c >= 32 && c < 127 )
	out.print(c);
	else {
	// Outside the charset range.
	// Does not cover beyond 16 bits codepoints directly
	// (i.e. \U escapes) but Java keeps these as surrogate
	// pairs and will print as characters
	out.print("\\u") ;
	OutputUtils.printHex(out, c, 4) ;
	}
	}

	// Utilities to remove escapes

	/** Replace \ escapes (\\u, \t, \n etc) in a string */
	public static String unescapeStr(String s)
	{ return unescapeStr(s, '\\') ; }

	/** Replace \ escapes (\\u, \t, \n etc) in a string */
	public static String unescapeStr(String s, char escapeChar)
	{ return unescape(s, escapeChar, false) ; }


	/** Unicode escapes \-u and \-U only */
	public static String unescapeUnicode(String s) {
	return unescape(s, '\\', true) ;
	}

	// Main worker function for unescaping strings.
	public static String unescape(String s, char escape, boolean pointCodeOnly) {
	int i = s.indexOf(escape) ;

	if ( i == -1 )
	return s ;

	// Dump the initial part straight into the string buffer
	StringBuilder sb = new StringBuilder(s.substring(0,i)) ;

	for ( ; i < s.length() ; i++ )
	{
	char ch = s.charAt(i) ;

	if ( ch != escape )
	{
	sb.append(ch) ;
	continue ;
	}

	// Escape
	if ( i >= s.length()-1 )
	throw new AtlasException("Illegal escape at end of string") ;
	char ch2 = s.charAt(i+1) ;
	i = i + 1 ;

	// \\u and \\U
	if ( ch2 == 'u' )
	{
	if ( i+4 >= s.length() )
	throw new AtlasException("\\u escape too short") ;
	int x4 = Hex.hexStringToInt(s, i+1, 4) ;
	sb.append((char)x4) ;
	// Jump 1 2 3 4 -- already skipped \ and u
	i = i+4 ;
	continue ;
	}
	if ( ch2 == 'U' )
	{
	if ( i+8 >= s.length() )
	throw new AtlasException("\\U escape too short") ;
	int ch8 = Hex.hexStringToInt(s, i+1, 8) ;
	if ( Character.charCount(ch8) == 1 )
	sb.append((char)ch8);
	else {
	// See also TokenerText.insertCodepoint and TokenerText.readUnicodeEscape
	// Convert to UTF-16. Note that the rest of any system this is used
	// in must also respect codepoints and surrogate pairs.
	if ( !Character.isDefined(ch8) && !Character.isSupplementaryCodePoint(ch8) )
	throw new AtlasException(String.format("Illegal codepoint: 0x%04X", ch8));
	if ( ch8 > Character.MAX_CODE_POINT )
	throw new AtlasException(String.format("Illegal code point in \\U sequence value: 0x%08X", ch8));
	char[] chars = Character.toChars(ch8);
	sb.append(chars);
	}
	// Jump 1 2 3 4 5 6 7 8 -- already skipped \ and u
	i = i+8 ;
	continue ;
	}

	// Are we doing just point code escapes?
	// If so, \X-anything else is legal as a literal "\" and "X"

	if ( pointCodeOnly )
	{
	sb.append('\\') ;
	sb.append(ch2) ;
	continue ;
	}

	// Not just codepoints. Must be a legal escape.
	char ch3 = 0 ;
	switch (ch2)
	{
	case 'n': ch3 = '\n' ; break ;
	case 't': ch3 = '\t' ; break ;
	case 'r': ch3 = '\r' ; break ;
	case 'b': ch3 = '\b' ; break ;
	case 'f': ch3 = '\f' ; break ;
	case '\'': ch3 = '\'' ; break ;
	case '\"': ch3 = '\"' ; break ;
	case '\\': ch3 = '\\' ; break ;
	default:
	throw new AtlasException("Unknown escape: \\"+ch2) ;
	}
	sb.append(ch3) ;
	}
	return sb.toString() ;
	}
	}