| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.jena.util; |
| |
| import java.util.Objects; |
| |
| import org.apache.jena.atlas.lib.StrUtils; |
| import org.apache.jena.graph.Node; |
| //Has copies of import org.apache.jena.riot.system.RiotChars; |
| |
| /** |
| * Code to split an URI or IRI into prefix and local part. |
| * This code does <em>not</em> consider any prefix mapping. |
| * The split is based on finding the last {@code /} or {@code #} |
| * character. |
| * <p> |
| * Historically, 'prefix' is referred to as 'namespace' |
| * reflecting RDF/XML history. |
| * <p> |
| * For display, use {@link #localname} and {@link #namespace}. |
| * These pragmatic follow Turtle (e.g. localname allows a plain trailing dot) |
| * but does not escape any characters. |
| * |
| * These operations are for display (c.f. {@link Object#toString} and do not guarantee a round-trip; |
| * namespace+localname may not be the exact input IRI. |
| * A URI is split never split before the last {@code /} or {@code #}, if present. |
| * See {@link #splitpoint} for more details. |
| * <p> |
| * The functions |
| * {@link #namespaceTTL} and {@link #localnameTTL} |
| * provide a strict Turtle split, if possible; |
| * the local name is escaped if necessary. |
| * {@link #namespaceTTL} can be used to build a set of prefix mappings. |
| * {@link #localnameTTLNoEsc} is the same as {@link #localnameTTL} |
| * without the escaping applied. |
| * <p> |
| * The functions {@link #namespaceXML} and {@link #localnameXML} |
| * apply the rules for XML qnames. |
| * <p> |
| * This code forms the machinery behind {@link Node#getLocalName} |
| * {@link Node#getNameSpace} for URI Nodes following the XML rules. |
| */ |
| public class SplitIRI |
| { |
| /** Return the 'namespace' (prefix) for a URI string. |
| * Use with {@link #localname}. |
| * Return the input string if there is no splitpoint. |
| */ |
| public static String namespace(String string) { |
| Objects.requireNonNull(string, "string argument is null"); |
| int i = splitpoint(string); |
| if ( i < 0 ) |
| return string; |
| return string.substring(0, i); |
| } |
| |
| /** Calculate a localname - do not escape PN_LOCAL_ESC. |
| * This is not guaranteed to be legal Turtle. |
| * Use with {@link #namespace} |
| * Return an empty string if there is no split point. |
| */ |
| public static String localname(String string) { |
| Objects.requireNonNull(string, "string argument is null"); |
| int i = splitpoint(string); |
| if ( i < 0 ) |
| return ""; |
| return string.substring(i); |
| } |
| |
| /** |
| * Return the 'namespace' (prefix) for a URI string, |
| * legal for Turtle and goes with {@link #localnameTTL}. |
| * This operation does not guaratee that the argument has a legal localname. |
| */ |
| public static String namespaceTTL(String string) { |
| int i = splitpoint(string); |
| if ( i < 0 ) |
| return string; |
| String ns = string.substring(0, i); |
| return ns; |
| } |
| |
| /** |
| * Calculate a localname - enforce legal Turtle |
| * escapes for localnames (rule PN_LOCAL_ESC), |
| * A final '.' is escaped. |
| * Return "" for "no split". |
| * Use with {@link #namespaceTTL} |
| */ |
| public static String localnameTTL(String string) { |
| String x = localname(string); |
| if ( x.isEmpty()) |
| return x; |
| // This will escape the final DOT but leave internal dots alone. |
| return escape_PN_LOCAL_ESC(x); |
| } |
| |
| /** |
| * Calculate a localname - enforce legal Turtle |
| * without applying the escape PN_LOCAL_ESC. |
| * Return "" for 'no split' |
| * Check for final '.' - if present, return "". |
| */ |
| public static String localnameTTLNoEsc(String string) { |
| String x = localname(string); |
| if ( x.isEmpty()) |
| return x; |
| char lastChar = StrUtils.lastChar(x); |
| if ( lastChar == '.' ) |
| // No legal localname. |
| return ""; |
| return x; |
| } |
| |
| private static String escape_PN_LOCAL_ESC(String x) { |
| // Assume that escapes are rare so scan once to make sure there |
| // is work to do then scan again doing the work. |
| //'\' ('_' | '~' | '.' | '-' | '!' | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' | '%') |
| |
| int N = x.length(); |
| boolean escchar = false; |
| for ( int i = 0; i < N; i++ ) { |
| char ch = x.charAt(i); |
| if ( needsEscape(ch, (i==N-1)) ) { |
| escchar = true; |
| break; |
| } |
| } |
| if ( ! escchar ) |
| return x; |
| StringBuilder sb = new StringBuilder(N+10); |
| for ( int i = 0; i < N; i++ ) { |
| char ch = x.charAt(i); |
| // DOT only needs escaping at the end |
| if ( needsEscape(ch, (i==N-1) ) ) |
| sb.append('\\'); |
| sb.append(ch); |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * Scan for chars needing escape. |
| * Return the index of the first. |
| * Return -1 for no escape needed. |
| * |
| */ |
| private static int needsEscape(String x) { |
| int N = x.length(); |
| for ( int i = 0; i < N; i++ ) { |
| char ch = x.charAt(i); |
| if ( needsEscape(ch, (i==N-1)) ) { |
| return i; |
| } |
| } |
| return -1; |
| } |
| |
| private static boolean needsEscape(char ch, boolean finalChar) { |
| if ( ch == '.' ) |
| // Only needed at the end. |
| return finalChar; |
| return isPN_LOCAL_ESC(ch); |
| } |
| |
| // @formatter:off |
| /* From the RDF 1.1 Turtle specification: |
| [136s] PrefixedName ::= PNAME_LN | PNAME_NS |
| Productions for terminals |
| |
| [163s] PN_CHARS_BASE ::= [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] |
| [164s] PN_CHARS_U ::= PN_CHARS_BASE | '_' |
| [166s] PN_CHARS ::= PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040] |
| [167s] PN_PREFIX ::= PN_CHARS_BASE ((PN_CHARS | '.')* PN_CHARS)? |
| |
| [168s] PN_LOCAL ::= (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))? |
| [169s] PLX ::= PERCENT | PN_LOCAL_ESC |
| [170s] PERCENT ::= '%' HEX HEX |
| [171s] HEX ::= [0-9] | [A-F] | [a-f] |
| [172s] PN_LOCAL_ESC ::= '\' ('_' | '~' | '.' | '-' | '!' | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' | '%') |
| */ |
| // @formatter:on |
| |
| /** Find the URI split point, return the index into the string that is the |
| * first character of a legal Turtle local name. |
| * <p> |
| * This is a pragmatic choice, not just finding the maximal point. |
| * For example, with escaping '/' can be included but that means |
| * {@code http://example/path/abc} could split to give {@code http://example/} |
| * and {@code path/abc} . |
| * <p> |
| * Split URN's after ':'. |
| * <p> |
| * This function does not enforce the Turtle rule that the final character can not be '.'. |
| * |
| * @param uri URI string |
| * @return The split point, or -1 for "not found". |
| */ |
| public static int splitpoint(String uri) { |
| boolean isURN = uri.startsWith("urn:"); |
| // Fast track. Still need to check validity of the prefix part. |
| int idx1 = uri.lastIndexOf('#'); |
| // Not so simple - \/ in local names |
| int idx2 = isURN ? uri.lastIndexOf(':') : uri.lastIndexOf('/'); |
| |
| // If absolute. |
| int idx3 = uri.indexOf(':'); |
| |
| // Note: local names can't end in "." in Turtle. |
| // This is handled by escape_PN_LOCAL_ESC which will escape it as "\." |
| |
| // Cases |
| // "abc#def" |
| // "/abc" |
| // "/" |
| // "/path/path#frag |
| // "/path/path#abc/def" :: / in fragment, split is at the "#". |
| |
| int limit; |
| if ( idx1 >= 0 && idx2 < 0 ) { |
| // No path "/" (or ":" if a URN) |
| limit = idx1; |
| } else if ( idx1 < 0 && idx2 >= 0 ) { |
| // No fragment |
| limit = idx2; |
| } else if ( idx1 >= 0 && idx2 >= 0 ) { |
| // Fragment and path. Use fragment. |
| // If "/" is in the fragment, it is not the split point. |
| limit = idx1; |
| } else { |
| limit = -1; |
| } |
| |
| // At least idx3, the case of no "/" and no "#" in an absolute IRI |
| if ( idx3 >= 0 ) |
| limit = Math.max(limit, idx3); |
| |
| // Limit is our guess. |
| // Now search end of URI to this guess checking the characters found. |
| |
| int splitPoint = -1; |
| // Work backwards, checking for |
| // ((PN_CHARS | '.' | ':' | PLX)* |
| for ( int i = uri.length()-1; i > limit; i-- ) { |
| char ch = uri.charAt(i); |
| if ( /*RiotChars.*/isPNChars_U_N(ch) || /*RiotChars.*/isPN_LOCAL_ESC(ch) || ch == ':' || ch == '-' || ch == '.' ) |
| continue; |
| splitPoint = i+1; |
| break; |
| } |
| // limit was at the end. No split point (we could escape the limit point) |
| if ( splitPoint == -1 ) |
| splitPoint = limit+1; |
| // No split point. |
| if ( splitPoint >= uri.length() ) |
| return -1; |
| |
| // Check the first character of the local name. |
| // All characters are legal localname name characters but may not satisfy the additional |
| // first character rule. Move forward to first legal first character. |
| int ch = uri.charAt(splitPoint); |
| while ( ch == '.' || ch == '-' ) { |
| splitPoint++; |
| if ( splitPoint >= uri.length() ) |
| return -1; |
| ch = uri.charAt(splitPoint); |
| } |
| |
| // Checking the final '.' is done when checking for escapes. |
| return splitPoint; |
| } |
| |
| private static boolean checkhex(String uri, int i) { |
| return /*RiotChars.*/isHexChar(uri.charAt(i)); |
| } |
| |
| // Assuming legal URIs, there is no work to be done |
| // for %XX. If illegal (e.g. %X), the best we can do |
| // is not mess them up. |
| /* |
| // % - just need to check that it is followed by two hex. |
| if ( ch == '%' ) { |
| if ( i+2 >= uri.length() ) { |
| // Too short |
| return -1; |
| } |
| if ( ! checkhex(uri, i+1) || ! checkhex(uri, i+2) ) |
| return -1; |
| } |
| |
| */ |
| /** |
| * Split point, according to XML qname rules. |
| * This is the longest NCName at the end of the uri. |
| * Return a split at the end of the string if there is no match |
| * (e.g. the URI string ends in '/' or '#'). |
| */ |
| public static int splitXML(String string) { return splitNamespaceXML(string); } |
| |
| /** |
| * Namespace, according to XML qname rules. |
| * Use with {@link #localnameXML}. |
| */ |
| public static String namespaceXML(String string) { |
| int i = splitXML(string); |
| return string.substring(0, i); |
| } |
| |
| /** Localname, according to XML qname rules. */ |
| public static String localnameXML(String string) { |
| int i = splitXML(string); |
| return string.substring(i); |
| } |
| |
| // Extracted from RiotChars |
| // When/if RIOT becomes accessible to this code, then refactor |
| |
| private static boolean /*RiotChars.*/isPN_LOCAL_ESC(char ch) { |
| switch (ch) { |
| case '\\': case '_': case '~': case '.': case '-': case '!': case '$': |
| case '&': case '\'': case '(': case ')': case '*': case '+': case ',': |
| case ';': case '=': case '/': case '?': case '#': case '@': case '%': |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| /** ASCII 0-9 */ |
| private static boolean isDigit(int ch) { |
| return range(ch, '0', '9'); |
| } |
| |
| private static boolean isPNCharsBase(int ch) { |
| // PN_CHARS_BASE ::= [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | |
| // [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | |
| // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | |
| // [#x10000-#xEFFFF] |
| return |
| r(ch, 'a', 'z') || r(ch, 'A', 'Z') || r(ch, 0x00C0, 0x00D6) || r(ch, 0x00D8, 0x00F6) || r(ch, 0x00F8, 0x02FF) || |
| r(ch, 0x0370, 0x037D) || r(ch, 0x037F, 0x1FFF) || r(ch, 0x200C, 0x200D) || r(ch, 0x2070, 0x218F) || |
| r(ch, 0x2C00, 0x2FEF) || r(ch, 0x3001, 0xD7FF) || |
| // Surrogate pairs |
| r(ch, 0xD800, 0xDFFF) || |
| r(ch, 0xF900, 0xFDCF) || r(ch, 0xFDF0, 0xFFFD) || |
| r(ch, 0x10000, 0xEFFFF); // Outside the basic plane. |
| } |
| |
| private static boolean isPNChars_U(int ch) { |
| //PN_CHARS_BASE | '_' |
| return isPNCharsBase(ch) || ( ch == '_' ); |
| } |
| |
| private static boolean isPNChars_U_N(int ch) { |
| // PN_CHARS_U | [0-9] |
| return isPNCharsBase(ch) || ( ch == '_' ) || isDigit(ch); |
| } |
| |
| private static boolean isPNChars(int ch) { |
| // PN_CHARS ::= PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040] |
| return isPNChars_U(ch) || isDigit(ch) || ( ch == '-' ) || ch == 0x00B7 || r(ch, 0x300, 0x036F) || r(ch, 0x203F, 0x2040); |
| } |
| |
| /** Hexadecimal character */ |
| private static boolean isHexChar(int ch) { |
| return range(ch, '0', '9') || range(ch, 'a', 'f') || range(ch, 'A', 'F'); |
| } |
| |
| private static int valHexChar(int ch) { |
| if ( range(ch, '0', '9') ) |
| return ch - '0'; |
| if ( range(ch, 'a', 'f') ) |
| return ch - 'a' + 10; |
| if ( range(ch, 'A', 'F') ) |
| return ch - 'A' + 10; |
| return -1; |
| } |
| |
| private static boolean r(int ch, int a, int b) { return ( ch >= a && ch <= b ); } |
| |
| private static boolean range(int ch, char a, char b) { |
| return (ch >= a && ch <= b); |
| } |
| |
| // -------- -------- |
| /** |
| * Given an absolute URI, determine the split point between the namespace |
| * part and the localname part. If there is no valid localname part then the |
| * length of the string is returned. The algorithm tries to find the longest |
| * NCName at the end of the uri, not immediately preceeded by the first |
| * colon in the string. |
| * <p> |
| * This operation follows XML QName rules which are more complicated than |
| * needed for Turtle and TriG. For example, QName can't start with a digit. |
| * |
| * @param uri |
| * @return the index of the first character of the localname |
| * @see SplitIRI |
| */ |
| private static int splitNamespaceXML(String uri) { |
| |
| // XML Namespaces 1.0: |
| // A qname name is NCName ':' NCName |
| // NCName ::= NCNameStartChar NCNameChar* |
| // NCNameChar ::= NameChar - ':' |
| // NCNameStartChar ::= Letter | '_' |
| // |
| // XML 1.0 |
| // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | |
| // [#xD8-#xF6] | [#xF8-#x2FF] | |
| // [#x370-#x37D] | [#x37F-#x1FFF] | |
| // [#x200C-#x200D] | [#x2070-#x218F] | |
| // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | |
| // [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] |
| // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | |
| // [#x0300-#x036F] | [#x203F-#x2040] |
| // Name ::= NameStartChar (NameChar)* |
| |
| char ch; |
| int lg = uri.length(); |
| if (lg == 0) |
| return 0; |
| int i = lg-1; |
| for (; i >= 1; i--) { |
| ch = uri.charAt(i); |
| if ( !XMLChar.isNCName(ch) ) |
| break; |
| } |
| |
| int j = i + 1; |
| |
| if ( j >= lg ) |
| return lg; |
| |
| // Check we haven't split up a %-encoding. |
| if ( j >= 2 && uri.charAt(j-2) == '%' ) |
| j = j+1; |
| if ( j >= 1 && uri.charAt(j-1) == '%' ) { |
| j = j+2; |
| if ( j > lg ) |
| // JENA-1941: Protect against overshoot in the case of "%x" |
| // at end of a (bad) URI. |
| return lg; |
| } |
| |
| // Have found the leftmost NCNameChar from the |
| // end of the URI string. |
| // Now scan forward for an NCNameStartChar |
| // The split must start with NCNameStart. |
| for (; j < lg; j++) { |
| ch = uri.charAt(j); |
| // if (XMLChar.isNCNameStart(ch)) |
| // break; |
| if (XMLChar.isNCNameStart(ch)) |
| { |
| // "mailto:" is special. |
| // split "mailto:me" as "mailto:m" and "e" ! |
| // Keep part after mailto: with at least one character. |
| if ( j == 7 && uri.startsWith("mailto:")) |
| // Don't split at "mailto:" |
| continue; |
| else |
| break; |
| } |
| } |
| return j; |
| } |
| } |
| |