| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // TODO e-mail uri list about . at end of domain name |
| // TODO e-mail uri list about IPv4 vs host: |
| // If host matches the rule for IPv4address, then it should be considered an IPv4 address literal and not a reg-name. |
| |
| package buildlexer; |
| |
| import java.io.FileReader; |
| import java.io.FileWriter; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.Writer; |
| import java.lang.reflect.Field; |
| |
| import org.apache.jena.iri.ViolationCodes ; |
| |
| |
| public class PatternCompilerBuilder implements ViolationCodes { |
| |
| private static final class ExpandAndOutput extends Expansion { |
| int exc[]; |
| int sub[]; |
| boolean incExc; |
| /** |
| * output those for which no errors in exclude, |
| * and all errors in sub[] occur |
| * or the inverse: at least one error in exclude |
| * occurs, and at least one error in sub doesn't |
| * @param exclude |
| */ |
| ExpandAndOutput(int exclude[], int subset[], boolean incExc ) { |
| exc = exclude; |
| sub = subset; |
| this.incExc = incExc; |
| } |
| int ruleCount = 1; |
| |
| @Override |
| public void doIt(String regex, int eCount, int[] eCodes, int cCount, |
| String c[]) { |
| |
| if (incExc == |
| ( (!overlap(exc,eCount, eCodes)) && |
| subset(sub,eCount, eCodes) ) ) |
| try { |
| out.write("/*\n"); |
| for (int j = 0; j < cCount; j++) { |
| out.write(c[j]); |
| out.write('\n'); |
| } |
| out.write("*/\n"); |
| |
| out.write(regex); |
| out.write(" {\n"); |
| count++; |
| out.write("rule("+count+"); "); |
| for (int i = 0; i < eCount; i++) |
| out.write("error(" + errorCodeName(eCodes[i]) + ");"); |
| out.write("}\n"); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| private boolean subset(int ee[], int el, int[]eCodes) { |
| for (int i=0;i<ee.length;i++) |
| if (!in(ee[i],el,eCodes)) |
| return false; |
| return true; |
| } |
| private boolean overlap(int ee[], int el, int[]eCodes) { |
| for (int i=0;i<ee.length;i++) |
| if (in(ee[i],el,eCodes)) |
| return true; |
| return false; |
| } |
| private boolean in(int e0, int eCount, int[] eCodes) { |
| for (int i=0; i<eCount; i++) |
| if (eCodes[i]==e0) |
| return true; |
| return false; |
| } |
| } |
| |
| static long start; |
| |
| static public void main(String args[]) throws IOException { |
| start = System.currentTimeMillis(); |
| // out = new FileWriter("src/main/java/org/apache/jena/iri/impl/iri2.jflex"); |
| // copy("src/main/java/org/apache/jena/iri/impl/iri.jflex"); |
| // outRules("scheme"); |
| // outRules("userinfo"); |
| outRules("host"); |
| // outRules("port"); |
| // outRules("path"); |
| // outRules("query"); |
| // outRules("fragment"); |
| // out.close(); |
| // |
| // JFlex.Main.main(new |
| // String[]{"src/main/java/com/hp/hpl/jena/iri/impl/iri2.jflex"}); |
| System.out.println(System.currentTimeMillis() - start); |
| } |
| |
| private static void copy(String fname) throws IOException { |
| Reader in = new FileReader(fname); |
| char buf[] = new char[2048]; |
| while (true) { |
| int sz = in.read(buf); |
| if (sz == -1) |
| break; |
| out.write(buf, 0, sz); |
| } |
| in.close(); |
| } |
| |
| static String eCodeNames[]; |
| |
| static String errorCodeName(int j) { |
| if (eCodeNames == null) { |
| eCodeNames = constantsFromClass(ViolationCodes.class, 200); |
| } |
| return eCodeNames[j]; |
| } |
| |
| static String[] constantsFromClass(Class<?> cl, int cnt) { |
| String[] names; |
| names = new String[cnt]; |
| Field f[] = cl.getDeclaredFields(); |
| for (int i = 0; i < f.length; i++) |
| try { |
| names[f[i].getInt(null)] = f[i].getName(); |
| } catch (IllegalArgumentException e) { |
| e.printStackTrace(); |
| } catch (IllegalAccessException e) { |
| e.printStackTrace(); |
| } |
| return names; |
| } |
| |
| static int count; |
| |
| static Writer out; |
| |
| static private void outRules(String name) throws IOException { |
| count = 0; |
| // if (true) throw new RuntimeException(); |
| out = new FileWriter("src/main/jflex/org/apache/jena/iri/impl/"+name+".jflex"); |
| copy("src/main/jflex/org/apache/jena/iri/impl/iri.jflex"); |
| out.write("%class Lexer"); |
| out.write(name.substring(0, 1).toUpperCase()); |
| out.write(name.substring(1)); |
| out.write("\n%%\n"); |
| int exc1[]= |
| new int[]{DOUBLE_DASH_IN_REG_NAME,NOT_DNS_NAME}; |
| int empty[]= new int[0]; |
| int sub1[] = new int[]{ACE_PREFIX}; |
| // int sub2[] = new int[]{DOUBLE_DASH_IN_REG_NAME,ACE_PREFIX}; |
| int sub4[] = new int[]{DOUBLE_DASH_IN_REG_NAME}; |
| int sub3[] = new int[]{NOT_DNS_NAME}; |
| |
| new ExpandAndOutput(exc1,empty,true).expand("@{" + name + "}"); |
| // new ExpandAndOutput(empty,sub2,true).expand("@{" + name + "}"); |
| new ExpandAndOutput(sub1,sub4,true).expand("@{" + name + "}"); |
| new ExpandAndOutput(empty,sub3,true).expand("@{" + name + "}"); |
| |
| out.write("\n"); |
| System.out.println(name + ": " + count + " expansions"); |
| out.close(); |
| |
| MainGenerateLexers.runJFlex(new String[] { "-d", "src/main/java/org/apache/jena/iri/impl", "src/main/jflex/org/apache/jena/iri/impl/"+name+".jflex" }); |
| System.out.println(System.currentTimeMillis() - start); |
| |
| } |
| /* |
| * |
| * Unicode LTR stuff: |
| * |
| * 200E ????-??- ????? ???? 200F ?????-??-???? ???? 202A ????-??-????? |
| * ????????? 202B ?????-??-???? ????????? 202C ??? ??????????? ?????????? |
| * 202D ????-??-????? ???????? 202E ?????-??-???? ???????? |
| * |
| * XSD preserve No normalization is done, the value is not changed (this is |
| * the behavior required by [XML 1.0 (Second Edition)] for element content) |
| * replace All occurrences of #x9 (tab), #xA (line feed) and #xD (carriage |
| * return) are replaced with #x20 (space) collapse After the processing |
| * implied by replace, contiguous sequences of #x20's are collapsed to a |
| * single #x20, and leading and trailing #x20's are removed. |
| * |
| * |
| * <xs:simpleType name="anyURI" id="anyURI"> <xs:annotation> <xs:appinfo> |
| * <hfp:hasFacet name="length"/> <hfp:hasFacet name="minLength"/> |
| * <hfp:hasFacet name="maxLength"/> <hfp:hasFacet name="pattern"/> |
| * <hfp:hasFacet name="enumeration"/> <hfp:hasFacet name="whiteSpace"/> |
| * <hfp:hasProperty name="ordered" value="false"/> <hfp:hasProperty |
| * name="bounded" value="false"/> <hfp:hasProperty name="cardinality" |
| * value="countably infinite"/> <hfp:hasProperty name="numeric" |
| * value="false"/> </xs:appinfo> <xs:documentation |
| * source="http://www.w3.org/TR/xmlschema-2/#anyURI"/> </xs:annotation> |
| * <xs:restriction base="xs:anySimpleType"> <xs:whiteSpace fixed="true" |
| * value="collapse" id="anyURI.whiteSpace"/> </xs:restriction> |
| * </xs:simpleType> |
| * |
| * XML 1.0 |
| * |
| * [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | |
| * [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate |
| * blocks, FFFE, and FFFF. |
| * |
| * |
| * Note: |
| * |
| * Document authors are encouraged to avoid "compatibility characters", as |
| * defined in section 6.8 of [Unicode] (see also D21 in section 3.6 of |
| * [Unicode3]). The characters defined in the following ranges are also |
| * discouraged. They are either control characters or permanently undefined |
| * Unicode characters: |
| * |
| * [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF], |
| * [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF], |
| * [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF], |
| * [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF], |
| * [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF]. |
| * |
| * |
| * XML 1.1 [Definition: A parsed entity contains text, a sequence of |
| * characters, which may represent markup or character data.] [Definition: A |
| * character is an atomic unit of text as specified by ISO/IEC 10646 |
| * [ISO/IEC 10646]. Legal characters are tab, carriage return, line feed, |
| * and the legal characters of Unicode and ISO/IEC 10646. The versions of |
| * these standards cited in A.1 Normative References were current at the |
| * time this document was prepared. New characters may be added to these |
| * standards by amendments or new editions. Consequently, XML processors |
| * MUST accept any character in the range specified for Char.] Character |
| * Range [2] Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* |
| * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. * / |
| * [2a] RestrictedChar ::= [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | |
| * [#x86-#x9F] |
| * |
| * The mechanism for encoding character code points into bit patterns MAY |
| * vary from entity to entity. All XML processors MUST accept the UTF-8 and |
| * UTF-16 encodings of Unicode [Unicode]; the mechanisms for signaling which |
| * of the two is in use, or for bringing other encodings into play, are |
| * discussed later, in 4.3.3 Character Encoding in Entities. |
| * |
| * Note: |
| * |
| * Document authors are encouraged to avoid "compatibility characters", as |
| * defined in Unicode [Unicode]. The characters defined in the following |
| * ranges are also discouraged. They are either control characters or |
| * permanently undefined Unicode characters: |
| * |
| * [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF], |
| * [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF], |
| * [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF], |
| * [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF], |
| * [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF]. |
| */ |
| |
| } |