jena-iri/src-dev/buildlexer/PatternCompilerBuilder.java - jena - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // TODO e-mail uri list about . at end of domain name
 // TODO e-mail uri list about IPv4 vs host:
 // If host matches the rule for IPv4address, then it should be considered an IPv4 address literal and not a reg-name.

 package buildlexer;

 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.Writer;
 import java.lang.reflect.Field;

 import org.apache.jena.iri.ViolationCodes ;


 public class PatternCompilerBuilder implements ViolationCodes {

     private static final class ExpandAndOutput extends Expansion {
         int exc[];
         int sub[];
         boolean incExc;
         /**
          * output those for which no errors in exclude,
          * and all errors in sub[] occur
          * or the inverse: at least one error in exclude
          * occurs, and at least one error in sub doesn't
          * @param exclude
          */
         ExpandAndOutput(int exclude[], int subset[], boolean incExc ) {
            exc = exclude;
            sub = subset;
            this.incExc = incExc;
         }
         int ruleCount = 1;

         @Override
         public void doIt(String regex, int eCount, int[] eCodes, int cCount,
                 String c[]) {

             if (incExc ==
                 ( (!overlap(exc,eCount, eCodes)) &&
                   subset(sub,eCount, eCodes) ) )
             try {
                 out.write("/*\n");
                 for (int j = 0; j < cCount; j++) {
                     out.write(c[j]);
                     out.write('\n');
                 }
                 out.write("*/\n");

                 out.write(regex);
                 out.write(" {\n");
                 count++;
                 out.write("rule("+count+"); ");
                 for (int i = 0; i < eCount; i++)
                     out.write("error(" + errorCodeName(eCodes[i]) + ");");
                 out.write("}\n");
             } catch (IOException e) {
                 throw new RuntimeException(e);
             }
         }
         private boolean subset(int ee[], int el, int[]eCodes) {
             for (int i=0;i<ee.length;i++)
                 if (!in(ee[i],el,eCodes))
                     return false;
             return true;
         }
         private boolean overlap(int ee[], int el, int[]eCodes) {
             for (int i=0;i<ee.length;i++)
                 if (in(ee[i],el,eCodes))
                     return true;
             return false;
         }
         private boolean in(int e0, int eCount, int[] eCodes) {
             for (int i=0; i<eCount; i++)
                 if (eCodes[i]==e0)
                      return true;
             return false;
         }
     }

     static long start;

     static public void main(String args[]) throws IOException {
         start = System.currentTimeMillis();
         // out = new FileWriter("src/main/java/org/apache/jena/iri/impl/iri2.jflex");
         // copy("src/main/java/org/apache/jena/iri/impl/iri.jflex");
 //        outRules("scheme");
 //        outRules("userinfo");
         outRules("host");
 //        outRules("port");
 //        outRules("path");
 //        outRules("query");
 //        outRules("fragment");
         // out.close();
         //
         // JFlex.Main.main(new
         // String[]{"src/main/java/com/hp/hpl/jena/iri/impl/iri2.jflex"});
         System.out.println(System.currentTimeMillis() - start);
     }

     private static void copy(String fname) throws IOException {
         Reader in = new FileReader(fname);
         char buf[] = new char[2048];
         while (true) {
             int sz = in.read(buf);
             if (sz == -1)
                 break;
             out.write(buf, 0, sz);
         }
         in.close();
     }

     static String eCodeNames[];

     static String errorCodeName(int j) {
 		if (eCodeNames == null) {
             eCodeNames = constantsFromClass(ViolationCodes.class, 200);
         }
         return eCodeNames[j];
     }

 	static String[] constantsFromClass(Class<?> cl, int cnt) {
 		String[] names;
 		names = new String[cnt];
 		Field f[] = cl.getDeclaredFields();
 		for (int i = 0; i < f.length; i++)
 		    try {
 		        names[f[i].getInt(null)] = f[i].getName();
 		    } catch (IllegalArgumentException e) {
 		        e.printStackTrace();
 		    } catch (IllegalAccessException e) {
 		        e.printStackTrace();
 		    }
 		return names;
 	}

     static int count;

     static Writer out;

     static private void outRules(String name) throws IOException {
         count = 0;
         // if (true) throw new RuntimeException();
         out = new FileWriter("src/main/jflex/org/apache/jena/iri/impl/"+name+".jflex");
         copy("src/main/jflex/org/apache/jena/iri/impl/iri.jflex");
         out.write("%class Lexer");
         out.write(name.substring(0, 1).toUpperCase());
         out.write(name.substring(1));
         out.write("\n%%\n");
         int exc1[]=
             new int[]{DOUBLE_DASH_IN_REG_NAME,NOT_DNS_NAME};
         int empty[]= new int[0];
         int sub1[] = new int[]{ACE_PREFIX};
         //        int sub2[] = new int[]{DOUBLE_DASH_IN_REG_NAME,ACE_PREFIX};
         int sub4[] = new int[]{DOUBLE_DASH_IN_REG_NAME};
         int sub3[] = new int[]{NOT_DNS_NAME};

         new ExpandAndOutput(exc1,empty,true).expand("@{" + name + "}");
         //   new ExpandAndOutput(empty,sub2,true).expand("@{" + name + "}");
         new ExpandAndOutput(sub1,sub4,true).expand("@{" + name + "}");
         new ExpandAndOutput(empty,sub3,true).expand("@{" + name + "}");

         out.write("\n");
         System.out.println(name + ": " + count + " expansions");
         out.close();

         MainGenerateLexers.runJFlex(new String[] { "-d", "src/main/java/org/apache/jena/iri/impl", "src/main/jflex/org/apache/jena/iri/impl/"+name+".jflex" });
         System.out.println(System.currentTimeMillis() - start);

     }
     /*
      *
      * Unicode LTR stuff:
      *
      * 200E ????-??- ????? ???? 200F ?????-??-???? ???? 202A ????-??-?????
      * ????????? 202B ?????-??-???? ????????? 202C ??? ??????????? ??????????
      * 202D ????-??-????? ???????? 202E ?????-??-???? ????????
      *
      * XSD preserve No normalization is done, the value is not changed (this is
      * the behavior required by [XML 1.0 (Second Edition)] for element content)
      * replace All occurrences of #x9 (tab), #xA (line feed) and #xD (carriage
      * return) are replaced with #x20 (space) collapse After the processing
      * implied by replace, contiguous sequences of #x20's are collapsed to a
      * single #x20, and leading and trailing #x20's are removed.
      *
      *
      * <xs:simpleType name="anyURI" id="anyURI"> <xs:annotation> <xs:appinfo>
      * <hfp:hasFacet name="length"/> <hfp:hasFacet name="minLength"/>
      * <hfp:hasFacet name="maxLength"/> <hfp:hasFacet name="pattern"/>
      * <hfp:hasFacet name="enumeration"/> <hfp:hasFacet name="whiteSpace"/>
      * <hfp:hasProperty name="ordered" value="false"/> <hfp:hasProperty
      * name="bounded" value="false"/> <hfp:hasProperty name="cardinality"
      * value="countably infinite"/> <hfp:hasProperty name="numeric"
      * value="false"/> </xs:appinfo> <xs:documentation
      * source="http://www.w3.org/TR/xmlschema-2/#anyURI"/> </xs:annotation>
      * <xs:restriction base="xs:anySimpleType"> <xs:whiteSpace fixed="true"
      * value="collapse" id="anyURI.whiteSpace"/> </xs:restriction>
      * </xs:simpleType>
      *
      * XML 1.0
      *
      * [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
      * [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
      * blocks, FFFE, and FFFF.
      *
      *
      * Note:
      *
      * Document authors are encouraged to avoid "compatibility characters", as
      * defined in section 6.8 of [Unicode] (see also D21 in section 3.6 of
      * [Unicode3]). The characters defined in the following ranges are also
      * discouraged. They are either control characters or permanently undefined
      * Unicode characters:
      *
      * [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF],
      * [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF],
      * [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF],
      * [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF],
      * [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF].
      *
      *
      * XML 1.1 [Definition: A parsed entity contains text, a sequence of
      * characters, which may represent markup or character data.] [Definition: A
      * character is an atomic unit of text as specified by ISO/IEC 10646
      * [ISO/IEC 10646]. Legal characters are tab, carriage return, line feed,
      * and the legal characters of Unicode and ISO/IEC 10646. The versions of
      * these standards cited in A.1 Normative References were current at the
      * time this document was prepared. New characters may be added to these
      * standards by amendments or new editions. Consequently, XML processors
      * MUST accept any character in the range specified for Char.] Character
      * Range [2] Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /*
      * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. * /
      * [2a] RestrictedChar ::= [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] |
      * [#x86-#x9F]
      *
      * The mechanism for encoding character code points into bit patterns MAY
      * vary from entity to entity. All XML processors MUST accept the UTF-8 and
      * UTF-16 encodings of Unicode [Unicode]; the mechanisms for signaling which
      * of the two is in use, or for bringing other encodings into play, are
      * discussed later, in 4.3.3 Character Encoding in Entities.
      *
      * Note:
      *
      * Document authors are encouraged to avoid "compatibility characters", as
      * defined in Unicode [Unicode]. The characters defined in the following
      * ranges are also discouraged. They are either control characters or
      * permanently undefined Unicode characters:
      *
      * [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF],
      * [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF],
      * [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF],
      * [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF],
      * [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF].
      */

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// TODO e-mail uri list about . at end of domain name
	// TODO e-mail uri list about IPv4 vs host:
	// If host matches the rule for IPv4address, then it should be considered an IPv4 address literal and not a reg-name.

	package buildlexer;

	import java.io.FileReader;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.io.Reader;
	import java.io.Writer;
	import java.lang.reflect.Field;

	import org.apache.jena.iri.ViolationCodes ;


	public class PatternCompilerBuilder implements ViolationCodes {

	private static final class ExpandAndOutput extends Expansion {
	int exc[];
	int sub[];
	boolean incExc;
	/**
	* output those for which no errors in exclude,
	* and all errors in sub[] occur
	* or the inverse: at least one error in exclude
	* occurs, and at least one error in sub doesn't
	* @param exclude
	*/
	ExpandAndOutput(int exclude[], int subset[], boolean incExc ) {
	exc = exclude;
	sub = subset;
	this.incExc = incExc;
	}
	int ruleCount = 1;

	@Override
	public void doIt(String regex, int eCount, int[] eCodes, int cCount,
	String c[]) {

	if (incExc ==
	( (!overlap(exc,eCount, eCodes)) &&
	subset(sub,eCount, eCodes) ) )
	try {
	out.write("/*\n");
	for (int j = 0; j < cCount; j++) {
	out.write(c[j]);
	out.write('\n');
	}
	out.write("*/\n");

	out.write(regex);
	out.write(" {\n");
	count++;
	out.write("rule("+count+"); ");
	for (int i = 0; i < eCount; i++)
	out.write("error(" + errorCodeName(eCodes[i]) + ");");
	out.write("}\n");
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}
	private boolean subset(int ee[], int el, int[]eCodes) {
	for (int i=0;i<ee.length;i++)
	if (!in(ee[i],el,eCodes))
	return false;
	return true;
	}
	private boolean overlap(int ee[], int el, int[]eCodes) {
	for (int i=0;i<ee.length;i++)
	if (in(ee[i],el,eCodes))
	return true;
	return false;
	}
	private boolean in(int e0, int eCount, int[] eCodes) {
	for (int i=0; i<eCount; i++)
	if (eCodes[i]==e0)
	return true;
	return false;
	}
	}

	static long start;

	static public void main(String args[]) throws IOException {
	start = System.currentTimeMillis();
	// out = new FileWriter("src/main/java/org/apache/jena/iri/impl/iri2.jflex");
	// copy("src/main/java/org/apache/jena/iri/impl/iri.jflex");
	// outRules("scheme");
	// outRules("userinfo");
	outRules("host");
	// outRules("port");
	// outRules("path");
	// outRules("query");
	// outRules("fragment");
	// out.close();
	//
	// JFlex.Main.main(new
	// String[]{"src/main/java/com/hp/hpl/jena/iri/impl/iri2.jflex"});
	System.out.println(System.currentTimeMillis() - start);
	}

	private static void copy(String fname) throws IOException {
	Reader in = new FileReader(fname);
	char buf[] = new char[2048];
	while (true) {
	int sz = in.read(buf);
	if (sz == -1)
	break;
	out.write(buf, 0, sz);
	}
	in.close();
	}

	static String eCodeNames[];

	static String errorCodeName(int j) {
	if (eCodeNames == null) {
	eCodeNames = constantsFromClass(ViolationCodes.class, 200);
	}
	return eCodeNames[j];
	}

	static String[] constantsFromClass(Class<?> cl, int cnt) {
	String[] names;
	names = new String[cnt];
	Field f[] = cl.getDeclaredFields();
	for (int i = 0; i < f.length; i++)
	try {
	names[f[i].getInt(null)] = f[i].getName();
	} catch (IllegalArgumentException e) {
	e.printStackTrace();
	} catch (IllegalAccessException e) {
	e.printStackTrace();
	}
	return names;
	}

	static int count;

	static Writer out;

	static private void outRules(String name) throws IOException {
	count = 0;
	// if (true) throw new RuntimeException();
	out = new FileWriter("src/main/jflex/org/apache/jena/iri/impl/"+name+".jflex");
	copy("src/main/jflex/org/apache/jena/iri/impl/iri.jflex");
	out.write("%class Lexer");
	out.write(name.substring(0, 1).toUpperCase());
	out.write(name.substring(1));
	out.write("\n%%\n");
	int exc1[]=
	new int[]{DOUBLE_DASH_IN_REG_NAME,NOT_DNS_NAME};
	int empty[]= new int[0];
	int sub1[] = new int[]{ACE_PREFIX};
	// int sub2[] = new int[]{DOUBLE_DASH_IN_REG_NAME,ACE_PREFIX};
	int sub4[] = new int[]{DOUBLE_DASH_IN_REG_NAME};
	int sub3[] = new int[]{NOT_DNS_NAME};

	new ExpandAndOutput(exc1,empty,true).expand("@{" + name + "}");
	// new ExpandAndOutput(empty,sub2,true).expand("@{" + name + "}");
	new ExpandAndOutput(sub1,sub4,true).expand("@{" + name + "}");
	new ExpandAndOutput(empty,sub3,true).expand("@{" + name + "}");

	out.write("\n");
	System.out.println(name + ": " + count + " expansions");
	out.close();

	MainGenerateLexers.runJFlex(new String[] { "-d", "src/main/java/org/apache/jena/iri/impl", "src/main/jflex/org/apache/jena/iri/impl/"+name+".jflex" });
	System.out.println(System.currentTimeMillis() - start);

	}
	/*
	*
	* Unicode LTR stuff:
	*
	* 200E ????-??- ????? ???? 200F ?????-??-???? ???? 202A ????-??-?????
	* ????????? 202B ?????-??-???? ????????? 202C ??? ??????????? ??????????
	* 202D ????-??-????? ???????? 202E ?????-??-???? ????????
	*
	* XSD preserve No normalization is done, the value is not changed (this is
	* the behavior required by [XML 1.0 (Second Edition)] for element content)
	* replace All occurrences of #x9 (tab), #xA (line feed) and #xD (carriage
	* return) are replaced with #x20 (space) collapse After the processing
	* implied by replace, contiguous sequences of #x20's are collapsed to a
	* single #x20, and leading and trailing #x20's are removed.
	*
	*
	* <xs:simpleType name="anyURI" id="anyURI"> <xs:annotation> <xs:appinfo>
	* <hfp:hasFacet name="length"/> <hfp:hasFacet name="minLength"/>
	* <hfp:hasFacet name="maxLength"/> <hfp:hasFacet name="pattern"/>
	* <hfp:hasFacet name="enumeration"/> <hfp:hasFacet name="whiteSpace"/>
	* <hfp:hasProperty name="ordered" value="false"/> <hfp:hasProperty
	* name="bounded" value="false"/> <hfp:hasProperty name="cardinality"
	* value="countably infinite"/> <hfp:hasProperty name="numeric"
	* value="false"/> </xs:appinfo> <xs:documentation
	* source="http://www.w3.org/TR/xmlschema-2/#anyURI"/> </xs:annotation>
	* <xs:restriction base="xs:anySimpleType"> <xs:whiteSpace fixed="true"
	* value="collapse" id="anyURI.whiteSpace"/> </xs:restriction>
	* </xs:simpleType>
	*
	* XML 1.0
	*
	* [2] Char ::= #x9 \| #xA \| #xD \| [#x20-#xD7FF] \| [#xE000-#xFFFD] \|
	* [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
	* blocks, FFFE, and FFFF.
	*
	*
	* Note:
	*
	* Document authors are encouraged to avoid "compatibility characters", as
	* defined in section 6.8 of [Unicode] (see also D21 in section 3.6 of
	* [Unicode3]). The characters defined in the following ranges are also
	* discouraged. They are either control characters or permanently undefined
	* Unicode characters:
	*
	* [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF],
	* [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF],
	* [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF],
	* [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF],
	* [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF].
	*
	*
	* XML 1.1 [Definition: A parsed entity contains text, a sequence of
	* characters, which may represent markup or character data.] [Definition: A
	* character is an atomic unit of text as specified by ISO/IEC 10646
	* [ISO/IEC 10646]. Legal characters are tab, carriage return, line feed,
	* and the legal characters of Unicode and ISO/IEC 10646. The versions of
	* these standards cited in A.1 Normative References were current at the
	* time this document was prepared. New characters may be added to these
	* standards by amendments or new editions. Consequently, XML processors
	* MUST accept any character in the range specified for Char.] Character
	* Range [2] Char ::= [#x1-#xD7FF] \| [#xE000-#xFFFD] \| [#x10000-#x10FFFF] /*
	* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. * /
	* [2a] RestrictedChar ::= [#x1-#x8] \| [#xB-#xC] \| [#xE-#x1F] \| [#x7F-#x84] \|
	* [#x86-#x9F]
	*
	* The mechanism for encoding character code points into bit patterns MAY
	* vary from entity to entity. All XML processors MUST accept the UTF-8 and
	* UTF-16 encodings of Unicode [Unicode]; the mechanisms for signaling which
	* of the two is in use, or for bringing other encodings into play, are
	* discussed later, in 4.3.3 Character Encoding in Entities.
	*
	* Note:
	*
	* Document authors are encouraged to avoid "compatibility characters", as
	* defined in Unicode [Unicode]. The characters defined in the following
	* ranges are also discouraged. They are either control characters or
	* permanently undefined Unicode characters:
	*
	* [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF],
	* [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF],
	* [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF],
	* [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF],
	* [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF].
	*/

	}