blob: 1eb177def78341c4f67a360b89818f3dec6a08dd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// TODO e-mail uri list about . at end of domain name
// TODO e-mail uri list about IPv4 vs host:
// If host matches the rule for IPv4address, then it should be considered an IPv4 address literal and not a reg-name.
package buildlexer;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.lang.reflect.Field;
import org.apache.jena.iri.ViolationCodes ;
public class PatternCompilerBuilder implements ViolationCodes {
private static final class ExpandAndOutput extends Expansion {
int exc[];
int sub[];
boolean incExc;
/**
* output those for which no errors in exclude,
* and all errors in sub[] occur
* or the inverse: at least one error in exclude
* occurs, and at least one error in sub doesn't
* @param exclude
*/
ExpandAndOutput(int exclude[], int subset[], boolean incExc ) {
exc = exclude;
sub = subset;
this.incExc = incExc;
}
int ruleCount = 1;
@Override
public void doIt(String regex, int eCount, int[] eCodes, int cCount,
String c[]) {
if (incExc ==
( (!overlap(exc,eCount, eCodes)) &&
subset(sub,eCount, eCodes) ) )
try {
out.write("/*\n");
for (int j = 0; j < cCount; j++) {
out.write(c[j]);
out.write('\n');
}
out.write("*/\n");
out.write(regex);
out.write(" {\n");
count++;
out.write("rule("+count+"); ");
for (int i = 0; i < eCount; i++)
out.write("error(" + errorCodeName(eCodes[i]) + ");");
out.write("}\n");
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private boolean subset(int ee[], int el, int[]eCodes) {
for (int i=0;i<ee.length;i++)
if (!in(ee[i],el,eCodes))
return false;
return true;
}
private boolean overlap(int ee[], int el, int[]eCodes) {
for (int i=0;i<ee.length;i++)
if (in(ee[i],el,eCodes))
return true;
return false;
}
private boolean in(int e0, int eCount, int[] eCodes) {
for (int i=0; i<eCount; i++)
if (eCodes[i]==e0)
return true;
return false;
}
}
static long start;
static public void main(String args[]) throws IOException {
start = System.currentTimeMillis();
// out = new FileWriter("src/main/java/org/apache/jena/iri/impl/iri2.jflex");
// copy("src/main/java/org/apache/jena/iri/impl/iri.jflex");
// outRules("scheme");
// outRules("userinfo");
outRules("host");
// outRules("port");
// outRules("path");
// outRules("query");
// outRules("fragment");
// out.close();
//
// JFlex.Main.main(new
// String[]{"src/main/java/com/hp/hpl/jena/iri/impl/iri2.jflex"});
System.out.println(System.currentTimeMillis() - start);
}
private static void copy(String fname) throws IOException {
Reader in = new FileReader(fname);
char buf[] = new char[2048];
while (true) {
int sz = in.read(buf);
if (sz == -1)
break;
out.write(buf, 0, sz);
}
in.close();
}
static String eCodeNames[];
static String errorCodeName(int j) {
if (eCodeNames == null) {
eCodeNames = constantsFromClass(ViolationCodes.class, 200);
}
return eCodeNames[j];
}
static String[] constantsFromClass(Class<?> cl, int cnt) {
String[] names;
names = new String[cnt];
Field f[] = cl.getDeclaredFields();
for (int i = 0; i < f.length; i++)
try {
names[f[i].getInt(null)] = f[i].getName();
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
}
return names;
}
static int count;
static Writer out;
static private void outRules(String name) throws IOException {
count = 0;
// if (true) throw new RuntimeException();
out = new FileWriter("src/main/jflex/org/apache/jena/iri/impl/"+name+".jflex");
copy("src/main/jflex/org/apache/jena/iri/impl/iri.jflex");
out.write("%class Lexer");
out.write(name.substring(0, 1).toUpperCase());
out.write(name.substring(1));
out.write("\n%%\n");
int exc1[]=
new int[]{DOUBLE_DASH_IN_REG_NAME,NOT_DNS_NAME};
int empty[]= new int[0];
int sub1[] = new int[]{ACE_PREFIX};
// int sub2[] = new int[]{DOUBLE_DASH_IN_REG_NAME,ACE_PREFIX};
int sub4[] = new int[]{DOUBLE_DASH_IN_REG_NAME};
int sub3[] = new int[]{NOT_DNS_NAME};
new ExpandAndOutput(exc1,empty,true).expand("@{" + name + "}");
// new ExpandAndOutput(empty,sub2,true).expand("@{" + name + "}");
new ExpandAndOutput(sub1,sub4,true).expand("@{" + name + "}");
new ExpandAndOutput(empty,sub3,true).expand("@{" + name + "}");
out.write("\n");
System.out.println(name + ": " + count + " expansions");
out.close();
MainGenerateLexers.runJFlex(new String[] { "-d", "src/main/java/org/apache/jena/iri/impl", "src/main/jflex/org/apache/jena/iri/impl/"+name+".jflex" });
System.out.println(System.currentTimeMillis() - start);
}
/*
*
* Unicode LTR stuff:
*
* 200E ????-??- ????? ???? 200F ?????-??-???? ???? 202A ????-??-?????
* ????????? 202B ?????-??-???? ????????? 202C ??? ??????????? ??????????
* 202D ????-??-????? ???????? 202E ?????-??-???? ????????
*
* XSD preserve No normalization is done, the value is not changed (this is
* the behavior required by [XML 1.0 (Second Edition)] for element content)
* replace All occurrences of #x9 (tab), #xA (line feed) and #xD (carriage
* return) are replaced with #x20 (space) collapse After the processing
* implied by replace, contiguous sequences of #x20's are collapsed to a
* single #x20, and leading and trailing #x20's are removed.
*
*
* <xs:simpleType name="anyURI" id="anyURI"> <xs:annotation> <xs:appinfo>
* <hfp:hasFacet name="length"/> <hfp:hasFacet name="minLength"/>
* <hfp:hasFacet name="maxLength"/> <hfp:hasFacet name="pattern"/>
* <hfp:hasFacet name="enumeration"/> <hfp:hasFacet name="whiteSpace"/>
* <hfp:hasProperty name="ordered" value="false"/> <hfp:hasProperty
* name="bounded" value="false"/> <hfp:hasProperty name="cardinality"
* value="countably infinite"/> <hfp:hasProperty name="numeric"
* value="false"/> </xs:appinfo> <xs:documentation
* source="http://www.w3.org/TR/xmlschema-2/#anyURI"/> </xs:annotation>
* <xs:restriction base="xs:anySimpleType"> <xs:whiteSpace fixed="true"
* value="collapse" id="anyURI.whiteSpace"/> </xs:restriction>
* </xs:simpleType>
*
* XML 1.0
*
* [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
* [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
* blocks, FFFE, and FFFF.
*
*
* Note:
*
* Document authors are encouraged to avoid "compatibility characters", as
* defined in section 6.8 of [Unicode] (see also D21 in section 3.6 of
* [Unicode3]). The characters defined in the following ranges are also
* discouraged. They are either control characters or permanently undefined
* Unicode characters:
*
* [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF],
* [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF],
* [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF],
* [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF],
* [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF].
*
*
* XML 1.1 [Definition: A parsed entity contains text, a sequence of
* characters, which may represent markup or character data.] [Definition: A
* character is an atomic unit of text as specified by ISO/IEC 10646
* [ISO/IEC 10646]. Legal characters are tab, carriage return, line feed,
* and the legal characters of Unicode and ISO/IEC 10646. The versions of
* these standards cited in A.1 Normative References were current at the
* time this document was prepared. New characters may be added to these
* standards by amendments or new editions. Consequently, XML processors
* MUST accept any character in the range specified for Char.] Character
* Range [2] Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /*
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. * /
* [2a] RestrictedChar ::= [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] |
* [#x86-#x9F]
*
* The mechanism for encoding character code points into bit patterns MAY
* vary from entity to entity. All XML processors MUST accept the UTF-8 and
* UTF-16 encodings of Unicode [Unicode]; the mechanisms for signaling which
* of the two is in use, or for bringing other encodings into play, are
* discussed later, in 4.3.3 Character Encoding in Entities.
*
* Note:
*
* Document authors are encouraged to avoid "compatibility characters", as
* defined in Unicode [Unicode]. The characters defined in the following
* ranges are also discouraged. They are either control characters or
* permanently undefined Unicode characters:
*
* [#x7F-#x84], [#x86-#x9F], [#xFDD0-#xFDDF], [#1FFFE-#x1FFFF],
* [#2FFFE-#x2FFFF], [#3FFFE-#x3FFFF], [#4FFFE-#x4FFFF], [#5FFFE-#x5FFFF],
* [#6FFFE-#x6FFFF], [#7FFFE-#x7FFFF], [#8FFFE-#x8FFFF], [#9FFFE-#x9FFFF],
* [#AFFFE-#xAFFFF], [#BFFFE-#xBFFFF], [#CFFFE-#xCFFFF], [#DFFFE-#xDFFFF],
* [#EFFFE-#xEFFFF], [#FFFFE-#xFFFFF], [#10FFFE-#x10FFFF].
*/
}