| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| import re |
| |
| # A simple python script to generate an HTML entity map and a regex alternation |
| # for inclusion in HTMLStripCharFilter.jflex. |
| |
| def main(): |
| print(get_apache_license()) |
| codes = {} |
| regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"') |
| for line in get_entity_text().split('\n'): |
| match = regex.match(line) |
| if match: |
| key = match.group(1) |
| if key == 'quot': codes[key] = r'\"' |
| elif key == 'nbsp': codes[key] = ' '; |
| else : codes[key] = r'\u%04X' % int(match.group(2)) |
| |
| keys = sorted(codes) |
| |
| first_entry = True |
| output_line = 'CharacterEntities = ( ' |
| for key in keys: |
| new_entry = ('"%s"' if first_entry else ' | "%s"') % key |
| first_entry = False |
| if len(output_line) + len(new_entry) >= 80: |
| print(output_line) |
| output_line = ' ' |
| output_line += new_entry |
| if key in ('quot','copy','gt','lt','reg','amp'): |
| new_entry = ' | "%s"' % key.upper() |
| if len(output_line) + len(new_entry) >= 80: |
| print(output_line) |
| output_line = ' ' |
| output_line += new_entry |
| print(output_line, ')') |
| |
| print('%{') |
| print(' private static final Map<String,String> upperCaseVariantsAccepted') |
| print(' = new HashMap<>();') |
| print(' static {') |
| print(' upperCaseVariantsAccepted.put("quot", "QUOT");') |
| print(' upperCaseVariantsAccepted.put("copy", "COPY");') |
| print(' upperCaseVariantsAccepted.put("gt", "GT");') |
| print(' upperCaseVariantsAccepted.put("lt", "LT");') |
| print(' upperCaseVariantsAccepted.put("reg", "REG");') |
| print(' upperCaseVariantsAccepted.put("amp", "AMP");') |
| print(' }') |
| print(' private static final CharArrayMap<Character> entityValues') |
| print(' = new CharArrayMap<>(%i, false);' % len(keys)) |
| print(' static {') |
| print(' String[] entities = {') |
| output_line = ' ' |
| for key in keys: |
| new_entry = ' "%s", "%s",' % (key, codes[key]) |
| if len(output_line) + len(new_entry) >= 80: |
| print(output_line) |
| output_line = ' ' |
| output_line += new_entry |
| print(output_line[:-1]) |
| print(' };') |
| print(' for (int i = 0 ; i < entities.length ; i += 2) {') |
| print(' Character value = entities[i + 1].charAt(0);') |
| print(' entityValues.put(entities[i], value);') |
| print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);') |
| print(' if (upperCaseVariant != null) {') |
| print(' entityValues.put(upperCaseVariant, value);') |
| print(' }') |
| print(' }') |
| print(" }") |
| print("%}") |
| |
| def get_entity_text(): |
| # The text below is taken verbatim from |
| # <http://www.w3.org/TR/REC-html40/sgml/entities.html>: |
| text = r""" |
| F.1. XHTML Character Entities |
| |
| XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section. |
| F.1.1. XHTML Latin 1 Character Entities |
| |
| You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent. |
| |
| <!-- ...................................................................... --> |
| <!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ --> |
| <!-- file: xhtml-lat1.ent |
| |
| Typical invocation: |
| |
| <!ENTITY % xhtml-lat1 |
| PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" |
| "xhtml-lat1.ent" > |
| %xhtml-lat1; |
| |
| This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
| |
| PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" |
| SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent" |
| |
| Revision: Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
| |
| Portions (C) International Organization for Standardization 1986: |
| Permission to copy in any form is granted for use with conforming |
| SGML systems and applications as defined in ISO 8879, provided |
| this notice is included in all copies. |
| --> |
| |
| <!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum --> |
| <!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum --> |
| <!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum --> |
| <!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum --> |
| <!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum --> |
| <!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum --> |
| <!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum --> |
| <!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum --> |
| <!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia --> |
| <!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum --> |
| <!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum --> |
| <!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum --> |
| <!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum --> |
| <!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum --> |
| <!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum --> |
| <!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia --> |
| <!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum --> |
| <!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum --> |
| <!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum --> |
| <!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum --> |
| <!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia --> |
| <!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum --> |
| <!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum --> |
| <!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum --> |
| <!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia --> |
| <!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum --> |
| <!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum --> |
| <!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum --> |
| <!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum --> |
| <!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum --> |
| <!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum --> |
| <!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum --> |
| <!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 --> |
| <!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 --> |
| <!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 --> |
| <!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 --> |
| <!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 --> |
| <!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 --> |
| <!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 --> |
| <!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 --> |
| <!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 --> |
| <!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 --> |
| <!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 --> |
| <!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 --> |
| <!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 --> |
| <!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 --> |
| <!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 --> |
| <!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 --> |
| <!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 --> |
| <!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 --> |
| <!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 --> |
| <!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 --> |
| <!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 --> |
| <!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 --> |
| <!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 --> |
| <!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum --> |
| <!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 --> |
| <!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 --> |
| <!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 --> |
| <!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 --> |
| <!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 --> |
| <!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 --> |
| <!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 --> |
| <!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 --> |
| <!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 --> |
| <!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 --> |
| <!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 --> |
| <!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 --> |
| <!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 --> |
| <!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 --> |
| <!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 --> |
| <!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 --> |
| <!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 --> |
| <!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 --> |
| <!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 --> |
| <!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 --> |
| <!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 --> |
| <!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 --> |
| <!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 --> |
| <!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 --> |
| <!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 --> |
| <!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 --> |
| <!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 --> |
| <!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 --> |
| <!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 --> |
| <!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 --> |
| <!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 --> |
| <!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum --> |
| <!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 --> |
| <!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 --> |
| <!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 --> |
| <!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 --> |
| <!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 --> |
| <!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 --> |
| <!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 --> |
| <!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 --> |
| <!-- end of xhtml-lat1.ent --> |
| |
| F.1.2. XHTML Special Characters |
| |
| You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent. |
| |
| <!-- ...................................................................... --> |
| <!-- XML-compatible ISO Special Character Entity Set for XHTML ............ --> |
| <!-- file: xhtml-special.ent |
| |
| Typical invocation: |
| |
| <!ENTITY % xhtml-special |
| PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" |
| "xhtml-special.ent" > |
| %xhtml-special; |
| |
| This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
| |
| PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" |
| SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent" |
| |
| Revision: Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
| |
| Portions (C) International Organization for Standardization 1986: |
| Permission to copy in any form is granted for use with conforming |
| SGML systems and applications as defined in ISO 8879, provided |
| this notice is included in all copies. |
| |
| Revisions: |
| 2000-10-28: added ' and altered XML Predefined Entities for compatibility |
| --> |
| |
| <!-- Relevant ISO entity set is given unless names are newly introduced. |
| New names (i.e., not in ISO 8879 [SGML] list) do not clash with |
| any existing ISO 8879 entity names. ISO 10646 [ISO10646] character |
| numbers are given for each character, in hex. Entity values are |
| decimal conversions of the ISO 10646 values and refer to the |
| document character set. Names are Unicode [UNICODE] names. |
| --> |
| |
| <!-- C0 Controls and Basic Latin --> |
| <!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum --> |
| <!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum --> |
| <!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum --> |
| <!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum --> |
| <!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum --> |
| |
| <!-- Latin Extended-A --> |
| <!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 --> |
| <!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 --> |
| |
| <!-- ligature is a misnomer, this is a separate character in some languages --> |
| <!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 --> |
| <!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 --> |
| <!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 --> |
| |
| <!-- Spacing Modifier Letters --> |
| <!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub --> |
| <!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia --> |
| |
| <!-- General Punctuation --> |
| <!ENTITY ensp " " ><!-- en space, U+2002 ISOpub --> |
| <!ENTITY emsp " " ><!-- em space, U+2003 ISOpub --> |
| <!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub --> |
| <!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 --> |
| <!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 --> |
| <!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 --> |
| <!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 --> |
| <!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub --> |
| <!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub --> |
| <!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum --> |
| <!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum --> |
| <!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW --> |
| <!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum --> |
| <!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum --> |
| <!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW --> |
| <!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub --> |
| <!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub --> |
| <!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech --> |
| |
| <!-- lsaquo is proposed but not yet ISO standardized --> |
| <!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed --> |
| <!-- rsaquo is proposed but not yet ISO standardized --> |
| <!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed --> |
| <!ENTITY euro "€" ><!-- euro sign, U+20AC NEW --> |
| |
| <!-- end of xhtml-special.ent --> |
| |
| F.1.3. XHTML Mathematical, Greek, and Symbolic Characters |
| |
| You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent. |
| |
| <!-- ...................................................................... --> |
| <!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... --> |
| <!-- file: xhtml-symbol.ent |
| |
| Typical invocation: |
| |
| <!ENTITY % xhtml-symbol |
| PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" |
| "xhtml-symbol.ent" > |
| %xhtml-symbol; |
| |
| This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
| |
| PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" |
| SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent" |
| |
| Revision: Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
| |
| Portions (C) International Organization for Standardization 1986: |
| Permission to copy in any form is granted for use with conforming |
| SGML systems and applications as defined in ISO 8879, provided |
| this notice is included in all copies. |
| --> |
| |
| <!-- Relevant ISO entity set is given unless names are newly introduced. |
| New names (i.e., not in ISO 8879 [SGML] list) do not clash with |
| any existing ISO 8879 entity names. ISO 10646 [ISO10646] character |
| numbers are given for each character, in hex. Entity values are |
| decimal conversions of the ISO 10646 values and refer to the |
| document character set. Names are Unicode [UNICODE] names. |
| --> |
| |
| <!-- Latin Extended-B --> |
| <!ENTITY fnof "ƒ" ><!-- latin small f with hook = function |
| = florin, U+0192 ISOtech --> |
| |
| <!-- Greek --> |
| <!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 --> |
| <!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 --> |
| <!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 --> |
| <!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 --> |
| <!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 --> |
| <!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 --> |
| <!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 --> |
| <!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 --> |
| <!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 --> |
| <!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A --> |
| <!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 --> |
| <!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C --> |
| <!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D --> |
| <!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 --> |
| <!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F --> |
| <!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 --> |
| <!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 --> |
| <!-- there is no Sigmaf, and no U+03A2 character either --> |
| <!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 --> |
| <!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 --> |
| <!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon, |
| U+03A5 ISOgrk3 --> |
| <!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 --> |
| <!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 --> |
| <!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 --> |
| <!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 --> |
| <!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 --> |
| <!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 --> |
| <!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 --> |
| <!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 --> |
| <!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 --> |
| <!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 --> |
| <!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 --> |
| <!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 --> |
| <!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 --> |
| <!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 --> |
| <!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 --> |
| <!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 --> |
| <!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 --> |
| <!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 --> |
| <!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW --> |
| <!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 --> |
| <!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 --> |
| <!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 --> |
| <!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 --> |
| <!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 --> |
| <!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 --> |
| <!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 --> |
| <!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 --> |
| <!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 --> |
| <!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 --> |
| <!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW --> |
| <!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW --> |
| <!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 --> |
| |
| <!-- General Punctuation --> |
| <!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub --> |
| <!-- bullet is NOT the same as bullet operator, U+2219 --> |
| <!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub --> |
| <!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech --> |
| <!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech --> |
| <!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW --> |
| <!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW --> |
| |
| <!-- Letterlike Symbols --> |
| <!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso --> |
| <!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso --> |
| <!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso --> |
| <!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum --> |
| <!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW --> |
| <!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although |
| the same glyph could be used to depict both characters --> |
| |
| <!-- Arrows --> |
| <!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum --> |
| <!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum--> |
| <!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum --> |
| <!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum --> |
| <!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa --> |
| <!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards |
| = carriage return, U+21B5 NEW --> |
| <!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech --> |
| <!-- Unicode does not say that lArr is the same as the 'is implied by' arrow |
| but also does not have any other character for that function. So ? lArr can |
| be used for 'is implied by' as ISOtech suggests --> |
| <!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa --> |
| <!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech --> |
| <!-- Unicode does not say this is the 'implies' character but does not have |
| another character with this function so ? |
| rArr can be used for 'implies' as ISOtech suggests --> |
| <!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa --> |
| <!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa --> |
| |
| <!-- Mathematical Operators --> |
| <!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech --> |
| <!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech --> |
| <!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech --> |
| <!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso --> |
| <!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech --> |
| <!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech --> |
| <!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech --> |
| <!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech --> |
| <!-- should there be a more memorable name than 'ni'? --> |
| <!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb --> |
| <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though |
| the same glyph might be used for both --> |
| <!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb --> |
| <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma' |
| though the same glyph might be used for both --> |
| <!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech --> |
| <!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech --> |
| <!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech --> |
| <!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech --> |
| <!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech --> |
| <!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso --> |
| <!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech --> |
| <!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech --> |
| <!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech --> |
| <!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech --> |
| <!ENTITY int "∫" ><!-- integral, U+222B ISOtech --> |
| <!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech --> |
| <!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech --> |
| <!-- tilde operator is NOT the same character as the tilde, U+007E, |
| although the same glyph might be used to represent both --> |
| <!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech --> |
| <!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr --> |
| <!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech --> |
| <!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech --> |
| <!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech --> |
| <!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech --> |
| <!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech --> |
| <!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech --> |
| <!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol |
| font encoding and is not included. Should it be, for symmetry? |
| It is in ISOamsn --> |
| <!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn --> |
| <!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech --> |
| <!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech --> |
| <!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb --> |
| <!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb --> |
| <!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech --> |
| <!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb --> |
| <!-- dot operator is NOT the same character as U+00B7 middle dot --> |
| |
| <!-- Miscellaneous Technical --> |
| <!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc --> |
| <!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc --> |
| <!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc --> |
| <!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc --> |
| <!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech --> |
| <!-- lang is NOT the same character as U+003C 'less than' |
| or U+2039 'single left-pointing angle quotation mark' --> |
| <!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech --> |
| <!-- rang is NOT the same character as U+003E 'greater than' |
| or U+203A 'single right-pointing angle quotation mark' --> |
| |
| <!-- Geometric Shapes --> |
| <!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub --> |
| |
| <!-- Miscellaneous Symbols --> |
| <!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub --> |
| <!-- black here seems to mean filled as opposed to hollow --> |
| <!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub --> |
| <!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub --> |
| <!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub --> |
| |
| <!-- end of xhtml-symbol.ent --> |
| """ |
| return text |
| |
| def get_apache_license(): |
| license = r"""/* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| """ |
| return license |
| |
| main() |