lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py - lucene-solr - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import re

 # A simple python script to generate an HTML entity map and a regex alternation
 # for inclusion in HTMLStripCharFilter.jflex.

 def main():
   print(get_apache_license())
   codes = {}
   regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
   for line in get_entity_text().split('\n'):
     match = regex.match(line)
     if match:
       key = match.group(1)
       if   key == 'quot': codes[key] = r'\"'
       elif key == 'nbsp': codes[key] = ' ';
       else              : codes[key] = r'\u%04X' % int(match.group(2))

   keys = sorted(codes)

   first_entry = True
   output_line = 'CharacterEntities = ( '
   for key in keys:
     new_entry = ('"%s"' if first_entry else ' | "%s"') % key
     first_entry = False
     if len(output_line) + len(new_entry) >= 80:
       print(output_line)
       output_line = '                   '
     output_line += new_entry
     if key in ('quot','copy','gt','lt','reg','amp'):
       new_entry = ' | "%s"' % key.upper()
       if len(output_line) + len(new_entry) >= 80:
         print(output_line)
         output_line = '                   '
       output_line += new_entry
   print(output_line, ')')

   print('%{')
   print('  private static final Map<String,String> upperCaseVariantsAccepted')
   print('      = new HashMap<>();')
   print('  static {')
   print('    upperCaseVariantsAccepted.put("quot", "QUOT");')
   print('    upperCaseVariantsAccepted.put("copy", "COPY");')
   print('    upperCaseVariantsAccepted.put("gt", "GT");')
   print('    upperCaseVariantsAccepted.put("lt", "LT");')
   print('    upperCaseVariantsAccepted.put("reg", "REG");')
   print('    upperCaseVariantsAccepted.put("amp", "AMP");')
   print('  }')
   print('  private static final CharArrayMap<Character> entityValues')
   print('      = new CharArrayMap<>(%i, false);' % len(keys))
   print('  static {')
   print('    String[] entities = {')
   output_line = '     '
   for key in keys:
     new_entry = ' "%s", "%s",' % (key, codes[key])
     if len(output_line) + len(new_entry) >= 80:
       print(output_line)
       output_line = '     '
     output_line += new_entry
   print(output_line[:-1])
   print('    };')
   print('    for (int i = 0 ; i < entities.length ; i += 2) {')
   print('      Character value = entities[i + 1].charAt(0);')
   print('      entityValues.put(entities[i], value);')
   print('      String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
   print('      if (upperCaseVariant != null) {')
   print('        entityValues.put(upperCaseVariant, value);')
   print('      }')
   print('    }')
   print("  }")
   print("%}")

 def get_entity_text():
 # The text below is taken verbatim from
 # <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
   text = r"""
 F.1. XHTML Character Entities

 XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
 F.1.1. XHTML Latin 1 Character Entities

 You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.

 <!-- ...................................................................... -->
 <!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
 <!-- file: xhtml-lat1.ent

      Typical invocation:

        <!ENTITY % xhtml-lat1
            PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
                   "xhtml-lat1.ent" >
        %xhtml-lat1;

      This DTD module is identified by the PUBLIC and SYSTEM identifiers:

        PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
        SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"

      Revision:  Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI

      Portions (C) International Organization for Standardization 1986:
      Permission to copy in any form is granted for use with conforming
      SGML systems and applications as defined in ISO 8879, provided
      this notice is included in all copies.
 -->

 <!ENTITY nbsp   "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
 <!ENTITY iexcl  "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
 <!ENTITY cent   "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
 <!ENTITY pound  "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
 <!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
 <!ENTITY yen    "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
 <!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
 <!ENTITY sect   "&#167;" ><!-- section sign, U+00A7 ISOnum -->
 <!ENTITY uml    "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
 <!ENTITY copy   "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
 <!ENTITY ordf   "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
 <!ENTITY laquo  "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
 <!ENTITY not    "&#172;" ><!-- not sign, U+00AC ISOnum -->
 <!ENTITY shy    "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
 <!ENTITY reg    "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
 <!ENTITY macr   "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
 <!ENTITY deg    "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
 <!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
 <!ENTITY sup2   "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
 <!ENTITY sup3   "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
 <!ENTITY acute  "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
 <!ENTITY micro  "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
 <!ENTITY para   "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
 <!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
 <!ENTITY cedil  "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
 <!ENTITY sup1   "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
 <!ENTITY ordm   "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
 <!ENTITY raquo  "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
 <!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
 <!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
 <!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
 <!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
 <!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
 <!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
 <!ENTITY Acirc  "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
 <!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
 <!ENTITY Auml   "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
 <!ENTITY Aring  "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
 <!ENTITY AElig  "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
 <!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
 <!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
 <!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
 <!ENTITY Ecirc  "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
 <!ENTITY Euml   "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
 <!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
 <!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
 <!ENTITY Icirc  "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
 <!ENTITY Iuml   "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
 <!ENTITY ETH    "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
 <!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
 <!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
 <!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
 <!ENTITY Ocirc  "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
 <!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
 <!ENTITY Ouml   "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
 <!ENTITY times  "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
 <!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
 <!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
 <!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
 <!ENTITY Ucirc  "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
 <!ENTITY Uuml   "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
 <!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
 <!ENTITY THORN  "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
 <!ENTITY szlig  "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
 <!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
 <!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
 <!ENTITY acirc  "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
 <!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
 <!ENTITY auml   "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
 <!ENTITY aring  "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
 <!ENTITY aelig  "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
 <!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
 <!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
 <!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
 <!ENTITY ecirc  "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
 <!ENTITY euml   "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
 <!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
 <!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
 <!ENTITY icirc  "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
 <!ENTITY iuml   "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
 <!ENTITY eth    "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
 <!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
 <!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
 <!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
 <!ENTITY ocirc  "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
 <!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
 <!ENTITY ouml   "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
 <!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
 <!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
 <!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
 <!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
 <!ENTITY ucirc  "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
 <!ENTITY uuml   "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
 <!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
 <!ENTITY thorn  "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
 <!ENTITY yuml   "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
 <!-- end of xhtml-lat1.ent -->

 F.1.2. XHTML Special Characters

 You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.

 <!-- ...................................................................... -->
 <!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
 <!-- file: xhtml-special.ent

      Typical invocation:

        <!ENTITY % xhtml-special
            PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
                   "xhtml-special.ent" >
        %xhtml-special;

      This DTD module is identified by the PUBLIC and SYSTEM identifiers:

        PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
        SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"

      Revision:  Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI

      Portions (C) International Organization for Standardization 1986:
      Permission to copy in any form is granted for use with conforming
      SGML systems and applications as defined in ISO 8879, provided
      this notice is included in all copies.

      Revisions:
 2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
 -->

 <!-- Relevant ISO entity set is given unless names are newly introduced.
      New names (i.e., not in ISO 8879 [SGML] list) do not clash with
      any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
      numbers are given for each character, in hex. Entity values are
      decimal conversions of the ISO 10646 values and refer to the
      document character set. Names are Unicode [UNICODE] names.
 -->

 <!-- C0 Controls and Basic Latin -->
 <!ENTITY lt      "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
 <!ENTITY gt      "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
 <!ENTITY amp     "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
 <!ENTITY apos    "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
 <!ENTITY quot    "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->

 <!-- Latin Extended-A -->
 <!ENTITY OElig   "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
 <!ENTITY oelig   "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->

 <!-- ligature is a misnomer, this is a separate character in some languages -->
 <!ENTITY Scaron  "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
 <!ENTITY scaron  "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
 <!ENTITY Yuml    "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->

 <!-- Spacing Modifier Letters -->
 <!ENTITY circ    "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
 <!ENTITY tilde   "&#732;" ><!-- small tilde, U+02DC ISOdia -->

 <!-- General Punctuation -->
 <!ENTITY ensp    "&#8194;" ><!-- en space, U+2002 ISOpub -->
 <!ENTITY emsp    "&#8195;" ><!-- em space, U+2003 ISOpub -->
 <!ENTITY thinsp  "&#8201;" ><!-- thin space, U+2009 ISOpub -->
 <!ENTITY zwnj    "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
 <!ENTITY zwj     "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
 <!ENTITY lrm     "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
 <!ENTITY rlm     "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
 <!ENTITY ndash   "&#8211;" ><!-- en dash, U+2013 ISOpub -->
 <!ENTITY mdash   "&#8212;" ><!-- em dash, U+2014 ISOpub -->
 <!ENTITY lsquo   "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
 <!ENTITY rsquo   "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
 <!ENTITY sbquo   "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
 <!ENTITY ldquo   "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
 <!ENTITY rdquo   "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
 <!ENTITY bdquo   "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
 <!ENTITY dagger  "&#8224;" ><!-- dagger, U+2020 ISOpub -->
 <!ENTITY Dagger  "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
 <!ENTITY permil  "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->

 <!-- lsaquo is proposed but not yet ISO standardized -->
 <!ENTITY lsaquo  "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
 <!-- rsaquo is proposed but not yet ISO standardized -->
 <!ENTITY rsaquo  "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
 <!ENTITY euro    "&#8364;" ><!-- euro sign, U+20AC NEW -->

 <!-- end of xhtml-special.ent -->

 F.1.3. XHTML Mathematical, Greek, and Symbolic Characters

 You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.

 <!-- ...................................................................... -->
 <!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
 <!-- file: xhtml-symbol.ent

      Typical invocation:

        <!ENTITY % xhtml-symbol
            PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
                   "xhtml-symbol.ent" >
        %xhtml-symbol;

      This DTD module is identified by the PUBLIC and SYSTEM identifiers:

        PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
        SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"

      Revision:  Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI

      Portions (C) International Organization for Standardization 1986:
      Permission to copy in any form is granted for use with conforming
      SGML systems and applications as defined in ISO 8879, provided
      this notice is included in all copies.
 -->

 <!-- Relevant ISO entity set is given unless names are newly introduced.
      New names (i.e., not in ISO 8879 [SGML] list) do not clash with
      any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
      numbers are given for each character, in hex. Entity values are
      decimal conversions of the ISO 10646 values and refer to the
      document character set. Names are Unicode [UNICODE] names.
 -->

 <!-- Latin Extended-B -->
 <!ENTITY fnof     "&#402;" ><!-- latin small f with hook = function
                               = florin, U+0192 ISOtech -->

 <!-- Greek -->
 <!ENTITY Alpha    "&#913;" ><!-- greek capital letter alpha, U+0391 -->
 <!ENTITY Beta     "&#914;" ><!-- greek capital letter beta, U+0392 -->
 <!ENTITY Gamma    "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
 <!ENTITY Delta    "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
 <!ENTITY Epsilon  "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
 <!ENTITY Zeta     "&#918;" ><!-- greek capital letter zeta, U+0396 -->
 <!ENTITY Eta      "&#919;" ><!-- greek capital letter eta, U+0397 -->
 <!ENTITY Theta    "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
 <!ENTITY Iota     "&#921;" ><!-- greek capital letter iota, U+0399 -->
 <!ENTITY Kappa    "&#922;" ><!-- greek capital letter kappa, U+039A -->
 <!ENTITY Lambda   "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
 <!ENTITY Mu       "&#924;" ><!-- greek capital letter mu, U+039C -->
 <!ENTITY Nu       "&#925;" ><!-- greek capital letter nu, U+039D -->
 <!ENTITY Xi       "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
 <!ENTITY Omicron  "&#927;" ><!-- greek capital letter omicron, U+039F -->
 <!ENTITY Pi       "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
 <!ENTITY Rho      "&#929;" ><!-- greek capital letter rho, U+03A1 -->
 <!-- there is no Sigmaf, and no U+03A2 character either -->
 <!ENTITY Sigma    "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
 <!ENTITY Tau      "&#932;" ><!-- greek capital letter tau, U+03A4 -->
 <!ENTITY Upsilon  "&#933;" ><!-- greek capital letter upsilon,
                               U+03A5 ISOgrk3 -->
 <!ENTITY Phi      "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
 <!ENTITY Chi      "&#935;" ><!-- greek capital letter chi, U+03A7 -->
 <!ENTITY Psi      "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
 <!ENTITY Omega    "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
 <!ENTITY alpha    "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
 <!ENTITY beta     "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
 <!ENTITY gamma    "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
 <!ENTITY delta    "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
 <!ENTITY epsilon  "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
 <!ENTITY zeta     "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
 <!ENTITY eta      "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
 <!ENTITY theta    "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
 <!ENTITY iota     "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
 <!ENTITY kappa    "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
 <!ENTITY lambda   "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
 <!ENTITY mu       "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
 <!ENTITY nu       "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
 <!ENTITY xi       "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
 <!ENTITY omicron  "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
 <!ENTITY pi       "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
 <!ENTITY rho      "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
 <!ENTITY sigmaf   "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
 <!ENTITY sigma    "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
 <!ENTITY tau      "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
 <!ENTITY upsilon  "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
 <!ENTITY phi      "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
 <!ENTITY chi      "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
 <!ENTITY psi      "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
 <!ENTITY omega    "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
 <!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
 <!ENTITY upsih    "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
 <!ENTITY piv      "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->

 <!-- General Punctuation -->
 <!ENTITY bull     "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub  -->
 <!-- bullet is NOT the same as bullet operator, U+2219 -->
 <!ENTITY hellip   "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub  -->
 <!ENTITY prime    "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
 <!ENTITY Prime    "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
 <!ENTITY oline    "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
 <!ENTITY frasl    "&#8260;" ><!-- fraction slash, U+2044 NEW -->

 <!-- Letterlike Symbols -->
 <!ENTITY weierp   "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
 <!ENTITY image    "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
 <!ENTITY real     "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
 <!ENTITY trade    "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
 <!ENTITY alefsym  "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
 <!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
      the same glyph could be used to depict both characters -->

 <!-- Arrows -->
 <!ENTITY larr     "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
 <!ENTITY uarr     "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
 <!ENTITY rarr     "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
 <!ENTITY darr     "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
 <!ENTITY harr     "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
 <!ENTITY crarr    "&#8629;" ><!-- downwards arrow with corner leftwards
                                = carriage return, U+21B5 NEW -->
 <!ENTITY lArr     "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
 <!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
     but also does not have any other character for that function. So ? lArr can
     be used for 'is implied by' as ISOtech suggests -->
 <!ENTITY uArr     "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
 <!ENTITY rArr     "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
 <!-- Unicode does not say this is the 'implies' character but does not have
      another character with this function so ?
      rArr can be used for 'implies' as ISOtech suggests -->
 <!ENTITY dArr     "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
 <!ENTITY hArr     "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->

 <!-- Mathematical Operators -->
 <!ENTITY forall   "&#8704;" ><!-- for all, U+2200 ISOtech -->
 <!ENTITY part     "&#8706;" ><!-- partial differential, U+2202 ISOtech  -->
 <!ENTITY exist    "&#8707;" ><!-- there exists, U+2203 ISOtech -->
 <!ENTITY empty    "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
 <!ENTITY nabla    "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
 <!ENTITY isin     "&#8712;" ><!-- element of, U+2208 ISOtech -->
 <!ENTITY notin    "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
 <!ENTITY ni       "&#8715;" ><!-- contains as member, U+220B ISOtech -->
 <!-- should there be a more memorable name than 'ni'? -->
 <!ENTITY prod     "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
 <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
      the same glyph might be used for both -->
 <!ENTITY sum      "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
 <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
      though the same glyph might be used for both -->
 <!ENTITY minus    "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
 <!ENTITY lowast   "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
 <!ENTITY radic    "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
 <!ENTITY prop     "&#8733;" ><!-- proportional to, U+221D ISOtech -->
 <!ENTITY infin    "&#8734;" ><!-- infinity, U+221E ISOtech -->
 <!ENTITY ang      "&#8736;" ><!-- angle, U+2220 ISOamso -->
 <!ENTITY and      "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
 <!ENTITY or       "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
 <!ENTITY cap      "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
 <!ENTITY cup      "&#8746;" ><!-- union = cup, U+222A ISOtech -->
 <!ENTITY int      "&#8747;" ><!-- integral, U+222B ISOtech -->
 <!ENTITY there4   "&#8756;" ><!-- therefore, U+2234 ISOtech -->
 <!ENTITY sim      "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
 <!-- tilde operator is NOT the same character as the tilde, U+007E,
      although the same glyph might be used to represent both  -->
 <!ENTITY cong     "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
 <!ENTITY asymp    "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
 <!ENTITY ne       "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
 <!ENTITY equiv    "&#8801;" ><!-- identical to, U+2261 ISOtech -->
 <!ENTITY le       "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
 <!ENTITY ge       "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
 <!ENTITY sub      "&#8834;" ><!-- subset of, U+2282 ISOtech -->
 <!ENTITY sup      "&#8835;" ><!-- superset of, U+2283 ISOtech -->
 <!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
      font encoding and is not included. Should it be, for symmetry?
      It is in ISOamsn  -->
 <!ENTITY nsub     "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
 <!ENTITY sube     "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
 <!ENTITY supe     "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
 <!ENTITY oplus    "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
 <!ENTITY otimes   "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
 <!ENTITY perp     "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
 <!ENTITY sdot     "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
 <!-- dot operator is NOT the same character as U+00B7 middle dot -->

 <!-- Miscellaneous Technical -->
 <!ENTITY lceil    "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc  -->
 <!ENTITY rceil    "&#8969;" ><!-- right ceiling, U+2309 ISOamsc  -->
 <!ENTITY lfloor   "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc  -->
 <!ENTITY rfloor   "&#8971;" ><!-- right floor, U+230B ISOamsc  -->
 <!ENTITY lang     "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
 <!-- lang is NOT the same character as U+003C 'less than'
      or U+2039 'single left-pointing angle quotation mark' -->
 <!ENTITY rang     "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
 <!-- rang is NOT the same character as U+003E 'greater than'
      or U+203A 'single right-pointing angle quotation mark' -->

 <!-- Geometric Shapes -->
 <!ENTITY loz      "&#9674;" ><!-- lozenge, U+25CA ISOpub -->

 <!-- Miscellaneous Symbols -->
 <!ENTITY spades   "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
 <!-- black here seems to mean filled as opposed to hollow -->
 <!ENTITY clubs    "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
 <!ENTITY hearts   "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
 <!ENTITY diams    "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->

 <!-- end of xhtml-symbol.ent -->
 """
   return text

 def get_apache_license():
   license = r"""/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 """
   return license

 main()