| #!/usr/bin/awk -f |
| # ************************************************************* |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| # ************************************************************* |
| # |
| # Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h |
| # Run in i18npool/source/isolang |
| # |
| # outputs new #define LANGUAGE_... 0x... and also some commented out substrings |
| # that were matched in already existing defines. |
| # |
| # ATTENTION! The sed filter in the command line examples below assures that a |
| # '|' border is drawn by html2text in data tables, and nowhere else, on which |
| # this awk script relies. This script also heavily relies on the column layout |
| # encountered. Should MS decide to change their layout or their CSS names |
| # ("data..."), this would probably break. Should html2text decide that the last |
| # border="..." attribute encountered wins instead of the first, this may break |
| # also. |
| # |
| # sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' |
| # |
| # After html2text best if file cleaned up to _only_ contain the table entries, |
| # but not necessary, entries are filtered. Check output. |
| # |
| # Expects input from the saved page of one of |
| # |
| # (1) |
| # http://www.microsoft.com/globaldev/reference/lcid-all.mspx |
| # filtered through ``html2text -nobs ...'', generated table: |
| # blank,name,hex,dec,blank fields: |
| # |Afrikaans_-_South_Africa___|0436___|1078___| |
| # |
| # complete command line: |
| # lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile |
| # |
| # |
| # (2) |
| # http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx |
| # filtered through ``html2text -nobs ...'', generated table: |
| # blank,name,hex,dec,inputlocales,collection,blank fields: |
| # |Afrikaans |0436 |1078 |0436:00000409, |Basic | |
| # |
| # complete command line: |
| # lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile |
| # |
| # |
| # (3) |
| # http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp |
| # filtered through ``html2text -nobs ...'', generated table: |
| # blank,hex,locale,name,blank fields: |
| # |0x0436___|af-ZA___|Afrikaans_(South_Africa)___| |
| # |
| # complete command line: |
| # lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile |
| # |
| # Author: Eike Rathke <erack@sun.com>, <er@openoffice.org> |
| # |
| |
| BEGIN { |
| while ((getline < "../../inc/i18npool/lang.h") > 0) |
| { |
| if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/) |
| { |
| # lang[HEX]=NAME |
| lang[toupper(substr($3,3))] = toupper($2) |
| #print substr($3,3) "=" $2 |
| } |
| } |
| # html2text table follows |
| FS = "\|" |
| filetype = 0 |
| lcid_all = 1 |
| xp_lcid = 2 |
| nls_238z = 3 |
| filetypename[filetype] = "unknown" |
| filetypename[lcid_all] = "lcid_all" |
| filetypename[xp_lcid] = "xp_lcid" |
| filetypename[nls_238z] = "nls_238z" |
| namefield[lcid_all] = 2 |
| namefield[xp_lcid] = 2 |
| namefield[nls_238z] = 4 |
| hexfield[lcid_all] = 3 |
| hexfield[xp_lcid] = 3 |
| hexfield[nls_238z] = 2 |
| locfield[lcid_all] = 0 |
| locfield[xp_lcid] = 0 |
| locfield[nls_238z] = 3 |
| } |
| |
| (NF < 5) { next } |
| |
| !filetype { |
| if (NF == 5) |
| { |
| if ($2 ~ /^0x/) |
| filetype = nls_238z |
| else if ($2 ~ /^Afrikaans/) |
| filetype = lcid_all |
| } |
| else if (NF == 7) |
| filetype = xp_lcid |
| if (!filetype) |
| next |
| name = namefield[filetype] |
| hex = hexfield[filetype] |
| loc = locfield[filetype] |
| } |
| |
| { |
| gsub( /^[^:]*:/, "", $name) |
| gsub( /\..*/, "", $name) |
| gsub( /(^[ _]+)|([ _]+$)/, "", $hex) |
| gsub( /(^[ _]+)|([ _]+$)/, "", $name) |
| if (loc) |
| gsub( /(^[ _]+)|([ _]+$)/, "", $loc) |
| } |
| |
| ($hex ~ /^0x/) { $hex = substr( $hex, 3) } |
| |
| # if only 464 instead of 0464, make it match lang.h |
| (length($hex) < 4) { $hex = "0" $hex } |
| |
| ($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next } |
| |
| # all[HEX]=string |
| { all[toupper($hex)] = $name } |
| |
| (loc) { comment[toupper($hex)] = " /* " $loc " */" } |
| |
| # new hex: newlang[HEX]=string |
| !(toupper($hex) in lang) { newlang[toupper($hex)] = $name } |
| |
| END { |
| if (!filetype) |
| { |
| print "No file type recognized." >>"/dev/stderr" |
| exit(1) |
| } |
| print "// assuming " filetypename[filetype] " file" |
| # every new language |
| for (x in newlang) |
| { |
| printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x]) |
| n = split(newlang[x],arr,/[^A-Za-z0-9]/) |
| def = "" |
| for (i=1; i<=n; ++i) |
| { |
| if (length(arr[i])) |
| { |
| # each identifier word of the language name |
| if (def) |
| def = def "_" |
| aup = toupper(arr[i]) |
| def = def aup |
| for (l in lang) |
| { |
| # contained in already existing definitions? |
| if (lang[l] ~ aup) |
| printf( "// %-50s %s\n", arr[i] ": " lang[l], l) |
| } |
| } |
| } |
| printf( "#define LANGUAGE_%-26s 0x%s\n", def, x) |
| } |
| print "\n// --- reverse check follows ----------------------------------\n" |
| for (x in lang) |
| { |
| if (!(x in all)) |
| print "// not in input file: " x " " lang[x] |
| } |
| print "\n// --- filtered table entries follow (if any) -----------------\n" |
| for (x in filtered) |
| print "// filtered: " x " " filtered[x] |
| } |