AOO410/main/i18npool/source/isolang/lcid.awk - openoffice - Git at Google

 #!/usr/bin/awk -f
 # *************************************************************
 #
 #  Licensed to the Apache Software Foundation (ASF) under one
 #  or more contributor license agreements.  See the NOTICE file
 #  distributed with this work for additional information
 #  regarding copyright ownership.  The ASF licenses this file
 #  to you under the Apache License, Version 2.0 (the
 #  "License"); you may not use this file except in compliance
 #  with the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing,
 #  software distributed under the License is distributed on an
 #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
 #
 # *************************************************************
 #
 # Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
 # Run in i18npool/source/isolang
 #
 # outputs new #define LANGUAGE_... 0x... and also some commented out substrings
 # that were matched in already existing defines.
 #
 # ATTENTION! The sed filter in the command line examples below assures that a
 # '|' border is drawn by html2text in data tables, and nowhere else, on which
 # this awk script relies. This script also heavily relies on the column layout
 # encountered. Should MS decide to change their layout or their CSS names
 # ("data..."), this would probably break. Should html2text decide that the last
 # border="..." attribute encountered wins instead of the first, this may break
 # also.
 #
 # sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g'
 #
 # After html2text best if file cleaned up to _only_ contain the table entries,
 # but not necessary, entries are filtered. Check output.
 #
 # Expects input from the saved page of one of
 #
 # (1)
 # http://www.microsoft.com/globaldev/reference/lcid-all.mspx
 # filtered through ``html2text -nobs ...'', generated table:
 # blank,name,hex,dec,blank fields:
 #    |Afrikaans_-_South_Africa___|0436___|1078___|
 #
 # complete command line:
 # lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
 #
 #
 # (2)
 # http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
 # filtered through ``html2text -nobs ...'', generated table:
 # blank,name,hex,dec,inputlocales,collection,blank fields:
 #    |Afrikaans   |0436   |1078   |0436:00000409,   |Basic   |
 #
 # complete command line:
 # lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
 #
 #
 # (3)
 # http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
 # filtered through ``html2text -nobs ...'', generated table:
 # blank,hex,locale,name,blank  fields:
 #   |0x0436___|af-ZA___|Afrikaans_(South_Africa)___|
 #
 # complete command line:
 # lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp | sed -e 's/|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]*\)\(border\|BORDER\)="[0-9]*"/\1/g; s/\(<table\)\([^>]*\(class\|CLASS\)="data\)/\1 border="1"\2/g' | html2text -nobs -width 234 | awk -f lcid.awk >outfile
 #
 # Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
 #

 BEGIN {
     while ((getline < "../../inc/i18npool/lang.h") > 0)
     {
         if ($0 ~ /^#define[ ]*LANGUAGE_[_A-Za-z0-9]*[ ]*0x[0-9a-fA-F]/)
         {
             # lang[HEX]=NAME
             lang[toupper(substr($3,3))] = toupper($2)
             #print substr($3,3) "=" $2
         }
     }
     # html2text table follows
     FS = "\|"
     filetype = 0
     lcid_all = 1
     xp_lcid  = 2
     nls_238z = 3
     filetypename[filetype] = "unknown"
     filetypename[lcid_all] = "lcid_all"
     filetypename[xp_lcid]  = "xp_lcid"
     filetypename[nls_238z] = "nls_238z"
     namefield[lcid_all] = 2
     namefield[xp_lcid]  = 2
     namefield[nls_238z] = 4
     hexfield[lcid_all]  = 3
     hexfield[xp_lcid]   = 3
     hexfield[nls_238z]  = 2
     locfield[lcid_all]  = 0
     locfield[xp_lcid]   = 0
     locfield[nls_238z]  = 3
 }

 (NF < 5) { next }

 !filetype {
     if (NF == 5)
     {
         if ($2 ~ /^0x/)
             filetype = nls_238z
         else if ($2 ~ /^Afrikaans/)
             filetype = lcid_all
     }
     else if (NF == 7)
         filetype = xp_lcid
     if (!filetype)
         next
     name = namefield[filetype]
     hex = hexfield[filetype]
     loc = locfield[filetype]
 }

 {
     gsub( /^[^:]*:/, "", $name)
     gsub( /\..*/, "", $name)
     gsub( /(^[ _]+)|([ _]+$)/, "", $hex)
     gsub( /(^[ _]+)|([ _]+$)/, "", $name)
     if (loc)
         gsub( /(^[ _]+)|([ _]+$)/, "", $loc)
 }

 ($hex ~ /^0x/) { $hex = substr( $hex, 3) }

 # if only 464 instead of 0464, make it match lang.h
 (length($hex) < 4) { $hex = "0" $hex }

 ($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }

 # all[HEX]=string
 { all[toupper($hex)] = $name }

 (loc) { comment[toupper($hex)] = "  /* " $loc " */" }

 # new hex: newlang[HEX]=string
 !(toupper($hex) in lang) { newlang[toupper($hex)] = $name }

 END {
     if (!filetype)
     {
         print "No file type recognized." >>"/dev/stderr"
         exit(1)
     }
     print "// assuming " filetypename[filetype] " file"
     # every new language
     for (x in newlang)
     {
         printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
         n = split(newlang[x],arr,/[^A-Za-z0-9]/)
         def = ""
         for (i=1; i<=n; ++i)
         {
             if (length(arr[i]))
             {
                 # each identifier word of the language name
                 if (def)
                     def = def "_"
                 aup = toupper(arr[i])
                 def = def aup
                 for (l in lang)
                 {
                     #  contained in already existing definitions?
                     if (lang[l] ~ aup)
                         printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
                 }
             }
         }
         printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
     }
     print "\n// --- reverse check follows ----------------------------------\n"
     for (x in lang)
     {
         if (!(x in all))
             print "// not in input file:   " x "  " lang[x]
     }
     print "\n// --- filtered table entries follow (if any) -----------------\n"
     for (x in filtered)
         print "// filtered:   " x "  " filtered[x]
 }
	#!/usr/bin/awk -f
	# *************************************************************
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	# *************************************************************
	#
	# Utility to compare MS-LANGID definitions with those defined in ../../inc/i18npool/lang.h
	# Run in i18npool/source/isolang
	#
	# outputs new #define LANGUAGE_... 0x... and also some commented out substrings
	# that were matched in already existing defines.
	#
	# ATTENTION! The sed filter in the command line examples below assures that a
	# '\|' border is drawn by html2text in data tables, and nowhere else, on which
	# this awk script relies. This script also heavily relies on the column layout
	# encountered. Should MS decide to change their layout or their CSS names
	# ("data..."), this would probably break. Should html2text decide that the last
	# border="..." attribute encountered wins instead of the first, this may break
	# also.
	#
	# sed -e 's/\|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]\)\(border\\|BORDER\)="[0-9]"/\1/g; s/\(<table\)\([^>]*\(class\\|CLASS\)="data\)/\1 border="1"\2/g'
	#
	# After html2text best if file cleaned up to _only_ contain the table entries,
	# but not necessary, entries are filtered. Check output.
	#
	# Expects input from the saved page of one of
	#
	# (1)
	# http://www.microsoft.com/globaldev/reference/lcid-all.mspx
	# filtered through ``html2text -nobs ...'', generated table:
	# blank,name,hex,dec,blank fields:
	# \|Afrikaans_-_South_Africa___\|0436___\|1078___\|
	#
	# complete command line:
	# lynx -dump -source http://www.microsoft.com/globaldev/reference/lcid-all.mspx \| sed -e 's/\|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]\)\(border\\|BORDER\)="[0-9]"/\1/g; s/\(<table\)\([^>]*\(class\\|CLASS\)="data\)/\1 border="1"\2/g' \| html2text -nobs -width 234 \| awk -f lcid.awk >outfile
	#
	#
	# (2)
	# http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx
	# filtered through ``html2text -nobs ...'', generated table:
	# blank,name,hex,dec,inputlocales,collection,blank fields:
	# \|Afrikaans \|0436 \|1078 \|0436:00000409, \|Basic \|
	#
	# complete command line:
	# lynx -dump -source http://www.microsoft.com/globaldev/reference/winxp/xp-lcid.mspx \| sed -e 's/\|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]\)\(border\\|BORDER\)="[0-9]"/\1/g; s/\(<table\)\([^>]*\(class\\|CLASS\)="data\)/\1 border="1"\2/g' \| html2text -nobs -width 234 \| awk -f lcid.awk >outfile
	#
	#
	# (3)
	# http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp
	# filtered through ``html2text -nobs ...'', generated table:
	# blank,hex,locale,name,blank fields:
	# \|0x0436___\|af-ZA___\|Afrikaans_(South_Africa)___\|
	#
	# complete command line:
	# lynx -dump -source http://msdn.microsoft.com/library/en-us/intl/nls_238z.asp \| sed -e 's/\|/,/g; s/<TABLE/<table/g; /<table/\!b; s/\(<table[^>]\)\(border\\|BORDER\)="[0-9]"/\1/g; s/\(<table\)\([^>]*\(class\\|CLASS\)="data\)/\1 border="1"\2/g' \| html2text -nobs -width 234 \| awk -f lcid.awk >outfile
	#
	# Author: Eike Rathke <erack@sun.com>, <er@openoffice.org>
	#

	BEGIN {
	while ((getline < "../../inc/i18npool/lang.h") > 0)
	{
	if ($0 ~ /^#define[ ]LANGUAGE_[_A-Za-z0-9][ ]*0x[0-9a-fA-F]/)
	{
	# lang[HEX]=NAME
	lang[toupper(substr($3,3))] = toupper($2)
	#print substr($3,3) "=" $2
	}
	}
	# html2text table follows
	FS = "\\|"
	filetype = 0
	lcid_all = 1
	xp_lcid = 2
	nls_238z = 3
	filetypename[filetype] = "unknown"
	filetypename[lcid_all] = "lcid_all"
	filetypename[xp_lcid] = "xp_lcid"
	filetypename[nls_238z] = "nls_238z"
	namefield[lcid_all] = 2
	namefield[xp_lcid] = 2
	namefield[nls_238z] = 4
	hexfield[lcid_all] = 3
	hexfield[xp_lcid] = 3
	hexfield[nls_238z] = 2
	locfield[lcid_all] = 0
	locfield[xp_lcid] = 0
	locfield[nls_238z] = 3
	}

	(NF < 5) { next }

	!filetype {
	if (NF == 5)
	{
	if ($2 ~ /^0x/)
	filetype = nls_238z
	else if ($2 ~ /^Afrikaans/)
	filetype = lcid_all
	}
	else if (NF == 7)
	filetype = xp_lcid
	if (!filetype)
	next
	name = namefield[filetype]
	hex = hexfield[filetype]
	loc = locfield[filetype]
	}

	{
	gsub( /^[^:]*:/, "", $name)
	gsub( /\..*/, "", $name)
	gsub( /(^[ _]+)\|([ _]+$)/, "", $hex)
	gsub( /(^[ _]+)\|([ _]+$)/, "", $name)
	if (loc)
	gsub( /(^[ _]+)\|([ _]+$)/, "", $loc)
	}

	($hex ~ /^0x/) { $hex = substr( $hex, 3) }

	# if only 464 instead of 0464, make it match lang.h
	(length($hex) < 4) { $hex = "0" $hex }

	($hex !~ /^[0-9a-fA-F][0-9a-fA-F]*$/) { filtered[$hex] = $0; next }

	# all[HEX]=string
	{ all[toupper($hex)] = $name }

	(loc) { comment[toupper($hex)] = " /* " $loc " */" }

	# new hex: newlang[HEX]=string
	!(toupper($hex) in lang) { newlang[toupper($hex)] = $name }

	END {
	if (!filetype)
	{
	print "No file type recognized." >>"/dev/stderr"
	exit(1)
	}
	print "// assuming " filetypename[filetype] " file"
	# every new language
	for (x in newlang)
	{
	printf( "xxxxxxx LANGUAGE_%-26s 0x%s%s\n", newlang[x], x, comment[x])
	n = split(newlang[x],arr,/[^A-Za-z0-9]/)
	def = ""
	for (i=1; i<=n; ++i)
	{
	if (length(arr[i]))
	{
	# each identifier word of the language name
	if (def)
	def = def "_"
	aup = toupper(arr[i])
	def = def aup
	for (l in lang)
	{
	# contained in already existing definitions?
	if (lang[l] ~ aup)
	printf( "// %-50s %s\n", arr[i] ": " lang[l], l)
	}
	}
	}
	printf( "#define LANGUAGE_%-26s 0x%s\n", def, x)
	}
	print "\n// --- reverse check follows ----------------------------------\n"
	for (x in lang)
	{
	if (!(x in all))
	print "// not in input file: " x " " lang[x]
	}
	print "\n// --- filtered table entries follow (if any) -----------------\n"
	for (x in filtered)
	print "// filtered: " x " " filtered[x]
	}