blob: a2a1069fa79dd5bdad12332107e3a683688a2866 [file] [log] [blame]
#!/usr/bin/perl
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
#
# RUN AND EDIT THIS SCRIPT IN ISO-8859-1 LOCALE
#
# Usage: lm_to_utf8.pl <xx.lm> [xx.utf8.lm]
#
# Helper to convert to latin1 based .lm into utf8.
#
# Run without output parameter to see what happens:
# lm_to_utf8.pl fi.lm
#
# Check that no obvious unknown chars are found (add to %conv_utf8 as needed)
# When satisfied, give output file:
# lm_to_utf8.pl fi.lm fi.utf8.lm
#
die "Missing lm" unless -f $ARGV[0];
my %conv_utf8 = (
'«' => "\xC3\xAB",
'»' => "\xC3\xBB",
'Ý' => "\xC3\x9D",
'Þ' => "\xC3\x9E",
'ß' => "\xC3\x9F",
'à' => "\xC3\xA0",
'á' => "\xC3\xA1",
'â' => "\xC3\xA2",
'ã' => "\xC3\xA3",
'ä' => "\xC3\xA4",
'å' => "\xC3\xA5",
'æ' => "\xC3\xA6",
'ç' => "\xC3\xA7",
'è' => "\xC3\xA8",
'é' => "\xC3\xA9",
'ê' => "\xC3\xAA",
'ì' => "\xC3\xAC",
'í' => "\xC3\xAD",
'î' => "\xC3\xAE",
'ï' => "\xC3\xAF",
'ð' => "\xC3\xB0",
'ñ' => "\xC3\xB1",
'ò' => "\xC3\xB2",
'ó' => "\xC3\xB3",
'ô' => "\xC3\xB4",
'õ' => "\xC3\xB5",
'ö' => "\xC3\xB6",
'÷' => "\xC3\xB7",
'ø' => "\xC3\xB8",
'ù' => "\xC3\xB9",
'ú' => "\xC3\xBA",
'û' => "\xC3\xBB",
'ü' => "\xC3\xBC",
'ý' => "\xC3\xBD",
'þ' => "\xC3\xBE",
'ÿ' => "\xC3\xB8",
);
my $conv_chars = join('', keys %conv_utf8);
load_models($ARGV[0]);
sub load_models {
my ($indir) = @_;
if ($ARGV[1]) {
open(OUT, ">$ARGV[1]") or die;
binmode OUT or die;
}
open(IN, $ARGV[0]) or die;
binmode IN or die;
my $changes = 0;
while (<IN>) {
s/\r?\n$//;
/^([^0-9\s]+)/ or die;
my $orig_g = $1;
my $g = $orig_g;
unless ($g =~ /^[a-zA-Z${conv_chars},.':;!`*"?()\-\x00]+$/) {
print "UNKNOWN CHAR FOUND: $g\n"
}
foreach (keys %conv_utf8) {
if ($g =~ s/$_/$conv_utf8{$_}/g) {
$changes++;
print "Changing: $orig_g -> $g\n" or die;
}
}
if ($ARGV[1]) {
print OUT "$g\n" or die;
}
}
close OUT if $ARGV[1];
close IN or die;
print "Total changes: $changes\n";
}