blob: cda226385df4d6974818d8db41170ee6fe659d5d [file] [log] [blame]
#!/usr/bin/perl
# Matt Post <2011-02-04>
# Inserts an OOV tag above singleton nodes, using a modified
# definition of "singleton" in which a singleton is defined to be any
# word occuring the minumum number of times any word occurred. Works
# by taking a stream of Treebank-style parse trees on STDIN, together
# with a lexicon (which contains vocabulary counts). Writes the
# modified trees to STDOUT.
# Usage:
# cat TREES | add-OOVs.pl LEXICON > TREES.OOV
use strict;
use warnings;
my %SINGLETONS;
my $lexicon_file = shift or die "need lexicon";
# If a second argument is passed, replace the preterminal instead of inserting below it.
my $REPLACE = shift || 0;
# pass over once to find the minimum count of any item in the lexicon
my $min_count = 0;
open LEX, $lexicon_file or die "can't read lexicon '$lexicon_file'";
while (my $line = <LEX>) {
my ($id,$word,$count) = split(' ',$line);
$min_count = $count if ($min_count == 0 or $count < $min_count);
# short-circuit if we find a 1
last if $min_count == 1;
}
close(LEX);
# pass over again to mark as "singletons" anything appearing min_count times
open LEX, $lexicon_file or die "can't read lexicon '$lexicon_file'";
while (my $line = <LEX>) {
my ($id,$word,$count) = split(' ',$line);
$SINGLETONS{$word} = 1 if ($count == $min_count);
}
close(LEX);
while (<>) {
s/\((\S+)\s+(\S+)\)/maybe_add_OOV($1,$2)/ge;
print;
}
sub maybe_add_OOV {
my ($pos,$word) = @_;
if (exists $SINGLETONS{$word}) {
return $REPLACE ? "(OOV $word)" : "($pos (OOV $word))";
} else {
return "($pos $word)";
}
}