blob: 2248c0a0641c5ad7fb33e19f92139ab3d55ea42a [file] [log] [blame]
#!/usr/bin/perl
# Matt Post <post@cs.jhu.edu>
# Takes a list of tab-separated strings on STDIN and a single argument N, a threshold. If either of
# the first two fields has mroe than N tokens, the line is skipped.
# e.g.,
# paste corpus.en corpus.fr | trim_parallel_corpus.pl 40 | split2files.pl en.trimmed.40 fr.trimmed.40
my $thresh = shift || 100;
while (my $line = <>) {
my ($line1,$line2,$rest) = split(/\t/,$line,3);
# Make sure they're both defined
next unless (defined $line1 and defined $line2);
# Skip if either side is over the threshold
my @tokens1 = split(' ', $line1);
my @tokens2 = split(' ', $line2);
next if (@tokens1 > $thresh || @tokens2 > $thresh) || @tokens1 == 0 || @tokens2 == 0;
# Otherwise print the whole line
print $line;
}