scripts/training/build-vocab.pl - joshua - Git at Google

 #!/usr/bin/perl

 # Takes a corpus of words on STDIN and builds a vocabulary with word
 # counts, writing them to STDOUT in the format
 #
 # ID WORD COUNT

 use utf8;
 use warnings;
 use strict;

 binmode(STDIN,  ":utf8");
 binmode(STDOUT, ":utf8");

 my %count;
 while (my $line = <>) {
   chomp($line);
   my @tokens = split(' ', $line);
   map { $count{$_}++ } @tokens;
 }

 my $id = 1;
 map { print $id++ . " $_ $count{$_}\n" } (sort { $count{$b} <=> $count{$a} } keys %count);
	#!/usr/bin/perl

	# Takes a corpus of words on STDIN and builds a vocabulary with word
	# counts, writing them to STDOUT in the format
	#
	# ID WORD COUNT

	use utf8;
	use warnings;
	use strict;

	binmode(STDIN, ":utf8");
	binmode(STDOUT, ":utf8");

	my %count;
	while (my $line = <>) {
	chomp($line);
	my @tokens = split(' ', $line);
	map { $count{$_}++ } @tokens;
	}

	my $id = 1;
	map { print $id++ . " $_ $count{$_}\n" } (sort { $count{$b} <=> $count{$a} } keys %count);