blob: fd26e6fcb70fdb3aeaa0644a3bd7f70eeb8303d8 [file] [log] [blame]
#!/usr/bin/perl -w
#
# freqdiff - print frequency difference between two inputs
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
our ($opt_a, $opt_b, $opt_c, $opt_d, $opt_h, $opt_p, $opt_r, $opt_l);
use Getopt::Std;
getopts("abcdhprl:");
my $prog = $0;
$prog =~ s@.*/@@;
sub usage {
my $status = shift;
my $out = $status ? STDERR : STDOUT;
print $out <<EOF;
usage: $prog [options] [first input] [second input]
$prog [options] [first input] [second input] [scores file]
-a show all elements (only used for -d)
-b show both count and percentage
-c remove comments from inputs
-d print delta of frequencies (second - first)
-h print this help
-p print percentage of frequencies ((first / (first + second)) * 100.0)
-r use relative frequencies (account for size of inputs, useful for -p)
-l N only read first N lines
default is -d if input line counts are within 1% of each other and -p otherwise
EOF
exit($status);
}
usage(0) if $opt_h;
usage(1) unless $#ARGV > 0;
my @line;
my $type;
my %one = read_argv(0);
my %two = read_argv(1);
$opt_d = 1 if (!$opt_p && (abs($line[0] - $line[1]) / $line[1]) < 0.01);
$opt_d = 1 if ($type == 3);
$opt_a = 1 if ($type == 4);
my %score;
if ($#ARGV > 1) {
open(FILE, $ARGV[2]) || die "open failed: $ARGV[2]";
while (<FILE>) {
chomp;
s/#.*//;
my @field = split;
if ($#field >= 2 && $field[0] eq "score") {
$score{$field[1]} = $field[2];
}
}
close(FILE);
}
my @all = (keys %one);
foreach my $elem (keys %two) {
if (! defined($one{$elem})) {
push(@all, $elem);
}
}
my %out;
foreach my $elem (@all) {
my $one = 0;
my $two = 0;
if ($type == 3) {
$one = 1.0;
$two = 1.0;
}
if (exists($one{$elem})) {
$one = $one{$elem};
delete $one{$elem};
}
if (exists($two{$elem})) {
$two = $two{$elem};
delete $two{$elem};
}
if ($opt_d) {
$out{$elem} = $two - $one;
}
else {
$count{$elem} = $two + $one if $opt_b;
$out{$elem} = $one / ($one + $two) * 100.0;
}
}
foreach my $elem (sort { $out{$b} <=> $out{$a} || ((defined %count) && ($count{$b} <=> $count{$a})) || $a cmp $b } keys %out) {
my $name = $elem;
if (%score) {
if (exists($score{$elem})) {
$name = "$score{$elem}\t$name";
}
else {
$name = "1.0\t$name";
}
}
if ($type == 3) {
printf "%.3f\t%s\n", $out{$elem}, $name;
}
elsif ($opt_d) {
print "$out{$elem}\t$name\n" if ($out{$elem} || $opt_a);
}
else {
if ($opt_b) {
printf "%.2f\t%d\t%s\n", $out{$elem}, $count{$elem}, $name;
}
else {
printf "%.2f\t%s\n", $out{$elem}, $name;
}
}
}
sub read_argv {
my ($input) = @_;
my %freq;
my $last = 0;
open(FILE, $ARGV[$input]) || die "open failed: $ARGV[$input]";
my $line = 0;
while(<FILE>) {
if ($opt_c) {
s/#.*//;
next unless /\S/;
}
next if (/^OVERALL/); # hit-frequencies header line
$line++;
next if ($opt_l && $line > $opt_l);
$last = $type;
# "sort | uniq -c" format
if (/^\s*(-?\d+|-?\d+\.\d+)\s+(.*)/) {
$type = 1;
$freq{$2} = $1;
}
# "mass-check" format
elsif (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) {
my $test_str = $1;
$type = 2;
foreach my $r (split(/,/, $test_str)) {
my $hits = 1;
# Support compacted RULE(hitcount) format
if ($r =~ s/\((\d+)\)$//) {
$hits = $1;
}
$freq{$r} += $hits;
}
}
# "scores" format
elsif (/^score\s+(\S+)\s+(-?[\d.]+)/) {
$type = 3;
$freq{$1} = $2;
}
# line number is frequency
else {
$type = 4;
chomp;
$freq{$_} = $line;
}
if ($last && $last != $type) {
die "$prog: inconsistent format in $ARGV[$input] (format $last then format $type)\n";
}
}
close(FILE);
foreach my $key (keys %freq) {
if ($type == 4) {
$freq{$key} = $line - $freq{$key} + 1;
}
if ($opt_r) {
$freq{$key} /= $line;
}
}
$line[$input] = $line;
return %freq;
}