| #!/usr/bin/perl -w |
| # |
| # freqdiff - print frequency difference between two inputs |
| # |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| our ($opt_a, $opt_b, $opt_c, $opt_d, $opt_h, $opt_p, $opt_r, $opt_l); |
| use Getopt::Std; |
| getopts("abcdhprl:"); |
| |
| my $prog = $0; |
| $prog =~ s@.*/@@; |
| |
| sub usage { |
| my $status = shift; |
| |
| my $out = $status ? STDERR : STDOUT; |
| print $out <<EOF; |
| usage: $prog [options] [first input] [second input] |
| $prog [options] [first input] [second input] [scores file] |
| |
| -a show all elements (only used for -d) |
| -b show both count and percentage |
| -c remove comments from inputs |
| -d print delta of frequencies (second - first) |
| -h print this help |
| -p print percentage of frequencies ((first / (first + second)) * 100.0) |
| -r use relative frequencies (account for size of inputs, useful for -p) |
| -l N only read first N lines |
| |
| default is -d if input line counts are within 1% of each other and -p otherwise |
| EOF |
| exit($status); |
| } |
| |
| usage(0) if $opt_h; |
| usage(1) unless $#ARGV > 0; |
| |
| my @line; |
| my $type; |
| my %one = read_argv(0); |
| my %two = read_argv(1); |
| |
| $opt_d = 1 if (!$opt_p && (abs($line[0] - $line[1]) / $line[1]) < 0.01); |
| $opt_d = 1 if ($type == 3); |
| $opt_a = 1 if ($type == 4); |
| |
| my %score; |
| if ($#ARGV > 1) { |
| open(FILE, $ARGV[2]) || die "open failed: $ARGV[2]"; |
| while (<FILE>) { |
| chomp; |
| s/#.*//; |
| my @field = split; |
| if ($#field >= 2 && $field[0] eq "score") { |
| $score{$field[1]} = $field[2]; |
| } |
| } |
| close(FILE); |
| } |
| |
| my @all = (keys %one); |
| foreach my $elem (keys %two) { |
| if (! defined($one{$elem})) { |
| push(@all, $elem); |
| } |
| } |
| |
| my %out; |
| foreach my $elem (@all) { |
| my $one = 0; |
| my $two = 0; |
| |
| if ($type == 3) { |
| $one = 1.0; |
| $two = 1.0; |
| } |
| if (exists($one{$elem})) { |
| $one = $one{$elem}; |
| delete $one{$elem}; |
| } |
| if (exists($two{$elem})) { |
| $two = $two{$elem}; |
| delete $two{$elem}; |
| } |
| if ($opt_d) { |
| $out{$elem} = $two - $one; |
| } |
| else { |
| $count{$elem} = $two + $one if $opt_b; |
| $out{$elem} = $one / ($one + $two) * 100.0; |
| } |
| } |
| |
| foreach my $elem (sort { $out{$b} <=> $out{$a} || ((defined %count) && ($count{$b} <=> $count{$a})) || $a cmp $b } keys %out) { |
| my $name = $elem; |
| if (%score) { |
| if (exists($score{$elem})) { |
| $name = "$score{$elem}\t$name"; |
| } |
| else { |
| $name = "1.0\t$name"; |
| } |
| } |
| if ($type == 3) { |
| printf "%.3f\t%s\n", $out{$elem}, $name; |
| } |
| elsif ($opt_d) { |
| print "$out{$elem}\t$name\n" if ($out{$elem} || $opt_a); |
| } |
| else { |
| if ($opt_b) { |
| printf "%.2f\t%d\t%s\n", $out{$elem}, $count{$elem}, $name; |
| } |
| else { |
| printf "%.2f\t%s\n", $out{$elem}, $name; |
| } |
| } |
| } |
| |
| sub read_argv { |
| my ($input) = @_; |
| my %freq; |
| my $last = 0; |
| |
| open(FILE, $ARGV[$input]) || die "open failed: $ARGV[$input]"; |
| my $line = 0; |
| while(<FILE>) { |
| if ($opt_c) { |
| s/#.*//; |
| next unless /\S/; |
| } |
| next if (/^OVERALL/); # hit-frequencies header line |
| |
| $line++; |
| next if ($opt_l && $line > $opt_l); |
| $last = $type; |
| # "sort | uniq -c" format |
| if (/^\s*(-?\d+|-?\d+\.\d+)\s+(.*)/) { |
| $type = 1; |
| $freq{$2} = $1; |
| } |
| # "mass-check" format |
| elsif (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) { |
| $type = 2; |
| foreach (split(/,/, $1)) { |
| $freq{$_}++; |
| } |
| } |
| # "scores" format |
| elsif (/^score\s+(\S+)\s+(-?[\d.]+)/) { |
| $type = 3; |
| $freq{$1} = $2; |
| } |
| # line number is frequency |
| else { |
| $type = 4; |
| chomp; |
| $freq{$_} = $line; |
| } |
| if ($last && $last != $type) { |
| die "$prog: inconsistent format in $ARGV[$input] (format $last then format $type)\n"; |
| } |
| } |
| close(FILE); |
| |
| foreach my $key (keys %freq) { |
| if ($type == 4) { |
| $freq{$key} = $line - $freq{$key} + 1; |
| } |
| if ($opt_r) { |
| $freq{$key} /= $line; |
| } |
| } |
| $line[$input] = $line; |
| return %freq; |
| } |