| #!/usr/bin/perl -w |
| |
| # <@LICENSE> |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to you under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at: |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # </@LICENSE> |
| |
| use strict; |
| use warnings; |
| |
| our ($opt_a, $opt_t); |
| use Getopt::Long qw(:config auto_help bundling); |
| |
| GetOptions("a|all" => \$opt_a, |
| "t|ignore" => \$opt_t); |
| |
| =head1 NAME |
| |
| overlap - Tool to help determine which tests overlap significantly |
| |
| =head1 SYNOPSIS |
| |
| overlap [options] <log file> |
| |
| Options: |
| -a,--all Show all entries (including reverses of pairs) |
| -t,--ignore Ignore T_ tests (rules under testing) |
| |
| =head1 DESCRIPTION |
| |
| B<overlap> will read the mass-check results log specified and output |
| pairs of tests and how frequently they occur together in absolute |
| terms, and relative to their individual hit rates. |
| |
| The output is of the form: |
| |
| COUNT PAIR/A PAIR/B A,B |
| |
| where C<COUNT> is the number of times the tests hit on the same |
| message, C<PAIR/A> is the ratio of times that both test hit to the |
| number of times test A hits, C<PAIR/B> is the ratio of pair hits to B |
| hits, and the C<A,B> column shows the names of the two tests. |
| |
| Do not abuse this tool. Just because a test highly correlates with |
| another test does not mean you can simply remove one or merge them |
| without further consideration. You need to also look at hit rates, |
| false positives, false negatives, and actually compare the tests. |
| Some overlap is often good, especially if the tests have different |
| characteristics. |
| |
| =cut |
| |
| |
| my $prog = $0; |
| $prog =~ s@.*/@@; |
| |
| if ($#ARGV < 0) { |
| push(@ARGV, "-"); |
| } |
| |
| my %solo; |
| my %pair; |
| |
| foreach my $file (@ARGV) { |
| read_file($file); |
| } |
| |
| print "COUNT\tPAIR/A\tPAIR/B\tA,B\n"; |
| |
| foreach my $k (sort { $pair{$b} <=> $pair{$a} } keys %pair) { |
| my ($a, $b) = split(/ /, $k); |
| my $a_pct = $pair{$k} / $solo{$a}; |
| my $b_pct = $pair{$k} / $solo{$b}; |
| if ($opt_a) { |
| printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$a_pct,$b_pct,$a,$b; |
| printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$b_pct,$a_pct,$b,$a; |
| } |
| else { |
| if (($a_pct > $b_pct) || ($a_pct == $b_pct && $a lt $b)) { |
| printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$a_pct,$b_pct,$a,$b; |
| } |
| else { |
| printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$b_pct,$a_pct,$b,$a; |
| } |
| } |
| } |
| |
| sub read_file { |
| my ($input) = @_; |
| |
| open(FILE, $input) || die "open failed: $input"; |
| my $line = 0; |
| while(<FILE>) { |
| next if /^#/; |
| if (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) { |
| my @tests = split(/,/, $1); |
| @tests = grep { !/^T_/ } @tests if $opt_t; |
| my $i = 0; |
| for my $a (@tests) { |
| $solo{$a}++; |
| $pair{"$a $_"}++ for @tests[(++$i) .. $#tests]; |
| } |
| } |
| else { |
| die "$prog: error in input format in $input\n"; |
| } |
| } |
| close(FILE); |
| } |