| /* This program uses a genetic algorithm to optimize a phrase-based rule such as |
| * the NIGERIAN or ADVANCE_FEE rule. |
| * |
| * <@LICENSE> |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to you under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * </@LICENSE> |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <assert.h> |
| #include <time.h> |
| #include <unistd.h> |
| #include <strings.h> |
| |
| /* GAUL: Genetic Algorithm Utility Library. http://gaul.sourceforge.net/ */ |
| #include <gaul.h> |
| |
| /* Config files */ |
| char * hits_file = "hits.dat"; /* The data file containing the matrix. */ |
| char * rules_file = "rules.dat"; /* The data file containing rule names. */ |
| |
| /* Fitness function parameters */ |
| int maximum_relevant_hits = 4; /* How many hits is the rule going to look for. */ |
| int target_num_rules = 50; /* How many sub-rules would we like the meta rule to use? */ |
| double target_flex_rules = 5; /* How flexible the GA should be. Half-life of the fitness function. */ |
| |
| /* The fitness function is based on: |
| * min(num_hits, maximum_relevant_hits)^hits_exponent * count * ... |
| * (if ham: -num_hits ^ penalty_exponent) */ |
| double hits_exponent = 3.0; |
| double penalty_exponent = 9.0; |
| |
| /* GA parameters */ |
| /* The number of individuals in the population */ |
| int population_size = 100; |
| /* The maximal number of generations that the simluation is to run for. */ |
| int max_generations = 10000; |
| /* The probability of a cross-over. */ |
| double crossover_prob = 1.0; |
| /* The probability of one boolean allele being switched. */ |
| double mutation_prob = 0.1; |
| |
| int num_rules; /* The number of rules being optimized. */ |
| int max_hits; /* The maximal number of hits in a unique pattern. */ |
| int num_patterns; /* The number of unique patterns. */ |
| int * pattern_data; /* A compressed matrix containing the patterns. */ |
| int * pattern_size_data; /* The width of each row in the above matrix. */ |
| int * pattern_count_data; /* The number of occurrences of the pattern. */ |
| int * class_data; /* The class that each pattern belongs to (ham, spam). */ |
| |
| /* Some #defines for fast access to the compressed matrix. */ |
| #define pattern(i,j) (pattern_data[(i)*max_hits + (j)]) |
| #define pattern_size(i) (pattern_size_data[(i)]) |
| #define pattern_count(i) (pattern_count_data[(i)]) |
| #define class(i) (class_data[(i)]) |
| |
| #define max(x,y) ((x)>(y)?(x):(y)) |
| #define min(x,y) ((x)<(y)?(x):(y)) |
| |
| /* Loads the compressed matrix into memory. */ |
| void load_patterns () { |
| FILE * pfile; |
| int p; |
| |
| pfile = fopen(hits_file, "r"); |
| if ( ! pfile ) { |
| perror (hits_file); |
| exit(1); |
| } |
| |
| if ( fscanf (pfile, "%d %d %d", &num_rules, &max_hits, &num_patterns) != 3 ) { |
| fprintf (stderr, "%s missing header.\n", hits_file); |
| exit (1); |
| } |
| |
| assert(pattern_data = (int*)malloc(sizeof(int) * max_hits * num_patterns)); |
| assert(pattern_size_data = (int*)malloc(sizeof(int) * num_patterns)); |
| assert(pattern_count_data = (int*)malloc(sizeof(int) * num_patterns)); |
| assert(class_data = (int*)malloc(sizeof(int) * num_patterns)); |
| |
| for (p = 0; p < num_patterns; p++) { |
| int i; |
| |
| if (fscanf(pfile, "%d %d %d", class_data+p, pattern_count_data+p, pattern_size_data+p) != 3) { |
| fprintf (stderr, "%s truncated (entry %d)\n", hits_file, p); |
| exit (1); |
| } |
| assert(pattern_size_data[p] <= max_hits); |
| for (i = 0; i < pattern_size(p); i++) { |
| if (fscanf(pfile, "%d", pattern_data+p*max_hits+i) != 1) { |
| fprintf (stderr, "%s truncated (entry %d)\n", hits_file, p); |
| exit(1); |
| } |
| } |
| } |
| } |
| |
| /* The fitness function is: |
| * sum_{all individuals} |
| * min(num_hits, maximum_relevant_hits)^hits_exponent * count * ... |
| * (if ham: -num_hits ^ penalty_exponent) |
| * / exp(abs(target_num_rules - num_rules_present) * log(2) / target_flex_rules) |
| * */ |
| static boolean pattern_score(population *pop, entity *entity) { |
| int i, j, num_hits, num_rules_present; |
| |
| entity->fitness = 0; |
| |
| /* Count up the number of rules present in this individual's |
| * chromosome. */ |
| num_rules_present = 0; |
| for (i = 0; i < num_rules; i++) { |
| if (((boolean*)entity->chromosome[0])[i]) { |
| num_rules_present++; |
| } |
| } |
| |
| /* An individual with no rules present in its chromosome has 0 fitness, |
| * so we take a short cut. */ |
| if (num_rules_present == 0) { |
| return true; |
| } |
| |
| /* Compute the fitness function as described above. */ |
| for (i = 0; i < num_patterns; i++) { |
| num_hits = 0; |
| /* This counts how many rules in the chromosome also hit the |
| * pattern */ |
| for (j = 0; j < pattern_size(i); j++) { |
| if (((boolean*)entity->chromosome[0])[pattern(i,j)]) { |
| num_hits++; |
| } |
| } |
| |
| /* See above for a description of what this does and why it |
| * does it. */ |
| entity->fitness += pow(min(num_hits,maximum_relevant_hits),hits_exponent) * pattern_count(i) * (class(i) ? 1 : -pow(num_hits,penalty_exponent)); |
| } |
| |
| /* This divisor is bound to 1, to prevent overflow. exp(0) is undefined |
| * so we just skip this part (it's unnecessary). */ |
| if ( target_num_rules - num_rules_present ) { |
| entity->fitness /= max(exp(fabs(target_num_rules - num_rules_present) * log(2) / target_flex_rules),1); |
| } |
| |
| /* Negative fitnesses make roulette wheels go owwie. */ |
| if ( entity->fitness < 0 ) { |
| entity->fitness = 0; |
| } |
| |
| return true; |
| } |
| |
| /* This is to print out the final result. */ |
| void print_entity (entity * entity) { |
| FILE * rfile; |
| char buf[BUFSIZ]; |
| int i; |
| int count = 0; |
| int histogram[2][maximum_relevant_hits+1], num_hits; |
| |
| assert(rfile = fopen (rules_file, "r")); |
| |
| /* The rules in rules.dat are supposed to correspond to column |
| * numbers. */ |
| for (i = 0; i < num_rules; i++) { |
| if ( ! fgets(buf, BUFSIZ, rfile) ) { |
| perror ("fgets"); |
| exit (1); |
| } |
| |
| /* Print out the rule name if the corresponding allele is |
| * present on the chromosome. */ |
| if ( ((boolean *)entity->chromosome[0])[i] == 1 ) { |
| count++; |
| printf ("%s", buf); |
| } |
| } |
| |
| fprintf (stderr, "fitness: %f\n", entity->fitness); |
| fprintf (stderr, "rule count: %d\n", count); |
| |
| /* Zero the histogram, just in case the compiler han't done it for us. */ |
| bzero (histogram, sizeof(histogram)); |
| |
| /* Compute the histogram by scanning through the training data. */ |
| for (i = 0; i < num_patterns; i++) { |
| int j; |
| |
| num_hits = 0; |
| for (j = 0; j < pattern_size(i); j++) { |
| if (((boolean*)entity->chromosome[0])[pattern(i,j)]) { |
| num_hits++; |
| if ( num_hits == maximum_relevant_hits ) { |
| break; |
| } |
| } |
| } |
| for (j = 0; j <= num_hits; j++) { |
| histogram[class(i)][j] += pattern_count(i); |
| } |
| } |
| |
| /* Print the histogram. */ |
| fprintf (stderr, "\t %8s %8s %8s %8s %8s\n", |
| "HAM", |
| "HAM%", |
| "SPAM", |
| "SPAM%", |
| "S/O"); |
| for (i = 0; i <= maximum_relevant_hits; i++) { |
| fprintf (stderr, ">=%d hits:%8d %8.4f %8d %8.4f %8.4f\n", i, |
| histogram[0][i], |
| 100.0 * histogram[0][i] / histogram[0][0], |
| histogram[1][i], |
| 100.0 * histogram[1][i] / histogram[1][0], |
| ((double)histogram[1][i] / histogram[1][0]) / ((double)histogram[1][i] / histogram[1][0] + (double)histogram[0][i] / histogram[0][0])); |
| } |
| } |
| |
| void usage () { |
| printf ("usage: evolve_metarule [args]\n" |
| "\n" |
| "Config parameters:\n" |
| " -h hits_file\n" |
| " -r rules_fule\n" |
| "\nFitness function parameters:\n" |
| " -m maximum_relevant_hits\n" |
| " -t target_num_rules\n" |
| " -l target_flex_rules\n" |
| " -e hits_exponent\n" |
| " -p penalty_exponent\n" |
| "\nGA parameters:\n" |
| " -s population_size\n" |
| " -g max_generations\n" |
| " -x crossover_prob\n" |
| " -u mutation_prob\n" |
| "\n -? = print this help\n" |
| "\n"); |
| |
| exit(0); |
| } |
| |
| int main (int argc, char ** argv) { |
| population *pop = 0; |
| char arg; |
| |
| while ((arg = getopt (argc, argv, "h:r:m:t:l:e:p:s:g:x:u:?")) != -1) { |
| switch (arg) { |
| case 'h': |
| hits_file = optarg; |
| break; |
| case 'r': |
| rules_file = optarg; |
| break; |
| case 'm': |
| maximum_relevant_hits = atoi(optarg); |
| break; |
| case 't': |
| target_num_rules = atoi(optarg); |
| break; |
| case 'l': |
| target_flex_rules = atoi(optarg); |
| break; |
| case 'e': |
| hits_exponent = atof(optarg); |
| break; |
| case 'p': |
| penalty_exponent = atof(optarg); |
| break; |
| case 's': |
| population_size = atoi(optarg); |
| break; |
| case 'g': |
| max_generations = atoi(optarg); |
| break; |
| case 'x': |
| crossover_prob = atof(optarg); |
| break; |
| case 'u': |
| mutation_prob = atof(optarg); |
| break; |
| case '?': |
| usage (); |
| } |
| } |
| |
| load_patterns(); |
| |
| random_init(); |
| random_seed(time(0)); |
| |
| pop = ga_genesis_boolean( |
| population_size, /* const int population_size */ |
| 1, /* const int num_chromo */ |
| num_rules, /* const int len_chromo */ |
| NULL, /* GAgeneration_hook generation_hook */ |
| NULL, /* GAiteration_hook iteration_hook */ |
| NULL, /* GAdata_destructor data_destructor */ |
| NULL, /* GAdata_ref_incrementor data_ref_incrementor */ |
| pattern_score, /* GAevaluate evaluate */ |
| ga_seed_boolean_random, /* GAseed seed */ |
| NULL, /* GAadapt adapt */ |
| ga_select_one_roulette, /* GAselect_one select_one */ |
| ga_select_two_roulette, /* GAselect_two select_two */ |
| ga_mutate_boolean_singlepoint, /* GAmutate mutate */ |
| ga_crossover_boolean_allele_mixing, /* GAcrossover crossover */ |
| ga_replace_by_fitness, /* GAreplace replace */ |
| NULL /* vpointer User data */ |
| ); |
| |
| ga_population_set_parameters( |
| pop, /* population *pop */ |
| GA_SCHEME_DARWIN, /* const ga_scheme_type scheme */ |
| GA_ELITISM_NULL, /* const ga_elitism_type elitism */ |
| crossover_prob, /* double crossover */ |
| mutation_prob, /* double mutation */ |
| 0.0 /* double migration */ |
| ); |
| |
| ga_evolution_steady_state( |
| pop, /* population *pop */ |
| max_generations /* const int max_generations */ |
| ); |
| |
| print_entity(ga_get_entity_from_rank(pop, 0)); |
| |
| return 0; |
| } |