| /* This program uses stochastic gradient descent to learn a scoreset for |
| * SpamAssassin. You'll need to run logs-to-c from spamassassin/masses to |
| * generate the stuff in tmp. |
| * |
| * <@LICENSE> |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to you under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * </@LICENSE> |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <assert.h> |
| #include <sys/time.h> |
| #include <math.h> |
| #include <unistd.h> |
| |
| #include "tmp/scores.h" |
| #include "tmp/tests.h" |
| |
| /* Ensure that multiple error functions have not been chosen. */ |
| #ifdef ENTROPIC_ERROR |
| #ifdef LEAST_SQUARES_ERROR |
| #warn Both entropic error and least squares error chosen. Using least squares. |
| #endif |
| #endif |
| |
| /* Choose the least squares error function by default. */ |
| #ifndef ENTROPIC_ERROR |
| #ifndef LEAST_SQUARES_ERROR |
| #define LEAST_SQUARES_ERROR |
| #endif |
| #endif |
| |
| #define OUTPUT_FILE "perceptron.scores" |
| /* #define IGNORE_SCORE_RANGES 1 */ |
| |
| void init_wheel (); |
| void destroy_wheel (); |
| int get_random_test (); |
| void init_weights(); |
| void destroy_weights (); |
| void write_weights (FILE * fp); |
| void scale_scores (double old_threshold, double new_threshold); |
| double evaluate_test (int test); |
| double evaluate_test_nogain (int test); |
| void train (int num_epochs, double learning_rate); |
| void usage (); |
| |
| /* Converts a weight to a SpamAssassin score. */ |
| #define weight_to_score(x) (-threshold*(x)/bias) |
| #define score_to_weight(x) (-(x)*bias/threshold) |
| |
| int wheel_size; /* The number of entries in the roulette wheel (NOT THE |
| SIZE OF THE ROULETTE_WHEEL ARRAY!). */ |
| int * roulette_wheel; /* Used for roulette wheel selection. */ |
| double ham_preference = 2.0; |
| |
| #define DEFAULT_THRESHOLD 5.0 |
| double threshold = DEFAULT_THRESHOLD; |
| |
| double * weights; /* The weights of the single-layer perceptron. */ |
| double bias; /* The network bias for the single-layer perceptron. */ |
| |
| int num_epochs = 15; |
| double learning_rate = 2.0; |
| double weight_decay = 1.0; |
| |
| /* Initialize the roulette wheel and populate it, replicating harder-to-classify hams. */ |
| void init_wheel () { |
| int i; |
| int spam = 0, ham = 0; |
| |
| /* cache locations on the roulette wheel for fast lookups */ |
| roulette_wheel = (int*)calloc(num_nondup, sizeof(int)); |
| wheel_size = 0; |
| |
| roulette_wheel[0] = 0; |
| |
| for (i = 0; i < num_nondup - 1; i++) { |
| int slot_size = 1; |
| |
| /* Hams with more tests are rare and harder to classify but are the |
| * most important to classify correctly. They are thus replicated in the |
| * training set proportionally to their difficulty. */ |
| if ( ! is_spam[i] ) { |
| slot_size += (int)(num_tests_hit[i] * ham_preference * tests_count[i]); |
| } else { |
| slot_size = tests_count[i]; |
| } |
| |
| /* The database is compressed with all instances mapped in the same place. */ |
| wheel_size += slot_size; |
| |
| if ( ! is_spam[i] ) { |
| ham += slot_size; |
| } else { |
| spam++; |
| } |
| |
| roulette_wheel[i+1] = roulette_wheel[i] + slot_size; |
| } |
| |
| printf ("Modified training set statistics: %d spam, %d ham.\n", spam, ham); |
| } |
| |
| /* Free the resources for the roulette wheel selector. */ |
| void destroy_wheel () { |
| if ( roulette_wheel ) { |
| free (roulette_wheel); |
| roulette_wheel = 0; |
| wheel_size = 0; |
| } |
| } |
| |
| /* Get a random test using roulette wheel selection. This is not used anymore. */ |
| int get_random_test () { |
| int r; |
| int bottom, middle, top; |
| |
| r = lrand48() % wheel_size; |
| |
| bottom = 0; |
| top = num_nondup - 1; |
| middle = top / 2; |
| |
| while (1) { |
| if ( r < roulette_wheel[bottom+1] ) { |
| return bottom; |
| } else if ( r >= roulette_wheel[top] ) { |
| return top; |
| } else if ( r >= roulette_wheel[middle] && r < roulette_wheel[middle+1] ) { |
| return middle; |
| } else if ( r < roulette_wheel[middle] ) { |
| top = middle-1; |
| bottom++; |
| middle = bottom + (top-bottom)/2; |
| } else { |
| bottom = middle+1; |
| top--; |
| middle = bottom + (top-bottom)/2; |
| } |
| } |
| } |
| |
| /* Allocate and initialize the weights over the range [-0.5..0.5] */ |
| void init_weights () { |
| int i; |
| |
| weights = (double*)calloc(num_scores, sizeof(double)); |
| |
| bias = drand48() - 0.5; |
| for (i = 0; i < num_scores; i++) { |
| weights[i] = range_lo[i] + drand48() * (range_hi[i] - range_lo[i]); |
| } |
| } |
| |
| /* Free the resources allocated for the weights. */ |
| void destroy_weights () { |
| if ( weights ) { |
| free(weights); |
| weights = 0; |
| } |
| } |
| |
| /* Writes out the weights in SpamAssassin score space. */ |
| void write_weights (FILE * fp) { |
| int i; |
| int ga_nn, ga_yy, ga_ny, ga_yn; |
| double nnscore, yyscore, nyscore, ynscore; |
| |
| ga_nn = ga_yy = ga_ny = ga_yn = 0; |
| nnscore = yyscore = nyscore = ynscore = 0; |
| |
| /* Run through all of the instances in the training set and tally up |
| * the scores. */ |
| for (i = 0; i < num_nondup; i++) { |
| double score = weight_to_score(evaluate_test_nogain(i)) + threshold; |
| |
| if ( score >= threshold && is_spam[i] ) { |
| ga_yy += tests_count[i]; |
| yyscore += tests_count[i] * score; |
| } else if ( score < threshold && !is_spam[i] ) { |
| ga_nn += tests_count[i]; |
| nnscore += tests_count[i] * score; |
| } else if ( score >= threshold && !is_spam[i] ) { |
| ga_ny += tests_count[i]; |
| nyscore += tests_count[i] * score; |
| } else { |
| ga_yn += tests_count[i]; |
| ynscore += tests_count[i] * score; |
| } |
| } |
| |
| /* This is copied from the dump() function in craig-evolve.c. It |
| * outputs some nice statistics about the learned classifier. */ |
| fprintf (fp,"\n# SUMMARY for threshold %3.1f:\n", threshold); |
| fprintf (fp, |
| "# Correctly non-spam: %6d %4.2f%%\n", |
| ga_nn, |
| (ga_nn / (float) num_ham) * 100.0); |
| fprintf (fp, |
| "# Correctly spam: %6d %4.2f%%\n", |
| ga_yy, |
| (ga_yy / (float) num_spam) * 100.0); |
| fprintf (fp, |
| "# False positives: %6d %4.2f%%\n", |
| ga_ny, |
| (ga_ny / (float) num_ham) * 100.0); |
| fprintf (fp, |
| "# False negatives: %6d %4.2f%%\n", |
| ga_yn, |
| (ga_yn / (float) num_spam) * 100.0); |
| |
| fprintf (fp,"# Average score for spam: %3.3f ham: %3.1f\n",(ynscore+yyscore)/((double)(ga_yn+ga_yy)),(nyscore+nnscore)/((double)(ga_nn+ga_ny))); |
| fprintf (fp,"# Average for false-pos: %3.3f false-neg: %3.1f\n",(nyscore/(double)ga_ny),(ynscore/(double)ga_yn)); |
| |
| fprintf (fp,"# TOTAL: %6d %3.2f%%\n\n", num_tests, 100.0); |
| |
| |
| for (i = 0; i < num_scores; i++) { |
| if ( is_mutable[i] ) { |
| fprintf(fp, "score %-30s %2.3f # [%2.3f..%2.3f]\n", score_names[i], weight_to_score(weights[i]), range_lo[i], range_hi[i]); |
| } else { |
| fprintf(fp, "score %-30s %2.3f # not mutable\n", score_names[i], range_lo[i]); |
| } |
| } |
| } |
| |
| /* This is to support Daniel's threshold thing. */ |
| void scale_scores (double old_threshold, double new_threshold) { |
| int i; |
| |
| /* No need to scale something to itself. */ |
| if ( old_threshold == new_threshold ) { |
| return; |
| } |
| |
| for (i = 0; i < num_scores; i++) { |
| if ( is_mutable[i] ) { |
| range_lo[i] = range_lo[i] * new_threshold / old_threshold; |
| range_hi[i] = range_hi[i] * new_threshold / old_threshold; |
| } |
| } |
| |
| /* Maybe we don't want this bit. This prescaling stuff makes my |
| * brain hurt.*/ |
| /* |
| for (i = 0; i < num_nondup; i++) { |
| scores[i] = scores[i] * new_threshold / old_threshold; |
| } |
| */ |
| } |
| |
| /* Computes the value of the activation function of the perceptron for |
| * a given input. */ |
| double evaluate_test (int test) { |
| #ifdef LEAST_SQUARES_ERROR |
| return 1/(1+exp(-evaluate_test_nogain(test))); |
| #else |
| #ifdef ENTROPIC_ERROR |
| return tanh(evaluate_test_nogain(test)); |
| #endif |
| #endif |
| } |
| |
| /* Computes the value of the transfer function (in this case, linear) for |
| * an input defined in tests_hit[test]* */ |
| double evaluate_test_nogain (int test) { |
| double sum; |
| int i; |
| |
| sum = bias; |
| |
| for (i = 0; i < num_tests_hit[test]; i++) { |
| sum += weights[tests_hit[test][i]]; |
| } |
| |
| /* Translate the 'unmutable' scores to weight space. */ |
| sum += score_to_weight(scores[test]); |
| |
| return sum; |
| } |
| |
| /* Trains the perceptron using stochastic gradient descent. */ |
| void train (int num_epochs, double learning_rate) { |
| int epoch, random_test; |
| int i, j; |
| int * tests; |
| double y_out, error, delta; |
| |
| /* Initialize and populate an array containing indices of training |
| * instances. This is shuffled on every epoch and then iterated |
| * through. I had originally planned to use roulette wheel selection, |
| * but shuffled selection seems to work better. */ |
| tests = (int*)calloc(wheel_size, sizeof(int)); |
| |
| for (i = 0, j = 0; i < num_nondup-1; i++) { |
| for (; j < roulette_wheel[i+1]; j++) { |
| tests[j] = i; |
| } |
| } |
| for (; j < wheel_size; j++) { |
| tests[j] = num_nondup-1; |
| } |
| |
| for (epoch = 0; epoch < num_epochs; epoch++) { |
| /* decay the weights on every epoch to smooth out statistical |
| * anomalies */ |
| if ( weight_decay != 1.0 ) { |
| bias *= weight_decay; |
| for (i = 0; i < num_mutable; i++) { |
| weights[i] *= weight_decay; |
| } |
| } |
| |
| /* shuffle the training instances */ |
| for (i = 0; i < wheel_size-1; i++) { |
| int tmp; |
| int r = lrand48 () % (wheel_size - i); |
| |
| tmp = tests[i]; |
| tests[i] = tests[r+i]; |
| tests[r+i] = tmp; |
| } |
| |
| for (j = 0; j < wheel_size; j++) { |
| |
| /* select a random test (they have been randomized above) */ |
| random_test = tests[j]; |
| |
| /* compute the output of the network */ |
| y_out = evaluate_test(random_test); |
| |
| /* compute the error gradient for the logsig node with least squares error */ |
| #ifdef LEAST_SQUARES_ERROR |
| error = is_spam[random_test] - y_out; |
| delta = y_out * (1-y_out) * error / (num_tests_hit[random_test]+1) * learning_rate; |
| #else |
| /* compute the error gradient for the tanh node with entropic error */ |
| #ifdef ENTROPIC_ERROR |
| error = (2.0*is_spam[random_test]-1) - y_out; |
| delta = error / (num_tests_hit[random_test]+1) * learning_rate; |
| #endif |
| #endif |
| |
| /* adjust the weights to descend the steepest part of the error gradient */ |
| if ( epoch + 1 < num_epochs ) { |
| bias += delta; |
| } |
| for (i = 0; i < num_tests_hit[random_test]; i++) { |
| int idx = tests_hit[random_test][i]; |
| weights[idx] += delta; |
| |
| #ifdef IGNORE_SCORE_RANGES |
| /* Constrain the weights so that nice rules are always <= 0 etc. */ |
| if ( range_lo[idx] >= 0 && weights[idx] < 0 ) { |
| weights[idx] = 0; |
| } else if ( range_hi[idx] <= 0 && weights[idx] > 0 ) { |
| weights[idx] = 0; |
| } |
| #else |
| if ( weights[idx] < score_to_weight(range_lo[idx]) ) { |
| weights[idx] = score_to_weight(range_lo[idx]); |
| } else if ( weights[idx] > score_to_weight(range_hi[idx]) ) { |
| weights[idx] = score_to_weight(range_hi[idx]); |
| } |
| #endif |
| } |
| } |
| } |
| |
| free(tests); |
| } |
| |
| void usage () { |
| printf ("usage: perceptron [args]\n" |
| "\n" |
| " -p ham_preference = adds extra ham to training set multiplied by number of\n" |
| " tests hit (2.0 default)\n" |
| " -e num_epochs = number of epochs to train (15 default)\n" |
| " -l learning_rate = learning rate for gradient descent (2.0 default)\n" |
| " -t threshold = minimum threshold for spam (5.0 default)\n" |
| " -w weight_decay = per-epoch decay of learned weight and bias (1.0 default)\n" |
| " -h = print this help\n" |
| "\n"); |
| exit(30); |
| } |
| |
| int main (int argc, char ** argv) { |
| struct timeval tv, tv_start, tv_end; |
| long long int t_usec; |
| FILE * fp; |
| int arg; |
| |
| /* Read the command line options */ |
| while ((arg = getopt (argc, argv, "p:e:l:t:w:h?")) != -1) { |
| switch (arg) { |
| case 'p': |
| ham_preference = atof(optarg); |
| break; |
| |
| case 'e': |
| num_epochs = atoi(optarg); |
| break; |
| |
| case 'l': |
| learning_rate = atof(optarg); |
| break; |
| |
| case 't': |
| threshold = atof(optarg); |
| break; |
| |
| case 'w': |
| weight_decay = atof(optarg); |
| break; |
| |
| case 'h': |
| case '?': |
| usage(); |
| break; |
| } |
| } |
| |
| /* Seed the PRNG */ |
| gettimeofday (&tv, 0); |
| t_usec = tv.tv_sec * 1000000 + tv.tv_usec; |
| srand48 ((int)t_usec); |
| |
| /* Load the instances and score constraints generated by logs-to-c. */ |
| loadtests(); |
| loadscores(); |
| |
| /* If the threshold has been changed, the ranges and scores need to be |
| * scaled so that the output of the program will not be affected. |
| */ |
| scale_scores (DEFAULT_THRESHOLD, threshold); |
| |
| /* Replicate instances from the training set to bias against false positives. */ |
| init_wheel (); |
| |
| /* Allocate and initialize the weight vector. */ |
| init_weights (); |
| |
| /* Train the network using stochastic gradient descent. */ |
| gettimeofday(&tv_start, 0); |
| train(num_epochs, learning_rate); |
| gettimeofday(&tv_end, 0); |
| |
| t_usec = tv_end.tv_sec * 1000000 + tv_end.tv_usec - |
| (tv_start.tv_sec *1000000 + tv_start.tv_usec); |
| printf ("Training time = %fs.\n", t_usec / 1000000.0f); |
| |
| /* Translate the learned weights to SA score space and output to a file. */ |
| fp = fopen (OUTPUT_FILE, "w"); |
| if ( fp ) { |
| write_weights(fp); |
| fclose(fp); |
| } else { |
| perror (OUTPUT_FILE); |
| } |
| |
| /* Free resources */ |
| destroy_weights (); |
| destroy_wheel (); |
| return 0; |
| } |