masses/garescorer.c - spamassassin - Git at Google

 /* @COPYRIGHT */
 /*
  *  This program uses PGAPack to do its GA stuff.
  *  ftp://ftp.mcs.anl.gov/pub/pgapack/pgapack.tar.Z
  *  I used this one instead of galib because it uses MPI
  *  to spread load around.  It also seems like the API is a little
  *  cleaner.
  */

 #include "pgapack.h"

 #include <unistd.h>
 #include <sys/time.h>
 #include <math.h>
 #include "tmp/scores.h"
 #include "tmp/tests.h"

 extern int num_spam, num_ham;	/* in tmp/tests.h */


 /* Use score ranges derived from hit-frequencies S/O ratio,
  * and numbers of mails hit.
  */
 #define USE_SCORE_RANGES

 #define USE_VARIABLE_MUTATIONS

 /* Lamarckian evolution? (Jean-Baptiste Lamarck) */
 /* inheritance of acquired characters / soft inheritance / Lamarckism */
 #define LAMARCK

 double evaluate(PGAContext *, int, int);
 int    GetIntegerParameter(char *query);
 void dump(FILE *);
 void WriteString(PGAContext *ctx, FILE *fp, int p, int pop);
 void showSummary(PGAContext *ctx);

 #if defined(USE_VARIABLE_MUTATIONS) || (! defined(USE_SCORE_RANGES))
 int    myMutation(PGAContext *, int, int, double);
 # ifdef LAMARCK
 int adapt(PGAContext *, int, int, int, int,int);
 # endif
 #endif

 #ifdef USE_VARIABLE_MUTATIONS
 void         CreateString     (PGAContext *, int, int, int);
 void         Crossover        (PGAContext *, int, int, int, int, int, int);
 void         CopyString       (PGAContext *, int, int, int, int);
 int          DuplicateString  (PGAContext *, int, int, int, int);
 MPI_Datatype BuildDT          (PGAContext *, int, int);
 #endif

 void dump(FILE *);
 void WriteString(PGAContext *ctx, FILE *fp, int p, int pop);
 void showSummary(PGAContext *ctx);

 double evaluate_inner();


 double threshold = 5.0;
 double nybias = 10.0;
 double fptarget = -1.0;   /* -1 means unused */
 int save_every_n_generations = 50;
 int no_change_val = 300;
 int pop_size = 50;
 int replace_num = 33;
 int maxiter = 30000;

 struct timeval t0 = { 0, 0 };
 int t0_iter = 0;

 #ifdef USE_VARIABLE_MUTATIONS
 double mutation_rate = 0.03;
 double base_mutation_rate = 0.03;
 #ifdef LAMARCK
 int adapt_yn = 0;
 int adapt_ny = 0;
 #endif
 double mutation_rate_modifier = 0.85;
 int num_better_same = 0;
 int num_worse = 0;
 #else
 const double mutation_rate = 0.03;
 #endif

 const double mutation_noise = 0.5;
 #ifdef USE_VARIABLE_MUTATIONS
 const double min_mutation_noise = 0.1;
 #endif
 const double regression_coefficient = 0.75;
 #ifndef USE_SCORE_RANGES
 const double SCORE_CAP = 4.0;
 const double NEG_SCORE_CAP = -9.0;
 #endif

 #ifdef USE_VARIABLE_MUTATIONS
 const double crossover_rate = 0.5;
 #else
 const double crossover_rate = 0.65;
 #endif

 int justCount = 0;

 void usage()
 {
 #ifdef USE_MPI
   int rank;
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   if(rank == 0) {
 #endif
   printf("usage: evolve [-s size] [args]\n"
      "\n"
      "  -s size = population size (50 recommended)\n"
      "  -e num_epochs = number of epochs (generations) to run (30000 default)\n"
      "  -r replace = number of individuals to replace each generation (20 recommended)\n"
      "  -b nybias = bias towards false negatives (10.0 default)\n"
      "  -f fptarget = target FP percentage (alt fitness function, off by default)\n"
      "  -t threshold = threshold for spam/nonspam decision (5 default)\n"
      "\n"
      "  -C = just count hits and exit, no evolution\n\n");
 #ifdef USE_MPI
   }
 #endif
   exit (30);
 }

 void init_data()
 {
 #ifdef USE_MPI
   int rank;

   MPI_Comm_rank(MPI_COMM_WORLD, &rank);

   if (rank == 0) {
 #endif

     loadtests();
     loadscores();
     nybias = nybias*((double)num_spam)/((double)num_ham);
 #ifdef USE_VARIABLE_MUTATIONS
     mutation_rate_modifier = (double)pow(mutation_rate_modifier,
 					(double)1/num_mutable);
 #endif

 #ifdef USE_MPI
   }

   MPI_Bcast(num_tests_hit, num_nondup, MPI_CHAR, 0, MPI_COMM_WORLD);
   MPI_Bcast(&nybias, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
   MPI_Bcast(is_spam, num_nondup, MPI_CHAR, 0, MPI_COMM_WORLD);
   MPI_Bcast(tests_hit, num_nondup*max_hits_per_msg, MPI_SHORT, 0,
 	    MPI_COMM_WORLD);
 #ifdef USE_VARIABLE_MUTATIONS
   MPI_Bcast(&mutation_rate_modifier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
 #endif
   MPI_Bcast(is_mutable, num_scores, MPI_CHAR, 0, MPI_COMM_WORLD);
   MPI_Bcast(range_lo, num_scores, MPI_DOUBLE, 0, MPI_COMM_WORLD);
   MPI_Bcast(range_hi, num_scores, MPI_DOUBLE, 0, MPI_COMM_WORLD);
   MPI_Bcast(bestscores, num_scores, MPI_DOUBLE, 0, MPI_COMM_WORLD);
   MPI_Bcast(scores, num_scores, MPI_DOUBLE, 0, MPI_COMM_WORLD);
 #endif
 }

 /* this is about 35% faster than calling PGAGetRealAllele() directly inside
  * score_msg(), in my tests. */
 void
 load_scores_into_lookup(PGAContext *ctx, int p, int pop)
 {
   int i;
   for (i = 0; i < num_mutable; i++) {
     lookup[i] = PGAGetRealAllele(ctx, p, pop, i);
 #ifdef LAMARCK
     yn_hit[i] = ny_hit[i] = 0;
 #endif
   }
 }

 int main(int argc, char **argv) {
     PGAContext *ctx;
     int i,p;
     int arg;

 #ifdef USE_MPI
     MPI_Init(&argc, &argv);
 #endif

     while ((arg = getopt (argc, argv, "b:r:s:e:t:f:C")) != -1) {
       switch (arg) {
         case 'b':
           nybias = atof(optarg);
           break;

         case 'f':
           fptarget = atof(optarg);
           break;

          case 't':
            threshold = (double) atof(optarg);
            break;

         case 's':
           pop_size = atoi(optarg);
           break;

         case 'e':
           maxiter = atoi(optarg);
           break;

 	case 'r':
 	  replace_num = atoi(optarg);
 	  break;

         case 'C':
           justCount = 1;
           break;

         case '?':
           usage();
           break;
       }
     }

      init_data();

      ctx = PGACreate(&argc, argv, PGA_DATATYPE_REAL, num_scores, PGA_MINIMIZE);

      PGASetUserFunction(ctx, PGA_USERFUNCTION_PRINTSTRING,
 			(void *)WriteString);
      PGASetUserFunction(ctx, PGA_USERFUNCTION_ENDOFGEN, (void *)showSummary);

      /* use a tiny population - just want to get into the evaluate function */
      if (justCount) {
        pop_size = 2;
        replace_num = 1;
      }

      PGASetPopSize(ctx, pop_size);

      PGASetRealInitRange (ctx, range_lo, range_hi);

      PGASetMutationBoundedFlag(ctx, PGA_FALSE);

      PGASetNumReplaceValue(ctx, replace_num);

      /* Defaults to this - Allen */
      /* PGASetMutationOrCrossoverFlag(ctx, PGA_TRUE); */

      if (justCount) {           /* don't allow any mutation or crossover */
        PGASetMutationType(ctx, PGA_MUTATION_CONSTANT);
        PGASetRealInitRange (ctx, bestscores, bestscores);
        PGASetCrossoverProb(ctx, 0.0);
        for(i=0; i<num_scores; i++) {
 	 for(p=0; p<pop_size; p++) {
 	   /* just counting?  score[i] = defaultscore[i] in that case */
            PGASetRealAllele(ctx, p, PGA_NEWPOP, i, bestscores[i]);
 	 }
        }
      } else {
 #if (! defined(USE_SCORE_RANGES)) || defined(USE_VARIABLE_MUTATIONS)
        PGASetUserFunction(ctx, PGA_USERFUNCTION_MUTATION, (void *)myMutation);
 #else
      PGASetMutationType(ctx, PGA_MUTATION_RANGE);
 #endif

        /* PGASetCrossoverType(ctx, PGA_CROSSOVER_ONEPT); */
      PGASetCrossoverProb(ctx, crossover_rate);

 #ifdef USE_VARIABLE_MUTATIONS
        mutation_rate = 0.15/sqrt(num_mutable);
        base_mutation_rate = mutation_rate;
        PGASetMutationProb(ctx, mutation_rate);
        PGASetUserFunction(ctx, PGA_USERFUNCTION_CROSSOVER,
 			  (void *)Crossover);
        PGASetUserFunction(ctx, PGA_USERFUNCTION_CREATESTRING,
 			  (void *)CreateString);
        PGASetUserFunction(ctx, PGA_USERFUNCTION_COPYSTRING,
 			  (void *)CopyString);
        PGASetUserFunction(ctx, PGA_USERFUNCTION_DUPLICATE,
 			  (void *)DuplicateString);
        PGASetUserFunction(ctx, PGA_USERFUNCTION_BUILDDATATYPE,
 			  (void *)BuildDT);
 #endif
      }

      PGASetPrintFrequencyValue(ctx,300);
      PGASetPrintOptions(ctx, PGA_REPORT_AVERAGE);

      PGASetStoppingRuleType(ctx, PGA_STOP_NOCHANGE);
      PGASetMaxNoChangeValue(ctx, no_change_val);
      PGASetMaxGAIterValue(ctx, maxiter);

      PGASetUp(ctx);

 #ifndef USE_VARIABLE_MUTATIONS
      if (! justCount) {
        /* Now initialize the scores */
        for(i=0; i<num_scores; i++) {
 	 for(p=0; p<pop_size; p++) {

 #ifndef USE_SCORE_RANGES
 	 if (is_mutable[i]) {
             if(bestscores[i] > SCORE_CAP) bestscores[i] = SCORE_CAP;
 	     else if(bestscores[i] < NEG_SCORE_CAP) bestscores[i] =
 						      NEG_SCORE_CAP;
 	 }
 #endif
 	 PGASetRealAllele(ctx, p, PGA_NEWPOP, i, bestscores[i]);
        }
      }
      }
 #endif /* ! USE_VARIABLE_MUTATIONS */

      (void)gettimeofday(&t0, (struct timezone *)NULL);
      PGARun(ctx, evaluate);

      PGADestroy(ctx);

 #ifdef USE_MPI
      MPI_Finalize();
 #endif

      return(0);
 }

 int ga_yy,ga_yn,ga_ny,ga_nn;
 #ifdef USE_VARIABLE_MUTATIONS
 int num_mutated = 0;
 int var_mutated = 0;
 int iters_same_passed = 0;
 #ifdef LAMARCK
 int weight_balance;
 int adapt_times = 0;
 int adapt_crossover = 0;
 int adapt_repeat = 0;
 int adapt_overshot = 0;
 int adapt_fp_add = 0;
 int adapt_fn_add = 0;
 #endif
 #endif
 double ynscore,nyscore,yyscore,nnscore;

 double score_msg(PGAContext *ctx, int p, int pop, int i)
 {
   double msg_score = 0.0;
   int j;

   /* For every test the message hit on */
   for(j=num_tests_hit[i]-1; j>=0; j--)
   {
     /* Up the message score by the allele for this test in the genome
      * msg_score += PGAGetRealAllele(ctx, p, pop, tests_hit[i][j]); */
     msg_score += lookup[tests_hit[i][j]];
   }

   msg_score += scores[i];	/* base from non-mutable */

   /* Ok, now we know the score for this message.
    * Let's see how this genome did... */

   if(is_spam[i])
   {
     if(msg_score >= threshold)
     {
       /* Good positive */
       ga_yy += tests_count[i];
       yyscore += msg_score*tests_count[i];
       /* Each true positive means yyscore += at least 5 */
     }
     else
     {
       /* False negative */
       ga_yn += tests_count[i];
       ynscore += msg_score*tests_count[i];
       /* Each false negative means that ynscore += less than 5 */
 #ifdef LAMARCK
       for(j=num_tests_hit[i]-1; j>=0; j--)
 	yn_hit[tests_hit[i][j]] = 1;
 #endif
     }
   }
   else
   {
     if(msg_score >= threshold)
     {
       /* False positive */
       ga_ny += tests_count[i];
       nyscore += msg_score*tests_count[i];
       /* Each false positive means nyscore += more than 5 */
 #ifdef LAMARCK
       for(j=num_tests_hit[i]-1; j>=0; j--)
 	ny_hit[tests_hit[i][j]] = 1;
 #endif
     }
     else
     {
       /* Good negative */
       ga_nn += tests_count[i];
       nnscore += msg_score*tests_count[i];
       /* Each good negative means nnscore += less than 5 */
     }
   }

   return msg_score*tests_count[i];
 }

 double evaluate(PGAContext *ctx, int p, int pop)
 {
   double tot_score = 0.0;
   int i;

   yyscore = ynscore = nyscore = nnscore = 0.0;
   ga_yy=ga_yn=ga_ny=ga_nn=0;

   load_scores_into_lookup(ctx, p, pop);

   /* For every message */
   for (i=num_nondup-1; i>=0; i--)
   {
     tot_score += score_msg(ctx,p,pop,i);
   }

   if (justCount) {
     dump(stdout);
     exit (0);
   }

   return evaluate_inner();
 }

 /* So can figure out how would evaluate without above - Allen */

 double evaluate_inner() {
   double dist_from_target_fp_rate_multiplier;
   double ynweight,nyweight;

   /* just count how far they were from the threshold, in each case */
   ynweight = (ga_yn * threshold) - ynscore;
   nyweight = nyscore - (ga_ny * threshold);

 #ifdef LAMARCK
   if (ynweight > (nyweight*nybias))
     weight_balance = -1;
   else if (ynweight < (nyweight*nybias))
     weight_balance = 1;
   else
     weight_balance = 0;
 #endif

   if (fptarget >= 0.0) {
     /* abs((FP rate as percentage) - (target FP rate)) */
     dist_from_target_fp_rate_multiplier =
               fabs(((ga_ny / (float) num_ham) * 100.0) - fptarget);

     /* now ensure it's >= 1.0 and a large multiplier */
     dist_from_target_fp_rate_multiplier =
               (dist_from_target_fp_rate_multiplier * 10) + 1.0;

     /* criteria, in order of priority: FP/FN rate, in number of messages;
     * distance from target FP rate; then the distance of FP and FN scores
     * from the threshold (as the least important criterion) */

     return    ((100 * (ga_yn + ga_ny)) * dist_from_target_fp_rate_multiplier)
               + (ynweight + nyweight*nybias);

   } else {

     return  ynweight +            /* all FNs' points from threshold */
             nyweight*nybias;      /* all FPs' points from threshold */
   }
 }

 #ifdef LAMARCK
 int adapt(PGAContext *ctx, int p, int pop, int done_eval, int threshold,
 	  int repeat) {
   double *myscores;
   int i;
   int changed = 0;
   double tmp,old_evaluation,new_evaluation;

   if (justCount) {
     return 0;
   }

   adapt_times++;

   if (done_eval && PGAGetEvaluationUpToDateFlag(ctx, p, pop))
     old_evaluation = PGAGetEvaluation(ctx, p, pop);
   else {
     old_evaluation = evaluate(ctx, p, pop);
     PGASetEvaluation(ctx, p, pop, old_evaluation);
     PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
   }

   if ((double)ga_yn > ((double)ga_ny*nybias))
     weight_balance--;
   else if ((double)ga_yn < ((double)ga_ny*nybias))
     weight_balance++;

   if ((weight_balance < (threshold-1)) &&
       (weight_balance > -threshold))
     return 0;

   myscores = PGAGetIndividual(ctx, p, pop)->chrom;

   if (repeat) {
     for (i = 0; i < num_mutable; i++) {
       if ((yn_hit[i] && (weight_balance < 0)) ||
 	  (ny_hit[i] && (weight_balance > 0))) {
 	if (((weight_balance < 0) &&
 #ifdef USE_SCORE_RANGES
 	     (myscores[i] < range_hi[i]) &&
 #endif
 	     (myscores[i] < -(double)0.01)) ||
 	    ((weight_balance > 0) &&
 #ifdef USE_SCORE_RANGES
 	     (myscores[i] > range_lo[i]) &&
 #endif
 	     (myscores[i] > (double)0.01))) {
           tmp_scores[i][0] = (double)0.001*rint(myscores[i]); /* reducing */
 #ifdef USE_SCORE_RANGES
 	  if (((myscores[i] < -(double)0.01) && ((myscores[i] - tmp_scores[i][0]) > range_hi[i])) ||
 	      ((myscores[i] > (double)0.01) && ((myscores[i] - tmp_scores[i][0]) < range_lo[i]))) {
 	    tmp_scores[i][0] = 0;
 	  }
 #endif
 	  if (tmp_scores[i][0]) {
 	    changed = 1;
 	    lookup[i] = 0;
 	  }
 	} else
 	  tmp_scores[i][0] = 0;
       } else
 	tmp_scores[i][0] = 0;
     }

     if (! changed)		/* if can't reduce, don't do anything - safe */
       return 0;

     /* For every message */
     for (i=num_nondup-1; i>=0; i--) {
       tmp_total[i] = scores[i];
       scores[i] =
 	score_msg(ctx,p,pop,i)/tests_count[i]; /* score sans ones modifying */
     }

     for (i = 0; i < num_mutable; i++) {
       if (tmp_scores[i][0]) {
 	lookup[i] = myscores[i];
 	tmp_scores[i][1] = 1;
 	if (weight_balance < 0) {
 	  yn_hit[i] = 1;
 	  ny_hit[i] = 0;
 	} else {
 	  ny_hit[i] = 1;
 	  yn_hit[i] = 0;
 	}
       } else {
 	lookup[i] = 0;
 	tmp_scores[i][1] = 0;
 	yn_hit[i] = ny_hit[i] = 0;
       }
     }

     new_evaluation = old_evaluation;  /* avoid a warning */
     while (1) {
       changed = 0;
       for (i = 0; i < num_mutable; i++) {
 	if (((tmp_scores[i][0] < 0) && yn_hit[i] && /* going up */
 #ifdef USE_SCORE_RANGES
              ((lookup[i] - tmp_scores[i][0]) < range_hi[i]) &&
 #endif
              (weight_balance < 0) && (lookup[i] < -(double)0.01)) ||
 	    ((tmp_scores[i][0] > 0) && ny_hit[i] && /* going down */
 #ifdef USE_SCORE_RANGES
              ((lookup[i] - tmp_scores[i][0]) > range_lo[i]) &&
 #endif
 	     (weight_balance > 0) &&
              (lookup[i] > (double)0.01))) {
 	  lookup[i] -= tmp_scores[i][0];
 	  changed = 1;
 	} else
 	  tmp_scores[i][0] = 0;
 	yn_hit[i] = ny_hit[i] = 0;
       }

       if (changed) {
 	if (weight_balance > 0)
 	  adapt_ny++;
 	else
 	  adapt_yn++;
 	adapt_repeat++;
       } else
 	break;

       yyscore = ynscore = nyscore = nnscore = 0.0;
       ga_yy=ga_yn=ga_ny=ga_nn=0;

       for (i=num_nondup-1; i>=0; i--)
 	(void)score_msg(ctx,p,pop,i);

       new_evaluation = evaluate_inner();

       if (new_evaluation > old_evaluation) {
 	for (i = 0; i < num_mutable; i++) {
 	  if (tmp_scores[i][0])
 	    lookup[i] += tmp_scores[i][0];
 	}
 	new_evaluation = old_evaluation;
 	adapt_overshot++;
 	break;
       } else
 	old_evaluation = new_evaluation;

       if ((double)ga_yn > ((double)ga_ny*nybias))
 	weight_balance--;
       else if ((double)ga_yn < ((double)ga_ny*nybias))
 	weight_balance++;

       if ((weight_balance < (threshold-1)) &&
 	  (weight_balance > -threshold))
 	break;
     }
     for (i=num_nondup-1; i>=0; i--)
       scores[i] = tmp_total[i];

     for (i=0; i < num_mutable; i++) {
       if (tmp_scores[i][1])
 	myscores[i] = lookup[i];
     }

     PGASetEvaluation(ctx, p, pop, new_evaluation);
     PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);

     return 1;
   } else {
     for (i = 0; i < num_mutable; i++) {
       if ((yn_hit[i] && (weight_balance < 0)) ||
 	  (ny_hit[i] && (weight_balance > 0))) {
         tmp = (double)0.001*rint(myscores[i]);
 	if (! tmp) {
 	  if (myscores[i] > (double)0.01)
             tmp = (double)0.001;
 	  else if (myscores[i] < -(double)0.01)
             tmp = -(double)0.001;
 	}
 #ifdef USE_SCORE_RANGES
 	if (tmp && (((myscores[i] > 0) &&
 		     ((myscores[i] - tmp) < range_lo[i])) ||
 		    ((myscores[i] < 0) &&
 		     ((myscores[i] - tmp) > range_hi[i]))))
 	  tmp = 0;
 #endif
 	if (tmp) {
 	  myscores[i] -= tmp;
 	  changed = 1;
 	}
       }
     }

     if (changed) {
       if (weight_balance > 0)
 	adapt_ny++;
       else
 	adapt_yn++;
       return 1;
     } else
       return 0;
   }
 }
 #endif

 /*
  * This mutation function tosses a weighted coin for each allele.
  * If the allele is to be mutated, then the way it's mutated is to regress it
  * toward the mean of the population for that allele, then add a little
  * gaussian noise.
  *
  * [To the _mean_? Weird... - Allen]
  *
  * Aug 21 2002 jm: we now use ranges and allow PGA to take care of it, if
  * USE_SCORE_RANGES is defined.
  *
  * Modified for variable mutations - 9/26/02 - Allen
  *
  */
 #if defined(USE_VARIABLE_MUTATIONS) || (! defined(USE_SCORE_RANGES))
 int myMutation(PGAContext *ctx, int p, int pop, double mr) {
     int         count=0;
     int i;
 # ifdef USE_VARIABLE_MUTATIONS
     double *myscores;
     double old_evaluation,new_evaluation,min_score,max_score;

     myscores = PGAGetIndividual(ctx, p, pop)->chrom;
     if (PGAGetEvaluationUpToDateFlag(ctx, p, pop))
       old_evaluation = PGAGetEvaluation(ctx, p, pop);
     else {
       old_evaluation = evaluate(ctx, p, pop);
       PGASetEvaluation(ctx, p, pop, old_evaluation);
       PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
     }

     for (i=0; i<num_mutable; i++) {
       tmp_scores[i][0] = 0;
       if (PGARandomFlip(ctx, mr)) {
 #ifdef USE_SCORE_RANGES
 	min_score = range_lo[i];
 	max_score = range_hi[i];
 #else
 	min_score = SCORE_CAP;
 	max_score = NEG_SCORE_CAP;
 #endif
 	if (myscores[i] > max_score)
 	  myscores[i] = max_score;
 	else if (myscores[i] < min_score)
 	  myscores[i] = min_score;

 	tmp_scores[i][1] = (max_score - min_score)/4;

 	myscores[i+num_scores] *=
 	  pow(2,(PGARandomGaussian(ctx,0,mutation_noise*2)));

 	if (myscores[i+num_scores] < min_mutation_noise)
 	  myscores[i+num_scores] = min_mutation_noise;
 	else if (myscores[i+num_scores] > tmp_scores[i][1])
 	  myscores[i+num_scores] = tmp_scores[i][1];

 	while (! tmp_scores[i][0]) {
 	  tmp_scores[i][0] = PGARandomGaussian(ctx,0,
 					       myscores[i+num_scores]);
 #ifdef USE_SCORE_RANGES
 	  if (((double)(myscores[i] + tmp_scores[i][0]) >= max_score) ||
 	      ((double)(myscores[i] + tmp_scores[i][0]) <= min_score)) {
 	    if (myscores[i+num_scores] > mutation_noise) {
 	      myscores[i+num_scores] =
 		(myscores[i+num_scores] + mutation_noise)/2;
 	      tmp_scores[i][0] = 0;
 	    } else if ((double)(myscores[i] + tmp_scores[i][0]) >= max_score) {
 	      tmp_scores[i][0] = max_score - myscores[i] - (double)0.001;
 	      break;
 	    } else {
 	      tmp_scores[i][0] = min_score - myscores[i] + (double)0.001;
 	      break;
 	    }
 	  }
 #endif
 	}
 	myscores[i] += tmp_scores[i][0];
 	count++;
       }
     }

     if (count > 0) {
       var_mutated++;
       new_evaluation = evaluate(ctx, p, pop);

       if (new_evaluation > old_evaluation) {
 	/* Did previous try go too far away? */
 	if (iters_same_passed) { /* in 2nd phase */
 	  count = 0;
 	  for (i=0; i<num_mutable; i++) {
 	    if (tmp_scores[i][0]) {
 	      if (myscores[i+num_scores] > mutation_noise) {
 		tmp_scores[i][1] = PGARandomGaussian(ctx,0,mutation_noise);
 		count++;
 	      } else
 		tmp_scores[i][1] =
 		  PGARandomGaussian(ctx,0,myscores[i+num_scores]);
 	      tmp_scores[i][1] = copysign(tmp_scores[i][1],tmp_scores[i][0]);
 #ifdef USE_SCORE_RANGES
               if ((double)(myscores[i] + tmp_scores[i][1]
                            - tmp_scores[i][0]) >= range_hi[i]) {
                 tmp_scores[i][1] = range_hi[i] - myscores[i] +
                   tmp_scores[i][0] - (double)0.001;
               } else if ((double)(myscores[i] + tmp_scores[i][1]
                                   - tmp_scores[i][0]) <= range_lo[i]) {
                 tmp_scores[i][1] = range_lo[i] - myscores[i] +
                   tmp_scores[i][0] + (double)0.001;
               }
 #endif
 	      myscores[i] += tmp_scores[i][1] - tmp_scores[i][0];
 	    }
 	  }

 	  if (count > 0) {
 	    num_mutated++;
 	    new_evaluation = evaluate(ctx, p, pop);
 	    if (PGAGetNoDuplicatesFlag(ctx) == PGA_FALSE) {
 	      /* Hack to avoid redoing evaluation without need - Allen */
 	      count = 0;
 	      PGASetEvaluation(ctx, p, pop, new_evaluation);
 	      PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
 	    }
 	    if (new_evaluation <= old_evaluation) {
 	      /* Previous try went too far away */
 	      if (mr < base_mutation_rate)
 		num_better_same++;
 	      for (i=0; i<num_mutable; i++) {
 		if (tmp_scores[i][0] &&
 		    (myscores[i+num_scores] > mutation_noise)
 		    && (fabs(tmp_scores[i][1]) < fabs(tmp_scores[i][0])))
 		  myscores[i+num_scores] =
 		    (myscores[i+num_scores] + mutation_noise)/2;
 	      }
 	    } else {
 #ifdef LAMARCK
 	      if (mr < base_mutation_rate) {
 		count = adapt(ctx,p,pop,1,1,0);
 		if (count) {
 		  count = adapt(ctx,p,pop,0,2,1);
 		  if (count)
 		    new_evaluation = PGAGetEvaluation(ctx, p, pop);
 		  else
 		    new_evaluation = evaluate(ctx, p, pop);
 		  if (new_evaluation > old_evaluation)
 		    num_worse++;
 		  else
 		    num_better_same++; /* only had to adapt once */

 		  if (PGAGetNoDuplicatesFlag(ctx) == PGA_FALSE) {
 		    /* Hack to avoid redoing evaluation without need - Allen */
 		    count = 0;
 		    PGASetEvaluation(ctx, p, pop, new_evaluation);
 		    PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
 		  }
 		} else
 		  num_worse++;
 	      } else
 #endif
 		num_worse++;
 	    }
 	  } else {		/* didn't decrease mutation SD */
 #ifdef LAMARCK
 	    if (mr < base_mutation_rate) {
 	      count = adapt(ctx,p,pop,0,1,0);
 	      new_evaluation = evaluate(ctx, p, pop);
 	    }
 #endif
 	    if (new_evaluation > old_evaluation) {
 #ifdef LAMARCK
 	      if ((mr < base_mutation_rate) && count) {
 		count = adapt(ctx,p,pop,1,2,1);
 		if (count) {
 		  new_evaluation = PGAGetEvaluation(ctx, p, pop);
 		  if (new_evaluation > old_evaluation)
 		    num_worse++;
 		  else
 		    num_better_same++;
 		} else
 		  num_worse++;
 	      } else
 #endif
 		num_worse++;
 	    } else
 	      num_better_same++;
 	    if (PGAGetNoDuplicatesFlag(ctx) == PGA_FALSE) {
 	      /* Hack to avoid redoing evaluation without need - Allen */
 	      count = 0;
 	      PGASetEvaluation(ctx, p, pop, new_evaluation);
 	      PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
 	    }
 	  }

 	  if ((! count) &&
 	      (PGAGetNoDuplicatesFlag(ctx) == PGA_TRUE))
 	    count++;

 	} else
 	  num_worse++;
       } else {
 	if (PGAGetNoDuplicatesFlag(ctx) == PGA_FALSE) {
 	  /* Hack to avoid redoing evaluation without need - Allen */
 	  count = 0;
 	  PGASetEvaluation(ctx, p, pop, new_evaluation);
 	  PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
 	}
 	num_better_same++;
       }
     }
 #ifdef LAMARCK
     else if (mr < base_mutation_rate) {
       count = adapt(ctx,p,pop,1,2,0);
       if (! count)
 	num_better_same++;	/* adapt not working, use mutation */
     }
 #endif

 # else /* USE_VARIABLE_MUTATIONS */
     int j;

     for (i=0; i<num_mutable; i++)
     {
       if(PGARandomFlip(ctx, mr))
       {
 	double gene_sum=0.0;
 	/* Find the mean */
 	for(j=0; j<pop_size; j++) {
 	  if(p!=j)
 	    gene_sum += PGAGetRealAllele(ctx, j, pop, i);
 	}
 	gene_sum /= (double)(pop_size-1);
 	/* Regress towards it... */
 	gene_sum = (1.0-regression_coefficient)*gene_sum+regression_coefficient*PGAGetRealAllele(ctx, p, pop, i);
 	/* Set this gene in this allele to be the average, plus some gaussian noise */
 	if(gene_sum > SCORE_CAP)
 	  gene_sum = SCORE_CAP;
 	else if(gene_sum < NEG_SCORE_CAP)
 	  gene_sum = NEG_SCORE_CAP;
 	PGASetRealAllele(ctx, p, pop, i,
 			 PGARandomGaussian(ctx, gene_sum, mutation_noise));
 	count++;
       }
     }
 # endif /* !USE_VARIABLE_MUTATIONS */
     return count;
 }
 #endif /* USE_VARIABLE_MUTATIONS || !USE_SCORE_RANGES */

 void dump(FILE *fp)
 {
    fprintf (fp,"\n# SUMMARY for threshold %3.1f:\n", threshold);
   fprintf (fp,
 	   "# Correctly non-spam: %6d  %4.3f%%  (%4.3f%% of non-spam corpus)\n",
 	   ga_nn,
        (ga_nn / (float) num_tests) * 100.0,
        (ga_nn / (float) num_ham) * 100.0);
   fprintf (fp,
 	   "# Correctly spam:     %6d  %4.3f%%  (%4.3f%% of spam corpus)\n",
 	   ga_yy,
        (ga_yy / (float) num_tests) * 100.0,
        (ga_yy / (float) num_spam) * 100.0);
   fprintf (fp,
 	   "# False positives:    %6d  %4.3f%%  (%4.3f%% of nonspam, %6.0f weighted)\n",
 	   ga_ny,
        (ga_ny / (float) num_tests) * 100.0,
        (ga_ny / (float) num_ham) * 100.0,
        nyscore*nybias);
   fprintf (fp,
 	   "# False negatives:    %6d  %4.3f%%  (%4.3f%% of spam, %6.0f weighted)\n",
 	   ga_yn,
        (ga_yn / (float) num_tests) * 100.0,
        (ga_yn / (float) num_spam) * 100.0,
        ynscore);

    fprintf (fp,"# Average score for spam:  %3.1f    nonspam: %3.1f\n",(ynscore+yyscore)/((double)(ga_yn+ga_yy)),(nyscore+nnscore)/((double)(ga_nn+ga_ny)));
    fprintf (fp,"# Average for false-pos:   %3.1f  false-neg: %3.1f\n",(nyscore/(double)ga_ny),(ynscore/(double)ga_yn));

    fprintf (fp,"# TOTAL:              %6d  %3.2f%%\n\n", num_tests, 100.0);
 }

 /*****************************************************************************
  * WriteString sends a visual representation of the chromosome out to fp     *
  *****************************************************************************/
 void WriteString(PGAContext *ctx, FILE *fp, int p, int pop)
 {
   int i;

 #ifdef USE_MPI
   int rank;
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);

   if(0 == rank)
   {
 #endif
     evaluate(ctx,p,pop);
     dump(fp);
     for(i=0; i<num_scores; i++)
     {
       fprintf(fp,"score %-30s %2.3f\n",
 	      score_names[i],PGAGetRealAllele(ctx, p, pop, i));
     }
     fprintf ( fp,"\n" );
 #ifdef USE_MPI
   }
 #endif
 }

 #ifdef USE_VARIABLE_MUTATIONS
 double last_best = 0;
 #endif

 void showSummary(PGAContext *ctx)
 {
 #ifdef USE_MPI
   int rank;

   MPI_Comm_rank(MPI_COMM_WORLD, &rank);

   if(0 == rank)
   {
 #endif
     if(0 == PGAGetGAIterValue(ctx) % save_every_n_generations)
     {
       int genome = PGAGetBestIndex(ctx,PGA_OLDPOP);
       FILE *scores_file = NULL;
       (void)evaluate(ctx, genome, PGA_OLDPOP);
       PGAGetEvaluation(ctx, genome, PGA_OLDPOP);
       scores_file = fopen("garescorer.scores","w");
       WriteString(ctx, scores_file, genome, PGA_OLDPOP);
       fclose(scores_file);
 #ifdef USE_VARIABLE_MUTATIONS
       if (! justCount) {
 	printf("\nPop size, replacement: %d %d\n",
 	       pop_size, replace_num);
         /*
 	printf("\nMutations (rate, good, bad, var, num): %3.7f %d %d %d %d\n",
 	       mutation_rate, num_better_same, num_worse, var_mutated,
 	       num_mutated);
         */
 	var_mutated = 0;
 	num_mutated = 0;
 	if (! iters_same_passed) {
 	  if (! last_best)
 	    last_best = ctx->rep.Best;
 	  else if ((last_best*0.999) < ctx->rep.Best) /* too slow! */
 	    iters_same_passed = 1;
 	  else
 	    last_best = ctx->rep.Best;
 	}
 #ifdef LAMARCK
 	printf("\n");
 	printf("Adapt (t, fneg, fneg_add, fpos, fpos_add): %d %d %d %d %d\n",
 	       adapt_times,adapt_yn,adapt_fn_add,adapt_ny,adapt_fp_add);
 	printf("Adapt (over, cross, repeat): %d %d %d\n",
 	       adapt_overshot,adapt_crossover,adapt_repeat);
 	adapt_times = adapt_overshot = adapt_crossover = adapt_repeat =
 	  adapt_yn = adapt_ny = adapt_fn_add = adapt_fp_add = 0;
 #endif
       }
 #endif

       { struct timeval t1;
 	if (gettimeofday(&t1, (struct timezone *)NULL) == 0) {
 	  double dt = (t1.tv_sec + t1.tv_usec * 1.0e-6) -
 	              (t0.tv_sec + t0.tv_usec * 1.0e-6);
 	  int iter = PGAGetGAIterValue(ctx);
 	  if (dt < 1e-6) dt = 1e-6;
 	  printf("Performance: %.3f iterations/s, iteration no. %d\n",
 	         (iter-t0_iter)/dt, iter);
 	  t0.tv_sec = t1.tv_sec; t0.tv_usec = t0.tv_usec; t0_iter = iter;
 	}
       }

       dump(stdout);

     }
     else if(0 == PGAGetGAIterValue(ctx) % 5)
     {
       printf("%d",(PGAGetGAIterValue(ctx)/5)%10);
     }

 #ifdef USE_VARIABLE_MUTATIONS
     if (! justCount) {
       if ((num_better_same*4) >= num_worse)
 	mutation_rate /= mutation_rate_modifier;
       else if ((num_better_same*4) < num_worse) {
 	if ((mutation_rate > base_mutation_rate) || iters_same_passed)
 	  mutation_rate *= mutation_rate_modifier;
 	else if (ctx->ga.ItersOfSame >= (no_change_val/2)) {
 	  iters_same_passed = 1;
 	  mutation_rate *= mutation_rate_modifier;
 	  printf("\nMutation rate %3.7f (ItersOfSame %d)\n",
 		 mutation_rate,ctx->ga.ItersOfSame);
 	} else
 	  return;
       }

       if (mutation_rate > mutation_rate_modifier) {
 	mutation_rate = mutation_rate_modifier;
 	printf("\nMutation rate max: %3.7f\n",mutation_rate);
       } else if (mutation_rate < 0.05/sqrt(num_mutable)) {
 	mutation_rate = 0.05/sqrt(num_mutable);
 	printf("\nMutation rate min: %3.7f\n",mutation_rate);
       }

       PGASetMutationProb(ctx, mutation_rate);

       num_better_same = 0;
       num_worse = 0;
     }
 #endif

 #ifdef USE_MPI
   }
 #endif
 }

 #ifdef USE_VARIABLE_MUTATIONS
 /*****************************************************************************
  * CreateString allocates and initializes a chromosome.  If InitFlag is      *
  * set to true, then it will initialize the chromosome using the best known  *
  * values; otherwise, it sets each double to 0.0 and each int to 0.          *
  *****************************************************************************/
 void CreateString(PGAContext *ctx, int p, int pop, int InitFlag) {
     int i;
     double *myscore;

     PGAIndividual *new;

     new = PGAGetIndividual(ctx, p, pop);
     if (!(new->chrom = malloc(sizeof(double)*num_scores*2))) {
         fprintf(stderr, "No room for new->chrom");
         exit(1);
     }
     myscore = new->chrom;
     if (InitFlag) {
       for(i=0; i<num_scores; i++)
 	myscore[i] = bestscores[i];
       for(i=num_scores; i<num_scores*2; i++)
 	myscore[i] = mutation_noise;
     } else {
       for(i=0; i<num_scores*2; i++)
 	myscore[i] = 0.0;
     }
 }


 /*****************************************************************************
  * Crossover implements uniform crossover on the chromosome.                 *
  *****************************************************************************/
 void Crossover(PGAContext *ctx, int p1, int p2, int pop1, int t1, int t2,
                int pop2) {
     int i;
     double *parent1, *parent2, *child1, *child2;
     double pu;
 #ifdef LAMARCK
     double parent1_eval, parent2_eval, child1_eval, child2_eval;
 #endif

     parent1 = PGAGetIndividual(ctx, p1, pop1)->chrom;
     parent2 = PGAGetIndividual(ctx, p2, pop1)->chrom;
     child1  = PGAGetIndividual(ctx, t1, pop2)->chrom;
     child2  = PGAGetIndividual(ctx, t2, pop2)->chrom;

     pu = PGAGetUniformCrossoverProb(ctx);

     for (i = 0; i < num_mutable; i++) {
       if (PGARandomFlip(ctx, pu)) {
 	child1[i] = parent2[i];
 	child2[i] = parent1[i];
 	if (num_mutated > 0) {
 	  if (fabs(parent1[i+num_scores] - mutation_noise) >
 	      fabs(parent1[i+num_scores] - parent2[i+num_scores]))
 	    child2[i+num_scores] =
 	      (parent1[i+num_scores] + parent2[i+num_scores])/2;
 	  else
 	    child2[i+num_scores] =
 	      (parent1[i+num_scores] + mutation_noise)/2;
 	  if (fabs(parent2[i+num_scores] - mutation_noise) >
 	      fabs(parent2[i+num_scores] - parent1[i+num_scores]))
 	    child1[i+num_scores] =
 	      (parent2[i+num_scores] + parent1[i+num_scores])/2;
 	  else
 	    child1[i+num_scores] =
 	      (parent2[i+num_scores] + mutation_noise)/2;
 	} else {
 	  /* Doing intermediate recombination due to usage
 	   * of exponential multiplication in mutation - Allen */
 	  child1[i+num_scores] = child2[i+num_scores] =
 	    (parent1[i+num_scores] + parent2[i+num_scores])/2;
 	}
       } else {
 	child1[i] = parent1[i];
 	child2[i] = parent2[i];
 	if (pu < 0.5) {		/* more grouped */
 	  child1[i+num_scores] = parent1[i+num_scores];
 	  child2[i+num_scores] = parent2[i+num_scores];
 	} else {
 	  if (num_mutated > 0) {
 	    if (fabs(parent1[i+num_scores] - mutation_noise) >
 		fabs(parent1[i+num_scores] - parent2[i+num_scores]))
 	      child1[i+num_scores] =
 		(parent1[i+num_scores] + parent2[i+num_scores])/2;
 	    else
 	      child1[i+num_scores] =
 		(parent1[i+num_scores] + mutation_noise)/2;
 	    if (fabs(parent2[i+num_scores] - mutation_noise) >
 		fabs(parent2[i+num_scores] - parent1[i+num_scores]))
 	      child2[i+num_scores] =
 		(parent2[i+num_scores] + parent1[i+num_scores])/2;
 	    else
 	      child2[i+num_scores] =
 		(parent2[i+num_scores] + mutation_noise)/2;
 	  } else {
 	    /* Doing intermediate recombination due to usage
 	     * of exponential multiplication in mutation - Allen */
 	    child1[i+num_scores] = child2[i+num_scores] =
 	      (parent1[i+num_scores] + parent2[i+num_scores])/2;
 	  }
 	}
       }
     }
     for (i = num_mutable; i < num_scores; i++) {
       child1[i] = parent1[i];
       child2[i] = parent2[i];
       child1[i+num_scores] = parent1[i+num_scores];
       child2[i+num_scores] = parent2[i+num_scores];
     }

 #ifdef LAMARCK
     if ((PGAGetMutationAndCrossoverFlag(ctx) == PGA_FALSE) &&
 	(mutation_rate < base_mutation_rate) &&
 	(PGAGetEvaluationUpToDateFlag(ctx, p1, pop1) == PGA_TRUE) &&
 	(PGAGetEvaluationUpToDateFlag(ctx, p2, pop1) == PGA_TRUE)) {
       parent1_eval = PGAGetEvaluation(ctx, p1, pop1);
       parent2_eval = PGAGetEvaluation(ctx, p2, pop1);

       if (PGARandomFlip(ctx, (double)0.5)) {
 	child1_eval = evaluate(ctx, t1, pop2);
 	if ((child1_eval > parent1_eval) &&
 	    (child1_eval > parent2_eval)) {
 	  /* Urk! */
 	  if (PGARandomFlip(ctx, (double)(mutation_rate/base_mutation_rate)))
 	    adapt_crossover += adapt(ctx, t1, pop2, 1, 2, 0);
 	  else {		/* low mr */
 	    if (adapt(ctx, t1, pop2, 1, 1, 0))
 	      adapt_crossover += adapt(ctx, t1, pop2, 0, 2, 1) + 1;
 	    adapt_crossover += adapt(ctx, t2, pop2, 0, 2, 0);
 	  }
 	} else {
 	  PGASetEvaluation(ctx, t1, pop2, child1_eval);
 	  PGASetEvaluationUpToDateFlag(ctx, t1, pop2, PGA_TRUE);
 	}
       } else {
 	child2_eval = evaluate(ctx, t2, pop2);

 	if ((child2_eval > parent1_eval) &&
 	    (child2_eval > parent2_eval)) {
 	  /* Urk! */
 	  if (PGARandomFlip(ctx, (double)(mutation_rate/base_mutation_rate)))
 	    adapt_crossover += adapt(ctx, t2, pop2, 1, 2, 0);
 	  else {		/* low mr */
 	    if (adapt(ctx, t2, pop2, 1, 1, 0))
 	      adapt_crossover += adapt(ctx, t2, pop2, 0, 2, 1) + 1;
 	    adapt_crossover += adapt(ctx, t1, pop2, 0, 2, 0);
 	  }
 	} else {
 	  PGASetEvaluation(ctx, t2, pop2, child2_eval);
 	  PGASetEvaluationUpToDateFlag(ctx, t2, pop2, PGA_TRUE);
 	}
       }
     }

 #endif
 }


 /*****************************************************************************
  * CopyString makes a copy of the chromosome at (p1, pop1) and puts it at    *
  * (p2, pop2).                                                               *
  *****************************************************************************/
 void CopyString(PGAContext *ctx, int p1, int pop1, int p2, int pop2) {
     void *d, *s;

      s = PGAGetIndividual(ctx, p1, pop1)->chrom;
      d = PGAGetIndividual(ctx, p2, pop2)->chrom;
      memcpy(d, s, sizeof(double)*num_scores*2);
 }


 /*****************************************************************************
  * DuplicateString compares two chromosomes and returns 1 if they are the    *
  * same and 0 if they are different.                                         *
  *****************************************************************************/
 int DuplicateString(PGAContext *ctx, int p1, int pop1, int p2, int pop2) {
     void *a, *b;

      a = PGAGetIndividual(ctx, p1, pop1)->chrom;
      b = PGAGetIndividual(ctx, p2, pop2)->chrom;
      return (!memcmp(a, b, sizeof(double)*num_scores*2));
 }

 /*****************************************************************************
  * BuildDatattype builds an MPI datatype for sending strings to other        *
  * processors.  Consult your favorite MPI manual for more information.       *
  *****************************************************************************/
 MPI_Datatype BuildDT(PGAContext *ctx, int p, int pop) {
   MPI_Datatype    DT_PGAIndividual;
 #ifdef USE_MPI
   int             counts[3];
   MPI_Aint        displs[3];
   MPI_Datatype    types[3];
   PGAIndividual  *P;

   P = PGAGetIndividual(ctx, p, pop);

   /*  Build the MPI datatype.  Every user defined function needs these.
    *  The first two calls are stuff that is internal to PGAPack, but
    *  the user still must include it.  See pgapack.h for details one the
    *  fields (under PGAIndividual)
    */
   MPI_Address(&P->evalfunc, &displs[0]);
   counts[0] = 2;
   types[0]  = MPI_DOUBLE;

   /*  Next, we have an integer, evaluptodate.  */
   MPI_Address(&P->evaluptodate, &displs[1]);
   counts[1] = 1;
   types[1]  = MPI_INT;

   /*  Finally, we have the actual user-defined string.  */
   MPI_Address(P->chrom, &displs[2]);
   counts[2] = num_scores*2;
   types[2]  = MPI_DOUBLE;

   MPI_Type_struct(3, counts, displs, types, &DT_PGAIndividual);
 #endif /* defined(USE_MPI) */
   MPI_Type_commit(&DT_PGAIndividual);
   return(DT_PGAIndividual);
 }
 #endif /* defined(USE_VARIABLE_MUTATIONS) */