blob: 8aa1c8a3576ede3d6d175c26789481600fac79ec [file] [log] [blame]
/* @COPYRIGHT */
/*
* This program uses PGAPack to do its GA stuff.
* ftp://ftp.mcs.anl.gov/pub/pgapack/pgapack.tar.Z
* I used this one instead of galib because it uses MPI
* to spread load around. It also seems like the API is a little
* cleaner.
*/
#include "pgapack.h"
#include <unistd.h>
#include <sys/time.h>
#include <math.h>
#include "tmp/scores.h"
#include "tmp/tests.h"
extern int num_spam, num_ham; /* in tmp/tests.h */
/* Use score ranges derived from hit-frequencies S/O ratio,
* and numbers of mails hit.
*/
#define USE_SCORE_RANGES
#define USE_VARIABLE_MUTATIONS
/* Lamarckian evolution? (Jean-Baptiste Lamarck) */
/* inheritance of acquired characters / soft inheritance / Lamarckism */
#define LAMARCK
double evaluate(PGAContext *, int, int);
int GetIntegerParameter(char *query);
void dump(FILE *);
void WriteString(PGAContext *ctx, FILE *fp, int p, int pop);
void showSummary(PGAContext *ctx);
#if defined(USE_VARIABLE_MUTATIONS) || (! defined(USE_SCORE_RANGES))
int myMutation(PGAContext *, int, int, double);
# ifdef LAMARCK
int adapt(PGAContext *, int, int, int, int,int);
# endif
#endif
#ifdef USE_VARIABLE_MUTATIONS
void CreateString (PGAContext *, int, int, int);
void Crossover (PGAContext *, int, int, int, int, int, int);
void CopyString (PGAContext *, int, int, int, int);
int DuplicateString (PGAContext *, int, int, int, int);
MPI_Datatype BuildDT (PGAContext *, int, int);
#endif
void dump(FILE *);
void WriteString(PGAContext *ctx, FILE *fp, int p, int pop);
void showSummary(PGAContext *ctx);
double evaluate_inner();
double threshold = 5.0;
double nybias = 10.0;
double fptarget = -1.0; /* -1 means unused */
int save_every_n_generations = 50;
int no_change_val = 300;
int pop_size = 50;
int replace_num = 33;
int maxiter = 30000;
struct timeval t0 = { 0, 0 };
int t0_iter = 0;
#ifdef USE_VARIABLE_MUTATIONS
double mutation_rate = 0.03;
double base_mutation_rate = 0.03;
#ifdef LAMARCK
int adapt_yn = 0;
int adapt_ny = 0;
#endif
double mutation_rate_modifier = 0.85;
int num_better_same = 0;
int num_worse = 0;
#else
const double mutation_rate = 0.03;
#endif
const double mutation_noise = 0.5;
#ifdef USE_VARIABLE_MUTATIONS
const double min_mutation_noise = 0.1;
#endif
const double regression_coefficient = 0.75;
#ifndef USE_SCORE_RANGES
const double SCORE_CAP = 4.0;
const double NEG_SCORE_CAP = -9.0;
#endif
#ifdef USE_VARIABLE_MUTATIONS
const double crossover_rate = 0.5;
#else
const double crossover_rate = 0.65;
#endif
int justCount = 0;
void usage()
{
#ifdef USE_MPI
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if(rank == 0) {
#endif
printf("usage: evolve [-s size] [args]\n"
"\n"
" -s size = population size (50 recommended)\n"
" -e num_epochs = number of epochs (generations) to run (30000 default)\n"
" -r replace = number of individuals to replace each generation (20 recommended)\n"
" -b nybias = bias towards false negatives (10.0 default)\n"
" -f fptarget = target FP percentage (alt fitness function, off by default)\n"
" -t threshold = threshold for spam/nonspam decision (5 default)\n"
"\n"
" -C = just count hits and exit, no evolution\n\n");
#ifdef USE_MPI
}
#endif
exit (30);
}
void init_data()
{
#ifdef USE_MPI
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
#endif
loadtests();
loadscores();
nybias = nybias*((double)num_spam)/((double)num_ham);
#ifdef USE_VARIABLE_MUTATIONS
mutation_rate_modifier = (double)pow(mutation_rate_modifier,
(double)1/num_mutable);
#endif
#ifdef USE_MPI
}
MPI_Bcast(num_tests_hit, num_nondup, MPI_CHAR, 0, MPI_COMM_WORLD);
MPI_Bcast(&nybias, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(is_spam, num_nondup, MPI_CHAR, 0, MPI_COMM_WORLD);
MPI_Bcast(tests_hit, num_nondup*max_hits_per_msg, MPI_SHORT, 0,
MPI_COMM_WORLD);
#ifdef USE_VARIABLE_MUTATIONS
MPI_Bcast(&mutation_rate_modifier, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
#endif
MPI_Bcast(is_mutable, num_scores, MPI_CHAR, 0, MPI_COMM_WORLD);
MPI_Bcast(range_lo, num_scores, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(range_hi, num_scores, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(bestscores, num_scores, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(scores, num_scores, MPI_DOUBLE, 0, MPI_COMM_WORLD);
#endif
}
/* this is about 35% faster than calling PGAGetRealAllele() directly inside
* score_msg(), in my tests. */
void
load_scores_into_lookup(PGAContext *ctx, int p, int pop)
{
int i;
for (i = 0; i < num_mutable; i++) {
lookup[i] = PGAGetRealAllele(ctx, p, pop, i);
#ifdef LAMARCK
yn_hit[i] = ny_hit[i] = 0;
#endif
}
}
int main(int argc, char **argv) {
PGAContext *ctx;
int i,p;
int arg;
#ifdef USE_MPI
MPI_Init(&argc, &argv);
#endif
while ((arg = getopt (argc, argv, "b:r:s:e:t:f:C")) != -1) {
switch (arg) {
case 'b':
nybias = atof(optarg);
break;
case 'f':
fptarget = atof(optarg);
break;
case 't':
threshold = (double) atof(optarg);
break;
case 's':
pop_size = atoi(optarg);
break;
case 'e':
maxiter = atoi(optarg);
break;
case 'r':
replace_num = atoi(optarg);
break;
case 'C':
justCount = 1;
break;
case '?':
usage();
break;
}
}
init_data();
ctx = PGACreate(&argc, argv, PGA_DATATYPE_REAL, num_scores, PGA_MINIMIZE);
PGASetUserFunction(ctx, PGA_USERFUNCTION_PRINTSTRING,
(void *)WriteString);
PGASetUserFunction(ctx, PGA_USERFUNCTION_ENDOFGEN, (void *)showSummary);
/* use a tiny population - just want to get into the evaluate function */
if (justCount) {
pop_size = 2;
replace_num = 1;
}
PGASetPopSize(ctx, pop_size);
PGASetRealInitRange (ctx, range_lo, range_hi);
PGASetMutationBoundedFlag(ctx, PGA_FALSE);
PGASetNumReplaceValue(ctx, replace_num);
/* Defaults to this - Allen */
/* PGASetMutationOrCrossoverFlag(ctx, PGA_TRUE); */
if (justCount) { /* don't allow any mutation or crossover */
PGASetMutationType(ctx, PGA_MUTATION_CONSTANT);
PGASetRealInitRange (ctx, bestscores, bestscores);
PGASetCrossoverProb(ctx, 0.0);
for(i=0; i<num_scores; i++) {
for(p=0; p<pop_size; p++) {
/* just counting? score[i] = defaultscore[i] in that case */
PGASetRealAllele(ctx, p, PGA_NEWPOP, i, bestscores[i]);
}
}
} else {
#if (! defined(USE_SCORE_RANGES)) || defined(USE_VARIABLE_MUTATIONS)
PGASetUserFunction(ctx, PGA_USERFUNCTION_MUTATION, (void *)myMutation);
#else
PGASetMutationType(ctx, PGA_MUTATION_RANGE);
#endif
/* PGASetCrossoverType(ctx, PGA_CROSSOVER_ONEPT); */
PGASetCrossoverProb(ctx, crossover_rate);
#ifdef USE_VARIABLE_MUTATIONS
mutation_rate = 0.15/sqrt(num_mutable);
base_mutation_rate = mutation_rate;
PGASetMutationProb(ctx, mutation_rate);
PGASetUserFunction(ctx, PGA_USERFUNCTION_CROSSOVER,
(void *)Crossover);
PGASetUserFunction(ctx, PGA_USERFUNCTION_CREATESTRING,
(void *)CreateString);
PGASetUserFunction(ctx, PGA_USERFUNCTION_COPYSTRING,
(void *)CopyString);
PGASetUserFunction(ctx, PGA_USERFUNCTION_DUPLICATE,
(void *)DuplicateString);
PGASetUserFunction(ctx, PGA_USERFUNCTION_BUILDDATATYPE,
(void *)BuildDT);
#endif
}
PGASetPrintFrequencyValue(ctx,300);
PGASetPrintOptions(ctx, PGA_REPORT_AVERAGE);
PGASetStoppingRuleType(ctx, PGA_STOP_NOCHANGE);
PGASetMaxNoChangeValue(ctx, no_change_val);
PGASetMaxGAIterValue(ctx, maxiter);
PGASetUp(ctx);
#ifndef USE_VARIABLE_MUTATIONS
if (! justCount) {
/* Now initialize the scores */
for(i=0; i<num_scores; i++) {
for(p=0; p<pop_size; p++) {
#ifndef USE_SCORE_RANGES
if (is_mutable[i]) {
if(bestscores[i] > SCORE_CAP) bestscores[i] = SCORE_CAP;
else if(bestscores[i] < NEG_SCORE_CAP) bestscores[i] =
NEG_SCORE_CAP;
}
#endif
PGASetRealAllele(ctx, p, PGA_NEWPOP, i, bestscores[i]);
}
}
}
#endif /* ! USE_VARIABLE_MUTATIONS */
(void)gettimeofday(&t0, (struct timezone *)NULL);
PGARun(ctx, evaluate);
PGADestroy(ctx);
#ifdef USE_MPI
MPI_Finalize();
#endif
return(0);
}
int ga_yy,ga_yn,ga_ny,ga_nn;
#ifdef USE_VARIABLE_MUTATIONS
int num_mutated = 0;
int var_mutated = 0;
int iters_same_passed = 0;
#ifdef LAMARCK
int weight_balance;
int adapt_times = 0;
int adapt_crossover = 0;
int adapt_repeat = 0;
int adapt_overshot = 0;
int adapt_fp_add = 0;
int adapt_fn_add = 0;
#endif
#endif
double ynscore,nyscore,yyscore,nnscore;
double score_msg(PGAContext *ctx, int p, int pop, int i)
{
double msg_score = 0.0;
int j;
/* For every test the message hit on */
for(j=num_tests_hit[i]-1; j>=0; j--)
{
/* Up the message score by the allele for this test in the genome
* msg_score += PGAGetRealAllele(ctx, p, pop, tests_hit[i][j]); */
msg_score += lookup[tests_hit[i][j]];
}
msg_score += scores[i]; /* base from non-mutable */
/* Ok, now we know the score for this message.
* Let's see how this genome did... */
if(is_spam[i])
{
if(msg_score >= threshold)
{
/* Good positive */
ga_yy += tests_count[i];
yyscore += msg_score*tests_count[i];
/* Each true positive means yyscore += at least 5 */
}
else
{
/* False negative */
ga_yn += tests_count[i];
ynscore += msg_score*tests_count[i];
/* Each false negative means that ynscore += less than 5 */
#ifdef LAMARCK
for(j=num_tests_hit[i]-1; j>=0; j--)
yn_hit[tests_hit[i][j]] = 1;
#endif
}
}
else
{
if(msg_score >= threshold)
{
/* False positive */
ga_ny += tests_count[i];
nyscore += msg_score*tests_count[i];
/* Each false positive means nyscore += more than 5 */
#ifdef LAMARCK
for(j=num_tests_hit[i]-1; j>=0; j--)
ny_hit[tests_hit[i][j]] = 1;
#endif
}
else
{
/* Good negative */
ga_nn += tests_count[i];
nnscore += msg_score*tests_count[i];
/* Each good negative means nnscore += less than 5 */
}
}
return msg_score*tests_count[i];
}
double evaluate(PGAContext *ctx, int p, int pop)
{
double tot_score = 0.0;
int i;
yyscore = ynscore = nyscore = nnscore = 0.0;
ga_yy=ga_yn=ga_ny=ga_nn=0;
load_scores_into_lookup(ctx, p, pop);
/* For every message */
for (i=num_nondup-1; i>=0; i--)
{
tot_score += score_msg(ctx,p,pop,i);
}
if (justCount) {
dump(stdout);
exit (0);
}
return evaluate_inner();
}
/* So can figure out how would evaluate without above - Allen */
double evaluate_inner() {
double dist_from_target_fp_rate_multiplier;
double ynweight,nyweight;
/* just count how far they were from the threshold, in each case */
ynweight = (ga_yn * threshold) - ynscore;
nyweight = nyscore - (ga_ny * threshold);
#ifdef LAMARCK
if (ynweight > (nyweight*nybias))
weight_balance = -1;
else if (ynweight < (nyweight*nybias))
weight_balance = 1;
else
weight_balance = 0;
#endif
if (fptarget >= 0.0) {
/* abs((FP rate as percentage) - (target FP rate)) */
dist_from_target_fp_rate_multiplier =
fabs(((ga_ny / (float) num_ham) * 100.0) - fptarget);
/* now ensure it's >= 1.0 and a large multiplier */
dist_from_target_fp_rate_multiplier =
(dist_from_target_fp_rate_multiplier * 10) + 1.0;
/* criteria, in order of priority: FP/FN rate, in number of messages;
* distance from target FP rate; then the distance of FP and FN scores
* from the threshold (as the least important criterion) */
return ((100 * (ga_yn + ga_ny)) * dist_from_target_fp_rate_multiplier)
+ (ynweight + nyweight*nybias);
} else {
return ynweight + /* all FNs' points from threshold */
nyweight*nybias; /* all FPs' points from threshold */
}
}
#ifdef LAMARCK
int adapt(PGAContext *ctx, int p, int pop, int done_eval, int threshold,
int repeat) {
double *myscores;
int i;
int changed = 0;
double tmp,old_evaluation,new_evaluation;
if (justCount) {
return 0;
}
adapt_times++;
if (done_eval && PGAGetEvaluationUpToDateFlag(ctx, p, pop))
old_evaluation = PGAGetEvaluation(ctx, p, pop);
else {
old_evaluation = evaluate(ctx, p, pop);
PGASetEvaluation(ctx, p, pop, old_evaluation);
PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
}
if ((double)ga_yn > ((double)ga_ny*nybias))
weight_balance--;
else if ((double)ga_yn < ((double)ga_ny*nybias))
weight_balance++;
if ((weight_balance < (threshold-1)) &&
(weight_balance > -threshold))
return 0;
myscores = PGAGetIndividual(ctx, p, pop)->chrom;
if (repeat) {
for (i = 0; i < num_mutable; i++) {
if ((yn_hit[i] && (weight_balance < 0)) ||
(ny_hit[i] && (weight_balance > 0))) {
if (((weight_balance < 0) &&
#ifdef USE_SCORE_RANGES
(myscores[i] < range_hi[i]) &&
#endif
(myscores[i] < -(double)0.01)) ||
((weight_balance > 0) &&
#ifdef USE_SCORE_RANGES
(myscores[i] > range_lo[i]) &&
#endif
(myscores[i] > (double)0.01))) {
tmp_scores[i][0] = (double)0.001*rint(myscores[i]); /* reducing */
#ifdef USE_SCORE_RANGES
if (((myscores[i] < -(double)0.01) && ((myscores[i] - tmp_scores[i][0]) > range_hi[i])) ||
((myscores[i] > (double)0.01) && ((myscores[i] - tmp_scores[i][0]) < range_lo[i]))) {
tmp_scores[i][0] = 0;
}
#endif
if (tmp_scores[i][0]) {
changed = 1;
lookup[i] = 0;
}
} else
tmp_scores[i][0] = 0;
} else
tmp_scores[i][0] = 0;
}
if (! changed) /* if can't reduce, don't do anything - safe */
return 0;
/* For every message */
for (i=num_nondup-1; i>=0; i--) {
tmp_total[i] = scores[i];
scores[i] =
score_msg(ctx,p,pop,i)/tests_count[i]; /* score sans ones modifying */
}
for (i = 0; i < num_mutable; i++) {
if (tmp_scores[i][0]) {
lookup[i] = myscores[i];
tmp_scores[i][1] = 1;
if (weight_balance < 0) {
yn_hit[i] = 1;
ny_hit[i] = 0;
} else {
ny_hit[i] = 1;
yn_hit[i] = 0;
}
} else {
lookup[i] = 0;
tmp_scores[i][1] = 0;
yn_hit[i] = ny_hit[i] = 0;
}
}
new_evaluation = old_evaluation; /* avoid a warning */
while (1) {
changed = 0;
for (i = 0; i < num_mutable; i++) {
if (((tmp_scores[i][0] < 0) && yn_hit[i] && /* going up */
#ifdef USE_SCORE_RANGES
((lookup[i] - tmp_scores[i][0]) < range_hi[i]) &&
#endif
(weight_balance < 0) && (lookup[i] < -(double)0.01)) ||
((tmp_scores[i][0] > 0) && ny_hit[i] && /* going down */
#ifdef USE_SCORE_RANGES
((lookup[i] - tmp_scores[i][0]) > range_lo[i]) &&
#endif
(weight_balance > 0) &&
(lookup[i] > (double)0.01))) {
lookup[i] -= tmp_scores[i][0];
changed = 1;
} else
tmp_scores[i][0] = 0;
yn_hit[i] = ny_hit[i] = 0;
}
if (changed) {
if (weight_balance > 0)
adapt_ny++;
else
adapt_yn++;
adapt_repeat++;
} else
break;
yyscore = ynscore = nyscore = nnscore = 0.0;
ga_yy=ga_yn=ga_ny=ga_nn=0;
for (i=num_nondup-1; i>=0; i--)
(void)score_msg(ctx,p,pop,i);
new_evaluation = evaluate_inner();
if (new_evaluation > old_evaluation) {
for (i = 0; i < num_mutable; i++) {
if (tmp_scores[i][0])
lookup[i] += tmp_scores[i][0];
}
new_evaluation = old_evaluation;
adapt_overshot++;
break;
} else
old_evaluation = new_evaluation;
if ((double)ga_yn > ((double)ga_ny*nybias))
weight_balance--;
else if ((double)ga_yn < ((double)ga_ny*nybias))
weight_balance++;
if ((weight_balance < (threshold-1)) &&
(weight_balance > -threshold))
break;
}
for (i=num_nondup-1; i>=0; i--)
scores[i] = tmp_total[i];
for (i=0; i < num_mutable; i++) {
if (tmp_scores[i][1])
myscores[i] = lookup[i];
}
PGASetEvaluation(ctx, p, pop, new_evaluation);
PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
return 1;
} else {
for (i = 0; i < num_mutable; i++) {
if ((yn_hit[i] && (weight_balance < 0)) ||
(ny_hit[i] && (weight_balance > 0))) {
tmp = (double)0.001*rint(myscores[i]);
if (! tmp) {
if (myscores[i] > (double)0.01)
tmp = (double)0.001;
else if (myscores[i] < -(double)0.01)
tmp = -(double)0.001;
}
#ifdef USE_SCORE_RANGES
if (tmp && (((myscores[i] > 0) &&
((myscores[i] - tmp) < range_lo[i])) ||
((myscores[i] < 0) &&
((myscores[i] - tmp) > range_hi[i]))))
tmp = 0;
#endif
if (tmp) {
myscores[i] -= tmp;
changed = 1;
}
}
}
if (changed) {
if (weight_balance > 0)
adapt_ny++;
else
adapt_yn++;
return 1;
} else
return 0;
}
}
#endif
/*
* This mutation function tosses a weighted coin for each allele.
* If the allele is to be mutated, then the way it's mutated is to regress it
* toward the mean of the population for that allele, then add a little
* gaussian noise.
*
* [To the _mean_? Weird... - Allen]
*
* Aug 21 2002 jm: we now use ranges and allow PGA to take care of it, if
* USE_SCORE_RANGES is defined.
*
* Modified for variable mutations - 9/26/02 - Allen
*
*/
#if defined(USE_VARIABLE_MUTATIONS) || (! defined(USE_SCORE_RANGES))
int myMutation(PGAContext *ctx, int p, int pop, double mr) {
int count=0;
int i;
# ifdef USE_VARIABLE_MUTATIONS
double *myscores;
double old_evaluation,new_evaluation,min_score,max_score;
myscores = PGAGetIndividual(ctx, p, pop)->chrom;
if (PGAGetEvaluationUpToDateFlag(ctx, p, pop))
old_evaluation = PGAGetEvaluation(ctx, p, pop);
else {
old_evaluation = evaluate(ctx, p, pop);
PGASetEvaluation(ctx, p, pop, old_evaluation);
PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
}
for (i=0; i<num_mutable; i++) {
tmp_scores[i][0] = 0;
if (PGARandomFlip(ctx, mr)) {
#ifdef USE_SCORE_RANGES
min_score = range_lo[i];
max_score = range_hi[i];
#else
min_score = SCORE_CAP;
max_score = NEG_SCORE_CAP;
#endif
if (myscores[i] > max_score)
myscores[i] = max_score;
else if (myscores[i] < min_score)
myscores[i] = min_score;
tmp_scores[i][1] = (max_score - min_score)/4;
myscores[i+num_scores] *=
pow(2,(PGARandomGaussian(ctx,0,mutation_noise*2)));
if (myscores[i+num_scores] < min_mutation_noise)
myscores[i+num_scores] = min_mutation_noise;
else if (myscores[i+num_scores] > tmp_scores[i][1])
myscores[i+num_scores] = tmp_scores[i][1];
while (! tmp_scores[i][0]) {
tmp_scores[i][0] = PGARandomGaussian(ctx,0,
myscores[i+num_scores]);
#ifdef USE_SCORE_RANGES
if (((double)(myscores[i] + tmp_scores[i][0]) >= max_score) ||
((double)(myscores[i] + tmp_scores[i][0]) <= min_score)) {
if (myscores[i+num_scores] > mutation_noise) {
myscores[i+num_scores] =
(myscores[i+num_scores] + mutation_noise)/2;
tmp_scores[i][0] = 0;
} else if ((double)(myscores[i] + tmp_scores[i][0]) >= max_score) {
tmp_scores[i][0] = max_score - myscores[i] - (double)0.001;
break;
} else {
tmp_scores[i][0] = min_score - myscores[i] + (double)0.001;
break;
}
}
#endif
}
myscores[i] += tmp_scores[i][0];
count++;
}
}
if (count > 0) {
var_mutated++;
new_evaluation = evaluate(ctx, p, pop);
if (new_evaluation > old_evaluation) {
/* Did previous try go too far away? */
if (iters_same_passed) { /* in 2nd phase */
count = 0;
for (i=0; i<num_mutable; i++) {
if (tmp_scores[i][0]) {
if (myscores[i+num_scores] > mutation_noise) {
tmp_scores[i][1] = PGARandomGaussian(ctx,0,mutation_noise);
count++;
} else
tmp_scores[i][1] =
PGARandomGaussian(ctx,0,myscores[i+num_scores]);
tmp_scores[i][1] = copysign(tmp_scores[i][1],tmp_scores[i][0]);
#ifdef USE_SCORE_RANGES
if ((double)(myscores[i] + tmp_scores[i][1]
- tmp_scores[i][0]) >= range_hi[i]) {
tmp_scores[i][1] = range_hi[i] - myscores[i] +
tmp_scores[i][0] - (double)0.001;
} else if ((double)(myscores[i] + tmp_scores[i][1]
- tmp_scores[i][0]) <= range_lo[i]) {
tmp_scores[i][1] = range_lo[i] - myscores[i] +
tmp_scores[i][0] + (double)0.001;
}
#endif
myscores[i] += tmp_scores[i][1] - tmp_scores[i][0];
}
}
if (count > 0) {
num_mutated++;
new_evaluation = evaluate(ctx, p, pop);
if (PGAGetNoDuplicatesFlag(ctx) == PGA_FALSE) {
/* Hack to avoid redoing evaluation without need - Allen */
count = 0;
PGASetEvaluation(ctx, p, pop, new_evaluation);
PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
}
if (new_evaluation <= old_evaluation) {
/* Previous try went too far away */
if (mr < base_mutation_rate)
num_better_same++;
for (i=0; i<num_mutable; i++) {
if (tmp_scores[i][0] &&
(myscores[i+num_scores] > mutation_noise)
&& (fabs(tmp_scores[i][1]) < fabs(tmp_scores[i][0])))
myscores[i+num_scores] =
(myscores[i+num_scores] + mutation_noise)/2;
}
} else {
#ifdef LAMARCK
if (mr < base_mutation_rate) {
count = adapt(ctx,p,pop,1,1,0);
if (count) {
count = adapt(ctx,p,pop,0,2,1);
if (count)
new_evaluation = PGAGetEvaluation(ctx, p, pop);
else
new_evaluation = evaluate(ctx, p, pop);
if (new_evaluation > old_evaluation)
num_worse++;
else
num_better_same++; /* only had to adapt once */
if (PGAGetNoDuplicatesFlag(ctx) == PGA_FALSE) {
/* Hack to avoid redoing evaluation without need - Allen */
count = 0;
PGASetEvaluation(ctx, p, pop, new_evaluation);
PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
}
} else
num_worse++;
} else
#endif
num_worse++;
}
} else { /* didn't decrease mutation SD */
#ifdef LAMARCK
if (mr < base_mutation_rate) {
count = adapt(ctx,p,pop,0,1,0);
new_evaluation = evaluate(ctx, p, pop);
}
#endif
if (new_evaluation > old_evaluation) {
#ifdef LAMARCK
if ((mr < base_mutation_rate) && count) {
count = adapt(ctx,p,pop,1,2,1);
if (count) {
new_evaluation = PGAGetEvaluation(ctx, p, pop);
if (new_evaluation > old_evaluation)
num_worse++;
else
num_better_same++;
} else
num_worse++;
} else
#endif
num_worse++;
} else
num_better_same++;
if (PGAGetNoDuplicatesFlag(ctx) == PGA_FALSE) {
/* Hack to avoid redoing evaluation without need - Allen */
count = 0;
PGASetEvaluation(ctx, p, pop, new_evaluation);
PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
}
}
if ((! count) &&
(PGAGetNoDuplicatesFlag(ctx) == PGA_TRUE))
count++;
} else
num_worse++;
} else {
if (PGAGetNoDuplicatesFlag(ctx) == PGA_FALSE) {
/* Hack to avoid redoing evaluation without need - Allen */
count = 0;
PGASetEvaluation(ctx, p, pop, new_evaluation);
PGASetEvaluationUpToDateFlag(ctx, p, pop, PGA_TRUE);
}
num_better_same++;
}
}
#ifdef LAMARCK
else if (mr < base_mutation_rate) {
count = adapt(ctx,p,pop,1,2,0);
if (! count)
num_better_same++; /* adapt not working, use mutation */
}
#endif
# else /* USE_VARIABLE_MUTATIONS */
int j;
for (i=0; i<num_mutable; i++)
{
if(PGARandomFlip(ctx, mr))
{
double gene_sum=0.0;
/* Find the mean */
for(j=0; j<pop_size; j++) {
if(p!=j)
gene_sum += PGAGetRealAllele(ctx, j, pop, i);
}
gene_sum /= (double)(pop_size-1);
/* Regress towards it... */
gene_sum = (1.0-regression_coefficient)*gene_sum+regression_coefficient*PGAGetRealAllele(ctx, p, pop, i);
/* Set this gene in this allele to be the average, plus some gaussian noise */
if(gene_sum > SCORE_CAP)
gene_sum = SCORE_CAP;
else if(gene_sum < NEG_SCORE_CAP)
gene_sum = NEG_SCORE_CAP;
PGASetRealAllele(ctx, p, pop, i,
PGARandomGaussian(ctx, gene_sum, mutation_noise));
count++;
}
}
# endif /* !USE_VARIABLE_MUTATIONS */
return count;
}
#endif /* USE_VARIABLE_MUTATIONS || !USE_SCORE_RANGES */
void dump(FILE *fp)
{
fprintf (fp,"\n# SUMMARY for threshold %3.1f:\n", threshold);
fprintf (fp,
"# Correctly non-spam: %6d %4.3f%% (%4.3f%% of non-spam corpus)\n",
ga_nn,
(ga_nn / (float) num_tests) * 100.0,
(ga_nn / (float) num_ham) * 100.0);
fprintf (fp,
"# Correctly spam: %6d %4.3f%% (%4.3f%% of spam corpus)\n",
ga_yy,
(ga_yy / (float) num_tests) * 100.0,
(ga_yy / (float) num_spam) * 100.0);
fprintf (fp,
"# False positives: %6d %4.3f%% (%4.3f%% of nonspam, %6.0f weighted)\n",
ga_ny,
(ga_ny / (float) num_tests) * 100.0,
(ga_ny / (float) num_ham) * 100.0,
nyscore*nybias);
fprintf (fp,
"# False negatives: %6d %4.3f%% (%4.3f%% of spam, %6.0f weighted)\n",
ga_yn,
(ga_yn / (float) num_tests) * 100.0,
(ga_yn / (float) num_spam) * 100.0,
ynscore);
fprintf (fp,"# Average score for spam: %3.1f nonspam: %3.1f\n",(ynscore+yyscore)/((double)(ga_yn+ga_yy)),(nyscore+nnscore)/((double)(ga_nn+ga_ny)));
fprintf (fp,"# Average for false-pos: %3.1f false-neg: %3.1f\n",(nyscore/(double)ga_ny),(ynscore/(double)ga_yn));
fprintf (fp,"# TOTAL: %6d %3.2f%%\n\n", num_tests, 100.0);
}
/*****************************************************************************
* WriteString sends a visual representation of the chromosome out to fp *
*****************************************************************************/
void WriteString(PGAContext *ctx, FILE *fp, int p, int pop)
{
int i;
#ifdef USE_MPI
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if(0 == rank)
{
#endif
evaluate(ctx,p,pop);
dump(fp);
for(i=0; i<num_scores; i++)
{
fprintf(fp,"score %-30s %2.3f\n",
score_names[i],PGAGetRealAllele(ctx, p, pop, i));
}
fprintf ( fp,"\n" );
#ifdef USE_MPI
}
#endif
}
#ifdef USE_VARIABLE_MUTATIONS
double last_best = 0;
#endif
void showSummary(PGAContext *ctx)
{
#ifdef USE_MPI
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if(0 == rank)
{
#endif
if(0 == PGAGetGAIterValue(ctx) % save_every_n_generations)
{
int genome = PGAGetBestIndex(ctx,PGA_OLDPOP);
FILE *scores_file = NULL;
(void)evaluate(ctx, genome, PGA_OLDPOP);
PGAGetEvaluation(ctx, genome, PGA_OLDPOP);
scores_file = fopen("garescorer.scores","w");
WriteString(ctx, scores_file, genome, PGA_OLDPOP);
fclose(scores_file);
#ifdef USE_VARIABLE_MUTATIONS
if (! justCount) {
printf("\nPop size, replacement: %d %d\n",
pop_size, replace_num);
/*
printf("\nMutations (rate, good, bad, var, num): %3.7f %d %d %d %d\n",
mutation_rate, num_better_same, num_worse, var_mutated,
num_mutated);
*/
var_mutated = 0;
num_mutated = 0;
if (! iters_same_passed) {
if (! last_best)
last_best = ctx->rep.Best;
else if ((last_best*0.999) < ctx->rep.Best) /* too slow! */
iters_same_passed = 1;
else
last_best = ctx->rep.Best;
}
#ifdef LAMARCK
printf("\n");
printf("Adapt (t, fneg, fneg_add, fpos, fpos_add): %d %d %d %d %d\n",
adapt_times,adapt_yn,adapt_fn_add,adapt_ny,adapt_fp_add);
printf("Adapt (over, cross, repeat): %d %d %d\n",
adapt_overshot,adapt_crossover,adapt_repeat);
adapt_times = adapt_overshot = adapt_crossover = adapt_repeat =
adapt_yn = adapt_ny = adapt_fn_add = adapt_fp_add = 0;
#endif
}
#endif
{ struct timeval t1;
if (gettimeofday(&t1, (struct timezone *)NULL) == 0) {
double dt = (t1.tv_sec + t1.tv_usec * 1.0e-6) -
(t0.tv_sec + t0.tv_usec * 1.0e-6);
int iter = PGAGetGAIterValue(ctx);
if (dt < 1e-6) dt = 1e-6;
printf("Performance: %.3f iterations/s, iteration no. %d\n",
(iter-t0_iter)/dt, iter);
t0.tv_sec = t1.tv_sec; t0.tv_usec = t0.tv_usec; t0_iter = iter;
}
}
dump(stdout);
}
else if(0 == PGAGetGAIterValue(ctx) % 5)
{
printf("%d",(PGAGetGAIterValue(ctx)/5)%10);
}
#ifdef USE_VARIABLE_MUTATIONS
if (! justCount) {
if ((num_better_same*4) >= num_worse)
mutation_rate /= mutation_rate_modifier;
else if ((num_better_same*4) < num_worse) {
if ((mutation_rate > base_mutation_rate) || iters_same_passed)
mutation_rate *= mutation_rate_modifier;
else if (ctx->ga.ItersOfSame >= (no_change_val/2)) {
iters_same_passed = 1;
mutation_rate *= mutation_rate_modifier;
printf("\nMutation rate %3.7f (ItersOfSame %d)\n",
mutation_rate,ctx->ga.ItersOfSame);
} else
return;
}
if (mutation_rate > mutation_rate_modifier) {
mutation_rate = mutation_rate_modifier;
printf("\nMutation rate max: %3.7f\n",mutation_rate);
} else if (mutation_rate < 0.05/sqrt(num_mutable)) {
mutation_rate = 0.05/sqrt(num_mutable);
printf("\nMutation rate min: %3.7f\n",mutation_rate);
}
PGASetMutationProb(ctx, mutation_rate);
num_better_same = 0;
num_worse = 0;
}
#endif
#ifdef USE_MPI
}
#endif
}
#ifdef USE_VARIABLE_MUTATIONS
/*****************************************************************************
* CreateString allocates and initializes a chromosome. If InitFlag is *
* set to true, then it will initialize the chromosome using the best known *
* values; otherwise, it sets each double to 0.0 and each int to 0. *
*****************************************************************************/
void CreateString(PGAContext *ctx, int p, int pop, int InitFlag) {
int i;
double *myscore;
PGAIndividual *new;
new = PGAGetIndividual(ctx, p, pop);
if (!(new->chrom = malloc(sizeof(double)*num_scores*2))) {
fprintf(stderr, "No room for new->chrom");
exit(1);
}
myscore = new->chrom;
if (InitFlag) {
for(i=0; i<num_scores; i++)
myscore[i] = bestscores[i];
for(i=num_scores; i<num_scores*2; i++)
myscore[i] = mutation_noise;
} else {
for(i=0; i<num_scores*2; i++)
myscore[i] = 0.0;
}
}
/*****************************************************************************
* Crossover implements uniform crossover on the chromosome. *
*****************************************************************************/
void Crossover(PGAContext *ctx, int p1, int p2, int pop1, int t1, int t2,
int pop2) {
int i;
double *parent1, *parent2, *child1, *child2;
double pu;
#ifdef LAMARCK
double parent1_eval, parent2_eval, child1_eval, child2_eval;
#endif
parent1 = PGAGetIndividual(ctx, p1, pop1)->chrom;
parent2 = PGAGetIndividual(ctx, p2, pop1)->chrom;
child1 = PGAGetIndividual(ctx, t1, pop2)->chrom;
child2 = PGAGetIndividual(ctx, t2, pop2)->chrom;
pu = PGAGetUniformCrossoverProb(ctx);
for (i = 0; i < num_mutable; i++) {
if (PGARandomFlip(ctx, pu)) {
child1[i] = parent2[i];
child2[i] = parent1[i];
if (num_mutated > 0) {
if (fabs(parent1[i+num_scores] - mutation_noise) >
fabs(parent1[i+num_scores] - parent2[i+num_scores]))
child2[i+num_scores] =
(parent1[i+num_scores] + parent2[i+num_scores])/2;
else
child2[i+num_scores] =
(parent1[i+num_scores] + mutation_noise)/2;
if (fabs(parent2[i+num_scores] - mutation_noise) >
fabs(parent2[i+num_scores] - parent1[i+num_scores]))
child1[i+num_scores] =
(parent2[i+num_scores] + parent1[i+num_scores])/2;
else
child1[i+num_scores] =
(parent2[i+num_scores] + mutation_noise)/2;
} else {
/* Doing intermediate recombination due to usage
* of exponential multiplication in mutation - Allen */
child1[i+num_scores] = child2[i+num_scores] =
(parent1[i+num_scores] + parent2[i+num_scores])/2;
}
} else {
child1[i] = parent1[i];
child2[i] = parent2[i];
if (pu < 0.5) { /* more grouped */
child1[i+num_scores] = parent1[i+num_scores];
child2[i+num_scores] = parent2[i+num_scores];
} else {
if (num_mutated > 0) {
if (fabs(parent1[i+num_scores] - mutation_noise) >
fabs(parent1[i+num_scores] - parent2[i+num_scores]))
child1[i+num_scores] =
(parent1[i+num_scores] + parent2[i+num_scores])/2;
else
child1[i+num_scores] =
(parent1[i+num_scores] + mutation_noise)/2;
if (fabs(parent2[i+num_scores] - mutation_noise) >
fabs(parent2[i+num_scores] - parent1[i+num_scores]))
child2[i+num_scores] =
(parent2[i+num_scores] + parent1[i+num_scores])/2;
else
child2[i+num_scores] =
(parent2[i+num_scores] + mutation_noise)/2;
} else {
/* Doing intermediate recombination due to usage
* of exponential multiplication in mutation - Allen */
child1[i+num_scores] = child2[i+num_scores] =
(parent1[i+num_scores] + parent2[i+num_scores])/2;
}
}
}
}
for (i = num_mutable; i < num_scores; i++) {
child1[i] = parent1[i];
child2[i] = parent2[i];
child1[i+num_scores] = parent1[i+num_scores];
child2[i+num_scores] = parent2[i+num_scores];
}
#ifdef LAMARCK
if ((PGAGetMutationAndCrossoverFlag(ctx) == PGA_FALSE) &&
(mutation_rate < base_mutation_rate) &&
(PGAGetEvaluationUpToDateFlag(ctx, p1, pop1) == PGA_TRUE) &&
(PGAGetEvaluationUpToDateFlag(ctx, p2, pop1) == PGA_TRUE)) {
parent1_eval = PGAGetEvaluation(ctx, p1, pop1);
parent2_eval = PGAGetEvaluation(ctx, p2, pop1);
if (PGARandomFlip(ctx, (double)0.5)) {
child1_eval = evaluate(ctx, t1, pop2);
if ((child1_eval > parent1_eval) &&
(child1_eval > parent2_eval)) {
/* Urk! */
if (PGARandomFlip(ctx, (double)(mutation_rate/base_mutation_rate)))
adapt_crossover += adapt(ctx, t1, pop2, 1, 2, 0);
else { /* low mr */
if (adapt(ctx, t1, pop2, 1, 1, 0))
adapt_crossover += adapt(ctx, t1, pop2, 0, 2, 1) + 1;
adapt_crossover += adapt(ctx, t2, pop2, 0, 2, 0);
}
} else {
PGASetEvaluation(ctx, t1, pop2, child1_eval);
PGASetEvaluationUpToDateFlag(ctx, t1, pop2, PGA_TRUE);
}
} else {
child2_eval = evaluate(ctx, t2, pop2);
if ((child2_eval > parent1_eval) &&
(child2_eval > parent2_eval)) {
/* Urk! */
if (PGARandomFlip(ctx, (double)(mutation_rate/base_mutation_rate)))
adapt_crossover += adapt(ctx, t2, pop2, 1, 2, 0);
else { /* low mr */
if (adapt(ctx, t2, pop2, 1, 1, 0))
adapt_crossover += adapt(ctx, t2, pop2, 0, 2, 1) + 1;
adapt_crossover += adapt(ctx, t1, pop2, 0, 2, 0);
}
} else {
PGASetEvaluation(ctx, t2, pop2, child2_eval);
PGASetEvaluationUpToDateFlag(ctx, t2, pop2, PGA_TRUE);
}
}
}
#endif
}
/*****************************************************************************
* CopyString makes a copy of the chromosome at (p1, pop1) and puts it at *
* (p2, pop2). *
*****************************************************************************/
void CopyString(PGAContext *ctx, int p1, int pop1, int p2, int pop2) {
void *d, *s;
s = PGAGetIndividual(ctx, p1, pop1)->chrom;
d = PGAGetIndividual(ctx, p2, pop2)->chrom;
memcpy(d, s, sizeof(double)*num_scores*2);
}
/*****************************************************************************
* DuplicateString compares two chromosomes and returns 1 if they are the *
* same and 0 if they are different. *
*****************************************************************************/
int DuplicateString(PGAContext *ctx, int p1, int pop1, int p2, int pop2) {
void *a, *b;
a = PGAGetIndividual(ctx, p1, pop1)->chrom;
b = PGAGetIndividual(ctx, p2, pop2)->chrom;
return (!memcmp(a, b, sizeof(double)*num_scores*2));
}
/*****************************************************************************
* BuildDatattype builds an MPI datatype for sending strings to other *
* processors. Consult your favorite MPI manual for more information. *
*****************************************************************************/
MPI_Datatype BuildDT(PGAContext *ctx, int p, int pop) {
MPI_Datatype DT_PGAIndividual;
#ifdef USE_MPI
int counts[3];
MPI_Aint displs[3];
MPI_Datatype types[3];
PGAIndividual *P;
P = PGAGetIndividual(ctx, p, pop);
/* Build the MPI datatype. Every user defined function needs these.
* The first two calls are stuff that is internal to PGAPack, but
* the user still must include it. See pgapack.h for details one the
* fields (under PGAIndividual)
*/
MPI_Address(&P->evalfunc, &displs[0]);
counts[0] = 2;
types[0] = MPI_DOUBLE;
/* Next, we have an integer, evaluptodate. */
MPI_Address(&P->evaluptodate, &displs[1]);
counts[1] = 1;
types[1] = MPI_INT;
/* Finally, we have the actual user-defined string. */
MPI_Address(P->chrom, &displs[2]);
counts[2] = num_scores*2;
types[2] = MPI_DOUBLE;
MPI_Type_struct(3, counts, displs, types, &DT_PGAIndividual);
#endif /* defined(USE_MPI) */
MPI_Type_commit(&DT_PGAIndividual);
return(DT_PGAIndividual);
}
#endif /* defined(USE_VARIABLE_MUTATIONS) */