mailet/ai/src/main/java/org/apache/james/ai/classic/BayesianAnalyzer.java - james-project - Git at Google

 /****************************************************************
  * Licensed to the Apache Software Foundation (ASF) under one   *
  * or more contributor license agreements.  See the NOTICE file *
  * distributed with this work for additional information        *
  * regarding copyright ownership.  The ASF licenses this file   *
  * to you under the Apache License, Version 2.0 (the            *
  * "License"); you may not use this file except in compliance   *
  * with the License.  You may obtain a copy of the License at   *
  *                                                              *
  *   http://www.apache.org/licenses/LICENSE-2.0                 *
  *                                                              *
  * Unless required by applicable law or agreed to in writing,   *
  * software distributed under the License is distributed on an  *
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
  * KIND, either express or implied.  See the License for the    *
  * specific language governing permissions and limitations      *
  * under the License.                                           *
  ****************************************************************/

 package org.apache.james.ai.classic;

 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;

 /**
  * <p>
  * Determines probability that text contains Spam.
  * </p>
  *
  * <p>
  * Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan
  * for Spam</a>. Extended to Paul Grahams' <a
  * href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.
  * </p>
  *
  * <p>
  * Sample method usage:
  * </p>
  *
  * <p>
  * Use: void addHam(Reader) and void addSpam(Reader)
  *
  * methods to build up the Maps of ham & spam tokens/occurrences. Both addHam
  * and addSpam assume they're reading one message at a time, if you feed more
  * than one message per call, be sure to adjust the appropriate message counter:
  * hamMessageCount or spamMessageCount.
  *
  * Then...
  * </p>
  *
  * <p>
  * Use: void buildCorpus()
  *
  * to build the final token/probabilities Map.
  *
  * Use your own methods for persistent storage of either the individual ham/spam
  * corpus & message counts, and/or the final corpus.
  *
  * Then you can...
  * </p>
  *
  * <p>
  * Use: double computeSpamProbability(Reader)
  *
  * to determine the probability that a particular text contains spam. A returned
  * result of 0.9 or above is an indicator that the text was spam.
  * </p>
  *
  * <p>
  * If you use persistent storage, use: void setCorpus(Map)
  *
  * before calling computeSpamProbability.
  * </p>
  *
  * @since 2.3.0
  */

 public class BayesianAnalyzer {

     /**
      * Number of "interesting" tokens to use to compute overall spamminess
      * probability.
      */
     private static final int MAX_INTERESTING_TOKENS = 15;

     /**
      * Minimum probability distance from 0.5 to consider a token "interesting"
      * to use to compute overall spamminess probability.
      */
     private static final double INTERESTINGNESS_THRESHOLD = 0.46;

     /**
      * Default token probability to use when a token has not been encountered
      * before.
      */
     static final double DEFAULT_TOKEN_PROBABILITY = 0.4;

     /** Map of ham tokens and their occurrences. */
     private Map<String, Integer> hamTokenCounts = new HashMap<>();

     /** Map of spam tokens and their occurrences. */
     private Map<String, Integer> spamTokenCounts = new HashMap<>();

     /** Number of ham messages analyzed. */
     private int hamMessageCount = 0;

     /** Number of spam messages analyzed. */
     private int spamMessageCount = 0;

     /** Final token/probability corpus. */
     private Map<String, Double> corpus = new HashMap<>();

     /**
      * Basic class constructor.
      */
     public BayesianAnalyzer() {
     }

     /**
      * Public setter for the hamTokenCounts Map.
      *
      * @param hamTokenCounts
      *            The new ham Token counts Map.
      */
     public void setHamTokenCounts(Map<String, Integer> hamTokenCounts) {
         this.hamTokenCounts = hamTokenCounts;
     }

     /**
      * Public getter for the hamTokenCounts Map.
      */
     public Map<String, Integer> getHamTokenCounts() {
         return this.hamTokenCounts;
     }

     /**
      * Public setter for the spamTokenCounts Map.
      *
      * @param spamTokenCounts
      *            The new spam Token counts Map.
      */
     public void setSpamTokenCounts(Map<String, Integer> spamTokenCounts) {
         this.spamTokenCounts = spamTokenCounts;
     }

     /**
      * Public getter for the spamTokenCounts Map.
      */
     public Map<String, Integer> getSpamTokenCounts() {
         return this.spamTokenCounts;
     }

     /**
      * Public setter for spamMessageCount.
      *
      * @param spamMessageCount
      *            The new spam message count.
      */
     public void setSpamMessageCount(int spamMessageCount) {
         this.spamMessageCount = spamMessageCount;
     }

     /**
      * Public getter for spamMessageCount.
      */
     public int getSpamMessageCount() {
         return this.spamMessageCount;
     }

     /**
      * Public setter for hamMessageCount.
      *
      * @param hamMessageCount
      *            The new ham message count.
      */
     public void setHamMessageCount(int hamMessageCount) {
         this.hamMessageCount = hamMessageCount;
     }

     /**
      * Public getter for hamMessageCount.
      */
     public int getHamMessageCount() {
         return this.hamMessageCount;
     }

     /**
      * Clears all analysis repositories and counters.
      */
     public void clear() {
         corpus.clear();

         tokenCountsClear();

         hamMessageCount = 0;
         spamMessageCount = 0;
     }

     /**
      * Clears token counters.
      */
     public void tokenCountsClear() {
         hamTokenCounts.clear();
         spamTokenCounts.clear();
     }

     /**
      * Public setter for corpus.
      *
      * @param corpus
      *            The new corpus.
      */
     public void setCorpus(Map<String, Double> corpus) {
         this.corpus = corpus;
     }

     /**
      * Public getter for corpus.
      */
     public Map<String, Double> getCorpus() {
         return this.corpus;
     }

     /**
      * Builds the corpus from the existing ham & spam counts.
      */
     public void buildCorpus() {
         // Combine the known ham & spam tokens.
         Set<String> set = new HashSet<>(hamTokenCounts.size() + spamTokenCounts.size());
         set.addAll(hamTokenCounts.keySet());
         set.addAll(spamTokenCounts.keySet());
         Map<String, Double> tempCorpus = new HashMap<>(set.size());

         // Iterate through all the tokens and compute their new
         // individual probabilities.
         for (String token : set) {
             tempCorpus.put(token, computeProbability(token));
         }
         setCorpus(tempCorpus);
     }

     /**
      * Adds a message to the ham list.
      *
      * @param stream
      *            A reader stream on the ham message to analyze
      * @throws IOException
      *             If any error occurs
      */
     public void addHam(Reader stream) throws java.io.IOException {
         addTokenOccurrences(stream, hamTokenCounts);
         hamMessageCount++;
     }

     /**
      * Adds a message to the spam list.
      *
      * @param stream
      *            A reader stream on the spam message to analyze
      * @throws IOException
      *             If any error occurs
      */
     public void addSpam(Reader stream) throws java.io.IOException {
         addTokenOccurrences(stream, spamTokenCounts);
         spamMessageCount++;
     }

     /**
      * Computes the probability that the stream contains SPAM.
      *
      * @param stream
      *            The text to be analyzed for Spamminess.
      * @return A 0.0 - 1.0 probability
      * @throws IOException
      *             If any error occurs
      */
     public double computeSpamProbability(Reader stream) throws java.io.IOException {
         // Build a set of the tokens in the Stream.
         Set<String> tokens = parse(stream);

         // Get the corpus to use in this run
         // A new corpus may be being built in the meantime
         Map<String, Double> workCorpus = getCorpus();

         // Assign their probabilities from the Corpus (using an additional
         // calculation to determine spamminess).
         SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);

         // Compute and return the overall probability that the
         // stream is SPAM.
         return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
     }

     /**
      * Parses a stream into tokens, and updates the target Map with the
      * token/counts.
      *
      * @param stream
      * @param target
      */
     private void addTokenOccurrences(Reader stream, Map<String, Integer> target) throws java.io.IOException {
         new TokenCounter(target).count(stream);
     }

     /**
      * Parses a stream into tokens, and returns a Set of the unique tokens
      * encountered.
      *
      * @param stream
      * @return Set
      */
     private Set<String> parse(Reader stream) throws java.io.IOException {
         Set<String> tokens = new HashSet<>();
         new TokenCollector(tokens).collect(stream);
         // Return the unique set of tokens encountered.
         return tokens;
     }

     /**
      * Compute the probability that "token" is SPAM.
      *
      * @param token
      * @return The probability that the token occurs within spam.
      */
     private double computeProbability(String token) {
         double hamFactor = 0;
         double spamFactor = 0;

         boolean foundInHam = false;
         boolean foundInSpam = false;

         double minThreshold = 0.01;
         double maxThreshold = 0.99;

         if (hamTokenCounts.containsKey(token)) {
             foundInHam = true;
         }

         if (spamTokenCounts.containsKey(token)) {
             foundInSpam = true;
         }

         if (foundInHam) {
             hamFactor = 2 * hamTokenCounts.get(token).doubleValue();
             if (!foundInSpam) {
                 minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
             }
         }

         if (foundInSpam) {
             spamFactor = spamTokenCounts.get(token).doubleValue();
             if (!foundInHam) {
                 maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
             }
         }

         if ((hamFactor + spamFactor) < 5) {
             // This token hasn't been seen enough.
             return 0.4;
         }

         double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
         double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);

         return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
     }

     /**
      * Returns a SortedSet of TokenProbabilityStrength built from the Corpus and
      * the tokens passed in the "tokens" Set. The ordering is from the highest
      * strength to the lowest strength.
      *
      * @param tokens
      * @param workCorpus
      * @return SortedSet of TokenProbabilityStrength objects.
      */
     private SortedSet<TokenProbabilityStrength> getTokenProbabilityStrengths(Set<String> tokens, Map<String, Double> workCorpus) {
         // Convert to a SortedSet of token probability strengths.
         SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = new TreeSet<>();

         for (String token : tokens) {
             TokenProbabilityStrength tps = new TokenProbabilityStrength();

             tps.token = token;

             if (workCorpus.containsKey(tps.token)) {
                 tps.strength = Math.abs(0.5 - workCorpus.get(tps.token));
             } else {
                 // This token has never been seen before,
                 // we'll give it initially the default probability.
                 Double corpusProbability = DEFAULT_TOKEN_PROBABILITY;
                 tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
                 boolean isTokenDegeneratedFound = false;

                 Collection<String> degeneratedTokens = buildDegenerated(tps.token);
                 Iterator<String> iDegenerated = degeneratedTokens.iterator();
                 String tokenDegenerated;
                 double strengthDegenerated;
                 while (iDegenerated.hasNext()) {
                     tokenDegenerated = iDegenerated.next();
                     if (workCorpus.containsKey(tokenDegenerated)) {
                         Double probabilityTemp = workCorpus.get(tokenDegenerated);
                         strengthDegenerated = Math.abs(0.5 - probabilityTemp);
                         if (strengthDegenerated > tps.strength) {
                             isTokenDegeneratedFound = true;
                             tps.strength = strengthDegenerated;
                             corpusProbability = probabilityTemp;
                         }
                     }
                 }
                 // to reduce memory usage, put in the corpus only if the
                 // probability is different from (stronger than) the default
                 if (isTokenDegeneratedFound) {
                     synchronized (workCorpus) {
                         workCorpus.put(tps.token, corpusProbability);
                     }
                 }
             }

             tokenProbabilityStrengths.add(tps);
         }

         return tokenProbabilityStrengths;
     }

     private Collection<String> buildDegenerated(String fullToken) {
         ArrayList<String> tokens = new ArrayList<>();
         String header;
         String token;
         String tokenLower;

         // look for a header string termination
         int headerEnd = fullToken.indexOf(':');
         if (headerEnd >= 0) {
             header = fullToken.substring(0, headerEnd);
             token = fullToken.substring(headerEnd);
         } else {
             header = "";
             token = fullToken;
         }

         // prepare a version of the token containing all lower case (for
         // performance reasons)
         tokenLower = token.toLowerCase(Locale.US);

         int end = token.length();
         do {
             if (!token.substring(0, end).equals(tokenLower.substring(0, end))) {
                 tokens.add(header + tokenLower.substring(0, end));
                 if (header.length() > 0) {
                     tokens.add(tokenLower.substring(0, end));
                 }
             }
             if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
                 tokens.add(header + token.charAt(0) + tokenLower.substring(1, end));
                 if (header.length() > 0) {
                     tokens.add(token.charAt(0) + tokenLower.substring(1, end));
                 }
             }

             if (token.charAt(end - 1) != '!') {
                 break;
             }

             end--;

             tokens.add(header + token.substring(0, end));
             if (header.length() > 0) {
                 tokens.add(token.substring(0, end));
             }
         } while (end > 0);

         return tokens;
     }

     /**
      * Compute the spamminess probability of the interesting tokens in the
      * tokenProbabilities SortedSet.
      *
      * @param tokenProbabilityStrengths
      * @param workCorpus
      * @return Computed spamminess.
      */
     private double computeOverallProbability(SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths, Map<String, Double> workCorpus) {
         double p = 1.0;
         double np = 1.0;
         double tempStrength = 0.5;
         int count = MAX_INTERESTING_TOKENS;
         Iterator<TokenProbabilityStrength> iterator = tokenProbabilityStrengths.iterator();
         while ((iterator.hasNext()) && (count-- > 0 || tempStrength >= INTERESTINGNESS_THRESHOLD)) {
             TokenProbabilityStrength tps = iterator.next();
             tempStrength = tps.strength;

             // System.out.println(tps);

             double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
             Double theDoubleObject = workCorpus.get(tps.token);
             // if either the original token or a degeneration was found use the
             // double value, otherwise use the default
             if (theDoubleObject != null) {
                 theDoubleValue = theDoubleObject;
             }
             p *= theDoubleValue;
             np *= (1.0 - theDoubleValue);
             // System.out.println("Token " + tps + ", p=" + theDoubleValue +
             // ", overall p=" + p / (p + np));
         }

         return (p / (p + np));
     }
 }
	/****************************************************************
	* Licensed to the Apache Software Foundation (ASF) under one *
	* or more contributor license agreements. See the NOTICE file *
	* distributed with this work for additional information *
	* regarding copyright ownership. The ASF licenses this file *
	* to you under the Apache License, Version 2.0 (the *
	* "License"); you may not use this file except in compliance *
	* with the License. You may obtain a copy of the License at *
	* *
	* http://www.apache.org/licenses/LICENSE-2.0 *
	* *
	* Unless required by applicable law or agreed to in writing, *
	* software distributed under the License is distributed on an *
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
	* KIND, either express or implied. See the License for the *
	* specific language governing permissions and limitations *
	* under the License. *
	****************************************************************/

	package org.apache.james.ai.classic;

	import java.io.Reader;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.Locale;
	import java.util.Map;
	import java.util.Set;
	import java.util.SortedSet;
	import java.util.TreeSet;

	/**
	* <p>
	* Determines probability that text contains Spam.
	* </p>
	*
	* <p>
	* Based upon Paul Grahams' <a href="http://www.paulgraham.com/spam.html">A Plan
	* for Spam</a>. Extended to Paul Grahams' <a
	* href="http://paulgraham.com/better.html">Better Bayesian Filtering</a>.
	* </p>
	*
	* <p>
	* Sample method usage:
	* </p>
	*
	* <p>
	* Use: void addHam(Reader) and void addSpam(Reader)
	*
	* methods to build up the Maps of ham & spam tokens/occurrences. Both addHam
	* and addSpam assume they're reading one message at a time, if you feed more
	* than one message per call, be sure to adjust the appropriate message counter:
	* hamMessageCount or spamMessageCount.
	*
	* Then...
	* </p>
	*
	* <p>
	* Use: void buildCorpus()
	*
	* to build the final token/probabilities Map.
	*
	* Use your own methods for persistent storage of either the individual ham/spam
	* corpus & message counts, and/or the final corpus.
	*
	* Then you can...
	* </p>
	*
	* <p>
	* Use: double computeSpamProbability(Reader)
	*
	* to determine the probability that a particular text contains spam. A returned
	* result of 0.9 or above is an indicator that the text was spam.
	* </p>
	*
	* <p>
	* If you use persistent storage, use: void setCorpus(Map)
	*
	* before calling computeSpamProbability.
	* </p>
	*
	* @since 2.3.0
	*/

	public class BayesianAnalyzer {

	/**
	* Number of "interesting" tokens to use to compute overall spamminess
	* probability.
	*/
	private static final int MAX_INTERESTING_TOKENS = 15;

	/**
	* Minimum probability distance from 0.5 to consider a token "interesting"
	* to use to compute overall spamminess probability.
	*/
	private static final double INTERESTINGNESS_THRESHOLD = 0.46;

	/**
	* Default token probability to use when a token has not been encountered
	* before.
	*/
	static final double DEFAULT_TOKEN_PROBABILITY = 0.4;

	/** Map of ham tokens and their occurrences. */
	private Map<String, Integer> hamTokenCounts = new HashMap<>();

	/** Map of spam tokens and their occurrences. */
	private Map<String, Integer> spamTokenCounts = new HashMap<>();

	/** Number of ham messages analyzed. */
	private int hamMessageCount = 0;

	/** Number of spam messages analyzed. */
	private int spamMessageCount = 0;

	/** Final token/probability corpus. */
	private Map<String, Double> corpus = new HashMap<>();

	/**
	* Basic class constructor.
	*/
	public BayesianAnalyzer() {
	}

	/**
	* Public setter for the hamTokenCounts Map.
	*
	* @param hamTokenCounts
	* The new ham Token counts Map.
	*/
	public void setHamTokenCounts(Map<String, Integer> hamTokenCounts) {
	this.hamTokenCounts = hamTokenCounts;
	}

	/**
	* Public getter for the hamTokenCounts Map.
	*/
	public Map<String, Integer> getHamTokenCounts() {
	return this.hamTokenCounts;
	}

	/**
	* Public setter for the spamTokenCounts Map.
	*
	* @param spamTokenCounts
	* The new spam Token counts Map.
	*/
	public void setSpamTokenCounts(Map<String, Integer> spamTokenCounts) {
	this.spamTokenCounts = spamTokenCounts;
	}

	/**
	* Public getter for the spamTokenCounts Map.
	*/
	public Map<String, Integer> getSpamTokenCounts() {
	return this.spamTokenCounts;
	}

	/**
	* Public setter for spamMessageCount.
	*
	* @param spamMessageCount
	* The new spam message count.
	*/
	public void setSpamMessageCount(int spamMessageCount) {
	this.spamMessageCount = spamMessageCount;
	}

	/**
	* Public getter for spamMessageCount.
	*/
	public int getSpamMessageCount() {
	return this.spamMessageCount;
	}

	/**
	* Public setter for hamMessageCount.
	*
	* @param hamMessageCount
	* The new ham message count.
	*/
	public void setHamMessageCount(int hamMessageCount) {
	this.hamMessageCount = hamMessageCount;
	}

	/**
	* Public getter for hamMessageCount.
	*/
	public int getHamMessageCount() {
	return this.hamMessageCount;
	}

	/**
	* Clears all analysis repositories and counters.
	*/
	public void clear() {
	corpus.clear();

	tokenCountsClear();

	hamMessageCount = 0;
	spamMessageCount = 0;
	}

	/**
	* Clears token counters.
	*/
	public void tokenCountsClear() {
	hamTokenCounts.clear();
	spamTokenCounts.clear();
	}

	/**
	* Public setter for corpus.
	*
	* @param corpus
	* The new corpus.
	*/
	public void setCorpus(Map<String, Double> corpus) {
	this.corpus = corpus;
	}

	/**
	* Public getter for corpus.
	*/
	public Map<String, Double> getCorpus() {
	return this.corpus;
	}

	/**
	* Builds the corpus from the existing ham & spam counts.
	*/
	public void buildCorpus() {
	// Combine the known ham & spam tokens.
	Set<String> set = new HashSet<>(hamTokenCounts.size() + spamTokenCounts.size());
	set.addAll(hamTokenCounts.keySet());
	set.addAll(spamTokenCounts.keySet());
	Map<String, Double> tempCorpus = new HashMap<>(set.size());

	// Iterate through all the tokens and compute their new
	// individual probabilities.
	for (String token : set) {
	tempCorpus.put(token, computeProbability(token));
	}
	setCorpus(tempCorpus);
	}

	/**
	* Adds a message to the ham list.
	*
	* @param stream
	* A reader stream on the ham message to analyze
	* @throws IOException
	* If any error occurs
	*/
	public void addHam(Reader stream) throws java.io.IOException {
	addTokenOccurrences(stream, hamTokenCounts);
	hamMessageCount++;
	}

	/**
	* Adds a message to the spam list.
	*
	* @param stream
	* A reader stream on the spam message to analyze
	* @throws IOException
	* If any error occurs
	*/
	public void addSpam(Reader stream) throws java.io.IOException {
	addTokenOccurrences(stream, spamTokenCounts);
	spamMessageCount++;
	}

	/**
	* Computes the probability that the stream contains SPAM.
	*
	* @param stream
	* The text to be analyzed for Spamminess.
	* @return A 0.0 - 1.0 probability
	* @throws IOException
	* If any error occurs
	*/
	public double computeSpamProbability(Reader stream) throws java.io.IOException {
	// Build a set of the tokens in the Stream.
	Set<String> tokens = parse(stream);

	// Get the corpus to use in this run
	// A new corpus may be being built in the meantime
	Map<String, Double> workCorpus = getCorpus();

	// Assign their probabilities from the Corpus (using an additional
	// calculation to determine spamminess).
	SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = getTokenProbabilityStrengths(tokens, workCorpus);

	// Compute and return the overall probability that the
	// stream is SPAM.
	return computeOverallProbability(tokenProbabilityStrengths, workCorpus);
	}

	/**
	* Parses a stream into tokens, and updates the target Map with the
	* token/counts.
	*
	* @param stream
	* @param target
	*/
	private void addTokenOccurrences(Reader stream, Map<String, Integer> target) throws java.io.IOException {
	new TokenCounter(target).count(stream);
	}

	/**
	* Parses a stream into tokens, and returns a Set of the unique tokens
	* encountered.
	*
	* @param stream
	* @return Set
	*/
	private Set<String> parse(Reader stream) throws java.io.IOException {
	Set<String> tokens = new HashSet<>();
	new TokenCollector(tokens).collect(stream);
	// Return the unique set of tokens encountered.
	return tokens;
	}

	/**
	* Compute the probability that "token" is SPAM.
	*
	* @param token
	* @return The probability that the token occurs within spam.
	*/
	private double computeProbability(String token) {
	double hamFactor = 0;
	double spamFactor = 0;

	boolean foundInHam = false;
	boolean foundInSpam = false;

	double minThreshold = 0.01;
	double maxThreshold = 0.99;

	if (hamTokenCounts.containsKey(token)) {
	foundInHam = true;
	}

	if (spamTokenCounts.containsKey(token)) {
	foundInSpam = true;
	}

	if (foundInHam) {
	hamFactor = 2 * hamTokenCounts.get(token).doubleValue();
	if (!foundInSpam) {
	minThreshold = (hamFactor > 20) ? 0.0001 : 0.0002;
	}
	}

	if (foundInSpam) {
	spamFactor = spamTokenCounts.get(token).doubleValue();
	if (!foundInHam) {
	maxThreshold = (spamFactor > 10) ? 0.9999 : 0.9998;
	}
	}

	if ((hamFactor + spamFactor) < 5) {
	// This token hasn't been seen enough.
	return 0.4;
	}

	double spamFreq = Math.min(1.0, spamFactor / spamMessageCount);
	double hamFreq = Math.min(1.0, hamFactor / hamMessageCount);

	return Math.max(minThreshold, Math.min(maxThreshold, (spamFreq / (hamFreq + spamFreq))));
	}

	/**
	* Returns a SortedSet of TokenProbabilityStrength built from the Corpus and
	* the tokens passed in the "tokens" Set. The ordering is from the highest
	* strength to the lowest strength.
	*
	* @param tokens
	* @param workCorpus
	* @return SortedSet of TokenProbabilityStrength objects.
	*/
	private SortedSet<TokenProbabilityStrength> getTokenProbabilityStrengths(Set<String> tokens, Map<String, Double> workCorpus) {
	// Convert to a SortedSet of token probability strengths.
	SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths = new TreeSet<>();

	for (String token : tokens) {
	TokenProbabilityStrength tps = new TokenProbabilityStrength();

	tps.token = token;

	if (workCorpus.containsKey(tps.token)) {
	tps.strength = Math.abs(0.5 - workCorpus.get(tps.token));
	} else {
	// This token has never been seen before,
	// we'll give it initially the default probability.
	Double corpusProbability = DEFAULT_TOKEN_PROBABILITY;
	tps.strength = Math.abs(0.5 - DEFAULT_TOKEN_PROBABILITY);
	boolean isTokenDegeneratedFound = false;

	Collection<String> degeneratedTokens = buildDegenerated(tps.token);
	Iterator<String> iDegenerated = degeneratedTokens.iterator();
	String tokenDegenerated;
	double strengthDegenerated;
	while (iDegenerated.hasNext()) {
	tokenDegenerated = iDegenerated.next();
	if (workCorpus.containsKey(tokenDegenerated)) {
	Double probabilityTemp = workCorpus.get(tokenDegenerated);
	strengthDegenerated = Math.abs(0.5 - probabilityTemp);
	if (strengthDegenerated > tps.strength) {
	isTokenDegeneratedFound = true;
	tps.strength = strengthDegenerated;
	corpusProbability = probabilityTemp;
	}
	}
	}
	// to reduce memory usage, put in the corpus only if the
	// probability is different from (stronger than) the default
	if (isTokenDegeneratedFound) {
	synchronized (workCorpus) {
	workCorpus.put(tps.token, corpusProbability);
	}
	}
	}

	tokenProbabilityStrengths.add(tps);
	}

	return tokenProbabilityStrengths;
	}

	private Collection<String> buildDegenerated(String fullToken) {
	ArrayList<String> tokens = new ArrayList<>();
	String header;
	String token;
	String tokenLower;

	// look for a header string termination
	int headerEnd = fullToken.indexOf(':');
	if (headerEnd >= 0) {
	header = fullToken.substring(0, headerEnd);
	token = fullToken.substring(headerEnd);
	} else {
	header = "";
	token = fullToken;
	}

	// prepare a version of the token containing all lower case (for
	// performance reasons)
	tokenLower = token.toLowerCase(Locale.US);

	int end = token.length();
	do {
	if (!token.substring(0, end).equals(tokenLower.substring(0, end))) {
	tokens.add(header + tokenLower.substring(0, end));
	if (header.length() > 0) {
	tokens.add(tokenLower.substring(0, end));
	}
	}
	if (end > 1 && token.charAt(0) >= 'A' && token.charAt(0) <= 'Z') {
	tokens.add(header + token.charAt(0) + tokenLower.substring(1, end));
	if (header.length() > 0) {
	tokens.add(token.charAt(0) + tokenLower.substring(1, end));
	}
	}

	if (token.charAt(end - 1) != '!') {
	break;
	}

	end--;

	tokens.add(header + token.substring(0, end));
	if (header.length() > 0) {
	tokens.add(token.substring(0, end));
	}
	} while (end > 0);

	return tokens;
	}

	/**
	* Compute the spamminess probability of the interesting tokens in the
	* tokenProbabilities SortedSet.
	*
	* @param tokenProbabilityStrengths
	* @param workCorpus
	* @return Computed spamminess.
	*/
	private double computeOverallProbability(SortedSet<TokenProbabilityStrength> tokenProbabilityStrengths, Map<String, Double> workCorpus) {
	double p = 1.0;
	double np = 1.0;
	double tempStrength = 0.5;
	int count = MAX_INTERESTING_TOKENS;
	Iterator<TokenProbabilityStrength> iterator = tokenProbabilityStrengths.iterator();
	while ((iterator.hasNext()) && (count-- > 0 \|\| tempStrength >= INTERESTINGNESS_THRESHOLD)) {
	TokenProbabilityStrength tps = iterator.next();
	tempStrength = tps.strength;

	// System.out.println(tps);

	double theDoubleValue = DEFAULT_TOKEN_PROBABILITY; // initialize it to the default
	Double theDoubleObject = workCorpus.get(tps.token);
	// if either the original token or a degeneration was found use the
	// double value, otherwise use the default
	if (theDoubleObject != null) {
	theDoubleValue = theDoubleObject;
	}
	p *= theDoubleValue;
	np *= (1.0 - theDoubleValue);
	// System.out.println("Token " + tps + ", p=" + theDoubleValue +
	// ", overall p=" + p / (p + np));
	}

	return (p / (p + np));
	}
	}