blob: 8fa2050f9692233a8e9217b7ce50591dff00b5ff [file] [log] [blame]
package org.apache.sling.mailarchiveserver.util;
import java.io.PrintStream;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Reference;
import org.apache.sling.api.resource.LoginException;
import org.apache.sling.api.resource.Resource;
import org.apache.sling.api.resource.ResourceResolver;
import org.apache.sling.api.resource.ResourceResolverFactory;
import org.apache.sling.api.resource.ValueMap;
/**
* Util class to calculate entropy of a letter position in the message subject.
*
* @author bogomolo
*/
@Component
public class SubjectLettersEntropy {
private static final int SAMPLE_LENGTH = 300;
private static final int ALPHABET_LENGTH = 26;
private static final double THRESHOLD = 0.9;
private static final double MAX_THEOR_ENTROPY = -Math.log10(1./ALPHABET_LENGTH);
@Reference
ResourceResolverFactory resourceResolverFactory;
private ResourceResolver resolver = null;
public static SubjectLettersEntropy instance = null;
public SubjectLettersEntropy() {
if (instance == null) {
instance = this;
}
}
int[][] count = new int[SAMPLE_LENGTH][ALPHABET_LENGTH];
double[] entropy = new double[SAMPLE_LENGTH];
int messages = 0;
public void calculateEntropyAndPrint(PrintStream out) {
try {
if (resourceResolverFactory == null) {
System.out.println("resourceResolverFactory is NULL");
}
resolver = resourceResolverFactory.getAdministrativeResourceResolver(null);
String root = "/content/mailarchiveserver/archive"; // /domain/project/list/t/th/thread/message
Resource main = resolver.getResource(root);
iterate(main, 6);
int[] sum = new int[SAMPLE_LENGTH];
for (int i = 0; i < sum.length; i++) {
for (int j = 0; j < ALPHABET_LENGTH; j++) {
sum[i] += count[i][j];
}
}
for (int i = 0; i < sum.length; i++) {
for (int j = 0; j < ALPHABET_LENGTH; j++) {
if (count[i][j] > 0) {
double num = count[i][j]/1./sum[i];
entropy[i] += - num * Math.log10(num);
}
}
}
System.out.println(String.format("%s\t%s\t%s", "charAt","entropy", "sum"));
for (int i = 0; i < sum.length; i++) {
if (entropy[i] >= MAX_THEOR_ENTROPY*THRESHOLD || sum[i] >= messages*THRESHOLD)
System.out.println(String.format("%d\t%.3f\t%d", i, entropy[i], sum[i]));
}
out.println("Messages #: "+messages);
} catch (LoginException e) {
e.printStackTrace();
}
}
private void countLetters(Resource r) {
messages++;
ValueMap properties = r.adaptTo(ValueMap.class);
String subj = properties.get("Subject", (String) null);
for (int i = 0; i < Math.min(subj.length(), SAMPLE_LENGTH); i++) {
Character c = Character.toLowerCase(subj.charAt(i));
if (c.toString().matches("[a-z]")) {
count[i][c-'a']++;
}
}
}
private void iterate(Resource r, int lvl) {
for (Resource child : r.getChildren()) {
if (lvl == 0) {
countLetters(child);
} else {
iterate(child, lvl-1);
}
}
}
}