/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.jsmlearning; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
/* | |
* | |
* The rule is in the form | |
The report also shows how many positive cases are covered by this rule (should be 0) and how many negative cases | |
are covered by this rule (should be above 1) | |
The rule | |
{plugin_number=3, service_type=all, mime_type_number=11, review_status=pass} 0 192 | |
should be read as | |
plugin_number=3 & service_type=all & mime_type_number=11 & review_status=pass | |
For a single-attribute, its value should be the one from this rule. For a multi-value attribute, the set of values in the case | |
should INCLUDE the set of values from the rule. | |
The rule checking that a case belongs to the negative set is a disjunction of all rules in the result file. | |
input: two data files, one is negative set and another is positive set. | |
in the argument, just the negative file needs to be specified: | |
".../negativeSet1.csv", | |
then the system assumes that the filename for negative is obtained by replacing 'negative' with 'positive' | |
".../positiveSet1.csv", | |
The set of attribute in analysis is hard coded | |
*/ | |
public class IntersectionSetBuilder{ | |
private FeatureSpaceCoverageProcessor distProcessorPos, distProcessorNeg; | |
private float percentageOfAllowedSetCover = 0.001f; | |
//The set of attribute in analysis is hard coded | |
String[] fieldsToAggr = new String[]{ | |
"reason_code", "risk_rating", "service_type", "device_match_result", "device_result", "http_referer", "device_id_reason_code", | |
"review_status", "tcp_os_sig_ttl", "tcp_connection_type", | |
"mime_type_number", "plugin_number", "http_connection_type", "device_last_event", "http_connection_type" | |
}; | |
public IntersectionSetBuilder() {}; | |
/* | |
* Takes a file generated by public String ruleFormer(String dataFile) | |
* and performs verification of coverage for positive and negative set, as well as dedupe of rules | |
* The input for negative positive data set is the same as the above function. | |
* The second argument is the rule file generated by the above. | |
* Outputs the verified rule file. | |
*/ | |
public void ruleVerifier(String dataFile, String ruleFile){ | |
List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile); | |
List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive")); | |
distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor(); | |
distProcessorNeg.initParamMap( fieldsToAggr, negativeSet.get(0)); | |
distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0)); | |
negativeSet.remove(0); positiveSet.remove(0); | |
List<String[]> ruleStrings = ProfileReaderWriter.readProfiles(ruleFile); | |
List<Map<String, String>> rules = new ArrayList<Map<String, String>>(), dedupedRules = new ArrayList<Map<String, String>>() ; | |
for(String[] l : ruleStrings){ | |
Map<String, String> rule = new HashMap<String, String>(); | |
String lstr = l[0].substring(1, l[0].length()-1); | |
String[] ruleStr= lstr.split(","); | |
for(String attr_valueStr: ruleStr){ | |
String[] attr_value = attr_valueStr.split("="); | |
if (attr_value.length==2) | |
rule.put(attr_value[0].trim(), attr_value[1].trim()); | |
else if (attr_value.length==1) | |
rule.put(attr_value[0].trim(),""); | |
else | |
System.err.println("Problem parsing rule file "+lstr); | |
} | |
rules.add(rule); | |
} | |
for(int i=0; i<rules.size(); i++){ | |
boolean bCovered = false; | |
for(int j=i+1; j<rules.size(); j++){ | |
if (distProcessorNeg.ruleCoversRule(rules.get(j), rules.get(i))){ | |
bCovered = true; | |
} | |
} | |
if (!bCovered) | |
dedupedRules.add(rules.get(i)); | |
} | |
rules = dedupedRules; | |
List<String[]> output = new ArrayList<String[]>(); | |
output.add(new String[]{"rule", "# covers positive", "# covers negative"}); | |
for(Map<String, String> rule: rules){ | |
int countCoverNeg = 0, countCoverPos=0; | |
for(String[] line: positiveSet){ | |
if (distProcessorPos.ruleCoversCase(rule, line)){ | |
countCoverPos++; | |
} | |
} | |
for(String[] line: negativeSet){ | |
if (distProcessorNeg.ruleCoversCase(rule, line)){ | |
countCoverNeg++; | |
} | |
} | |
output.add(new String[]{rule.toString(), new Integer(countCoverPos).toString(), new Integer(countCoverNeg).toString()}); | |
} | |
ProfileReaderWriter.writeReport(output, ruleFile+"Verif1.csv"); | |
} | |
/* | |
* Takes one argument for negative training set file, assumes the positive filename is formed by replacing 'negative'->'positive' | |
* Outputs the filename with generated rules | |
* | |
*/ | |
public String ruleFormer(String dataFile){ | |
List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile); | |
List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive")); | |
distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor(); | |
distProcessorNeg.initParamMap( fieldsToAggr, negativeSet.get(0)); | |
distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0)); | |
negativeSet.remove(0); positiveSet.remove(0); | |
List<Map<String, String>> intersections = formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(negativeSet, positiveSet); | |
List<Map<String, String>> superIntersections = formIntersections(intersections, negativeSet, positiveSet); | |
List<String[]> output = new ArrayList<String[]>(); | |
for(Map<String, String> rule: superIntersections){ | |
int countCover = 0; | |
for(String[] line: positiveSet){ | |
if (distProcessorPos.ruleCoversCase(rule, line)){ | |
countCover++; | |
} | |
} | |
output.add(new String[]{rule.toString(), new Integer(countCover).toString()}); | |
} | |
String outputFile = "learnedRulesForNegativeSetJune23-1.csv"; | |
ProfileReaderWriter.writeReport(output, outputFile); | |
return outputFile; | |
} | |
private List<Map<String, String>> formIntersections(List<Map<String, String>> intersectionsIn, List<String[]> negativeSet, List<String[]> positiveSet) { | |
List<Map<String, String>> intersectionsNew = new ArrayList<Map<String, String>>(); | |
for(int i=0; i<intersectionsIn.size(); i++){ | |
for(int j=i+1; j<intersectionsIn.size(); j++){ | |
Map<String, String> intersection = distProcessorNeg.computeIntersection(intersectionsIn.get(i), intersectionsIn.get(j)); | |
if (intersection.isEmpty()) | |
continue; | |
int countCover = 0; | |
for(String[] line: positiveSet){ | |
if (distProcessorPos.ruleCoversCase(intersection, line)){ | |
//countCover++; | |
countCover = 10000000; | |
break; | |
} | |
} | |
float cover = (float)countCover/(float)positiveSet.size(); | |
if (!(cover<this.percentageOfAllowedSetCover)) | |
continue; | |
List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>(); | |
boolean nothingCoversThisRule = true; | |
for(Map<String, String> intersChecker: intersectionsIn){ // more general rule covers more specific | |
if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){ | |
nothingCoversThisRule = false; | |
break; | |
} // now check if this new rule defeats built rules | |
if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){ | |
rulesToBeRemoved.add(intersChecker); | |
} | |
} | |
if(nothingCoversThisRule){ | |
intersectionsNew.add(intersection); | |
intersectionsNew.removeAll(rulesToBeRemoved); | |
} | |
} | |
} | |
intersectionsNew.addAll(intersectionsIn); | |
return intersectionsNew; | |
} | |
private List<Map<String, String>> formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(List<String[]> negativeSet, List<String[]> positiveSet){ | |
List<Map<String, String>> intersections = new ArrayList<Map<String, String>>(); | |
for(int i=0; i<negativeSet.size() && i<1000; i++){ | |
for(int j=i+1; j<negativeSet.size(); j++){ | |
Map<String, String> intersection = distProcessorNeg.computeIntersection(negativeSet.get(i), negativeSet.get(j)); | |
if (intersection.isEmpty()) | |
continue; | |
/* temporary code that formed rule covers at least 2 cases | |
int countCoverNeg=0; | |
for(String[] line: negativeSet){ | |
if (distProcessorNeg.ruleCoversCase(intersection, line)){ | |
countCoverNeg++; | |
} | |
} | |
if (countCoverNeg<2){ | |
System.err.println("A rule formed but it does not cover its origin! "+intersection); | |
distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(i)); | |
distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(j)); | |
} */ | |
int countCover = 0; | |
for(String[] line: positiveSet){ | |
if (distProcessorPos.ruleCoversCase(intersection, line)){ | |
//countCover++; | |
countCover = 10000000; | |
break; | |
} | |
} | |
float cover = (float)countCover/(float)positiveSet.size(); | |
if (!(cover<this.percentageOfAllowedSetCover)) | |
continue; | |
List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>(); | |
boolean nothingCoversThisRule = true; | |
for(Map<String, String> intersChecker: intersections){ // more general rule covers more specific | |
if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){ | |
nothingCoversThisRule = false; | |
break; | |
} // now check if this new rule defeats built rules | |
if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){ | |
rulesToBeRemoved.add(intersChecker); | |
} | |
} | |
if(nothingCoversThisRule){ | |
intersections.add(intersection); | |
intersections.removeAll(rulesToBeRemoved); | |
} | |
} | |
} | |
return intersections; | |
} | |
private List<Map<String, String>> filterIntersectionsByOppositeTrainingSet(List<Map<String, String>> intersections, List<String[]> positiveSet){ | |
List<Map<String, String>> filteredIntersections = new ArrayList<Map<String, String>>(); | |
for(Map<String, String> rule: intersections){ | |
int countCover = 0; | |
for(String[] line: positiveSet){ | |
if (!distProcessorPos.ruleCoversCase(rule, line)) | |
countCover++; | |
} | |
if ((float)countCover/(float)positiveSet.size()<this.percentageOfAllowedSetCover) | |
filteredIntersections.add(rule); | |
} | |
return filteredIntersections; | |
} | |
public boolean applyRule(String[] sample){ | |
return true; | |
// todo implement singleton which reads rule file and applies them | |
} | |
public static void main(String[] args){ | |
IntersectionSetBuilder iBuilder = new IntersectionSetBuilder (); | |
// builds the set of rules | |
String resFile = iBuilder.ruleFormer("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv"); | |
// verifies and cleans the rules | |
iBuilder.ruleVerifier("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv", | |
"C:/workspace/relevanceEngine/learnedRulesForNegativeSetJune23-1.csv"); | |
} | |
} | |
/* | |
* | |
* datetime | |
browser_language | |
browser_string | |
device_first_seen | |
device_match_result | |
http_os_signature | |
http_os_sig_raw | |
os | |
device_id_reason_code | |
true_ip | |
proxy_ip | |
http_os_sig_adv_mss | |
http_os_sig_snd_mss | |
http_os_sig_rcv_mss | |
http_os_sig_ttl | |
http_connection_type | |
device_last_event | |
flash_lang | |
flash_os | |
flash_version | |
os_fonts_number | |
plugin_adobe_acrobat | |
plugin_flash | |
plugin_silverlight | |
plugin_windows_media_player | |
profiling_datetime | |
screen_res | |
tcp_os_signature | |
tcp_os_sig_raw | |
time_zone | |
time_zone_dst_offset | |
profile_api_timedelta | |
mime_type_number | |
plugin_number | |
plugin_quicktime | |
plugin_java | |
fuzzy_device_id_confidence | |
fuzzy_device_match_result | |
fuzzy_device_last_event | |
fuzzy_device_first_seen | |
true_ip_city | |
true_ip_first_seen | |
true_ip_geo | |
true_ip_latitude | |
true_ip_longitude | |
account_email_first_seen | |
shipping_address_first_seen | |
tcp_os_ sig_ttl | |
tcp_connection_type | |
page_time_on | |
policy_score | |
reason_code | |
review_status | |
risk_rating | |
*/ |