blob: e1d748e32365024213daf443316a7ed569d51425 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.jsmlearning;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/*
*
* The rule is in the form
The report also shows how many positive cases are covered by this rule (should be 0) and how many negative cases
are covered by this rule (should be above 1)
The rule
{plugin_number=3, service_type=all, mime_type_number=11, review_status=pass} 0 192
should be read as
plugin_number=3 & service_type=all & mime_type_number=11 & review_status=pass
For a single-attribute, its value should be the one from this rule. For a multi-value attribute, the set of values in the case
should INCLUDE the set of values from the rule.
The rule checking that a case belongs to the negative set is a disjunction of all rules in the result file.
input: two data files, one is negative set and another is positive set.
in the argument, just the negative file needs to be specified:
".../negativeSet1.csv",
then the system assumes that the filename for negative is obtained by replacing 'negative' with 'positive'
".../positiveSet1.csv",
The set of attribute in analysis is hard coded
*/
public class IntersectionSetBuilder{
private FeatureSpaceCoverageProcessor distProcessorPos, distProcessorNeg;
private float percentageOfAllowedSetCover = 0.001f;
//The set of attribute in analysis is hard coded
String[] fieldsToAggr = new String[]{
"reason_code", "risk_rating", "service_type", "device_match_result", "device_result", "http_referer", "device_id_reason_code",
"review_status", "tcp_os_sig_ttl", "tcp_connection_type",
"mime_type_number", "plugin_number", "http_connection_type", "device_last_event", "http_connection_type"
};
public IntersectionSetBuilder() {};
/*
* Takes a file generated by public String ruleFormer(String dataFile)
* and performs verification of coverage for positive and negative set, as well as dedupe of rules
* The input for negative positive data set is the same as the above function.
* The second argument is the rule file generated by the above.
* Outputs the verified rule file.
*/
public void ruleVerifier(String dataFile, String ruleFile){
List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile);
List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive"));
distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor();
distProcessorNeg.initParamMap( fieldsToAggr, negativeSet.get(0));
distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0));
negativeSet.remove(0); positiveSet.remove(0);
List<String[]> ruleStrings = ProfileReaderWriter.readProfiles(ruleFile);
List<Map<String, String>> rules = new ArrayList<Map<String, String>>(), dedupedRules = new ArrayList<Map<String, String>>() ;
for(String[] l : ruleStrings){
Map<String, String> rule = new HashMap<String, String>();
String lstr = l[0].substring(1, l[0].length()-1);
String[] ruleStr= lstr.split(",");
for(String attr_valueStr: ruleStr){
String[] attr_value = attr_valueStr.split("=");
if (attr_value.length==2)
rule.put(attr_value[0].trim(), attr_value[1].trim());
else if (attr_value.length==1)
rule.put(attr_value[0].trim(),"");
else
System.err.println("Problem parsing rule file "+lstr);
}
rules.add(rule);
}
for(int i=0; i<rules.size(); i++){
boolean bCovered = false;
for(int j=i+1; j<rules.size(); j++){
if (distProcessorNeg.ruleCoversRule(rules.get(j), rules.get(i))){
bCovered = true;
}
}
if (!bCovered)
dedupedRules.add(rules.get(i));
}
rules = dedupedRules;
List<String[]> output = new ArrayList<String[]>();
output.add(new String[]{"rule", "# covers positive", "# covers negative"});
for(Map<String, String> rule: rules){
int countCoverNeg = 0, countCoverPos=0;
for(String[] line: positiveSet){
if (distProcessorPos.ruleCoversCase(rule, line)){
countCoverPos++;
}
}
for(String[] line: negativeSet){
if (distProcessorNeg.ruleCoversCase(rule, line)){
countCoverNeg++;
}
}
output.add(new String[]{rule.toString(), new Integer(countCoverPos).toString(), new Integer(countCoverNeg).toString()});
}
ProfileReaderWriter.writeReport(output, ruleFile+"Verif1.csv");
}
/*
* Takes one argument for negative training set file, assumes the positive filename is formed by replacing 'negative'->'positive'
* Outputs the filename with generated rules
*
*/
public String ruleFormer(String dataFile){
List<String[]> negativeSet = ProfileReaderWriter.readProfiles(dataFile);
List<String[]> positiveSet = ProfileReaderWriter.readProfiles(dataFile.replace("negative", "positive"));
distProcessorPos = new FeatureSpaceCoverageProcessor(); distProcessorNeg = new FeatureSpaceCoverageProcessor();
distProcessorNeg.initParamMap( fieldsToAggr, negativeSet.get(0));
distProcessorPos.initParamMap(fieldsToAggr, positiveSet.get(0));
negativeSet.remove(0); positiveSet.remove(0);
List<Map<String, String>> intersections = formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(negativeSet, positiveSet);
List<Map<String, String>> superIntersections = formIntersections(intersections, negativeSet, positiveSet);
List<String[]> output = new ArrayList<String[]>();
for(Map<String, String> rule: superIntersections){
int countCover = 0;
for(String[] line: positiveSet){
if (distProcessorPos.ruleCoversCase(rule, line)){
countCover++;
}
}
output.add(new String[]{rule.toString(), new Integer(countCover).toString()});
}
String outputFile = "learnedRulesForNegativeSetJune23-1.csv";
ProfileReaderWriter.writeReport(output, outputFile);
return outputFile;
}
private List<Map<String, String>> formIntersections(List<Map<String, String>> intersectionsIn, List<String[]> negativeSet, List<String[]> positiveSet) {
List<Map<String, String>> intersectionsNew = new ArrayList<Map<String, String>>();
for(int i=0; i<intersectionsIn.size(); i++){
for(int j=i+1; j<intersectionsIn.size(); j++){
Map<String, String> intersection = distProcessorNeg.computeIntersection(intersectionsIn.get(i), intersectionsIn.get(j));
if (intersection.isEmpty())
continue;
int countCover = 0;
for(String[] line: positiveSet){
if (distProcessorPos.ruleCoversCase(intersection, line)){
//countCover++;
countCover = 10000000;
break;
}
}
float cover = (float)countCover/(float)positiveSet.size();
if (!(cover<this.percentageOfAllowedSetCover))
continue;
List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>();
boolean nothingCoversThisRule = true;
for(Map<String, String> intersChecker: intersectionsIn){ // more general rule covers more specific
if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){
nothingCoversThisRule = false;
break;
} // now check if this new rule defeats built rules
if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){
rulesToBeRemoved.add(intersChecker);
}
}
if(nothingCoversThisRule){
intersectionsNew.add(intersection);
intersectionsNew.removeAll(rulesToBeRemoved);
}
}
}
intersectionsNew.addAll(intersectionsIn);
return intersectionsNew;
}
private List<Map<String, String>> formIntersectionAmongMembersOfTrainingSetAndVerifyThatDoesNotCoverOppositeTrainingS(List<String[]> negativeSet, List<String[]> positiveSet){
List<Map<String, String>> intersections = new ArrayList<Map<String, String>>();
for(int i=0; i<negativeSet.size() && i<1000; i++){
for(int j=i+1; j<negativeSet.size(); j++){
Map<String, String> intersection = distProcessorNeg.computeIntersection(negativeSet.get(i), negativeSet.get(j));
if (intersection.isEmpty())
continue;
/* temporary code that formed rule covers at least 2 cases
int countCoverNeg=0;
for(String[] line: negativeSet){
if (distProcessorNeg.ruleCoversCase(intersection, line)){
countCoverNeg++;
}
}
if (countCoverNeg<2){
System.err.println("A rule formed but it does not cover its origin! "+intersection);
distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(i));
distProcessorNeg.ruleCoversCase(intersection, negativeSet.get(j));
} */
int countCover = 0;
for(String[] line: positiveSet){
if (distProcessorPos.ruleCoversCase(intersection, line)){
//countCover++;
countCover = 10000000;
break;
}
}
float cover = (float)countCover/(float)positiveSet.size();
if (!(cover<this.percentageOfAllowedSetCover))
continue;
List<Map<String, String>> rulesToBeRemoved = new ArrayList<Map<String, String>>();
boolean nothingCoversThisRule = true;
for(Map<String, String> intersChecker: intersections){ // more general rule covers more specific
if (distProcessorNeg.ruleCoversRule(intersChecker, intersection)){
nothingCoversThisRule = false;
break;
} // now check if this new rule defeats built rules
if (distProcessorNeg.ruleCoversRule( intersection, intersChecker)){
rulesToBeRemoved.add(intersChecker);
}
}
if(nothingCoversThisRule){
intersections.add(intersection);
intersections.removeAll(rulesToBeRemoved);
}
}
}
return intersections;
}
private List<Map<String, String>> filterIntersectionsByOppositeTrainingSet(List<Map<String, String>> intersections, List<String[]> positiveSet){
List<Map<String, String>> filteredIntersections = new ArrayList<Map<String, String>>();
for(Map<String, String> rule: intersections){
int countCover = 0;
for(String[] line: positiveSet){
if (!distProcessorPos.ruleCoversCase(rule, line))
countCover++;
}
if ((float)countCover/(float)positiveSet.size()<this.percentageOfAllowedSetCover)
filteredIntersections.add(rule);
}
return filteredIntersections;
}
public boolean applyRule(String[] sample){
return true;
// todo implement singleton which reads rule file and applies them
}
public static void main(String[] args){
IntersectionSetBuilder iBuilder = new IntersectionSetBuilder ();
// builds the set of rules
String resFile = iBuilder.ruleFormer("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv");
// verifies and cleans the rules
iBuilder.ruleVerifier("C:/workspace/relevanceEngine/src/test/resources/maps/anomaly/negativeSet1.csv",
"C:/workspace/relevanceEngine/learnedRulesForNegativeSetJune23-1.csv");
}
}
/*
*
* datetime
browser_language
browser_string
device_first_seen
device_match_result
http_os_signature
http_os_sig_raw
os
device_id_reason_code
true_ip
proxy_ip
http_os_sig_adv_mss
http_os_sig_snd_mss
http_os_sig_rcv_mss
http_os_sig_ttl
http_connection_type
device_last_event
flash_lang
flash_os
flash_version
os_fonts_number
plugin_adobe_acrobat
plugin_flash
plugin_silverlight
plugin_windows_media_player
profiling_datetime
screen_res
tcp_os_signature
tcp_os_sig_raw
time_zone
time_zone_dst_offset
profile_api_timedelta
mime_type_number
plugin_number
plugin_quicktime
plugin_java
fuzzy_device_id_confidence
fuzzy_device_match_result
fuzzy_device_last_event
fuzzy_device_first_seen
true_ip_city
true_ip_first_seen
true_ip_geo
true_ip_latitude
true_ip_longitude
account_email_first_seen
shipping_address_first_seen
tcp_os_ sig_ttl
tcp_connection_type
page_time_on
policy_score
reason_code
review_status
risk_rating
*/