blob: 5df0dee523598fb4f98de16e59055ea242f3e900 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.matching;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.jsmlearning.ProfileReaderWriter;
import opennlp.tools.parse_thicket.VerbNetProcessor;
import org.apache.commons.io.FileUtils;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
public class PersonalInformationExtractor {
FrameQueryBasedIExtractor extractor = new FrameQueryBasedIExtractor();
private ArrayList<File> queue = new ArrayList<File>();
private Tika tika = new Tika();
public void runExtractor(String filename){
String content = null;
try {
content = FileUtils.readFileToString(new File(filename));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
extractor.buildTemplates(new String[] { "John Doe send his California driver license 1234567 . "
+ "Jill Jones received her Ohio license 4567456. ",
" Mary Poppins got her identification 8765. Jorge Malony sold his identification 9876. ",
//" President Jorge Smith of Microsoft used his id 4567. Manager John Smith of Google used his id 8765. "
" Johh Doe 123. Don Joe 1323. "
});
List<GeneralizationResult> res = extractor.doIE( content);
}
private void addFiles(File file) {
try {
if (!file.exists()) {
System.out.println(file + " does not exist.");
}
if (file.isDirectory()) {
for (File f : file.listFiles()) {
try {
addFiles(f);
} catch (Exception e) {
}
}
} else {
queue.add(file);
}
} catch (Exception e) {
}
}
public void processDirectory(String filename, String template) throws IOException {
List<String[]> report = new ArrayList<String[]>();
report.add(new String[]{"filename", "text", "generalization", "fired?" });
String templateStr = null;
try {
templateStr = FileUtils.readFileToString(new File(template));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String[] samples = templateStr.split("&");
extractor.buildTemplates(samples);
addFiles(new File(filename));
for (File f : queue) {
String content=null;
try {
content = tika.parseToString(f);
List<GeneralizationResult> res = extractor.doIE( content);
for(GeneralizationResult gr: res){
report.add(new String[]{filename, gr.getText(), gr.getGen().toString(), gr.getbFire().toString() });
}
} catch (TikaException e) {
System.out.println("Tika problem with file" + f.getAbsolutePath());
} catch (Exception ee){
ee.printStackTrace();
}
ProfileReaderWriter.writeReport(report, "PII_report.csv");
}
queue.clear();
}
public void runExtractor(String filename, String template){
String content = null, templateStr = null;
try {
content = FileUtils.readFileToString(new File(filename));
templateStr = FileUtils.readFileToString(new File(template));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String[] samples = templateStr.split("&");
extractor.buildTemplates(samples);
List<GeneralizationResult> res = extractor.doIE( content);
List<String[]> report = new ArrayList<String[]>();
for(GeneralizationResult gr: res){
report.add(new String[]{filename, gr.getText(), gr.getGen().toString(), gr.getbFire().toString() });
}
}
public static void main(String[] args){
//String filename = "/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources/pii/agreement.txt";
if (args ==null || args.length!=3)
System.err.println("Usage: java -Xmx10g -jar *.jar path-to-resources path-to-file-to-analyze path-to-file-with_samples\n");
try {
VerbNetProcessor.getInstance(args[0]);
new PersonalInformationExtractor().processDirectory( args[1], args[2]);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}