blob: 6de318047cd87b9249a01dea08e542da72e3be74 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.opinion_processor;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import au.com.bytecode.opencsv.CSVWriter;
import opennlp.tools.jsmlearning.ProfileReaderWriter;
import opennlp.tools.parse_thicket.ParseTreeNode;
import opennlp.tools.textsimilarity.ParseTreeChunk;
public class TwitterEngineRunner {
private List<File> queue;
private final static String twSource = "/Users/bgalitsky/Documents/workspace/TwitterMiner/data/TwitterArtistsDynamicsTot12_07.csv";
TwitterFilter neExtractor = new TwitterFilter();
private static int iWind = 80;
public void processTweetFile(int nRun){
List<String[]> report = new ArrayList<String[]>(), ful_less = new ArrayList<String[]>();
List<String> meaningLESS = new ArrayList<String>(), meaningFUL = new ArrayList<String>();
report.add(new String[] { "text", "phrases of potential interest list" , });
List<String[]> texts = ProfileReaderWriter.readProfiles(twSource);
int offset = iWind*nRun;
//for(int i=offset; i< offset+iWind; i++){
// String[] text = texts.get(i);
for(String[] text: texts){
List<String> textDeduped = new ArrayList<String>(new HashSet<String>(Arrays.asList(text)));
EntityExtractionResult result = null;
if (text==null || text.length<4)
continue;
for(int nInLine=3; nInLine<textDeduped.size(); nInLine++){
if (textDeduped.get(nInLine).length()>180)
continue;
String cleanedTweet = textDeduped.get(nInLine).replace("/\\bs\\@+/ig","");
try {
result = neExtractor.extractEntities(cleanedTweet);
} catch (Exception e) {
e.printStackTrace();
continue;
}
report.add(new String[]{text[0],text[nInLine]});
report.add((String[])result.extractedNERWords.toArray(new String[0]));
//report.add((String[])result.extractedSentimentPhrases.toArray(new String[0]));
List<String> stringPhrases = new ArrayList<String>(),
nodePhrases = new ArrayList<String>();
Boolean bMeaningf = false;
//stringPhrases.add(""); nodePhrases.add(""); // to make report more readable
for(List<ParseTreeNode> chList: result.extractedSentimentPhrases){
String buf = "", nodeBuf="";
for(ParseTreeNode ch: chList){
buf+=ch.getWord()+ " ";
nodeBuf+=ch.toString()+ " ";
}
stringPhrases.add(buf.trim());
nodePhrases.add(nodeBuf.trim());
}
// selecting MEANINGFULL
if (nodePhrases.size()>1){
if ((nodePhrases.get(0).indexOf(">VP'")>-1 || nodePhrases.get(0).indexOf(">NNP'")>-1) &&
(nodePhrases.get(1).indexOf(">VP'")>-1 || nodePhrases.get(1).indexOf(">NNP'")>-1)){
bMeaningf = true;
}
}
report.add((String[])stringPhrases.toArray(new String[0]));
report.add((String[])nodePhrases.toArray(new String[0]));
if (bMeaningf){
report.add(new String[]{"===", "MEANINGFUL tweet"});
if (!meaningFUL.contains(cleanedTweet))
meaningFUL.add(cleanedTweet);
} else {
if (!meaningLESS.contains(cleanedTweet))
meaningLESS.add(cleanedTweet);
}
int count = 0;
ful_less.clear();
for(String less: meaningLESS ){
String fl = "";
if (count<meaningFUL.size())
fl = meaningFUL.get(count);
ful_less.add(new String[]{less, fl});
count++;
}
report.add(new String[]{"-----------------------------------------------------"});
ProfileReaderWriter.writeReport(report, "phrasesExtractedFromTweets3_"+nRun+".csv");
ProfileReaderWriter.writeReport(ful_less, "ful_lessTweets3_"+nRun+".csv");
}
}
}
public static void main(String[] args){
TwitterEngineRunner runner = new TwitterEngineRunner();
int nRun = Integer.parseInt(args[0]);
runner.processTweetFile(nRun);
}
}
/*
public void processDirectory(String path){
List<String[]> report = new ArrayList<String[]>();
report.add(new String[] { "filename", "named entity list", "phrases of potential interest list" });
List<String> allNamedEntities = new ArrayList<String>();
addFiles(new File(path));
for(File f: queue){
List<String> entities = (List<String>) extractEntities(f.getAbsolutePath()).getFirst();
List<String> opinions = (List<String>) extractEntities(f.getAbsolutePath()).getSecond();
report.add(new String[]{ f.getName(), entities.toString(), opinions.toString()});
ProfileReaderWriter.writeReport(report, "nameEntitiesExtracted.csv");
allNamedEntities.addAll(entities);
allNamedEntities = new ArrayList<String>(new HashSet<String> (allNamedEntities ));
}
ProfileReaderWriter.writeReport(report, "nameEntitiesTopicsOfInterestExtracted.csv");
}
} */