blob: fd989ba8ce3ac9d4aaca35793fc72f48a7335bee [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.pattern_structure;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.collections.ListUtils;
import junit.framework.TestCase;
import opennlp.tools.fca.ConceptLattice;
import opennlp.tools.fca.FcaWriter;
import opennlp.tools.fca.FormalConcept;
import opennlp.tools.similarity.apps.BingWebQueryRunner;
import opennlp.tools.similarity.apps.HitBase;
import opennlp.tools.similarity.apps.utils.Pair;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
public class JSMLearnerOnLatticeTest extends TestCase{
ParserChunker2MatcherProcessor chunk_maker = ParserChunker2MatcherProcessor.getInstance();
LinguisticPatternStructure psPos = new LinguisticPatternStructure(0,0), psNeg = new LinguisticPatternStructure(0,0);
ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
public void testJSMLearner() {
String text1p = "I rent an office space. This office is for my business. I can deduct office rental expense from my business profit to calculate net income. ";
String text2p = "To run my business, I have to rent an office. The net business profit is calculated as follows. Rental expense needs to be subtracted from revenue. ";
String text3p = "To store goods for my retail business I rent some space. When I calculate the net income, I take revenue and subtract business expenses such as office rent. ";
String text4p = "I rent some space for my business. To calculate my net income, I subtract from revenue my rental business expense.";
String text1n = "I rent out a first floor unit of my house to a travel business. I need to add the rental income to my profit. However, when I repair my house, I can deduct the repair expense from my rental income. ";
String text2n = "I receive rental income from my office. I have to claim it as a profit in my tax forms. I need to add my rental income to my profits, but subtract rental expenses such as repair from it. ";
String text3n = "I advertised my property as a business rental. Advertisement and repair expenses can be subtracted from the rental income. Remaining rental income needs to be added to my profit and be reported as taxable profit. ";
String text4n = "I showed my property to a business owner to rent. Expenses on my time spent on advertisement are subtracted from the rental income. My rental profits are added to my taxable income. ";
List<List<ParseTreeChunk>> chunks1p = chunk_maker.formGroupedPhrasesFromChunksForPara(text1p);
List<List<ParseTreeChunk>> chunks2p = chunk_maker.formGroupedPhrasesFromChunksForPara(text2p);
List<List<ParseTreeChunk>> chunks3p = chunk_maker.formGroupedPhrasesFromChunksForPara(text3p);
List<List<ParseTreeChunk>> chunks4p = chunk_maker.formGroupedPhrasesFromChunksForPara(text4p);
List<List<ParseTreeChunk>> chunks1n = chunk_maker.formGroupedPhrasesFromChunksForPara(text1n);
List<List<ParseTreeChunk>> chunks2n = chunk_maker.formGroupedPhrasesFromChunksForPara(text2n);
List<List<ParseTreeChunk>> chunks3n = chunk_maker.formGroupedPhrasesFromChunksForPara(text3n);
List<List<ParseTreeChunk>> chunks4n = chunk_maker.formGroupedPhrasesFromChunksForPara(text4n);
LinkedHashSet<Integer> obj = null;
obj = new LinkedHashSet<Integer>();
obj.add(0);
psPos.AddIntent(chunks1p, obj, 0);
obj = new LinkedHashSet<Integer>();
obj.add(1);
psPos.AddIntent(chunks2p, obj, 0);
obj = new LinkedHashSet<Integer>();
obj.add(2);
psPos.AddIntent(chunks3p, obj, 0);
obj = new LinkedHashSet<Integer>();
obj.add(3);
psPos.AddIntent(chunks4p, obj, 0);
obj = new LinkedHashSet<Integer>();
obj.add(0);
psNeg.AddIntent(chunks1n, obj, 0);
obj = new LinkedHashSet<Integer>();
obj.add(1);
psNeg.AddIntent(chunks2n, obj, 0);
obj = new LinkedHashSet<Integer>();
obj.add(2);
psNeg.AddIntent(chunks3n, obj, 0);
obj = new LinkedHashSet<Integer>();
obj.add(3);
psNeg.AddIntent(chunks4n, obj, 0);
String unknown = "I do not want to rent anything to anyone. I just want to rent a space for myself. I neither calculate deduction of individual or business tax. I subtract my tax from my income";
List<List<ParseTreeChunk>> chunksUnknown = chunk_maker.formGroupedPhrasesFromChunksForPara(unknown);
List<List<List<ParseTreeChunk>>> posIntersections = new ArrayList<List<List<ParseTreeChunk>>>(),
negIntersections = new ArrayList<List<List<ParseTreeChunk>>>();
List<List<ParseTreeChunk>> intersection = null;
for(int iConcept = 0; iConcept<psPos.conceptList.size(); iConcept++){
if (psPos.conceptList.get(iConcept).intent!=null && psPos.conceptList.get(iConcept).intent.size()>0){
intersection = md
.matchTwoSentencesGroupedChunksDeterministic(psPos.conceptList.get(iConcept).intent, chunksUnknown);
if (reduceList(intersection).size()>0)
posIntersections.add(reduceList(intersection));
}
if (psNeg.conceptList.get(iConcept).intent!=null && psNeg.conceptList.get(iConcept).intent.size()>0){
intersection = md
.matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent, chunksUnknown);
if (reduceList(intersection).size()>0)
negIntersections.add(reduceList(intersection));
}
}
Pair<List<List<List<ParseTreeChunk>>>, List<List<List<ParseTreeChunk>>>> pair =
removeInconsistenciesFromPosNegIntersections( posIntersections,
negIntersections);
posIntersections = pair.getFirst();
negIntersections = pair.getSecond();
List<List<List<ParseTreeChunk>>> posIntersectionsUnderNeg = new ArrayList<List<List<ParseTreeChunk>>>(),
negIntersectionsUnderPos = new ArrayList<List<List<ParseTreeChunk>>>();
for(int iConcept = 0; iConcept<psNeg.conceptList.size(); iConcept++){
for(int iConceptJ = 0; iConceptJ<negIntersections.size(); iConceptJ++){
intersection = md
.matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent, negIntersections.get(iConceptJ));
if (reduceList(intersection).size()>0)
posIntersectionsUnderNeg.add(reduceList(intersection));
}
}
for(int iConcept = 0; iConcept<psPos.conceptList.size(); iConcept++){
for(int iConceptJ = 0; iConceptJ<posIntersections.size(); iConceptJ++){
intersection = md
.matchTwoSentencesGroupedChunksDeterministic(psPos.conceptList.get(iConcept).intent, posIntersections.get(iConceptJ));
if (reduceList(intersection).size()>0)
negIntersectionsUnderPos.add(reduceList(intersection));
}
}
List<ParseTreeChunk>posIntersectionsUnderNegLst = flattenParseTreeChunkLst(posIntersectionsUnderNeg);
List<ParseTreeChunk>negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos);
posIntersectionsUnderNegLst = subtract(posIntersectionsUnderNegLst, negIntersectionsUnderPosLst);
negIntersectionsUnderPosLst= subtract(negIntersectionsUnderPosLst, posIntersectionsUnderNegLst);
System.out.println("Pos - neg inters = "+posIntersectionsUnderNegLst);
System.out.println("Neg - pos inters = "+negIntersectionsUnderPosLst);
}
public List<List<ParseTreeChunk>> reduceList(List<List<ParseTreeChunk>> list){
float minScore = 1.3f;
List<List<ParseTreeChunk>> newList = new ArrayList<List<ParseTreeChunk>>();
ParseTreeChunkListScorer scorer = new ParseTreeChunkListScorer();
for( List<ParseTreeChunk> group: list){
List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>();
for(ParseTreeChunk ch: group){
if (scorer.getScore(ch) > minScore)
newGroup.add(ch);
}
if (newGroup.size()>0)
newList.add(newGroup);
}
return newList;
}
public List<List<ParseTreeChunk>> flattenParseTreeChunkListList(List<List<List<ParseTreeChunk>>> listOfLists){
List<List<ParseTreeChunk>> newList = new ArrayList<List<ParseTreeChunk>>();
for( List<List<ParseTreeChunk>> member: listOfLists){
Set<ParseTreeChunk> newSet= new HashSet<ParseTreeChunk>();
for( List<ParseTreeChunk> group: member){
if (group.size()>0)
newSet.addAll(group);
}
newList.add(new ArrayList<ParseTreeChunk>(newSet));
}
return newList;
}
public List<ParseTreeChunk> flattenParseTreeChunkLst(List<List<List<ParseTreeChunk>>> listOfLists){
List<ParseTreeChunk> newList = new ArrayList<ParseTreeChunk>();
Set<ParseTreeChunk> newSetAll = new HashSet<ParseTreeChunk>();
for( List<List<ParseTreeChunk>> member: listOfLists){
Set<ParseTreeChunk> newSet= new HashSet<ParseTreeChunk>();
for( List<ParseTreeChunk> group: member){
if (group.size()>0)
newSet.addAll(group);
}
newSetAll.addAll(newSet);
}
return removeDuplicates(new ArrayList<ParseTreeChunk>(newSetAll));
}
public List<ParseTreeChunk> removeDuplicates(List<ParseTreeChunk> dupes){
List<Integer> toDelete = new ArrayList<Integer>();
for(int i=0; i<dupes.size(); i++)
for(int j=i+1; j<dupes.size(); j++){
if (dupes.get(i).equals(dupes.get(j))){
toDelete.add(j);
}
}
List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>();
for(int i=0; i<dupes.size(); i++){
if (!toDelete.contains(i))
cleaned.add(dupes.get(i));
}
return cleaned;
}
public List<ParseTreeChunk> subtract(List<ParseTreeChunk> main, List<ParseTreeChunk> toSubtract){
List<Integer> toDelete = new ArrayList<Integer>();
for(int i=0; i<main.size(); i++)
for(int j=0; j<toSubtract.size(); j++){
if (main.get(i).equals(toSubtract.get(j))){
toDelete.add(i);
}
}
List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>();
for(int i=0; i<main.size(); i++){
if (!toDelete.contains(i))
cleaned.add(main.get(i));
}
return cleaned;
}
public List<ParseTreeChunk> intesectParseTreeChunkLists(List<ParseTreeChunk> a, List<ParseTreeChunk> b){
List<Integer> inters = new ArrayList<Integer>();
for(int i=0; i<a.size(); i++)
for(int j=0; j<b.size(); j++){
if (a.get(i).equals(b.get(j))){
inters.add(i);
}
}
List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>();
for(int i=0; i<a.size(); i++){
if (inters.contains(i))
cleaned.add(a.get(i));
}
return cleaned;
}
public Pair<List<List<List<ParseTreeChunk>>>, List<List<List<ParseTreeChunk>>>>
removeInconsistenciesFromPosNegIntersections(List<List<List<ParseTreeChunk>>> pos,
List<List<List<ParseTreeChunk>>> neg ){
List<ParseTreeChunk> posIntersectionsFl = flattenParseTreeChunkLst(pos);
List<ParseTreeChunk> negIntersectionsFl = flattenParseTreeChunkLst(neg);
List<ParseTreeChunk> intersParseTreeChunkLists = intesectParseTreeChunkLists(posIntersectionsFl, negIntersectionsFl);
List<List<List<ParseTreeChunk>>> cleanedFromInconsPos = new ArrayList<List<List<ParseTreeChunk>>>(),
cleanedFromInconsNeg = new ArrayList<List<List<ParseTreeChunk>>>();
/*
System.out.println("pos = "+ pos);
System.out.println("neg = "+ neg);
System.out.println("pos flat = "+ posIntersectionsFl);
System.out.println("neg flat = "+ negIntersectionsFl);
System.out.println("inters = "+ intersParseTreeChunkLists);
*/
for( List<List<ParseTreeChunk>> member: pos){
List<List<ParseTreeChunk>> memberList = new ArrayList<List<ParseTreeChunk>>();
for( List<ParseTreeChunk> group: member){
List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>();
for(ParseTreeChunk ch: group){
boolean bSkip = false;
for(ParseTreeChunk check: intersParseTreeChunkLists){
if (check.equals(ch))
bSkip=true;
}
if (!bSkip)
newGroup.add(ch);
}
if (newGroup.size()>0)
memberList.add(newGroup);
}
if (memberList.size()>0)
cleanedFromInconsPos.add(memberList);
}
for( List<List<ParseTreeChunk>> member: neg){
List<List<ParseTreeChunk>> memberList = new ArrayList<List<ParseTreeChunk>>();
for( List<ParseTreeChunk> group: member){
List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>();
for(ParseTreeChunk ch: group){
boolean bSkip = false;
for(ParseTreeChunk check: intersParseTreeChunkLists){
if (check.equals(ch))
bSkip=true;
}
if (!bSkip)
newGroup.add(ch);
}
if (newGroup.size()>0)
memberList.add(newGroup);
}
if (memberList.size()>0)
cleanedFromInconsNeg.add(memberList);
}
return new Pair(cleanedFromInconsPos, cleanedFromInconsNeg);
}
}