/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.parse_thicket.pattern_structure; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
import java.util.LinkedHashSet; | |
import java.util.List; | |
import java.util.Set; | |
import org.apache.commons.collections.ListUtils; | |
import junit.framework.TestCase; | |
import opennlp.tools.fca.ConceptLattice; | |
import opennlp.tools.fca.FcaWriter; | |
import opennlp.tools.fca.FormalConcept; | |
import opennlp.tools.similarity.apps.BingWebQueryRunner; | |
import opennlp.tools.similarity.apps.HitBase; | |
import opennlp.tools.similarity.apps.utils.Pair; | |
import opennlp.tools.textsimilarity.ParseTreeChunk; | |
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; | |
import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic; | |
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; | |
public class JSMLearnerOnLatticeTest extends TestCase{ | |
ParserChunker2MatcherProcessor chunk_maker = ParserChunker2MatcherProcessor.getInstance(); | |
LinguisticPatternStructure psPos = new LinguisticPatternStructure(0,0), psNeg = new LinguisticPatternStructure(0,0); | |
ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic(); | |
public void testJSMLearner() { | |
String text1p = "I rent an office space. This office is for my business. I can deduct office rental expense from my business profit to calculate net income. "; | |
String text2p = "To run my business, I have to rent an office. The net business profit is calculated as follows. Rental expense needs to be subtracted from revenue. "; | |
String text3p = "To store goods for my retail business I rent some space. When I calculate the net income, I take revenue and subtract business expenses such as office rent. "; | |
String text4p = "I rent some space for my business. To calculate my net income, I subtract from revenue my rental business expense."; | |
String text1n = "I rent out a first floor unit of my house to a travel business. I need to add the rental income to my profit. However, when I repair my house, I can deduct the repair expense from my rental income. "; | |
String text2n = "I receive rental income from my office. I have to claim it as a profit in my tax forms. I need to add my rental income to my profits, but subtract rental expenses such as repair from it. "; | |
String text3n = "I advertised my property as a business rental. Advertisement and repair expenses can be subtracted from the rental income. Remaining rental income needs to be added to my profit and be reported as taxable profit. "; | |
String text4n = "I showed my property to a business owner to rent. Expenses on my time spent on advertisement are subtracted from the rental income. My rental profits are added to my taxable income. "; | |
List<List<ParseTreeChunk>> chunks1p = chunk_maker.formGroupedPhrasesFromChunksForPara(text1p); | |
List<List<ParseTreeChunk>> chunks2p = chunk_maker.formGroupedPhrasesFromChunksForPara(text2p); | |
List<List<ParseTreeChunk>> chunks3p = chunk_maker.formGroupedPhrasesFromChunksForPara(text3p); | |
List<List<ParseTreeChunk>> chunks4p = chunk_maker.formGroupedPhrasesFromChunksForPara(text4p); | |
List<List<ParseTreeChunk>> chunks1n = chunk_maker.formGroupedPhrasesFromChunksForPara(text1n); | |
List<List<ParseTreeChunk>> chunks2n = chunk_maker.formGroupedPhrasesFromChunksForPara(text2n); | |
List<List<ParseTreeChunk>> chunks3n = chunk_maker.formGroupedPhrasesFromChunksForPara(text3n); | |
List<List<ParseTreeChunk>> chunks4n = chunk_maker.formGroupedPhrasesFromChunksForPara(text4n); | |
LinkedHashSet<Integer> obj = null; | |
obj = new LinkedHashSet<Integer>(); | |
obj.add(0); | |
psPos.AddIntent(chunks1p, obj, 0); | |
obj = new LinkedHashSet<Integer>(); | |
obj.add(1); | |
psPos.AddIntent(chunks2p, obj, 0); | |
obj = new LinkedHashSet<Integer>(); | |
obj.add(2); | |
psPos.AddIntent(chunks3p, obj, 0); | |
obj = new LinkedHashSet<Integer>(); | |
obj.add(3); | |
psPos.AddIntent(chunks4p, obj, 0); | |
obj = new LinkedHashSet<Integer>(); | |
obj.add(0); | |
psNeg.AddIntent(chunks1n, obj, 0); | |
obj = new LinkedHashSet<Integer>(); | |
obj.add(1); | |
psNeg.AddIntent(chunks2n, obj, 0); | |
obj = new LinkedHashSet<Integer>(); | |
obj.add(2); | |
psNeg.AddIntent(chunks3n, obj, 0); | |
obj = new LinkedHashSet<Integer>(); | |
obj.add(3); | |
psNeg.AddIntent(chunks4n, obj, 0); | |
String unknown = "I do not want to rent anything to anyone. I just want to rent a space for myself. I neither calculate deduction of individual or business tax. I subtract my tax from my income"; | |
List<List<ParseTreeChunk>> chunksUnknown = chunk_maker.formGroupedPhrasesFromChunksForPara(unknown); | |
List<List<List<ParseTreeChunk>>> posIntersections = new ArrayList<List<List<ParseTreeChunk>>>(), | |
negIntersections = new ArrayList<List<List<ParseTreeChunk>>>(); | |
List<List<ParseTreeChunk>> intersection = null; | |
for(int iConcept = 0; iConcept<psPos.conceptList.size(); iConcept++){ | |
if (psPos.conceptList.get(iConcept).intent!=null && psPos.conceptList.get(iConcept).intent.size()>0){ | |
intersection = md | |
.matchTwoSentencesGroupedChunksDeterministic(psPos.conceptList.get(iConcept).intent, chunksUnknown); | |
if (reduceList(intersection).size()>0) | |
posIntersections.add(reduceList(intersection)); | |
} | |
if (psNeg.conceptList.get(iConcept).intent!=null && psNeg.conceptList.get(iConcept).intent.size()>0){ | |
intersection = md | |
.matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent, chunksUnknown); | |
if (reduceList(intersection).size()>0) | |
negIntersections.add(reduceList(intersection)); | |
} | |
} | |
Pair<List<List<List<ParseTreeChunk>>>, List<List<List<ParseTreeChunk>>>> pair = | |
removeInconsistenciesFromPosNegIntersections( posIntersections, | |
negIntersections); | |
posIntersections = pair.getFirst(); | |
negIntersections = pair.getSecond(); | |
List<List<List<ParseTreeChunk>>> posIntersectionsUnderNeg = new ArrayList<List<List<ParseTreeChunk>>>(), | |
negIntersectionsUnderPos = new ArrayList<List<List<ParseTreeChunk>>>(); | |
for(int iConcept = 0; iConcept<psNeg.conceptList.size(); iConcept++){ | |
for(int iConceptJ = 0; iConceptJ<negIntersections.size(); iConceptJ++){ | |
intersection = md | |
.matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent, negIntersections.get(iConceptJ)); | |
if (reduceList(intersection).size()>0) | |
posIntersectionsUnderNeg.add(reduceList(intersection)); | |
} | |
} | |
for(int iConcept = 0; iConcept<psPos.conceptList.size(); iConcept++){ | |
for(int iConceptJ = 0; iConceptJ<posIntersections.size(); iConceptJ++){ | |
intersection = md | |
.matchTwoSentencesGroupedChunksDeterministic(psPos.conceptList.get(iConcept).intent, posIntersections.get(iConceptJ)); | |
if (reduceList(intersection).size()>0) | |
negIntersectionsUnderPos.add(reduceList(intersection)); | |
} | |
} | |
List<ParseTreeChunk>posIntersectionsUnderNegLst = flattenParseTreeChunkLst(posIntersectionsUnderNeg); | |
List<ParseTreeChunk>negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos); | |
posIntersectionsUnderNegLst = subtract(posIntersectionsUnderNegLst, negIntersectionsUnderPosLst); | |
negIntersectionsUnderPosLst= subtract(negIntersectionsUnderPosLst, posIntersectionsUnderNegLst); | |
System.out.println("Pos - neg inters = "+posIntersectionsUnderNegLst); | |
System.out.println("Neg - pos inters = "+negIntersectionsUnderPosLst); | |
} | |
public List<List<ParseTreeChunk>> reduceList(List<List<ParseTreeChunk>> list){ | |
float minScore = 1.3f; | |
List<List<ParseTreeChunk>> newList = new ArrayList<List<ParseTreeChunk>>(); | |
ParseTreeChunkListScorer scorer = new ParseTreeChunkListScorer(); | |
for( List<ParseTreeChunk> group: list){ | |
List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>(); | |
for(ParseTreeChunk ch: group){ | |
if (scorer.getScore(ch) > minScore) | |
newGroup.add(ch); | |
} | |
if (newGroup.size()>0) | |
newList.add(newGroup); | |
} | |
return newList; | |
} | |
public List<List<ParseTreeChunk>> flattenParseTreeChunkListList(List<List<List<ParseTreeChunk>>> listOfLists){ | |
List<List<ParseTreeChunk>> newList = new ArrayList<List<ParseTreeChunk>>(); | |
for( List<List<ParseTreeChunk>> member: listOfLists){ | |
Set<ParseTreeChunk> newSet= new HashSet<ParseTreeChunk>(); | |
for( List<ParseTreeChunk> group: member){ | |
if (group.size()>0) | |
newSet.addAll(group); | |
} | |
newList.add(new ArrayList<ParseTreeChunk>(newSet)); | |
} | |
return newList; | |
} | |
public List<ParseTreeChunk> flattenParseTreeChunkLst(List<List<List<ParseTreeChunk>>> listOfLists){ | |
List<ParseTreeChunk> newList = new ArrayList<ParseTreeChunk>(); | |
Set<ParseTreeChunk> newSetAll = new HashSet<ParseTreeChunk>(); | |
for( List<List<ParseTreeChunk>> member: listOfLists){ | |
Set<ParseTreeChunk> newSet= new HashSet<ParseTreeChunk>(); | |
for( List<ParseTreeChunk> group: member){ | |
if (group.size()>0) | |
newSet.addAll(group); | |
} | |
newSetAll.addAll(newSet); | |
} | |
return removeDuplicates(new ArrayList<ParseTreeChunk>(newSetAll)); | |
} | |
public List<ParseTreeChunk> removeDuplicates(List<ParseTreeChunk> dupes){ | |
List<Integer> toDelete = new ArrayList<Integer>(); | |
for(int i=0; i<dupes.size(); i++) | |
for(int j=i+1; j<dupes.size(); j++){ | |
if (dupes.get(i).equals(dupes.get(j))){ | |
toDelete.add(j); | |
} | |
} | |
List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>(); | |
for(int i=0; i<dupes.size(); i++){ | |
if (!toDelete.contains(i)) | |
cleaned.add(dupes.get(i)); | |
} | |
return cleaned; | |
} | |
public List<ParseTreeChunk> subtract(List<ParseTreeChunk> main, List<ParseTreeChunk> toSubtract){ | |
List<Integer> toDelete = new ArrayList<Integer>(); | |
for(int i=0; i<main.size(); i++) | |
for(int j=0; j<toSubtract.size(); j++){ | |
if (main.get(i).equals(toSubtract.get(j))){ | |
toDelete.add(i); | |
} | |
} | |
List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>(); | |
for(int i=0; i<main.size(); i++){ | |
if (!toDelete.contains(i)) | |
cleaned.add(main.get(i)); | |
} | |
return cleaned; | |
} | |
public List<ParseTreeChunk> intesectParseTreeChunkLists(List<ParseTreeChunk> a, List<ParseTreeChunk> b){ | |
List<Integer> inters = new ArrayList<Integer>(); | |
for(int i=0; i<a.size(); i++) | |
for(int j=0; j<b.size(); j++){ | |
if (a.get(i).equals(b.get(j))){ | |
inters.add(i); | |
} | |
} | |
List<ParseTreeChunk> cleaned = new ArrayList<ParseTreeChunk>(); | |
for(int i=0; i<a.size(); i++){ | |
if (inters.contains(i)) | |
cleaned.add(a.get(i)); | |
} | |
return cleaned; | |
} | |
public Pair<List<List<List<ParseTreeChunk>>>, List<List<List<ParseTreeChunk>>>> | |
removeInconsistenciesFromPosNegIntersections(List<List<List<ParseTreeChunk>>> pos, | |
List<List<List<ParseTreeChunk>>> neg ){ | |
List<ParseTreeChunk> posIntersectionsFl = flattenParseTreeChunkLst(pos); | |
List<ParseTreeChunk> negIntersectionsFl = flattenParseTreeChunkLst(neg); | |
List<ParseTreeChunk> intersParseTreeChunkLists = intesectParseTreeChunkLists(posIntersectionsFl, negIntersectionsFl); | |
List<List<List<ParseTreeChunk>>> cleanedFromInconsPos = new ArrayList<List<List<ParseTreeChunk>>>(), | |
cleanedFromInconsNeg = new ArrayList<List<List<ParseTreeChunk>>>(); | |
/* | |
System.out.println("pos = "+ pos); | |
System.out.println("neg = "+ neg); | |
System.out.println("pos flat = "+ posIntersectionsFl); | |
System.out.println("neg flat = "+ negIntersectionsFl); | |
System.out.println("inters = "+ intersParseTreeChunkLists); | |
*/ | |
for( List<List<ParseTreeChunk>> member: pos){ | |
List<List<ParseTreeChunk>> memberList = new ArrayList<List<ParseTreeChunk>>(); | |
for( List<ParseTreeChunk> group: member){ | |
List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>(); | |
for(ParseTreeChunk ch: group){ | |
boolean bSkip = false; | |
for(ParseTreeChunk check: intersParseTreeChunkLists){ | |
if (check.equals(ch)) | |
bSkip=true; | |
} | |
if (!bSkip) | |
newGroup.add(ch); | |
} | |
if (newGroup.size()>0) | |
memberList.add(newGroup); | |
} | |
if (memberList.size()>0) | |
cleanedFromInconsPos.add(memberList); | |
} | |
for( List<List<ParseTreeChunk>> member: neg){ | |
List<List<ParseTreeChunk>> memberList = new ArrayList<List<ParseTreeChunk>>(); | |
for( List<ParseTreeChunk> group: member){ | |
List<ParseTreeChunk> newGroup = new ArrayList<ParseTreeChunk>(); | |
for(ParseTreeChunk ch: group){ | |
boolean bSkip = false; | |
for(ParseTreeChunk check: intersParseTreeChunkLists){ | |
if (check.equals(ch)) | |
bSkip=true; | |
} | |
if (!bSkip) | |
newGroup.add(ch); | |
} | |
if (newGroup.size()>0) | |
memberList.add(newGroup); | |
} | |
if (memberList.size()>0) | |
cleanedFromInconsNeg.add(memberList); | |
} | |
return new Pair(cleanedFromInconsPos, cleanedFromInconsNeg); | |
} | |
} |