/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.parse_thicket.rhetoric_structure; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.List; | |
import opennlp.tools.parse_thicket.IGeneralizer; | |
import opennlp.tools.parse_thicket.Pair; | |
import opennlp.tools.parse_thicket.ParseTreeNode; | |
public class RhetoricStructureMarker implements IGeneralizer<Integer[]> { | |
//private static String rstRelations[] = {"antithesis", "concession", "contrast", "elaboration"}; | |
List<Pair<String, ParseTreeNode[]>> rstMarkers = new ArrayList<Pair<String, ParseTreeNode[]>>(); | |
public RhetoricStructureMarker(){ | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>("contrast", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("than",",") })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "antithesis", new ParseTreeNode[]{new ParseTreeNode("although",","), new ParseTreeNode("*","*") })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("however","*") })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("however","*"), new ParseTreeNode(",",","), | |
new ParseTreeNode("*","prp"), })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "elaboration", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("*","NN") })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "elaboration", new ParseTreeNode[]{new ParseTreeNode("as","*"), new ParseTreeNode("a","*") })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>("explanation", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("because",",") })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "example", new ParseTreeNode[]{new ParseTreeNode("for","IN"), new ParseTreeNode("example","NN") })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("ye","*") })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode(",",","), | |
new ParseTreeNode("*","prp"), })); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "contrast", new ParseTreeNode[]{new ParseTreeNode("yet","*"), new ParseTreeNode("i","*"), | |
})); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "explanation", new ParseTreeNode[]{new ParseTreeNode(",",","), new ParseTreeNode("where","*") })); | |
//as long as | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ new ParseTreeNode("*","RB"), | |
new ParseTreeNode("as","IN"),})); | |
rstMarkers.add(new Pair<String, ParseTreeNode[]>( "temp_sequence", new ParseTreeNode[]{/*new ParseTreeNode("as","*"),*/ new ParseTreeNode("*","VB*"), | |
new ParseTreeNode("until","IN"),})); | |
} | |
/* For a sentence, we obtain a list of markers with the CA word and position in the sentence | |
* Output span is an integer array with start/end occurrence of an RST marker in a sentence | |
* */ | |
public List<Pair<String, Integer[]>> extractRSTrelationInSentenceGetBoundarySpan(List<ParseTreeNode> sentence){ | |
List<Pair<String, Integer[]>> results = new ArrayList<Pair<String, Integer[]>> (); | |
for(Pair<String, ParseTreeNode[]> template: rstMarkers){ | |
List<Integer[]> spanList = generalize(sentence,template.getSecond() ); | |
if (!spanList.isEmpty()) | |
results.add(new Pair<String, Integer[]>(template.getFirst(), spanList.get(0))); | |
} | |
return results; | |
} | |
/* Rule application in the form of generalization | |
* Generalizing a sentence with a rule (a template), we obtain the occurrence of rhetoric marker | |
* | |
* o1 - sentence | |
* o2 - rule/template, specifying lemmas and/or POS, including punctuation | |
* @see opennlp.tools.parse_thicket.IGeneralizer#generalize(java.lang.Object, java.lang.Object) | |
* returns the span Integer[] | |
*/ | |
@Override | |
public List<Integer[]> generalize(Object o1, Object o2) { | |
List<Integer[]> result = new ArrayList<Integer[]>(); | |
List<ParseTreeNode> sentence = (List<ParseTreeNode> )o1; | |
ParseTreeNode[] template = (ParseTreeNode[]) o2; | |
boolean bBeingMatched = false; | |
for(int wordIndexInSentence=0; wordIndexInSentence<sentence.size(); wordIndexInSentence++){ | |
ParseTreeNode word = sentence.get(wordIndexInSentence); | |
int wordIndexInSentenceEnd = wordIndexInSentence; //init iterators for internal loop | |
int templateIterator=0; | |
while (wordIndexInSentenceEnd<sentence.size() && templateIterator< template.length){ | |
ParseTreeNode tword = template[templateIterator]; | |
ParseTreeNode currWord=sentence.get(wordIndexInSentenceEnd); | |
List<ParseTreeNode> gRes = tword.generalize(tword, currWord); | |
if (gRes.isEmpty()|| gRes.get(0)==null || ( gRes.get(0).getWord().equals("*") | |
&& gRes.get(0).getPos().equals("*") )){ | |
bBeingMatched = false; | |
break; | |
} else { | |
bBeingMatched = true; | |
} | |
wordIndexInSentenceEnd++; | |
templateIterator++; | |
} | |
// template iteration is done | |
// the only condition for successful match is IF we are at the end of template | |
if (templateIterator == template.length){ | |
result.add(new Integer[]{wordIndexInSentence, wordIndexInSentenceEnd-1}); | |
return result; | |
} | |
// no match for current sentence word: proceed to the next | |
} | |
return result; | |
} | |
public String markerToString(List<Pair<String, Integer[]>> res){ | |
StringBuffer buf = new StringBuffer(); | |
buf.append("["); | |
for(Pair<String, Integer[]> marker: res){ | |
buf.append(marker.getFirst()+":"); | |
for(int a: marker.getSecond()){ | |
buf.append(a+" "); | |
} | |
buf.append (" | "); | |
} | |
buf.append("]"); | |
return buf.toString(); | |
} | |
public static void main(String[] args){ | |
ParseTreeNode[] sent = | |
new ParseTreeNode[]{new ParseTreeNode("he","prn"), new ParseTreeNode("was","vbz"), new ParseTreeNode("more","jj"), | |
new ParseTreeNode(",",","), new ParseTreeNode("than",","), new ParseTreeNode("little","jj"), new ParseTreeNode("boy","nn"), | |
new ParseTreeNode(",",","), new ParseTreeNode("however","*"), new ParseTreeNode(",",","), | |
new ParseTreeNode("he","prp"), new ParseTreeNode("was","vbz"), new ParseTreeNode("adult","jj") | |
}; | |
List<Pair<String, Integer[]>> res = new RhetoricStructureMarker().extractRSTrelationInSentenceGetBoundarySpan(Arrays.asList(sent)); | |
System.out.println( new RhetoricStructureMarker().markerToString(res)); | |
} | |
} |