blob: 387e96b62b91f978ea18c9f335f6a66ba4be46b5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.nlp.model;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.Map.Entry;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.impl.SectionImpl;
import org.apache.stanbol.enhancer.nlp.model.impl.SpanImpl;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.NoSuchPartException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.helpers.SubstituteLoggerFactory;
import com.ibm.icu.lang.UCharacter.SentenceBreak;
public class AnalysedTextUtils {
private static final Logger log = LoggerFactory.getLogger(AnalysedTextUtils.class);
/**
* Getter for the {@link AnalysedText} content part of the parsed
* ContentItem.<p>
* This assumes that the AnalysedText is registered by using
* {@link AnalysedText#ANALYSED_TEXT_URI}. Otherwise it will not find it.
* @param ci The {@link ContentItem}
* @return the {@link AnalysedText} or <code>null</code> if not present.
* @throws ClassCastException if a content part is registered with
* {@link AnalysedText#ANALYSED_TEXT_URI} but its type is not compatible
* to {@link AnalysedText}.
*/
public static AnalysedText getAnalysedText(ContentItem ci){
ci.getLock().readLock().lock();
try {
return ci.getPart(AnalysedText.ANALYSED_TEXT_URI, AnalysedText.class);
} catch (NoSuchPartException e) {
return null;
} finally {
ci.getLock().readLock().unlock();
}
}
/**
* Copies the elements of the parsed iterator to a list.
* @param iterator the iterator
* @return the List with all spans of the Iterators
*/
public static <T extends Span> List<T> asList(Iterator<T> it){
if(it == null || !it.hasNext()){
return Collections.emptyList();
} else {
List<T> spans = new ArrayList<T>();
appandToList(it, spans);
return spans;
}
}
/**
* Appends the elements provided by the parsed Iterator to the list.
* @param it the Iterator
* @param list the List
* @throws NullPointerException if the parsed List is <code>null</code>
*/
public static <T extends Span> void appandToList(Iterator<T> it, List<? super T> list){
if(it != null){
while(it.hasNext()){
list.add(it.next());
}
}
}
/**
* Copies the elements of the parsed iterator(s) to a {@link SortedSet}. As
* {@link Span} implements {@link Comparable} the Spans within the resulting
* set will have the same order as returned by the methods of {@link AnalysedText}
* @param it the iterator(s)
* @return the {@link SortedSet} containing all Spans of the iterators
*/
public static <T extends Span> SortedSet<T> asSet(Iterator<T> it){
SortedSet<T> spans = new TreeSet<T>();
addToSet(it, spans);
return spans;
}
/**
* Adds the Spans of the parsed Iterator to the parsed Set
* @param it the Iterator
* @param set the set
* @throws NullPointerException if the parsed List is <code>null</code>
*/
public static <T extends Span> void addToSet(Iterator<T> it,Set<? super T> set){
if(it != null){
while(it.hasNext()){
set.add(it.next());
}
}
}
/**
* Iterates over two levels of the Span hierarchy (e.g. all Tokens of a
* Sentence that are within a Chunk). The returned Iterator is a live
* view on the {@link AnalysedText} (being the context of the enclosing
* Span).<p>
* Usage Example
* <code><pre>
* Sentence sentence; //The currently processed Sentence
* Iterator&lt;Span&gt; tokens = AnalysedTextUtils.getSpansInSpans(
* sentence,
* {@link SpanTypeEnum#Chunk SpanTypeEnum.Chunk}
* {@link SpanTypeEnum#Token SpanTypeEnum.Token}
* while(tokens.hasNext()){
* Token token = (Token)tokens.next();
* // process only tokens within a chunk
* }
* </pre></code>
* @param section
* @param level1 the {@link SpanTypeEnum} for the first Level. MUST be
* a Type that is a {@link Section} (e.g. Chunk or Sentence).
* @param level2
* @return
* @throws IllegalArgumentException if {@link SpanTypeEnum#Token} is parsed
* as <code>level1</code> span type.
*/
public static Iterator<Span> getSpansInSpans(Section section, SpanTypeEnum level1, final SpanTypeEnum level2){
if(level1 == SpanTypeEnum.Token){
throw new IllegalArgumentException("The SpanType for level1 MUST refer to a Section "
+ "(Chunk, Sentence, TextSection or Text)");
}
final Iterator<Span> level1It = section.getEnclosed(EnumSet.of(level1));
return new Iterator<Span>(){
Iterator<Span> level2It = null;
@Override
public boolean hasNext() {
if(level2It != null && level2It.hasNext()) {
return true;
} else {
while(level1It.hasNext()){
level2It = ((Section)level1It.next()).getEnclosed(EnumSet.of(level2));
if(level2It.hasNext()){
return true;
}
}
}
return false;
}
@Override
public Span next() {
hasNext(); //ensure hasNext is called on multiple calls to next()
return level2It.next();
}
@Override
public void remove() {
level2It.remove();
}
};
}
// NOTE: No longer used ... keep for now in case that we need this functionality.
// public static Set<Span> getEnclosed(SortedSet<Span> sortedSet, Span span){
// if(span.getType() == SpanTypeEnum.Token){
// log.warn("Span {} with SpanType {} parsed to getEnclosing(..). Returned Set will "
// + "contain the parsed span!");
// }
// return sortedSet.subSet(new SubSetHelperSpan(span.getStart(), span.getEnd()),
// new SubSetHelperSpan(span.getEnd()));
// }
// public static <T> Map<Span,T> getEnclosed(SortedMap<Span,T> sortedSet, Span span){
// if(span.getType() == SpanTypeEnum.Token){
// log.warn("Span {} with SpanType {} parsed to getEnclosing(..). Returned Set will "
// + "contain the parsed span!");
// }
// return sortedSet.subMap(new SubSetHelperSpan(span.getStart(), span.getEnd()),
// new SubSetHelperSpan(span.getEnd()));
// }
//
// /**
// * Internal helper class used for building {@link SortedSet#subSet(Object, Object)}.
// *
// * @author Rupert Westenthaler
// *
// */
// private static class SubSetHelperSpan extends SpanImpl implements Span {
// /**
// * Create the start constraint for {@link SortedSet#subSet(Object, Object)}
// * @param start
// * @param end
// */
// protected SubSetHelperSpan(int start,int end){
// super(SpanTypeEnum.Text, //lowest pos type
// start,end);
// }
// /**
// * Creates the end constraint for {@link SortedSet#subSet(Object, Object)}
// * @param pos
// */
// protected SubSetHelperSpan(int pos){
// super(SpanTypeEnum.Token, //highest pos type,
// pos,Integer.MAX_VALUE);
// }
// }
}