enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextUtils.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.nlp.model;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.EnumSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeSet;
 import java.util.Map.Entry;

 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
 import org.apache.stanbol.enhancer.nlp.model.impl.SectionImpl;
 import org.apache.stanbol.enhancer.nlp.model.impl.SpanImpl;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.NoSuchPartException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.helpers.SubstituteLoggerFactory;

 import com.ibm.icu.lang.UCharacter.SentenceBreak;

 public class AnalysedTextUtils {

     private static final Logger log = LoggerFactory.getLogger(AnalysedTextUtils.class);

     /**
      * Getter for the {@link AnalysedText} content part of the parsed
      * ContentItem.<p>
      * This assumes that the AnalysedText is registered by using
      * {@link AnalysedText#ANALYSED_TEXT_URI}. Otherwise it will not find it.
      * @param ci The {@link ContentItem}
      * @return the {@link AnalysedText} or <code>null</code> if not present.
      * @throws ClassCastException if a content part is registered with
      * {@link AnalysedText#ANALYSED_TEXT_URI} but its type is not compatible
      * to {@link AnalysedText}.
      */
     public static AnalysedText getAnalysedText(ContentItem ci){
         ci.getLock().readLock().lock();
         try {
             return ci.getPart(AnalysedText.ANALYSED_TEXT_URI, AnalysedText.class);
         } catch (NoSuchPartException e) {
             return null;
         } finally {
             ci.getLock().readLock().unlock();
         }
     }

     /**
      * Copies the elements of the parsed iterator to a list.
      * @param iterator the iterator
      * @return the List with all spans of the Iterators
      */
     public static <T extends Span> List<T> asList(Iterator<T> it){
         if(it == null || !it.hasNext()){
             return Collections.emptyList();
         } else {
             List<T> spans = new ArrayList<T>();
             appandToList(it, spans);
             return spans;
         }
     }
     /**
      * Appends the elements provided by the parsed Iterator to the list.
      * @param it the Iterator
      * @param list the List
      * @throws NullPointerException if the parsed List is <code>null</code>
      */
     public static <T extends Span> void appandToList(Iterator<T> it, List<? super T> list){
         if(it != null){
             while(it.hasNext()){
                 list.add(it.next());
             }
         }
     }

     /**
      * Copies the elements of the parsed iterator(s) to a {@link SortedSet}. As
      * {@link Span} implements {@link Comparable} the Spans within the resulting
      * set will have the same order as returned by the methods of {@link AnalysedText}
      * @param it the iterator(s)
      * @return the {@link SortedSet} containing all Spans of the iterators
      */
     public static <T extends Span> SortedSet<T> asSet(Iterator<T> it){
         SortedSet<T> spans = new TreeSet<T>();
         addToSet(it, spans);
         return spans;
     }
     /**
      * Adds the Spans of the parsed Iterator to the parsed Set
      * @param it the Iterator
      * @param set the set
      * @throws NullPointerException if the parsed List is <code>null</code>
      */
     public static <T extends Span> void addToSet(Iterator<T> it,Set<? super T> set){
         if(it != null){
             while(it.hasNext()){
                 set.add(it.next());
             }
         }
     }
     /**
      * Iterates over two levels of the Span hierarchy (e.g. all Tokens of a
      * Sentence that are within a Chunk). The returned Iterator is a live
      * view on the {@link AnalysedText} (being the context of the enclosing
      * Span).<p>
      * Usage Example
      * <code><pre>
      *     Sentence sentence; //The currently processed Sentence
      *     Iterator&lt;Span&gt; tokens = AnalysedTextUtils.getSpansInSpans(
      *         sentence,
      *         {@link SpanTypeEnum#Chunk SpanTypeEnum.Chunk}
      *         {@link SpanTypeEnum#Token SpanTypeEnum.Token}
      *     while(tokens.hasNext()){
      *         Token token = (Token)tokens.next();
      *         // process only tokens within a chunk
      *     }
      * </pre></code>
      * @param section
      * @param level1 the {@link SpanTypeEnum} for the first Level. MUST be
      * a Type that is a {@link Section} (e.g. Chunk or Sentence).
      * @param level2
      * @return
      * @throws IllegalArgumentException if {@link SpanTypeEnum#Token} is parsed
      * as <code>level1</code> span type.
      */
     public static Iterator<Span> getSpansInSpans(Section section, SpanTypeEnum level1, final SpanTypeEnum level2){
         if(level1 == SpanTypeEnum.Token){
             throw new IllegalArgumentException("The SpanType for level1 MUST refer to a Section "
                 + "(Chunk, Sentence, TextSection or Text)");
         }
         final Iterator<Span> level1It = section.getEnclosed(EnumSet.of(level1));
         return new Iterator<Span>(){
             Iterator<Span> level2It = null;
             @Override
             public boolean hasNext() {
                 if(level2It != null && level2It.hasNext()) {
                     return true;
                 } else {
                     while(level1It.hasNext()){
                         level2It = ((Section)level1It.next()).getEnclosed(EnumSet.of(level2));
                         if(level2It.hasNext()){
                             return true;
                         }
                     }
                 }
                 return false;
             }

             @Override
             public Span next() {
                 hasNext(); //ensure hasNext is called on multiple calls to next()
                 return level2It.next();
             }

             @Override
             public void remove() {
                 level2It.remove();
             }
         };
     }
 // NOTE: No longer used ... keep for now in case that we need this functionality.
 //    public static Set<Span> getEnclosed(SortedSet<Span> sortedSet, Span span){
 //        if(span.getType() == SpanTypeEnum.Token){
 //            log.warn("Span {} with SpanType {} parsed to getEnclosing(..). Returned Set will "
 //                    + "contain the parsed span!");
 //        }
 //        return sortedSet.subSet(new SubSetHelperSpan(span.getStart(), span.getEnd()),
 //            new SubSetHelperSpan(span.getEnd()));
 //    }
 //    public static <T> Map<Span,T> getEnclosed(SortedMap<Span,T> sortedSet, Span span){
 //        if(span.getType() == SpanTypeEnum.Token){
 //            log.warn("Span {} with SpanType {} parsed to getEnclosing(..). Returned Set will "
 //                    + "contain the parsed span!");
 //        }
 //        return sortedSet.subMap(new SubSetHelperSpan(span.getStart(), span.getEnd()),
 //            new SubSetHelperSpan(span.getEnd()));
 //    }
 //
 //    /**
 //     * Internal helper class used for building {@link SortedSet#subSet(Object, Object)}.
 //     *
 //     * @author Rupert Westenthaler
 //     *
 //     */
 //    private static class SubSetHelperSpan extends SpanImpl implements Span {
 //        /**
 //         * Create the start constraint for {@link SortedSet#subSet(Object, Object)}
 //         * @param start
 //         * @param end
 //         */
 //        protected SubSetHelperSpan(int start,int end){
 //            super(SpanTypeEnum.Text, //lowest pos type
 //                start,end);
 //        }
 //        /**
 //         * Creates the end constraint for {@link SortedSet#subSet(Object, Object)}
 //         * @param pos
 //         */
 //        protected SubSetHelperSpan(int pos){
 //            super(SpanTypeEnum.Token, //highest pos type,
 //                pos,Integer.MAX_VALUE);
 //        }
 //    }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.nlp.model;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.EnumSet;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.SortedMap;
	import java.util.SortedSet;
	import java.util.TreeSet;
	import java.util.Map.Entry;

	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
	import org.apache.stanbol.enhancer.nlp.model.impl.SectionImpl;
	import org.apache.stanbol.enhancer.nlp.model.impl.SpanImpl;
	import org.apache.stanbol.enhancer.servicesapi.Blob;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.NoSuchPartException;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.slf4j.helpers.SubstituteLoggerFactory;

	import com.ibm.icu.lang.UCharacter.SentenceBreak;

	public class AnalysedTextUtils {

	private static final Logger log = LoggerFactory.getLogger(AnalysedTextUtils.class);

	/**
	* Getter for the {@link AnalysedText} content part of the parsed
	* ContentItem.<p>
	* This assumes that the AnalysedText is registered by using
	* {@link AnalysedText#ANALYSED_TEXT_URI}. Otherwise it will not find it.
	* @param ci The {@link ContentItem}
	* @return the {@link AnalysedText} or <code>null</code> if not present.
	* @throws ClassCastException if a content part is registered with
	* {@link AnalysedText#ANALYSED_TEXT_URI} but its type is not compatible
	* to {@link AnalysedText}.
	*/
	public static AnalysedText getAnalysedText(ContentItem ci){
	ci.getLock().readLock().lock();
	try {
	return ci.getPart(AnalysedText.ANALYSED_TEXT_URI, AnalysedText.class);
	} catch (NoSuchPartException e) {
	return null;
	} finally {
	ci.getLock().readLock().unlock();
	}
	}

	/**
	* Copies the elements of the parsed iterator to a list.
	* @param iterator the iterator
	* @return the List with all spans of the Iterators
	*/
	public static <T extends Span> List<T> asList(Iterator<T> it){
	if(it == null \|\| !it.hasNext()){
	return Collections.emptyList();
	} else {
	List<T> spans = new ArrayList<T>();
	appandToList(it, spans);
	return spans;
	}
	}
	/**
	* Appends the elements provided by the parsed Iterator to the list.
	* @param it the Iterator
	* @param list the List
	* @throws NullPointerException if the parsed List is <code>null</code>
	*/
	public static <T extends Span> void appandToList(Iterator<T> it, List<? super T> list){
	if(it != null){
	while(it.hasNext()){
	list.add(it.next());
	}
	}
	}

	/**
	* Copies the elements of the parsed iterator(s) to a {@link SortedSet}. As
	* {@link Span} implements {@link Comparable} the Spans within the resulting
	* set will have the same order as returned by the methods of {@link AnalysedText}
	* @param it the iterator(s)
	* @return the {@link SortedSet} containing all Spans of the iterators
	*/
	public static <T extends Span> SortedSet<T> asSet(Iterator<T> it){
	SortedSet<T> spans = new TreeSet<T>();
	addToSet(it, spans);
	return spans;
	}
	/**
	* Adds the Spans of the parsed Iterator to the parsed Set
	* @param it the Iterator
	* @param set the set
	* @throws NullPointerException if the parsed List is <code>null</code>
	*/
	public static <T extends Span> void addToSet(Iterator<T> it,Set<? super T> set){
	if(it != null){
	while(it.hasNext()){
	set.add(it.next());
	}
	}
	}
	/**
	* Iterates over two levels of the Span hierarchy (e.g. all Tokens of a
	* Sentence that are within a Chunk). The returned Iterator is a live
	* view on the {@link AnalysedText} (being the context of the enclosing
	* Span).<p>
	* Usage Example
	* <code><pre>
	* Sentence sentence; //The currently processed Sentence
	* Iterator<Span> tokens = AnalysedTextUtils.getSpansInSpans(
	* sentence,
	* {@link SpanTypeEnum#Chunk SpanTypeEnum.Chunk}
	* {@link SpanTypeEnum#Token SpanTypeEnum.Token}
	* while(tokens.hasNext()){
	* Token token = (Token)tokens.next();
	* // process only tokens within a chunk
	* }
	* </pre></code>
	* @param section
	* @param level1 the {@link SpanTypeEnum} for the first Level. MUST be
	* a Type that is a {@link Section} (e.g. Chunk or Sentence).
	* @param level2
	* @return
	* @throws IllegalArgumentException if {@link SpanTypeEnum#Token} is parsed
	* as <code>level1</code> span type.
	*/
	public static Iterator<Span> getSpansInSpans(Section section, SpanTypeEnum level1, final SpanTypeEnum level2){
	if(level1 == SpanTypeEnum.Token){
	throw new IllegalArgumentException("The SpanType for level1 MUST refer to a Section "
	+ "(Chunk, Sentence, TextSection or Text)");
	}
	final Iterator<Span> level1It = section.getEnclosed(EnumSet.of(level1));
	return new Iterator<Span>(){
	Iterator<Span> level2It = null;
	@Override
	public boolean hasNext() {
	if(level2It != null && level2It.hasNext()) {
	return true;
	} else {
	while(level1It.hasNext()){
	level2It = ((Section)level1It.next()).getEnclosed(EnumSet.of(level2));
	if(level2It.hasNext()){
	return true;
	}
	}
	}
	return false;
	}

	@Override
	public Span next() {
	hasNext(); //ensure hasNext is called on multiple calls to next()
	return level2It.next();
	}

	@Override
	public void remove() {
	level2It.remove();
	}
	};
	}
	// NOTE: No longer used ... keep for now in case that we need this functionality.
	// public static Set<Span> getEnclosed(SortedSet<Span> sortedSet, Span span){
	// if(span.getType() == SpanTypeEnum.Token){
	// log.warn("Span {} with SpanType {} parsed to getEnclosing(..). Returned Set will "
	// + "contain the parsed span!");
	// }
	// return sortedSet.subSet(new SubSetHelperSpan(span.getStart(), span.getEnd()),
	// new SubSetHelperSpan(span.getEnd()));
	// }
	// public static <T> Map<Span,T> getEnclosed(SortedMap<Span,T> sortedSet, Span span){
	// if(span.getType() == SpanTypeEnum.Token){
	// log.warn("Span {} with SpanType {} parsed to getEnclosing(..). Returned Set will "
	// + "contain the parsed span!");
	// }
	// return sortedSet.subMap(new SubSetHelperSpan(span.getStart(), span.getEnd()),
	// new SubSetHelperSpan(span.getEnd()));
	// }
	//
	// /**
	// * Internal helper class used for building {@link SortedSet#subSet(Object, Object)}.
	// *
	// * @author Rupert Westenthaler
	// *
	// */
	// private static class SubSetHelperSpan extends SpanImpl implements Span {
	// /**
	// * Create the start constraint for {@link SortedSet#subSet(Object, Object)}
	// * @param start
	// * @param end
	// */
	// protected SubSetHelperSpan(int start,int end){
	// super(SpanTypeEnum.Text, //lowest pos type
	// start,end);
	// }
	// /**
	// * Creates the end constraint for {@link SortedSet#subSet(Object, Object)}
	// * @param pos
	// */
	// protected SubSetHelperSpan(int pos){
	// super(SpanTypeEnum.Token, //highest pos type,
	// pos,Integer.MAX_VALUE);
	// }
	// }
	}