| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.examples.tokenizer; |
| |
| import java.text.BreakIterator; |
| import java.text.ParsePosition; |
| import java.util.Locale; |
| |
| import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; |
| import org.apache.uima.analysis_engine.AnalysisEngineProcessException; |
| import org.apache.uima.jcas.JCas; |
| import org.apache.uima.jcas.tcas.Annotation; |
| |
| /** |
| * An example annotator that annotates Tokens and Sentences. |
| */ |
| public class SimpleTokenAndSentenceAnnotator extends JCasAnnotator_ImplBase { |
| |
| static abstract class Maker { |
| abstract Annotation newAnnotation(JCas jcas, int start, int end); |
| } |
| |
| JCas jcas; |
| |
| String input; |
| |
| ParsePosition pp = new ParsePosition(0); |
| |
| // **************************************** |
| // * Static vars holding break iterators |
| // **************************************** |
| static final BreakIterator sentenceBreak = BreakIterator.getSentenceInstance(Locale.US); |
| |
| static final BreakIterator wordBreak = BreakIterator.getWordInstance(Locale.US); |
| |
| // ********************************************* |
| // * function pointers for new instances * |
| // ********************************************* |
| static final Maker sentenceAnnotationMaker = new Maker() { |
| Annotation newAnnotation(JCas jcas, int start, int end) { |
| return new Sentence(jcas, start, end); |
| } |
| }; |
| |
| static final Maker tokenAnnotationMaker = new Maker() { |
| Annotation newAnnotation(JCas jcas, int start, int end) { |
| return new Token(jcas, start, end); |
| } |
| }; |
| |
| // ************************************************************* |
| // * process * |
| // ************************************************************* |
| public void process(JCas aJCas) throws AnalysisEngineProcessException { |
| jcas = aJCas; |
| input = jcas.getDocumentText(); |
| |
| // Create Annotations |
| makeAnnotations(sentenceAnnotationMaker, sentenceBreak); |
| makeAnnotations(tokenAnnotationMaker, wordBreak); |
| } |
| |
| // ************************************************************* |
| // * Helper Methods * |
| // ************************************************************* |
| void makeAnnotations(Maker m, BreakIterator b) { |
| b.setText(input); |
| for (int end = b.next(), start = b.first(); end != BreakIterator.DONE; start = end, end = b |
| .next()) { |
| // eliminate all-whitespace tokens |
| boolean isWhitespace = true; |
| for (int i = start; i < end; i++) { |
| if (!Character.isWhitespace(input.charAt(i))) { |
| isWhitespace = false; |
| break; |
| } |
| } |
| if (!isWhitespace) { |
| m.newAnnotation(jcas, start, end).addToIndexes(); |
| } |
| } |
| } |
| } |