blob: 57df5efa865cb2d49e7cf9949bebc073b90fed93 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.tutorial.ex3;
import java.text.BreakIterator;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.text.ParsePosition;
import java.util.Date;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.ResultSpecification;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.tutorial.DateAnnot;
import org.apache.uima.tutorial.DateTimeAnnot;
import org.apache.uima.tutorial.TimeAnnot;
/**
* Simple Date/Time annotator.
*/
public class TutorialDateTime extends JCasAnnotator_ImplBase {
static abstract class Maker {
abstract Annotation newAnnotation(JCas jcas, int start, int end);
}
JCas jcas;
String input;
ParsePosition pp = new ParsePosition(0);
// Static vars holding patterns, and function pointers
// n:nn nn:nn followed optionally with AM or PM
// .*? (any number of arbitrary chars, minimum, not greedy)
// \b followed by a word boundary
// [0-2]? followed by the optionally the first digit, a 0, 1, or 2
// \d:[0-6]\d followed by a digit and the colon char,and minutes
// \s*?(AM|PM)? followed by optional white space (non greedy) and AM or PM
static final Pattern hoursMinutesPattern = Pattern
.compile("(?s)\\b([0-2]?\\d:[0-5]\\d\\s*(AM\\W|PM\\W|am\\W|pm\\W)?)");
//
static final DateFormat dfTimeShort = DateFormat.getTimeInstance(DateFormat.SHORT, Locale.US);
// .*? (any number of artibrary chars, non greedy
// \b word boundary
// [0-1]? optional first digit
// \d digit of month
// /
// [0-3]? optional day of month 1st digit
// \d
// ((/[1-2]\d\d\d)|(/\d\d)|\s) // year is /nnnn or /nn or missing
static final Pattern numericDatePattern = Pattern
.compile("(?s)\\b([0-1]?\\d/[0-3]?\\d((/[1-2]\\d\\d\\d)|(/\\d\\d))?)\\W");
static final DateFormat dfDateShort = DateFormat.getDateInstance(DateFormat.SHORT, Locale.US);
// .*? (any number of artibrary chars, non greedy
// \b word boundary
// [Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec] Month
// \.? optional period
// \s+
// [0-3]? optional day of month 1st digit
// \d
// (((,\s+)?[1-2]\d\d\d\W)|((,\s+)?\d\d\W)|\W) // year is /nnnn or /nn or missing
static final String shortMonthNames = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)";
static final Pattern mediumDatePattern = Pattern.compile("(?s)\\b(" + shortMonthNames
+ "\\.?\\s[0-3]?\\d(((,\\s+)?[1-2]\\d\\d\\d)|((,\\s+)?\\d\\d))?)\\W");
static final DateFormat dfDateMedium = DateFormat.getDateInstance(DateFormat.MEDIUM, Locale.US);
// for long month names, exclude May since it is covered by short month names
static final String longMonthNames = "(January|February|March|April|June|July|August|September|October|November|December)";
static final Pattern longDatePattern = Pattern.compile("(?s)\\b(" + longMonthNames
+ "\\s[0-3]?\\d(((,\\s+)?[1-2]\\d\\d\\d)|((,\\s+)?\\d\\d))?)\\W");
static final DateFormat dfDateLong = DateFormat.getDateInstance(DateFormat.LONG, Locale.US);
static final NumberFormat numberFormat = NumberFormat.getInstance(Locale.US);
// function pointers for new instances
static final Maker dateAnnotationMaker = new Maker() {
Annotation newAnnotation(JCas jcas, int start, int end) {
return new DateAnnot(jcas, start, end);
}
};
static final Maker timeAnnotationMaker = new Maker() {
Annotation newAnnotation(JCas jcas, int start, int end) {
return new TimeAnnot(jcas, start, end);
}
};
static final String defaultYear = "2003";
// PROCESS
/**
* The ResultSpecification controls what gets produced. For example, to only produce
* DateAnnotations, change the descriptor for this component to specify it outputs only that type.
*/
public void process(JCas aJCas) {
jcas = aJCas;
input = jcas.getDocumentText();
// Create Annotations
ResultSpecification resultSpec = getResultSpecification();
if (resultSpec.containsType("org.apache.uima.tutorial.TimeAnnot"))
makeAnnotations(timeAnnotationMaker, hoursMinutesPattern, dfTimeShort);
if (resultSpec.containsType("org.apache.uima.tutorial.DateAnnot"))
makeAnnotations(dateAnnotationMaker, numericDatePattern, dfDateShort);
if (resultSpec.containsType("org.apache.uima.tutorial.DateAnnot"))
makeAnnotations(dateAnnotationMaker, mediumDatePattern, dfDateMedium);
if (resultSpec.containsType("org.apache.uima.tutorial.DateAnnot"))
makeAnnotations(dateAnnotationMaker, longDatePattern, dfDateLong);
}
// HELPER METHODS
void makeAnnotations(Maker m, BreakIterator b) {
b.setText(input);
for (int end = b.next(), start = b.first(); end != BreakIterator.DONE; start = end, end = b
.next()) {
// eliminate all-whitespace tokens
boolean isWhitespace = true;
for (int i = start; i < end; i++) {
if (!Character.isWhitespace(input.charAt(i))) {
isWhitespace = false;
break;
}
}
if (!isWhitespace) {
m.newAnnotation(jcas, start, end).addToIndexes();
}
}
}
void makeAnnotations(Maker m, Pattern pattern, DateFormat dateFormat) {
Matcher matcher = pattern.matcher(input);
String matched;
while (matcher.find()) {
int start = matcher.start(1);
matched = fixUpDateTimeStrings(matcher.group(1));
DateTimeAnnot dtAnnot = (DateTimeAnnot) m.newAnnotation(jcas, start, matcher.end(1));
pp.setIndex(0);
Date dtSpec = dateFormat.parse(matched, pp);
// System.out.println(dtAnnot.dtSpec);
if (dtSpec != null) {
dtAnnot.setShortDateString(dfDateShort.format(dtSpec));
}
dtAnnot.addToIndexes();
}
}
String fixUpDateTimeStrings(String s) {
String av; // append value
pp.setIndex(0);
if (-1 < s.indexOf(":")) { // have time string
if (s.endsWith("AM") | s.endsWith("PM") | s.endsWith("am") | s.endsWith("pm"))
return s;
else {
int hour = numberFormat.parse(s, pp).intValue();
if (0 == hour)
av = " AM";
else if (hour < 9)
av = " PM";
else
av = " AM";
return s + av;
}
}
// have date string
return s + ", " + defaultYear; // in case no year available
}
}