blob: 82c1c7fd3d39be68ce6dc2386cf20490fa076808 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
PACKAGE uima.ruta.example;
SCRIPT uima.ruta.example.Base;
ENGINE utils.PlainTextAnnotator;
TYPESYSTEM utils.PlainTextTypeSystem;
WORDLIST EditorMarkerList = 'EditorMarker.txt';
WORDLIST EnglishStopWordList = 'EnglishStopWords.txt';
WORDLIST FirstNameList = 'FirstNames.txt';
WORDLIST JournalVolumeMarkerList = 'JournalVolumeMarker.txt';
WORDLIST MonthList = 'Months.txt';
WORDLIST PagesMarkerList = 'PagesMarker.txt';
WORDLIST PublisherList = 'Publishers.txt';
DECLARE EditorMarker, EnglishStopWord, FirstName, JournalVolumeMarker, Month, PagesMarker, PublisherInd;
Document{ -> MARKFAST(EditorMarker, EditorMarkerList)};
Document{ -> MARKFAST(EnglishStopWord,EnglishStopWordList)};
Document{ -> MARKFAST(FirstName, FirstNameList)};
Document{ -> MARKFAST(JournalVolumeMarker, JournalVolumeMarkerList)};
Document{ -> MARKFAST(Month, MonthList)};
Document{ -> MARKFAST(PagesMarker, PagesMarkerList)};
Document{ -> MARKFAST(PublisherInd, PublisherList)};
DECLARE Reference;
Document{-> EXEC(PlainTextAnnotator, {Line, Paragraph})};
Document{-> RETAINTYPE(SPACE, BREAK)};
Line{-REGEXP("CORA:.*") -> MARK(Reference)};
Reference{-> TRIM(SPACE, BREAK)};
Document{-> RETAINTYPE};
DECLARE LParen, RParen;
SPECIAL{REGEXP("[(]") -> MARK(LParen)};
SPECIAL{REGEXP("[)]") -> MARK(RParen)};
DECLARE YearInd;
NUM{REGEXP("19..|20..") -> MARK(YearInd, 1, 2)} SW?{REGEXP("a|b|c|d", true)};
Document{-> RETAINTYPE(SPACE)};
CAP YearInd{-> UNMARK(YearInd)};
Document{-> RETAINTYPE};
DECLARE NameLinker;
W{-PARTOF(NameLinker), REGEXP("and", true) -> MARK(NameLinker)};
COMMA{-PARTOF(NameLinker) -> MARK(NameLinker)};
SEMICOLON{-PARTOF(NameLinker) -> MARK(NameLinker)};
SPECIAL{-PARTOF(NameLinker), REGEXP("&") -> MARK(NameLinker)};
DECLARE FirstNameInd, FirstNameInitial, SingleChar;
CW{-PARTOF(FirstNameInitial), REGEXP(".")} SPECIAL{-PARTOF(FirstNameInitial), REGEXP("-")} CW{REGEXP(".") -> MARK(FirstNameInitial,1,2,3,4)} PERIOD;
SPECIAL{-PARTOF(FirstNameInitial), REGEXP("-")} CW{REGEXP(".") -> MARK(FirstNameInitial,1,2,3)} PERIOD;
CW{-PARTOF(FirstNameInitial), REGEXP(".") -> MARK(FirstNameInitial,1,2)} PERIOD;
CW{-PARTOF(FirstNameInitial), REGEXP(".") -> MARK(FirstNameInitial)} COMMA;
CW{-PARTOF(FirstNameInitial), REGEXP(".") -> MARK(SingleChar)};
DECLARE Quote, QuotedStuff;
SPECIAL[1,2]{REGEXP("[\"'ยด`โ€˜โ€™โ€œ]"), -PARTOF(Quote) -> MARK(Quote)};
Document{-> RETAINTYPE(SPACE)};
W Quote{-> UNMARK(Quote)} W;
Document{-> RETAINTYPE};
BLOCK(InRef) Reference{}{
Quote ANY+{-PARTOF(Quote) -> MARK(QuotedStuff, 1, 2, 3)} Quote;
}
DECLARE InInd;
W{REGEXP("In", true)-> MARK(InInd)};
DECLARE FirstToken, LastToken;
BLOCK(InRef) Reference{}{
ANY{POSITION(Reference,1) -> MARK(FirstToken)};
Document{-> MARKLAST(LastToken)};
}
DECLARE NumPeriod, NumComma, NumColon;
Document{-> RETAINTYPE(SPACE, BREAK)};
NUM PERIOD{-> MARKONCE(NumPeriod)} NUM;
NUM COMMA{-> MARKONCE(NumComma)} NUM;
NUM COLON{-> MARKONCE(NumColon)} NUM;
Document{-> RETAINTYPE};
DECLARE PeriodSep, CommaSep, ColonSep;
PERIOD{-PARTOF(FirstNameInitial), -PARTOF(NumPeriod), -PARTOF(FirstToken) -> MARKONCE (PeriodSep)};
COMMA{-PARTOF(FirstNameInitial), -PARTOF(NumComma), -PARTOF(FirstToken) -> MARKONCE (CommaSep)};
COLON{-PARTOF(FirstNameInitial), -PARTOF(NumColon), -PARTOF(FirstToken) -> MARKONCE (ColonSep)};