blob: e693bd79e679ffbf7c034f475511c97cb7d58883 [file] [log] [blame]
<?xml version="1.0" encoding="UTF-8"?>
<!--
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
-->
<conceptSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://incubator.apache.org/uima/regex"
xsi:schemaLocation="concept.xsd">
<concept name="emailAddressDetection">
<rules>
<rule
regEx="([a-zA-Z0-9!#$%*+'/=?^_\x2D`{|}~.\x26]+)@([a-zA-Z0-9._-]+[a-zA-Z]{2,4})"
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" />
</rules>
<createAnnotations>
<annotation id="emailAnnot"
type="org.apache.uima.EmailAddress">
<begin group="0" />
<end group="0" />
<setFeature name="localPart" type="String"
normalization="ToLowerCase">
$1
</setFeature>
<setFeature name="domainPart" type="String"
normalization="ToLowerCase">
$2
</setFeature>
<setFeature name="normalizedEmail" type="String"
normalization="ToLowerCase">
$0
</setFeature>
</annotation>
</createAnnotations>
</concept>
<concept name="isbnNumberDetection">
<rules>
<rule regEx="(97(8|9))?-?(\d{9}|(\d|-){11})-?(\d|X)"
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"
confidence="1.0" />
</rules>
<createAnnotations>
<annotation id="isbnNumber"
type="org.apache.uima.ISBNNumber"
validate="org.apache.uima.annotator.regex.extension.impl.ISBNNumberValidator">
<begin group="0" />
<end group="0" />
<setFeature name="confidence" type="Confidence" />
</annotation>
</createAnnotations>
</concept>
<concept name="creditCardNumberDetection" processAllRules="true">
<rules>
<rule ruleId="AmericanExpress"
regEx="(((34|37)\d{2}[- ]?)(\d{6}[- ]?)\d{5})"
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"
confidence="1.0" />
<rule ruleId="Visa"
regEx="((4\d{3}[- ]?)(\d{4}[- ]?){2}\d{4})" matchStrategy="matchAll"
matchType="uima.tcas.DocumentAnnotation" confidence="1.0" />
<rule ruleId="MasterCard"
regEx="((5[1-5]\d{2}[- ]?)(\d{4}[- ]?){2}\d{4})"
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"
confidence="1.0" />
<rule ruleId="unknown"
regEx="(([1-6]\d{3}[- ])(\d{4}[- ]){2}\d{4})|([1-6]\d{13,18})|([1-6]\d{3}[- ]\d{6}[- ]\d{5})"
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"
confidence="1.0" />
</rules>
<createAnnotations>
<annotation id="creditCardNumber"
type="org.apache.uima.CreditCardNumber"
validate="org.apache.uima.annotator.regex.extension.impl.CreditCardNumberValidator">
<begin group="0" />
<end group="0" />
<setFeature name="confidence" type="Confidence" />
<setFeature name="cardType" type="RuleId" />
</annotation>
</createAnnotations>
</concept>
<concept name="MoneyAmountDetection" processAllRules="true">
<!-- \p{Sc} -> currentySymbol -->
<!-- (?i) -> case insensitive match -->
<!-- \s -> whitespace character -->
<rules>
<rule regEx="\m{currency}(\p{Sc}\s?|(?i)USD\s?|(?i)Dollars\s?|(?i)Dollar\s?|(?i)CNY\s?|(?i)CAD\s?|(?i)GBP\s?|(?i)Pounds\s?|(?i)Pound\s?|(?i)Euros\s?|(?i)Euro\s?|(?i)Yen\s?|(?i)EUR\s?)\m{amount}(\d+(,\d\d\d)*(\.\d\d?)?)\m{amountText}(\s?(?i)million|\s?(?i)billion)?"
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"/>
<rule regEx="\m{amount}(\d+(,\d\d\d)*(\.\d\d?\d?)?)\m{amountText}(\s?(?i)million|\s?(?i)billion)?\m{currency}(\s?\p{Sc}|\s?(?i)USD\b|\s?(?i)Dollars\b|\s?(?i)Dollar\b|\s?(?i)CNY\b|\s?(?i)CAD\b|\s?(?i)GBP\b|\s?(?i)Pounds\b|\s?(?i)Pound\b|\s?(?i)Euros\b|\s?(?i)Euro\b|\s?(?i)Yen\b|\s?(?i)EUR\b)"
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"/>
</rules>
<createAnnotations>
<annotation type="org.apache.uima.MoneyAmount">
<begin group="0" />
<end group="0" />
<setFeature name="currency" type="String" normalization="Trim">${currency}</setFeature>
<setFeature name="amount" type="Float">${amount}</setFeature>
<setFeature name="amountText" type="String" normalization="Trim">${amountText}</setFeature>
</annotation>
</createAnnotations>
</concept>
</conceptSet>