<?xml version="1.0" encoding="UTF-8"?> | |
<!-- | |
* Licensed to the Apache Software Foundation (ASF) under one | |
* or more contributor license agreements. See the NOTICE file | |
* distributed with this work for additional information | |
* regarding copyright ownership. The ASF licenses this file | |
* to you under the Apache License, Version 2.0 (the | |
* "License"); you may not use this file except in compliance | |
* with the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, | |
* software distributed under the License is distributed on an | |
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
* KIND, either express or implied. See the License for the | |
* specific language governing permissions and limitations | |
* under the License. | |
--> | |
<conceptSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xmlns="http://incubator.apache.org/uima/regex" | |
xsi:schemaLocation="concept.xsd"> | |
<concept name="emailAddressDetection"> | |
<rules> | |
<rule | |
regEx="([a-zA-Z0-9!#$%*+'/=?^_\x2D`{|}~.\x26]+)@([a-zA-Z0-9._-]+[a-zA-Z]{2,4})" | |
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" /> | |
</rules> | |
<createAnnotations> | |
<annotation id="emailAnnot" | |
type="org.apache.uima.EmailAddress"> | |
<begin group="0" /> | |
<end group="0" /> | |
<setFeature name="localPart" type="String" | |
normalization="ToLowerCase"> | |
$1 | |
</setFeature> | |
<setFeature name="domainPart" type="String" | |
normalization="ToLowerCase"> | |
$2 | |
</setFeature> | |
<setFeature name="normalizedEmail" type="String" | |
normalization="ToLowerCase"> | |
$0 | |
</setFeature> | |
</annotation> | |
</createAnnotations> | |
</concept> | |
<concept name="isbnNumberDetection"> | |
<rules> | |
<rule regEx="(97(8|9))?-?(\d{9}|(\d|-){11})-?(\d|X)" | |
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" | |
confidence="1.0" /> | |
</rules> | |
<createAnnotations> | |
<annotation id="isbnNumber" | |
type="org.apache.uima.ISBNNumber" | |
validate="org.apache.uima.annotator.regex.extension.impl.ISBNNumberValidator"> | |
<begin group="0" /> | |
<end group="0" /> | |
<setFeature name="confidence" type="Confidence" /> | |
</annotation> | |
</createAnnotations> | |
</concept> | |
<concept name="creditCardNumberDetection" processAllRules="true"> | |
<rules> | |
<rule ruleId="AmericanExpress" | |
regEx="(((34|37)\d{2}[- ]?)(\d{6}[- ]?)\d{5})" | |
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" | |
confidence="1.0" /> | |
<rule ruleId="Visa" | |
regEx="((4\d{3}[- ]?)(\d{4}[- ]?){2}\d{4})" matchStrategy="matchAll" | |
matchType="uima.tcas.DocumentAnnotation" confidence="1.0" /> | |
<rule ruleId="MasterCard" | |
regEx="((5[1-5]\d{2}[- ]?)(\d{4}[- ]?){2}\d{4})" | |
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" | |
confidence="1.0" /> | |
<rule ruleId="unknown" | |
regEx="(([1-6]\d{3}[- ])(\d{4}[- ]){2}\d{4})|([1-6]\d{13,18})|([1-6]\d{3}[- ]\d{6}[- ]\d{5})" | |
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" | |
confidence="1.0" /> | |
</rules> | |
<createAnnotations> | |
<annotation id="creditCardNumber" | |
type="org.apache.uima.CreditCardNumber" | |
validate="org.apache.uima.annotator.regex.extension.impl.CreditCardNumberValidator"> | |
<begin group="0" /> | |
<end group="0" /> | |
<setFeature name="confidence" type="Confidence" /> | |
<setFeature name="cardType" type="RuleId" /> | |
</annotation> | |
</createAnnotations> | |
</concept> | |
<concept name="MoneyAmountDetection" processAllRules="true"> | |
<!-- \p{Sc} -> currentySymbol --> | |
<!-- (?i) -> case insensitive match --> | |
<!-- \s -> whitespace character --> | |
<rules> | |
<rule regEx="\m{currency}(\p{Sc}\s?|(?i)USD\s?|(?i)Dollars\s?|(?i)Dollar\s?|(?i)CNY\s?|(?i)CAD\s?|(?i)GBP\s?|(?i)Pounds\s?|(?i)Pound\s?|(?i)Euros\s?|(?i)Euro\s?|(?i)Yen\s?|(?i)EUR\s?)\m{amount}(\d+(,\d\d\d)*(\.\d\d?)?)\m{amountText}(\s?(?i)million|\s?(?i)billion)?" | |
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"/> | |
<rule regEx="\m{amount}(\d+(,\d\d\d)*(\.\d\d?\d?)?)\m{amountText}(\s?(?i)million|\s?(?i)billion)?\m{currency}(\s?\p{Sc}|\s?(?i)USD\b|\s?(?i)Dollars\b|\s?(?i)Dollar\b|\s?(?i)CNY\b|\s?(?i)CAD\b|\s?(?i)GBP\b|\s?(?i)Pounds\b|\s?(?i)Pound\b|\s?(?i)Euros\b|\s?(?i)Euro\b|\s?(?i)Yen\b|\s?(?i)EUR\b)" | |
matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"/> | |
</rules> | |
<createAnnotations> | |
<annotation type="org.apache.uima.MoneyAmount"> | |
<begin group="0" /> | |
<end group="0" /> | |
<setFeature name="currency" type="String" normalization="Trim">${currency}</setFeature> | |
<setFeature name="amount" type="Float">${amount}</setFeature> | |
<setFeature name="amountText" type="String" normalization="Trim">${amountText}</setFeature> | |
</annotation> | |
</createAnnotations> | |
</concept> | |
</conceptSet> |