| <?xml version="1.0" encoding="UTF-8"?> |
| <!-- |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| --> |
| <conceptSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
| xmlns="http://incubator.apache.org/uima/regex" |
| xsi:schemaLocation="concept.xsd"> |
| |
| <concept name="emailAddressDetection"> |
| <rules> |
| <rule |
| regEx="([a-zA-Z0-9!#$%*+'/=?^_\x2D`{|}~.\x26]+)@([a-zA-Z0-9._-]+[a-zA-Z]{2,4})" |
| matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" /> |
| </rules> |
| <createAnnotations> |
| <annotation id="emailAnnot" |
| type="org.apache.uima.EmailAddress"> |
| <begin group="0" /> |
| <end group="0" /> |
| <setFeature name="localPart" type="String" |
| normalization="ToLowerCase"> |
| $1 |
| </setFeature> |
| <setFeature name="domainPart" type="String" |
| normalization="ToLowerCase"> |
| $2 |
| </setFeature> |
| <setFeature name="normalizedEmail" type="String" |
| normalization="ToLowerCase"> |
| $0 |
| </setFeature> |
| </annotation> |
| </createAnnotations> |
| </concept> |
| |
| <concept name="isbnNumberDetection"> |
| <rules> |
| <rule regEx="(97(8|9))?-?(\d{9}|(\d|-){11})-?(\d|X)" |
| matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" |
| confidence="1.0" /> |
| </rules> |
| <createAnnotations> |
| <annotation id="isbnNumber" |
| type="org.apache.uima.ISBNNumber" |
| validate="org.apache.uima.annotator.regex.extension.impl.ISBNNumberValidator"> |
| <begin group="0" /> |
| <end group="0" /> |
| <setFeature name="confidence" type="Confidence" /> |
| </annotation> |
| </createAnnotations> |
| </concept> |
| |
| <concept name="creditCardNumberDetection" processAllRules="true"> |
| <rules> |
| <rule ruleId="AmericanExpress" |
| regEx="(((34|37)\d{2}[- ]?)(\d{6}[- ]?)\d{5})" |
| matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" |
| confidence="1.0" /> |
| <rule ruleId="Visa" |
| regEx="((4\d{3}[- ]?)(\d{4}[- ]?){2}\d{4})" matchStrategy="matchAll" |
| matchType="uima.tcas.DocumentAnnotation" confidence="1.0" /> |
| <rule ruleId="MasterCard" |
| regEx="((5[1-5]\d{2}[- ]?)(\d{4}[- ]?){2}\d{4})" |
| matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" |
| confidence="1.0" /> |
| <rule ruleId="unknown" |
| regEx="(([1-6]\d{3}[- ])(\d{4}[- ]){2}\d{4})|([1-6]\d{13,18})|([1-6]\d{3}[- ]\d{6}[- ]\d{5})" |
| matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" |
| confidence="1.0" /> |
| </rules> |
| <createAnnotations> |
| <annotation id="creditCardNumber" |
| type="org.apache.uima.CreditCardNumber" |
| validate="org.apache.uima.annotator.regex.extension.impl.CreditCardNumberValidator"> |
| <begin group="0" /> |
| <end group="0" /> |
| <setFeature name="confidence" type="Confidence" /> |
| <setFeature name="cardType" type="RuleId" /> |
| </annotation> |
| </createAnnotations> |
| </concept> |
| |
| <concept name="MoneyAmountDetection" processAllRules="true"> |
| <!-- \p{Sc} -> currentySymbol --> |
| <!-- (?i) -> case insensitive match --> |
| <!-- \s -> whitespace character --> |
| <rules> |
| <rule regEx="\m{currency}(\p{Sc}\s?|(?i)USD\s?|(?i)Dollars\s?|(?i)Dollar\s?|(?i)CNY\s?|(?i)CAD\s?|(?i)GBP\s?|(?i)Pounds\s?|(?i)Pound\s?|(?i)Euros\s?|(?i)Euro\s?|(?i)Yen\s?|(?i)EUR\s?)\m{amount}(\d+(,\d\d\d)*(\.\d\d?)?)\m{amountText}(\s?(?i)million|\s?(?i)billion)?" |
| matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"/> |
| <rule regEx="\m{amount}(\d+(,\d\d\d)*(\.\d\d?\d?)?)\m{amountText}(\s?(?i)million|\s?(?i)billion)?\m{currency}(\s?\p{Sc}|\s?(?i)USD\b|\s?(?i)Dollars\b|\s?(?i)Dollar\b|\s?(?i)CNY\b|\s?(?i)CAD\b|\s?(?i)GBP\b|\s?(?i)Pounds\b|\s?(?i)Pound\b|\s?(?i)Euros\b|\s?(?i)Euro\b|\s?(?i)Yen\b|\s?(?i)EUR\b)" |
| matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"/> |
| </rules> |
| <createAnnotations> |
| <annotation type="org.apache.uima.MoneyAmount"> |
| <begin group="0" /> |
| <end group="0" /> |
| <setFeature name="currency" type="String" normalization="Trim">${currency}</setFeature> |
| <setFeature name="amount" type="Float">${amount}</setFeature> |
| <setFeature name="amountText" type="String" normalization="Trim">${amountText}</setFeature> |
| </annotation> |
| </createAnnotations> |
| </concept> |
| </conceptSet> |