blob: e0dbbd6b3a5899d088483f1be040b073797608f0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.rule;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.ruta.RutaBlock;
import org.apache.uima.ruta.RutaEnvironment;
import org.apache.uima.ruta.RutaStream;
import org.apache.uima.ruta.ScriptApply;
import org.apache.uima.ruta.UIMAConstants;
import org.apache.uima.ruta.expression.IRutaExpression;
import org.apache.uima.ruta.expression.bool.IBooleanExpression;
import org.apache.uima.ruta.expression.number.INumberExpression;
import org.apache.uima.ruta.expression.string.AbstractStringExpression;
import org.apache.uima.ruta.expression.string.IStringExpression;
import org.apache.uima.ruta.expression.type.ITypeExpression;
import org.apache.uima.ruta.utils.UIMAUtils;
import org.apache.uima.ruta.visitor.InferenceCrowd;
public class RegExpRule extends AbstractRule {
private Map<ITypeExpression, INumberExpression> typeMap;
private IStringExpression regexpExpr;
private Map<ITypeExpression, Map<IStringExpression, IRutaExpression>> featureAssignments;
public RegExpRule(AbstractStringExpression regexp,
Map<ITypeExpression, INumberExpression> typeMap, int id, RutaBlock parent) {
super(parent, id);
this.regexpExpr = regexp;
this.typeMap = typeMap;
}
@Override
public ScriptApply apply(RutaStream stream, InferenceCrowd crowd) {
RuleApply ruleApply = new RuleApply(this, false);
crowd.beginVisit(this, ruleApply);
MatchContext context = new MatchContext(getParent());
String regexpString = regexpExpr.getStringValue(context, stream);
AnnotationFS documentAnnotation = stream.getDocumentAnnotation();
String document = documentAnnotation.getCoveredText();
int delta = documentAnnotation.getBegin();
Map<Integer, List<Type>> groupTypes = getGroup2Types(context, stream);
Map<Integer, Map<Type, Map<String, Object>>> fa = getFeatureAssignmentMap(stream);
Pattern pattern = Pattern.compile(regexpString, Pattern.MULTILINE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(document);
int groupCount = matcher.groupCount();
while (matcher.find()) {
RegExpRuleMatch ruleMatch = new RegExpRuleMatch(this);
MatchResult matchResult = matcher.toMatchResult();
for (int i = 0; i <= groupCount; i++) {
int begin = matchResult.start(i);
int end = matchResult.end(i);
List<Type> types = groupTypes.get(i);
if (types != null) {
createAnnotations(i, delta, begin, end, types, fa, matchResult, ruleMatch, stream);
} else if (i == 0) {
CAS cas = stream.getCas();
AnnotationFS afs = cas.createAnnotation(cas.getAnnotationType(), delta + begin, delta
+ end);
ruleMatch.addMatched(0, afs);
}
}
List<AnnotationFS> matchedAnnotationsOfRoot = ruleMatch.getMatchedAnnotationsOfRoot();
if (matchedAnnotationsOfRoot != null && !matchedAnnotationsOfRoot.isEmpty()) {
ruleApply.add(ruleMatch);
}
}
crowd.endVisit(this, ruleApply);
return ruleApply;
}
private Map<Integer, Map<Type, Map<String, Object>>> getFeatureAssignmentMap(RutaStream stream) {
Map<Integer, Map<Type, Map<String, Object>>> result = new HashMap<Integer, Map<Type, Map<String, Object>>>();
Set<Entry<ITypeExpression, Map<IStringExpression, IRutaExpression>>> entrySet = featureAssignments
.entrySet();
MatchContext context = new MatchContext(getParent());
for (Entry<ITypeExpression, Map<IStringExpression, IRutaExpression>> entry : entrySet) {
ITypeExpression key = entry.getKey();
Type type = key.getType(context, stream);
Map<IStringExpression, IRutaExpression> value = entry.getValue();
INumberExpression cgExpr = typeMap.get(key);
int cg = cgExpr == null ? 0 : cgExpr.getIntegerValue(context, stream);
Map<Type, Map<String, Object>> map = result.get(cg);
if (map == null) {
map = new HashMap<Type, Map<String, Object>>();
result.put(cg, map);
}
Map<String, Object> typeMap = map.get(type);
if (typeMap == null) {
typeMap = new HashMap<String, Object>();
map.put(type, typeMap);
}
Set<Entry<IStringExpression, IRutaExpression>> entrySet2 = value.entrySet();
for (Entry<IStringExpression, IRutaExpression> entry2 : entrySet2) {
IStringExpression key2 = entry2.getKey();
IRutaExpression value2 = entry2.getValue();
String stringValue = key2.getStringValue(context, stream);
typeMap.put(stringValue, value2);
}
}
return result;
}
private Map<Integer, List<Type>> getGroup2Types(MatchContext context, RutaStream stream) {
Map<Integer, List<Type>> groupTypes = new TreeMap<Integer, List<Type>>();
Set<Entry<ITypeExpression, INumberExpression>> entrySet = typeMap.entrySet();
for (Entry<ITypeExpression, INumberExpression> entry : entrySet) {
Type type = entry.getKey().getType(context, stream);
INumberExpression value = entry.getValue();
int group = value == null ? 0 : value.getIntegerValue(context, stream);
List<Type> list = groupTypes.get(group);
if (list == null) {
list = new ArrayList<Type>();
groupTypes.put(group, list);
}
list.add(type);
}
return groupTypes;
}
private void createAnnotations(int group, int delta, int begin, int end, List<Type> globalTypes,
Map<Integer, Map<Type, Map<String, Object>>> fa, MatchResult matchResult,
RegExpRuleMatch match, RutaStream stream) {
CAS cas = stream.getCas();
if (begin < end) {
for (Type type : globalTypes) {
AnnotationFS afs = cas.createAnnotation(type, delta + begin, delta + end);
fillFeatures(group, afs, fa, delta, matchResult, stream);
match.addMatched(group, afs);
stream.addAnnotation(afs, true, true, match);
}
}
}
private void fillFeatures(int group, AnnotationFS afs,
Map<Integer, Map<Type, Map<String, Object>>> fa, int delta, MatchResult matchResult,
RutaStream stream) {
Type type = afs.getType();
JCas jcas = null;
CAS cas = stream.getCas();
try {
jcas = cas.getJCas();
} catch (CASException e) {
}
MatchContext context = new MatchContext(null, null, null, true);
TypeSystem typeSystem = cas.getTypeSystem();
Map<Type, Map<String, Object>> typeMap = fa.get(group);
if (typeMap != null) {
Map<String, Object> map = typeMap.get(type);
if (map != null) {
for (Entry<String, Object> eachEntry : map.entrySet()) {
String featureName = eachEntry.getKey();
Feature feature = type.getFeatureByBaseName(featureName);
if (feature != null) {
Object argExpr = eachEntry.getValue();
Type range = feature.getRange();
if (argExpr instanceof INumberExpression) {
INumberExpression ne = (INumberExpression) argExpr;
int cg = ne.getIntegerValue(context, stream);
if (range.getName().equals(UIMAConstants.TYPE_STRING)) {
String s = matchResult.group(cg);
afs.setStringValue(feature, s);
} else if (range.getName().equals(UIMAConstants.TYPE_BOOLEAN)) {
} else if (range.getName().equals(UIMAConstants.TYPE_BYTE)) {
} else if (range.getName().equals(UIMAConstants.TYPE_DOUBLE)) {
} else if (range.getName().equals(UIMAConstants.TYPE_FLOAT)) {
} else if (range.getName().equals(UIMAConstants.TYPE_INTEGER)) {
} else if (range.getName().equals(UIMAConstants.TYPE_LONG)) {
} else if (range.getName().equals(UIMAConstants.TYPE_SHORT)) {
} else {
if (typeSystem.subsumes(jcas.getCasType(FSArray.type), range)) {
// TODO add functionality for fsarrays
// AnnotationFS a = null;
// List<AnnotationFS> annotations = new ArrayList<AnnotationFS>(1);
// annotations.add(a);
// afs.setFeatureValue(feature, UIMAUtils.toFSArray(jcas, annotations));
} else {
int begin = delta + matchResult.start(cg);
int end = delta + matchResult.end(cg);
if (begin < end) {
AnnotationFS a = cas.createAnnotation(range, begin, end);
afs.setFeatureValue(feature, a);
}
}
}
} else {
if (argExpr instanceof ITypeExpression
&& range.getName().equals(UIMAConstants.TYPE_STRING)) {
ITypeExpression typeExpr = (ITypeExpression) argExpr;
List<AnnotationFS> annotationsInWindow = stream.getAnnotationsInWindow(afs,
typeExpr.getType(context, stream));
if (annotationsInWindow != null && !annotationsInWindow.isEmpty()) {
AnnotationFS annotation = annotationsInWindow.get(0);
afs.setStringValue(feature, annotation.getCoveredText());
}
} else if (argExpr instanceof AbstractStringExpression
&& range.getName().equals(UIMAConstants.TYPE_STRING)) {
afs.setStringValue(feature,
((AbstractStringExpression) argExpr).getStringValue(context, stream));
// numbers are reserved for capturing groups
//
// } else if (argExpr instanceof NumberExpression) {
// if (range.getName().equals(UIMAConstants.TYPE_INTEGER)) {
// afs.setIntValue(feature,
// ((NumberExpression) argExpr).getIntegerValue(getParent()));
// } else if (range.getName().equals(UIMAConstants.TYPE_DOUBLE)) {
// afs.setDoubleValue(feature,
// ((NumberExpression) argExpr).getDoubleValue(getParent()));
// } else if (range.getName().equals(UIMAConstants.TYPE_FLOAT)) {
// afs.setFloatValue(feature,
// ((NumberExpression) argExpr).getFloatValue(getParent()));
// } else if (range.getName().equals(UIMAConstants.TYPE_BYTE)) {
// afs.setByteValue(feature,
// (byte) ((NumberExpression) argExpr).getIntegerValue(getParent()));
// } else if (range.getName().equals(UIMAConstants.TYPE_SHORT)) {
// afs.setShortValue(feature,
// (short) ((NumberExpression) argExpr).getIntegerValue(getParent()));
// } else if (range.getName().equals(UIMAConstants.TYPE_LONG)) {
// afs.setLongValue(feature,
// (long) ((NumberExpression) argExpr).getIntegerValue(getParent()));
// }
} else if (argExpr instanceof IBooleanExpression
&& range.getName().equals(UIMAConstants.TYPE_BOOLEAN)) {
afs.setBooleanValue(feature,
((IBooleanExpression) argExpr).getBooleanValue(context, stream));
} else if (argExpr instanceof ITypeExpression) {
ITypeExpression typeExpr = (ITypeExpression) argExpr;
List<AnnotationFS> annotationsInWindow = stream.getAnnotationsInWindow(afs,
typeExpr.getType(context, stream));
if (typeSystem.subsumes(jcas.getCasType(FSArray.type), range)) {
afs.setFeatureValue(feature, UIMAUtils.toFSArray(jcas, annotationsInWindow));
} else if (typeSystem.subsumes(range, typeExpr.getType(context, stream))
&& !annotationsInWindow.isEmpty()) {
AnnotationFS annotation = annotationsInWindow.get(0);
afs.setFeatureValue(feature, annotation);
}
}
}
}
}
}
}
}
@Override
public RutaEnvironment getEnvironment() {
return getParent().getEnvironment();
}
public Map<ITypeExpression, INumberExpression> getTypeMap() {
return typeMap;
}
public void setTypeMap(Map<ITypeExpression, INumberExpression> typeMap) {
this.typeMap = typeMap;
}
public IStringExpression getRegExp() {
return regexpExpr;
}
public void setRegExp(IStringExpression regexp) {
this.regexpExpr = regexp;
}
public void setFeatureAssignments(Map<ITypeExpression, Map<IStringExpression, IRutaExpression>> fa) {
this.featureAssignments = fa;
}
public Map<ITypeExpression, Map<IStringExpression, IRutaExpression>> getFeatureAssignments() {
return featureAssignments;
}
}