blob: e3bf1caf1139f01b415d18a9651f6cc963774747 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.xpath;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.html.DomUtils;
import org.eclipse.rdf4j.model.IRI;
import org.w3c.dom.Document;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
/**
* Default implementation of {@link XPathExtractionRule}.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
public class TemplateXPathExtractionRuleImpl implements TemplateXPathExtractionRule {
private final String name;
private final String uriRegex;
private final Pattern uriRegexPattern;
private final List<Variable> variables;
private final List<QuadTemplate> templates;
public TemplateXPathExtractionRuleImpl(String name, String uriRegex) {
if(name == null) {
throw new NullPointerException("The rule name cannot be null.");
}
this.name = name;
this.uriRegex = uriRegex;
try {
uriRegexPattern = uriRegex != null ? Pattern.compile(uriRegex) : null;
} catch (Exception e) {
throw new IllegalArgumentException("Invalid value for uriRegex.", e);
}
variables = new ArrayList<Variable>();
templates = new ArrayList<QuadTemplate>();
}
/**
* @return the regex pattern filtering the template pages.
*/
public String getUriRegex() {
return uriRegex;
}
public void add(Variable variable) {
checkVariableNameNotDeclared(variable.getName());
variables.add(variable);
}
public boolean remove(Variable variable) {
return variables.remove(variable);
}
public void add(QuadTemplate template) {
checkTemplateVariablesDeclared(template);
templates.add(template);
}
public boolean remove(QuadTemplate template) {
return templates.remove(template);
}
public String getName() {
return name;
}
public boolean acceptIRI(IRI uri) {
if(uriRegexPattern == null) {
return true;
}
return uriRegexPattern.matcher(uri.stringValue()).find();
}
public void process(Document in, ExtractionResult er) {
final Map<String,String> varValues = new HashMap<String, String>();
String value;
for(Variable variable : variables) {
value = DomUtils.find(in, variable.getxPath().toUpperCase(Locale.ROOT));
varValues.put(variable.getName(), value);
}
for(QuadTemplate template : templates) {
template.printOut(er, varValues);
}
}
private boolean variableNameDeclared(String varName) {
for(Variable variable : variables) {
if(variable.getName().equals(varName)) {
return true;
}
}
return false;
}
private void checkVariableNameDeclared(String varName) {
if (!variableNameDeclared(varName)) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "A variable with name '%s' was not declared.", varName)
);
}
}
private void checkVariableNameNotDeclared(String varName) {
if (variableNameDeclared(varName)) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "A variable with name '%s' is already declared.", varName)
);
}
}
private void checkTemplateVariablesDeclared(QuadTemplate template) {
if( template.getSubject().isVar() ) checkVariableNameDeclared( template.getSubject().getInternalValue() );
if( template.getPredicate().isVar() ) checkVariableNameDeclared( template.getPredicate().getInternalValue() );
if( template.getObject().isVar() ) checkVariableNameDeclared( template.getObject().getInternalValue() );
if( template.getGraph() != null && template.getGraph().isVar() ) {
checkVariableNameDeclared( template.getGraph().getInternalValue() );
}
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append('\n');
sb.append("name: ").append(name).append('\n');
sb.append("pattern: '").append(uriRegex).append("'").append('\n');
sb.append("variables {\n");
for (Variable variable : variables) {
sb.append(variable.getName()).append(":").append(variable.getxPath()).append('\n');
}
sb.append("}\n");
sb.append("templates {\n");
for (QuadTemplate template : templates) {
sb.append(template).append('\n');
}
sb.append("}\n");
return sb.toString();
}
}