blob: ff0404880b870b2b46ce3e291d17994b2a7c01e7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor;
import org.apache.any23.extractor.html.MicroformatExtractor;
import org.apache.any23.rdf.Prefixes;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Value;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
/**
* <p>
* A default implementation of {@link ExtractionResult}; it receives extraction output from one {@link Extractor}
* working on one document, and passes the output on to a {@link TripleHandler}. It deals with details such as creation
* of {@link ExtractionContext} objects and closing any open contexts at the end of extraction.
* </p>
* <p>
* The {@link #close()} method must be invoked after the extractor has finished processing.
* </p>
* <p>
* There is usually no need to provide additional implementations of the ExtractionWriter interface.
* </p>
*
* @see org.apache.any23.writer.TripleHandler
* @see ExtractionContext
*
* @author Richard Cyganiak (richard@cyganiak.de)
* @author Michele Mostarda (michele.mostarda@gmail.com)
*/
public class ExtractionResultImpl implements TagSoupExtractionResult {
private final ExtractionContext context;
private final Extractor<?> extractor;
private final TripleHandler tripleHandler;
private final Collection<ExtractionResult> subResults = new ArrayList<>();
private final Set<Object> knownContextIDs = new HashSet<>();
private boolean isClosed = false;
private boolean isInitialized = false;
private List<Issue> issues;
private List<ResourceRoot> resourceRoots;
private List<PropertyPath> propertyPaths;
public ExtractionResultImpl(ExtractionContext context, Extractor<?> extractor, TripleHandler tripleHandler) {
this(context, extractor, tripleHandler, new ArrayList<>());
}
private ExtractionResultImpl(ExtractionContext context, Extractor<?> extractor, TripleHandler tripleHandler,
List<Issue> issues) {
if (context == null) {
throw new NullPointerException("context cannot be null.");
}
if (extractor == null) {
throw new NullPointerException("extractor cannot be null.");
}
if (tripleHandler == null) {
throw new NullPointerException("triple handler cannot be null.");
}
this.extractor = extractor;
this.tripleHandler = tripleHandler;
this.context = context;
this.issues = issues;
knownContextIDs.add(context.getUniqueID());
try {
// openContext() must be called before extraction begins
// so that BenchmarkTripleHandler can report accurate times.
// See https://issues.apache.org/jira/browse/ANY23-337
tripleHandler.openContext(context);
} catch (TripleHandlerException e) {
throw new RuntimeException("Error while opening context", e);
}
}
public boolean hasIssues() {
return !issues.isEmpty();
}
public int getIssuesCount() {
return issues.size();
}
@Override
public void printReport(PrintStream ps) {
ps.print(String.format(Locale.ROOT, "Context: %s [errors: %d] {\n", context, getIssuesCount()));
for (Issue issue : issues) {
ps.print(issue.toString());
ps.print("\n");
}
// Printing sub results.
for (ExtractionResult er : subResults) {
er.printReport(ps);
}
ps.print("}\n");
}
@Override
public Collection<Issue> getIssues() {
return issues.isEmpty() ? Collections.<Issue> emptyList() : Collections.unmodifiableList(issues);
}
@Override
public ExtractionResult openSubResult(ExtractionContext context) {
final String contextID = context.getUniqueID();
if (knownContextIDs.contains(contextID)) {
throw new IllegalArgumentException("Duplicate contextID: " + contextID);
}
knownContextIDs.add(contextID);
checkOpen();
ExtractionResult result = new ExtractionResultImpl(context, extractor, tripleHandler, this.issues);
subResults.add(result);
return result;
}
public ExtractionContext getExtractionContext() {
return context;
}
@Override
public void writeTriple(Resource s, IRI p, Value o, IRI g) {
if (s == null || p == null || o == null)
return;
// Check for misconstructed literals or BNodes, Sesame does not catch this.
if (s.stringValue() == null || p.stringValue() == null || o.stringValue() == null) {
throw new IllegalArgumentException("The statement arguments must be not null.");
}
checkOpen();
try {
tripleHandler.receiveTriple(s, p, o, g, context);
} catch (TripleHandlerException e) {
throw new RuntimeException(String.format(Locale.ROOT, "Error while receiving triple %s %s %s", s, p, o), e);
}
}
boolean wasTouched() {
return isInitialized;
}
@Override
public void writeTriple(Resource s, IRI p, Value o) {
writeTriple(s, p, o, null);
}
@Override
public void writeNamespace(String prefix, String uri) {
checkOpen();
try {
tripleHandler.receiveNamespace(prefix, uri, context);
} catch (TripleHandlerException e) {
throw new RuntimeException(String.format(Locale.ROOT, "Error while writing namespace %s:%s", prefix, uri),
e);
}
}
@Override
public void notifyIssue(IssueLevel level, String msg, long row, long col) {
issues.add(new Issue(level, msg, row, col));
}
@Override
public void close() {
if (isClosed)
return;
isClosed = true;
for (ExtractionResult subResult : subResults) {
subResult.close();
}
try {
tripleHandler.closeContext(context);
} catch (TripleHandlerException e) {
throw new RuntimeException("Error while opening context", e);
}
}
private void checkOpen() {
if (!isInitialized) {
isInitialized = true;
Prefixes prefixes = extractor.getDescription().getPrefixes();
for (String prefix : prefixes.allPrefixes()) {
try {
tripleHandler.receiveNamespace(prefix, prefixes.getNamespaceIRIFor(prefix), context);
} catch (TripleHandlerException e) {
throw new RuntimeException(String.format(Locale.ROOT, "Error while writing namespace %s", prefix),
e);
}
}
}
if (isClosed) {
throw new IllegalStateException("Not open: " + context);
}
}
@Override
public void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) {
if (resourceRoots == null) {
resourceRoots = new ArrayList<>();
}
resourceRoots.add(new ResourceRoot(path, root, extractor));
}
@Override
public List<ResourceRoot> getResourceRoots() {
List<ResourceRoot> allRoots = new ArrayList<>();
if (resourceRoots != null) {
allRoots.addAll(resourceRoots);
}
for (ExtractionResult er : subResults) {
ExtractionResultImpl eri = (ExtractionResultImpl) er;
if (eri.resourceRoots != null) {
allRoots.addAll(eri.resourceRoots);
}
}
return allRoots;
}
@Override
public void addPropertyPath(Class<? extends MicroformatExtractor> extractor, Resource propertySubject,
Resource property, BNode object, String[] path) {
if (propertyPaths == null) {
propertyPaths = new ArrayList<>();
}
propertyPaths.add(new PropertyPath(path, propertySubject, property, object, extractor));
}
@Override
public List<PropertyPath> getPropertyPaths() {
List<PropertyPath> allPaths = new ArrayList<>();
if (propertyPaths != null) {
allPaths.addAll(propertyPaths);
}
for (ExtractionResult er : subResults) {
ExtractionResultImpl eri = (ExtractionResultImpl) er;
if (eri.propertyPaths != null) {
allPaths.addAll(eri.propertyPaths);
}
}
return allPaths;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append(context.toString());
sb.append('\n');
if (issues != null) {
sb.append("Errors {\n");
for (Issue issue : issues) {
sb.append('\t');
sb.append(issue.toString());
sb.append('\n');
}
}
sb.append("}\n");
return sb.toString();
}
}