| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.any23.extractor; |
| |
| import org.apache.any23.extractor.html.MicroformatExtractor; |
| import org.apache.any23.rdf.Prefixes; |
| import org.apache.any23.writer.TripleHandler; |
| import org.apache.any23.writer.TripleHandlerException; |
| import org.eclipse.rdf4j.model.BNode; |
| import org.eclipse.rdf4j.model.Resource; |
| import org.eclipse.rdf4j.model.IRI; |
| import org.eclipse.rdf4j.model.Value; |
| |
| import java.io.PrintStream; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Set; |
| |
| /** |
| * <p> |
| * A default implementation of {@link ExtractionResult}; it receives |
| * extraction output from one {@link Extractor} working on one document, |
| * and passes the output on to a {@link TripleHandler}. It deals with |
| * details such as creation of {@link ExtractionContext} objects |
| * and closing any open contexts at the end of extraction. |
| * </p> |
| * <p> |
| * The {@link #close()} method must be invoked after the extractor has |
| * finished processing. |
| * </p> |
| * <p> |
| * There is usually no need to provide additional implementations |
| * of the ExtractionWriter interface. |
| *</p> |
| * |
| * @see org.apache.any23.writer.TripleHandler |
| * @see ExtractionContext |
| * @author Richard Cyganiak (richard@cyganiak.de) |
| * @author Michele Mostarda (michele.mostarda@gmail.com) |
| */ |
| public class ExtractionResultImpl implements TagSoupExtractionResult { |
| |
| private final ExtractionContext context; |
| |
| private final Extractor<?> extractor; |
| |
| private final TripleHandler tripleHandler; |
| |
| private final Collection<ExtractionResult> subResults = new ArrayList<>(); |
| |
| private final Set<Object> knownContextIDs = new HashSet<>(); |
| |
| private boolean isClosed = false; |
| |
| private boolean isInitialized = false; |
| |
| private List<Issue> issues; |
| |
| private List<ResourceRoot> resourceRoots; |
| |
| private List<PropertyPath> propertyPaths; |
| |
| public ExtractionResultImpl( |
| ExtractionContext context, |
| Extractor<?> extractor, |
| TripleHandler tripleHandler |
| ) { |
| this(context, extractor, tripleHandler, new ArrayList<>()); |
| } |
| |
| private ExtractionResultImpl( |
| ExtractionContext context, |
| Extractor<?> extractor, |
| TripleHandler tripleHandler, |
| List<Issue> issues |
| ) { |
| if(context == null) { |
| throw new NullPointerException("context cannot be null."); |
| } |
| if(extractor == null) { |
| throw new NullPointerException("extractor cannot be null."); |
| } |
| if(tripleHandler == null) { |
| throw new NullPointerException("triple handler cannot be null."); |
| } |
| |
| this.extractor = extractor; |
| this.tripleHandler = tripleHandler; |
| this.context = context; |
| this.issues = issues; |
| |
| knownContextIDs.add( context.getUniqueID() ); |
| |
| try { |
| // openContext() must be called before extraction begins |
| // so that BenchmarkTripleHandler can report accurate times. |
| // See https://issues.apache.org/jira/browse/ANY23-337 |
| tripleHandler.openContext(context); |
| } catch (TripleHandlerException e) { |
| throw new RuntimeException("Error while opening context", e); |
| } |
| } |
| |
| public boolean hasIssues() { |
| return ! issues.isEmpty(); |
| } |
| |
| public int getIssuesCount() { |
| return issues.size(); |
| } |
| |
| @Override |
| public void printReport(PrintStream ps) { |
| ps.print(String.format(Locale.ROOT, "Context: %s [errors: %d] {\n", context, getIssuesCount())); |
| for (Issue issue : issues) { |
| ps.print(issue.toString()); |
| ps.print("\n"); |
| } |
| // Printing sub results. |
| for (ExtractionResult er : subResults) { |
| er.printReport(ps); |
| } |
| ps.print("}\n"); |
| } |
| |
| @Override |
| public Collection<Issue> getIssues() { |
| return issues.isEmpty() ? Collections.<Issue>emptyList() : Collections.unmodifiableList(issues); |
| } |
| |
| @Override |
| public ExtractionResult openSubResult(ExtractionContext context) { |
| final String contextID = context.getUniqueID(); |
| if (knownContextIDs.contains(contextID)) { |
| throw new IllegalArgumentException("Duplicate contextID: " + contextID); |
| } |
| knownContextIDs.add(contextID); |
| |
| checkOpen(); |
| ExtractionResult result = new ExtractionResultImpl(context, extractor, tripleHandler, this.issues); |
| subResults.add(result); |
| return result; |
| } |
| |
| public ExtractionContext getExtractionContext() { |
| return context; |
| } |
| |
| @Override |
| public void writeTriple(Resource s, IRI p, Value o, IRI g) { |
| if (s == null || p == null || o == null) return; |
| // Check for misconstructed literals or BNodes, Sesame does not catch this. |
| if (s.stringValue() == null || p.stringValue() == null || o.stringValue() == null) { |
| throw new IllegalArgumentException("The statement arguments must be not null."); |
| } |
| checkOpen(); |
| try { |
| tripleHandler.receiveTriple(s, p, o, g, context); |
| } catch (TripleHandlerException e) { |
| throw new RuntimeException( |
| String.format(Locale.ROOT, "Error while receiving triple %s %s %s", s, p, o ), |
| e |
| ); |
| } |
| } |
| |
| boolean wasTouched() { |
| return isInitialized; |
| } |
| |
| @Override |
| public void writeTriple(Resource s, IRI p, Value o) { |
| writeTriple(s, p, o, null); |
| } |
| |
| @Override |
| public void writeNamespace(String prefix, String uri) { |
| checkOpen(); |
| try { |
| tripleHandler.receiveNamespace(prefix, uri, context); |
| } catch (TripleHandlerException e) { |
| throw new RuntimeException( |
| String.format(Locale.ROOT, "Error while writing namespace %s:%s", prefix, uri), |
| e |
| ); |
| } |
| } |
| |
| @Override |
| public void notifyIssue(IssueLevel level, String msg, long row, long col) { |
| issues.add(new Issue(level, msg, row, col)); |
| } |
| |
| @Override |
| public void close() { |
| if (isClosed) return; |
| isClosed = true; |
| for (ExtractionResult subResult : subResults) { |
| subResult.close(); |
| } |
| try { |
| tripleHandler.closeContext(context); |
| } catch (TripleHandlerException e) { |
| throw new RuntimeException("Error while opening context", e); |
| } |
| } |
| |
| private void checkOpen() { |
| if (!isInitialized) { |
| isInitialized = true; |
| Prefixes prefixes = extractor.getDescription().getPrefixes(); |
| for (String prefix : prefixes.allPrefixes()) { |
| try { |
| tripleHandler.receiveNamespace(prefix, prefixes.getNamespaceIRIFor(prefix), context); |
| } catch (TripleHandlerException e) { |
| throw new RuntimeException(String.format(Locale.ROOT, "Error while writing namespace %s", prefix), |
| e |
| ); |
| } |
| } |
| } |
| if (isClosed) { |
| throw new IllegalStateException("Not open: " + context); |
| } |
| } |
| |
| @Override |
| public void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) { |
| if(resourceRoots == null) { |
| resourceRoots = new ArrayList<>(); |
| } |
| resourceRoots.add( new ResourceRoot(path, root, extractor) ); |
| } |
| |
| @Override |
| public List<ResourceRoot> getResourceRoots() { |
| List<ResourceRoot> allRoots = new ArrayList<>(); |
| if(resourceRoots != null) { |
| allRoots.addAll( resourceRoots ); |
| } |
| for(ExtractionResult er : subResults) { |
| ExtractionResultImpl eri = (ExtractionResultImpl) er; |
| if( eri.resourceRoots != null ) { |
| allRoots.addAll( eri.resourceRoots ); |
| } |
| } |
| return allRoots; |
| } |
| |
| @Override |
| public void addPropertyPath( |
| Class<? extends MicroformatExtractor> extractor, |
| Resource propertySubject, |
| Resource property, |
| BNode object, |
| String[] path |
| ) { |
| if(propertyPaths == null) { |
| propertyPaths = new ArrayList<>(); |
| } |
| propertyPaths.add( new PropertyPath(path, propertySubject, property, object, extractor) ); |
| } |
| |
| @Override |
| public List<PropertyPath> getPropertyPaths() { |
| List<PropertyPath> allPaths = new ArrayList<>(); |
| if(propertyPaths != null) { |
| allPaths.addAll( propertyPaths ); |
| } |
| for(ExtractionResult er : subResults) { |
| ExtractionResultImpl eri = (ExtractionResultImpl) er; |
| if( eri.propertyPaths != null ) { |
| allPaths.addAll( eri.propertyPaths ); |
| } |
| } |
| return allPaths; |
| } |
| |
| @Override |
| public String toString() { |
| final StringBuilder sb = new StringBuilder(); |
| sb.append(context.toString()); |
| sb.append('\n'); |
| if (issues != null) { |
| sb.append("Errors {\n"); |
| for (Issue issue : issues) { |
| sb.append('\t'); |
| sb.append(issue.toString()); |
| sb.append('\n'); |
| } |
| } |
| sb.append("}\n"); |
| return sb.toString(); |
| } |
| |
| } |