any23-core/src/main/java/org/deri/any23/extractor/ExtractionResultImpl.java - any23 - Git at Google

 /**
  * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *          http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  */

 package org.deri.any23.extractor;

 import org.deri.any23.rdf.Prefixes;
 import org.deri.any23.writer.TripleHandler;
 import org.deri.any23.writer.TripleHandlerException;
 import org.openrdf.model.BNode;
 import org.openrdf.model.Resource;
 import org.openrdf.model.URI;
 import org.openrdf.model.Value;

 import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

 /**
  * <p/>
  * A default implementation of {@link ExtractionResult}; it receives
  * extraction output from one {@link Extractor} working on one document,
  * and passes the output on to a {@link TripleHandler}. It deals with
  * details such as creation of {@link ExtractionContext} objects
  * and closing any open contexts at the end of extraction.
  * <p/>
  * The {@link #close()} method must be invoked after the extractor has
  * finished processing.
  * <p/>
  * There is usually no need to provide additional implementations
  * of the ExtractionWriter interface.
  * <p/>
  *
  * @see org.deri.any23.writer.TripleHandler
  * @see org.deri.any23.extractor.ExtractionContext
  * @author Richard Cyganiak (richard@cyganiak.de)
  * @author Michele Mostarda (michele.mostarda@gmail.com)
  */
 public class ExtractionResultImpl implements TagSoupExtractionResult {

     public static final String ROOT_EXTRACTION_RESULT_ID = "root-extraction-result-id";

     private static final DocumentContext DEFAULT_DOCUMENT_CONTEXT = new DocumentContext(null);

     private final DocumentContext documentContext;

     private final URI documentURI;

     private final Extractor<?> extractor;

     private final TripleHandler tripleHandler;

     private final ExtractionContext context;

     private final Collection<ExtractionResult> subResults = new ArrayList<ExtractionResult>();

     private final Set<Object> knownContextIDs = new HashSet<Object>();

     private boolean isClosed = false;

     private boolean isInitialized = false;

     private List<Error> errors;

     private List<ResourceRoot> resourceRoots;

     private List<PropertyPath> propertyPaths;

     public ExtractionResultImpl(
             DocumentContext documentContext,
             URI documentURI,
             Extractor<?> extractor,
             TripleHandler tripleHandler,
             Object contextID
     ) {
         if(documentContext == null) {
             throw new NullPointerException("document context cannot be null.");
         }
         if(documentURI == null) {
             throw new NullPointerException("document URI cannot be null.");
         }
         if(extractor == null) {
             throw new NullPointerException("extractor cannot be null.");
         }
         if(tripleHandler == null) {
             throw new NullPointerException("triple handler cannot be null.");
         }
         if(contextID == null) {
             throw new NullPointerException("contextID cannot be null.");
         }

         this.documentContext = documentContext;
         this.documentURI     = documentURI;
         this.extractor       = extractor;
         this.tripleHandler   = tripleHandler;
         this.context = new ExtractionContext(
                 extractor.getDescription().getExtractorName(),
                 documentURI,
                 Integer.toHexString(contextID.hashCode())
         );
         knownContextIDs.add(contextID);
     }

     public ExtractionResultImpl(
             DocumentContext documentContext,
             URI documentURI,
             Extractor<?> extractor,
             TripleHandler tripleHandler
     ) {
         this(documentContext, documentURI, extractor, tripleHandler, ROOT_EXTRACTION_RESULT_ID);
     }

     public ExtractionResultImpl(
             URI documentURI,
             Extractor<?> extractor,
             TripleHandler tripleHandler
     ) {
         this(DEFAULT_DOCUMENT_CONTEXT, documentURI, extractor, tripleHandler, ROOT_EXTRACTION_RESULT_ID);
     }

     public boolean hasErrors() {
         return errors != null;
     }

     public int getErrorsCount() {
         return errors == null ? 0 : errors.size();
     }

     public void printErrorsReport(PrintStream ps) {
         ps.print(String.format("Context: %s [errors: %d] {\n", context, getErrorsCount()));
         if (errors != null) {
             for (Error error : errors) {
                 ps.print(error.toString());
                 ps.print("\n");
             }
         }
         // Printing sub results.
         for (ExtractionResult er : subResults) {
             er.printErrorsReport(ps);
         }
         ps.print("}\n");
     }

     public Collection<Error> getErrors() {
         return errors == null ? Collections.<Error>emptyList() : Collections.unmodifiableList(errors);
     }

     public ExtractionResult openSubResult(Object contextID) {
         if (knownContextIDs.contains(contextID)) {
             throw new IllegalArgumentException("Duplicate contextID: " + contextID);
         }
         knownContextIDs.add(contextID);

         checkOpen();
         ExtractionResult result =
                 new ExtractionResultImpl(documentContext, documentURI, extractor, tripleHandler, contextID);
         subResults.add(result);
         return result;
     }

     public DocumentContext getDocumentContext() {
         return documentContext;
     }

     public ExtractionContext getExtractionContext() {
         return context;
     }

     public void writeTriple(Resource s, URI p, Value o, URI g) {
         if (s == null || p == null || o == null) return;
         // Check for mal-constructed literals or BNodes, Sesame does not catch this.
         if (s.stringValue() == null || p.stringValue() == null || o.stringValue() == null) return;
         checkOpen();
         try {
             tripleHandler.receiveTriple(s, p, o, g, context);
         } catch (TripleHandlerException e) {
             throw new RuntimeException(
                     String.format("Error while receiving triple %s %s %s", s, p, o ),
                     e
             );
         }
     }

     public void writeTriple(Resource s, URI p, Value o) {
         writeTriple(s, p, o, null);
     }

     public void writeNamespace(String prefix, String uri) {
         checkOpen();
         try {
             tripleHandler.receiveNamespace(prefix, uri, context);
         } catch (TripleHandlerException e) {
             throw new RuntimeException(
                     String.format("Error while writing namespace %s:%s", prefix, uri),
                     e
             );
         }
     }

     public void notifyError(ErrorLevel level, String msg, int row, int col) {
         if(errors == null) {
             errors = new ArrayList<Error>();
         }
         errors.add( new Error(level, msg, row, col) );
     }

     public void close() {
         if (isClosed) return;
         isClosed = true;
         for (ExtractionResult subResult : subResults) {
             subResult.close();
         }
         if (isInitialized) {
             try {
                 tripleHandler.closeContext(context);
             } catch (TripleHandlerException e) {
                 throw new RuntimeException("Error while opening context", e);
             }
         }
     }

     private void checkOpen() {
         if (!isInitialized) {
             isInitialized = true;
             try {
                 tripleHandler.openContext(context);
             } catch (TripleHandlerException e) {
                 throw new RuntimeException("Error while opening context", e);
             }
             Prefixes prefixes = extractor.getDescription().getPrefixes();
             for (String prefix : prefixes.allPrefixes()) {
                 try {
                     tripleHandler.receiveNamespace(prefix, prefixes.getNamespaceURIFor(prefix), context);
                 } catch (TripleHandlerException e) {
                     throw new RuntimeException(String.format("Error while writing namespace %s", prefix),
                             e
                     );
                 }
             }
         }
         if (isClosed) {
             throw new IllegalStateException("Not open: " + context);
         }
     }

     public void addResourceRoot(String[] path, Resource root, String extractor) {
         if(resourceRoots == null) {
             resourceRoots = new ArrayList<ResourceRoot>();
         }
         resourceRoots.add( new ResourceRoot(path, root, extractor) );
     }

     public List<ResourceRoot> getResourceRoots() {
         List<ResourceRoot> allRoots = new ArrayList<ResourceRoot>();
         if(resourceRoots != null) {
             allRoots.addAll( resourceRoots );
         }
         for(ExtractionResult er : subResults) {
             ExtractionResultImpl eri = (ExtractionResultImpl) er;
             if( eri.resourceRoots != null ) {
                 allRoots.addAll( eri.resourceRoots );
             }
         }
         return allRoots;
     }

     public void addPropertyPath(
             String extractor, Resource propertySubject, Resource property, BNode object, String[] path
     ) {
         if(propertyPaths == null) {
             propertyPaths = new ArrayList<PropertyPath>();
         }
         propertyPaths.add( new PropertyPath(path, propertySubject, property, object, extractor) );
     }

     public List<PropertyPath> getPropertyPaths() {
         List<PropertyPath> allPaths = new ArrayList<PropertyPath>();
         if(propertyPaths != null) {
             allPaths.addAll( propertyPaths );
         }
         for(ExtractionResult er : subResults) {
             ExtractionResultImpl eri = (ExtractionResultImpl) er;
             if( eri.propertyPaths != null ) {
                 allPaths.addAll( eri.propertyPaths );
             }
         }
         return allPaths;
     }

 }
	/**
	* Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	*/

	package org.deri.any23.extractor;

	import org.deri.any23.rdf.Prefixes;
	import org.deri.any23.writer.TripleHandler;
	import org.deri.any23.writer.TripleHandlerException;
	import org.openrdf.model.BNode;
	import org.openrdf.model.Resource;
	import org.openrdf.model.URI;
	import org.openrdf.model.Value;

	import java.io.PrintStream;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;

	/**
	* <p/>
	* A default implementation of {@link ExtractionResult}; it receives
	* extraction output from one {@link Extractor} working on one document,
	* and passes the output on to a {@link TripleHandler}. It deals with
	* details such as creation of {@link ExtractionContext} objects
	* and closing any open contexts at the end of extraction.
	* <p/>
	* The {@link #close()} method must be invoked after the extractor has
	* finished processing.
	* <p/>
	* There is usually no need to provide additional implementations
	* of the ExtractionWriter interface.
	* <p/>
	*
	* @see org.deri.any23.writer.TripleHandler
	* @see org.deri.any23.extractor.ExtractionContext
	* @author Richard Cyganiak (richard@cyganiak.de)
	* @author Michele Mostarda (michele.mostarda@gmail.com)
	*/
	public class ExtractionResultImpl implements TagSoupExtractionResult {

	public static final String ROOT_EXTRACTION_RESULT_ID = "root-extraction-result-id";

	private static final DocumentContext DEFAULT_DOCUMENT_CONTEXT = new DocumentContext(null);

	private final DocumentContext documentContext;

	private final URI documentURI;

	private final Extractor<?> extractor;

	private final TripleHandler tripleHandler;

	private final ExtractionContext context;

	private final Collection<ExtractionResult> subResults = new ArrayList<ExtractionResult>();

	private final Set<Object> knownContextIDs = new HashSet<Object>();

	private boolean isClosed = false;

	private boolean isInitialized = false;

	private List<Error> errors;

	private List<ResourceRoot> resourceRoots;

	private List<PropertyPath> propertyPaths;

	public ExtractionResultImpl(
	DocumentContext documentContext,
	URI documentURI,
	Extractor<?> extractor,
	TripleHandler tripleHandler,
	Object contextID
	) {
	if(documentContext == null) {
	throw new NullPointerException("document context cannot be null.");
	}
	if(documentURI == null) {
	throw new NullPointerException("document URI cannot be null.");
	}
	if(extractor == null) {
	throw new NullPointerException("extractor cannot be null.");
	}
	if(tripleHandler == null) {
	throw new NullPointerException("triple handler cannot be null.");
	}
	if(contextID == null) {
	throw new NullPointerException("contextID cannot be null.");
	}

	this.documentContext = documentContext;
	this.documentURI = documentURI;
	this.extractor = extractor;
	this.tripleHandler = tripleHandler;
	this.context = new ExtractionContext(
	extractor.getDescription().getExtractorName(),
	documentURI,
	Integer.toHexString(contextID.hashCode())
	);
	knownContextIDs.add(contextID);
	}

	public ExtractionResultImpl(
	DocumentContext documentContext,
	URI documentURI,
	Extractor<?> extractor,
	TripleHandler tripleHandler
	) {
	this(documentContext, documentURI, extractor, tripleHandler, ROOT_EXTRACTION_RESULT_ID);
	}

	public ExtractionResultImpl(
	URI documentURI,
	Extractor<?> extractor,
	TripleHandler tripleHandler
	) {
	this(DEFAULT_DOCUMENT_CONTEXT, documentURI, extractor, tripleHandler, ROOT_EXTRACTION_RESULT_ID);
	}

	public boolean hasErrors() {
	return errors != null;
	}

	public int getErrorsCount() {
	return errors == null ? 0 : errors.size();
	}

	public void printErrorsReport(PrintStream ps) {
	ps.print(String.format("Context: %s [errors: %d] {\n", context, getErrorsCount()));
	if (errors != null) {
	for (Error error : errors) {
	ps.print(error.toString());
	ps.print("\n");
	}
	}
	// Printing sub results.
	for (ExtractionResult er : subResults) {
	er.printErrorsReport(ps);
	}
	ps.print("}\n");
	}

	public Collection<Error> getErrors() {
	return errors == null ? Collections.<Error>emptyList() : Collections.unmodifiableList(errors);
	}

	public ExtractionResult openSubResult(Object contextID) {
	if (knownContextIDs.contains(contextID)) {
	throw new IllegalArgumentException("Duplicate contextID: " + contextID);
	}
	knownContextIDs.add(contextID);

	checkOpen();
	ExtractionResult result =
	new ExtractionResultImpl(documentContext, documentURI, extractor, tripleHandler, contextID);
	subResults.add(result);
	return result;
	}

	public DocumentContext getDocumentContext() {
	return documentContext;
	}

	public ExtractionContext getExtractionContext() {
	return context;
	}

	public void writeTriple(Resource s, URI p, Value o, URI g) {
	if (s == null \|\| p == null \|\| o == null) return;
	// Check for mal-constructed literals or BNodes, Sesame does not catch this.
	if (s.stringValue() == null \|\| p.stringValue() == null \|\| o.stringValue() == null) return;
	checkOpen();
	try {
	tripleHandler.receiveTriple(s, p, o, g, context);
	} catch (TripleHandlerException e) {
	throw new RuntimeException(
	String.format("Error while receiving triple %s %s %s", s, p, o ),
	e
	);
	}
	}

	public void writeTriple(Resource s, URI p, Value o) {
	writeTriple(s, p, o, null);
	}

	public void writeNamespace(String prefix, String uri) {
	checkOpen();
	try {
	tripleHandler.receiveNamespace(prefix, uri, context);
	} catch (TripleHandlerException e) {
	throw new RuntimeException(
	String.format("Error while writing namespace %s:%s", prefix, uri),
	e
	);
	}
	}

	public void notifyError(ErrorLevel level, String msg, int row, int col) {
	if(errors == null) {
	errors = new ArrayList<Error>();
	}
	errors.add( new Error(level, msg, row, col) );
	}

	public void close() {
	if (isClosed) return;
	isClosed = true;
	for (ExtractionResult subResult : subResults) {
	subResult.close();
	}
	if (isInitialized) {
	try {
	tripleHandler.closeContext(context);
	} catch (TripleHandlerException e) {
	throw new RuntimeException("Error while opening context", e);
	}
	}
	}

	private void checkOpen() {
	if (!isInitialized) {
	isInitialized = true;
	try {
	tripleHandler.openContext(context);
	} catch (TripleHandlerException e) {
	throw new RuntimeException("Error while opening context", e);
	}
	Prefixes prefixes = extractor.getDescription().getPrefixes();
	for (String prefix : prefixes.allPrefixes()) {
	try {
	tripleHandler.receiveNamespace(prefix, prefixes.getNamespaceURIFor(prefix), context);
	} catch (TripleHandlerException e) {
	throw new RuntimeException(String.format("Error while writing namespace %s", prefix),
	e
	);
	}
	}
	}
	if (isClosed) {
	throw new IllegalStateException("Not open: " + context);
	}
	}

	public void addResourceRoot(String[] path, Resource root, String extractor) {
	if(resourceRoots == null) {
	resourceRoots = new ArrayList<ResourceRoot>();
	}
	resourceRoots.add( new ResourceRoot(path, root, extractor) );
	}

	public List<ResourceRoot> getResourceRoots() {
	List<ResourceRoot> allRoots = new ArrayList<ResourceRoot>();
	if(resourceRoots != null) {
	allRoots.addAll( resourceRoots );
	}
	for(ExtractionResult er : subResults) {
	ExtractionResultImpl eri = (ExtractionResultImpl) er;
	if( eri.resourceRoots != null ) {
	allRoots.addAll( eri.resourceRoots );
	}
	}
	return allRoots;
	}

	public void addPropertyPath(
	String extractor, Resource propertySubject, Resource property, BNode object, String[] path
	) {
	if(propertyPaths == null) {
	propertyPaths = new ArrayList<PropertyPath>();
	}
	propertyPaths.add( new PropertyPath(path, propertySubject, property, object, extractor) );
	}

	public List<PropertyPath> getPropertyPaths() {
	List<PropertyPath> allPaths = new ArrayList<PropertyPath>();
	if(propertyPaths != null) {
	allPaths.addAll( propertyPaths );
	}
	for(ExtractionResult er : subResults) {
	ExtractionResultImpl eri = (ExtractionResultImpl) er;
	if( eri.propertyPaths != null ) {
	allPaths.addAll( eri.propertyPaths );
	}
	}
	return allPaths;
	}

	}