blob: 974b0043d5ee96c3a2509f11061ad0157ad54195 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor;
import org.apache.any23.extractor.html.MicroformatExtractor;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.Resource;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
/**
* This interface models a specific {@link ExtractionResult}
* able to collect property roots generated by <i>HTML Microformat</i> extractions.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
public interface TagSoupExtractionResult extends ExtractionResult {
/**
* Adds a root property to the extraction result, specifying also
* the <i>path</i> corresponding to the root of data which generated the property
* and the extractor responsible for such addition.
*
* @param path the <i>path</i> from the document root to the local root of the data generating the property.
* @param root the property root node.
* @param extractor the extractor responsible of such extraction.
*/
void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor);
/**
* Returns all the collected property roots.
*
* @return an <b>unmodifiable</b> list of
* {@link TagSoupExtractionResult.ResourceRoot}s.
*/
List<ResourceRoot> getResourceRoots();
/**
* Adds a property path to the list of the extracted data.
*
* @param extractor the identifier of the extractor responsible for retrieving such property.
* @param propertySubject the subject of the property.
* @param property the property IRI.
* @param object the property object if any, <code>null</code> otherwise.
* @param path the path of the <i>HTML</i> node from which the property literal has been extracted.
*/
void addPropertyPath(
Class<? extends MicroformatExtractor> extractor,
Resource propertySubject,
Resource property,
BNode object,
String[] path
);
/**
* Returns all the collected property paths.
*
* @return a valid list of property paths.
*/
List<PropertyPath> getPropertyPaths();
/**
* Defines a property root object.
*/
class ResourceRoot {
private String[] path;
private Resource root;
private Class<? extends MicroformatExtractor> extractor;
public ResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) {
if(path == null || path.length == 0) {
throw new IllegalArgumentException( String.format(Locale.ROOT, "Invalid xpath: '%s'.", Arrays.toString(path) ) );
}
if(root == null) {
throw new IllegalArgumentException("Invalid root, cannot be null.");
}
if(extractor == null) {
throw new IllegalArgumentException("Invalid extractor, cannot ne null");
}
this.path = path;
this.root = root;
this.extractor = extractor;
}
public String[] getPath() {
return path;
}
public Resource getRoot() {
return root;
}
public Class<? extends MicroformatExtractor> getExtractor() {
return extractor;
}
@Override
public String toString() {
return String.format(Locale.ROOT,
"%s-%s-%s %s",
this.getClass().getCanonicalName(),
Arrays.toString(path),
root,
extractor
);
}
}
/**
* Defines a property path object.
*/
class PropertyPath {
private Class<? extends MicroformatExtractor> extractor;
private String[] path;
private Resource subject;
private Resource property;
private BNode object;
public PropertyPath(String[] path, Resource subject, Resource property, BNode object, Class<? extends MicroformatExtractor> extractor) {
if(path == null) {
throw new NullPointerException("path cannot be null.");
}
if(subject == null) {
throw new NullPointerException("subject cannot be null.");
}
if(property == null) {
throw new NullPointerException("property cannot be null.");
}
if(extractor == null) {
throw new NullPointerException("extractor cannot be null.");
}
this.path = path;
this.subject = subject;
this.property = property;
this.object = object;
this.extractor = extractor;
}
public String[] getPath() {
return path;
}
public Resource getSubject() {
return subject;
}
public Resource getProperty() {
return property;
}
public BNode getObject() {
return object;
}
public Class<? extends MicroformatExtractor> getExtractor() {
return extractor;
}
@Override
public String toString() {
return String.format(Locale.ROOT,
"%s %s - %s - %s -- %s -->",
this.getClass().getCanonicalName(),
Arrays.toString(path),
extractor,
subject,
property
);
}
}
}