blob: ad7c1ed1e2c689a01dc485638ce55a148aa9907c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.servlet;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractorRegistry;
import org.apache.any23.extractor.ExtractorRegistryImpl;
import org.apache.any23.http.HTTPClient;
import org.apache.any23.plugin.Any23PluginManager;
import org.apache.any23.servlet.conneg.Any23Negotiator;
import org.apache.any23.servlet.conneg.MediaRangeSpec;
import org.apache.any23.source.ByteArrayDocumentSource;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Pattern;
import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
/**
* A <i>Servlet</i> that fetches a client-specified <i>IRI</i>,
* RDFizes the content, and returns it in a format chosen by the client.
*
* @author Gabriele Renzi
* @author Richard Cyganiak (richard@cyganiak.de)
*/
public class Servlet extends HttpServlet {
private static final Logger LOG = LoggerFactory.getLogger(Servlet.class);
public static final String DEFAULT_BASE_IRI = "http://any23.org/tmp/";
private static final long serialVersionUID = 8207685628715421336L;
private static final Pattern schemeAndSingleSlashRegex =
Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:/[^/]");
// RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
private static final Pattern schemeRegex =
Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:");
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException {
final WebResponder responder = new WebResponder(this, resp);
final String format = getFormatFromRequestOrNegotiation(req);
final boolean report = isReport(req);
final boolean annotate = isAnnotated(req);
final boolean openie = isOpenIE(req);
if (format == null) {
try {
responder.sendError(406, "Client accept header does not include a supported output format", report);
return;
} catch (IOException e) {
LOG.error("Unable to send error for null request format.", e);
}
}
final String uri = getInputIRIFromRequest(req);
if (uri == null) {
try {
responder.sendError(404, "Missing IRI in GET request. Try /format/http://example.com/myfile", report);
return;
} catch (Exception e) {
LOG.error("Unable to send error for null request IRI.", e);
}
}
if (openie) {
Any23PluginManager pManager = Any23PluginManager.getInstance();
//Dynamically adding Jar's to the Classpath via the following logic
//is absolutely dependant on the 'apache-any23-openie' directory being
//present within the webapp /lib directory. This is specified within
//the maven-dependency-plugin.
File webappClasspath = new File(getClass().getClassLoader().getResource("").getPath());
File openIEJarPath = new File(webappClasspath.getParentFile().getPath() + "/lib/apache-any23-openie");
boolean loadedJars = pManager.loadJARDir(openIEJarPath);
if (loadedJars) {
ExtractorRegistry r = ExtractorRegistryImpl.getInstance();
try {
pManager.getExtractors().forEachRemaining(r::register);
} catch (IOException e) {
LOG.error("Error during dynamic classloading of JARs from OpenIE runtime directory {}", openIEJarPath.toString(), e);
}
LOG.info("Successful dynamic classloading of JARs from OpenIE runtime directory {}", openIEJarPath.toString());
}
}
final ExtractionParameters eps = getExtractionParameters(req);
try {
responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
} catch (IOException e) {
LOG.error("Unable to run extraction on HTTPDocumentSource.", e);
}
}
@Override
protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws IOException {
final WebResponder responder = new WebResponder(this, resp);
final boolean report = isReport(req);
final boolean annotate = isAnnotated(req);
final boolean openie = isOpenIE(req);
if (req.getContentType() == null) {
responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified", report);
return;
}
final String uri = getInputIRIFromRequest(req);
final String format = getFormatFromRequestOrNegotiation(req);
if (format == null) {
responder.sendError(406, "Client accept header does not include a supported output format", report);
return;
}
if (openie) {
Any23PluginManager pManager = Any23PluginManager.getInstance();
pManager.loadJARDir(new File(getClass().getResource("apache-any23-openie").getPath()));
}
final ExtractionParameters eps = getExtractionParameters(req);
if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) {
if (uri != null) {
log("Attempting conversion to '" + format + "' from IRI <" + uri + ">");
responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
return;
}
if (req.getParameter("body") == null) {
responder.sendError(400, "Invalid POST request, parameter 'uri' or 'body' required", report);
return;
}
String type = null;
if (req.getParameter("type") != null && !"".equals(req.getParameter("type"))) {
type = req.getParameter("type");
}
log("Attempting conversion to '" + format + "' from body parameter");
responder.runExtraction(
new StringDocumentSource(req.getParameter("body"), Servlet.DEFAULT_BASE_IRI, type),
eps,
format,
report, annotate
);
return;
}
log("Attempting conversion to '" + format + "' from POST body");
responder.runExtraction(
new ByteArrayDocumentSource(
req.getInputStream(),
Servlet.DEFAULT_BASE_IRI,
getContentTypeHeader(req)
),
eps,
format,
report, annotate
);
}
private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
String fromRequest = getFormatFromRequest(request);
if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
return fromRequest;
}
MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
if (result == null) {
return null;
} else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
return "n3";
} else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
return "nq";
} else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
return "rdf";
} else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
return "nt";
} else if (RDFFormat.JSONLD.hasMIMEType(result.getMediaType())) {
return "ld+json";
} else {
return "turtle"; // shouldn't happen however default is turtle
}
}
private String getFormatFromRequest(HttpServletRequest request) {
if (request.getPathInfo() == null)
return "best";
String[] args = request.getPathInfo().split("/", 3);
if (args.length < 2 || "".equals(args[1])) {
if (request.getParameter("format") == null) {
return "best";
} else {
return request.getParameter("format");
}
}
return args[1];
}
private String getInputIRIFromRequest(HttpServletRequest request) {
if (request.getPathInfo() == null)
return null;
String[] args = request.getPathInfo().split("/", 3);
if (args.length < 3) {
if (request.getParameter("uri") != null) {
return request.getParameter("uri").trim();
}
if (request.getParameter("url") != null) {
return request.getParameter("url").trim();
}
return null;
}
String uri = args[2];
if (request.getQueryString() != null) {
uri = uri + "?" + request.getQueryString();
}
if (!hasScheme(uri)) {
uri = "http://" + uri;
} else if (hasOnlySingleSlashAfterScheme(uri)) {
// This is to work around an issue where Tomcat 6.0.18 is
// too smart for us. Tomcat normalizes double-slashes in
// the path, and thus turns "http://" into "http:/" if it
// occurs in the path. So we restore the double slash.
uri = uri.replaceFirst(":/", "://");
}
return uri.trim();
}
private boolean hasScheme(String uri) {
return schemeRegex.matcher(uri).find();
}
private boolean hasOnlySingleSlashAfterScheme(String uri) {
return schemeAndSingleSlashRegex.matcher(uri).find();
}
private String getContentTypeHeader(HttpServletRequest req) {
String cType = "Content-Type";
if (req.getHeader(cType) == null)
return null;
if ("".equals(req.getHeader(cType)))
return null;
String contentType = req.getHeader(cType);
// strip off parameters such as ";charset=UTF-8"
int index = contentType.indexOf(';');
if (index == -1)
return contentType;
return contentType.substring(0, index);
}
private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
throws IOException {
try {
if (!isValidIRI(uri)) {
throw new URISyntaxException(uri, "@@@");
}
return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
} catch (URISyntaxException ex) {
LOG.error("Invalid IRI detected", ex);
responder.sendError(400, "Invalid input IRI " + uri, report);
return null;
}
}
protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
throws IOException, URISyntaxException {
return new HTTPDocumentSource(httpClient, uri);
}
private boolean isValidIRI(String s) {
try {
URI uri = new URI(s);
if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
return false;
}
} catch (Exception e) {
return false;
}
return true;
}
private ValidationMode getValidationMode(HttpServletRequest request) {
final String parameter = "validation-mode";
final String validationMode = request.getParameter(parameter);
if (validationMode == null)
return ValidationMode.NONE;
if ("none".equalsIgnoreCase(validationMode))
return ValidationMode.NONE;
if ("validate".equalsIgnoreCase(validationMode))
return ValidationMode.VALIDATE;
if ("validate-fix".equalsIgnoreCase(validationMode))
return ValidationMode.VALIDATE_AND_FIX;
throw new IllegalArgumentException(
String.format("Invalid value '%s' for '%s' parameter.", validationMode, parameter)
);
}
private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
final ValidationMode mode = getValidationMode(request);
return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
}
private boolean isReport(HttpServletRequest request) {
return request.getParameter("report") != null;
}
private boolean isAnnotated(HttpServletRequest request) {
return request.getParameter("annotate") != null;
}
private boolean isOpenIE(HttpServletRequest request) {
return request.getParameter("openie") != null;
}
}