| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.marmotta.ldclient.services.provider; |
| |
| import static org.apache.marmotta.commons.http.MarmottaHttpUtils.parseContentType; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Date; |
| import java.util.HashSet; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.Queue; |
| import java.util.Set; |
| |
| import org.apache.http.Header; |
| import org.apache.http.HttpEntity; |
| import org.apache.http.HttpResponse; |
| import org.apache.http.client.ClientProtocolException; |
| import org.apache.http.client.methods.HttpGet; |
| import org.apache.http.client.utils.DateUtils; |
| import org.apache.http.util.EntityUtils; |
| import org.apache.marmotta.commons.collections.CollectionUtils; |
| import org.apache.marmotta.commons.http.ContentType; |
| import org.apache.marmotta.ldclient.api.endpoint.Endpoint; |
| import org.apache.marmotta.ldclient.api.ldclient.LDClientService; |
| import org.apache.marmotta.ldclient.api.provider.DataProvider; |
| import org.apache.marmotta.ldclient.exception.DataRetrievalException; |
| import org.apache.marmotta.ldclient.model.ClientResponse; |
| import org.openrdf.repository.Repository; |
| import org.openrdf.repository.RepositoryConnection; |
| import org.openrdf.repository.RepositoryException; |
| import org.openrdf.repository.sail.SailRepository; |
| import org.openrdf.sail.memory.MemoryStore; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Add file description here! |
| * <p/> |
| * Author: Sebastian Schaffert |
| */ |
| public abstract class AbstractHttpProvider implements DataProvider { |
| |
| public static final int RETRY_AFTER = 60; |
| private static Logger log = LoggerFactory.getLogger(AbstractHttpProvider.class); |
| |
| /** |
| * Build the URL to use to call the webservice in order to retrieve the data for the resource passed as argument. |
| * In many cases, this will just return the URI of the resource (e.g. Linked Data), but there might be data providers |
| * that use different means for accessing the data for a resource, e.g. SPARQL or a Cache. |
| * |
| * |
| * |
| * @param resourceUri |
| * @param endpoint endpoint configuration for the data provider (optional) |
| * @return |
| */ |
| protected abstract List<String> buildRequestUrl(String resourceUri, Endpoint endpoint) throws DataRetrievalException; |
| |
| /** |
| * Parse the HTTP response entity returned by the web service call and return its contents in a Sesame RDF |
| * repository also passed as argument. The content type returned by the web service is passed as argument to help |
| * the implementation decide how to parse the data. The implementation can return a list of additional pages to |
| * retrieve for completing the data of the resource |
| * |
| * |
| * |
| * |
| * @param resourceUri |
| * @param repository an RDF repository for storing an RDF representation of the dataset located at the remote resource. |
| * @param in input stream as returned by the remote webservice |
| * @param contentType content type as returned in the HTTP headers of the remote webservice |
| * @return a possibly empty list of URLs of additional resources to retrieve to complete the content |
| * @throws java.io.IOException in case an error occurs while reading the input stream |
| */ |
| protected abstract List<String> parseResponse(String resourceUri, String requestUrl, Repository repository, InputStream in, String contentType) throws DataRetrievalException; |
| |
| /** |
| * Retrieve the data for a resource using the given http client and endpoint definition. The service is |
| * supposed to manage the connection handling itself. See {@link AbstractHttpProvider} |
| * for a generic implementation of this method. |
| * |
| * |
| * |
| * @param resource the resource to be retrieved |
| * @param endpoint the endpoint definition |
| * @return a completely specified client response, including expiry information and the set of triples |
| */ |
| @Override |
| public ClientResponse retrieveResource(String resource, LDClientService client, Endpoint endpoint) throws DataRetrievalException { |
| |
| try { |
| |
| String contentType; |
| if(endpoint != null && endpoint.getContentTypes().size() > 0) { |
| contentType = CollectionUtils.fold(endpoint.getContentTypes(), new CollectionUtils.StringSerializer<ContentType>() { |
| @Override |
| public String serialize(ContentType contentType) { |
| return contentType.toString("q"); |
| } |
| },","); |
| } else { |
| contentType = CollectionUtils.fold(Arrays.asList(listMimeTypes()), ","); |
| } |
| |
| long defaultExpires = client.getClientConfiguration().getDefaultExpiry(); |
| if(endpoint != null && endpoint.getDefaultExpiry() != null) { |
| defaultExpires = endpoint.getDefaultExpiry(); |
| } |
| |
| final ResponseHandler handler = new ResponseHandler(resource, endpoint); |
| |
| // a queue for queuing the request URLs needed to build the query response |
| Queue<String> requestUrls = new LinkedList<String>(); |
| requestUrls.addAll(buildRequestUrl(resource, endpoint)); |
| |
| Set<String> visited = new HashSet<String>(); |
| |
| String requestUrl = requestUrls.poll(); |
| while(requestUrl != null) { |
| |
| if(!visited.contains(requestUrl)) { |
| HttpGet get = new HttpGet(requestUrl); |
| try { |
| get.setHeader("Accept",contentType); |
| get.setHeader("Accept-Language", "*"); // PoolParty compatibility |
| |
| log.info("retrieving resource data for {} from '{}' endpoint, request URI is <{}>", new Object[] {resource, getName(), get.getURI().toASCIIString()}); |
| |
| handler.requestUrl = requestUrl; |
| List<String> additionalRequestUrls = client.getClient().execute(get, handler); |
| requestUrls.addAll(additionalRequestUrls); |
| |
| visited.add(requestUrl); |
| } finally { |
| get.releaseConnection(); |
| } |
| } |
| |
| requestUrl = requestUrls.poll(); |
| } |
| |
| Date expiresDate = handler.expiresDate; |
| if (expiresDate == null) { |
| expiresDate = new Date(System.currentTimeMillis() + defaultExpires * 1000); |
| } |
| |
| long min_expires = System.currentTimeMillis() + client.getClientConfiguration().getMinimumExpiry() * 1000; |
| if (expiresDate.getTime() < min_expires) { |
| log.info("expiry time returned by request lower than minimum expiration time; using minimum time instead"); |
| expiresDate = new Date(min_expires); |
| } |
| |
| if(log.isInfoEnabled()) { |
| RepositoryConnection con = handler.triples.getConnection(); |
| log.info("retrieved {} triples for resource {}; expiry date: {}",new Object[] {con.size(),resource,expiresDate}); |
| con.close(); |
| } |
| |
| ClientResponse result = new ClientResponse(handler.httpStatus, handler.triples); |
| result.setExpires(expiresDate); |
| return result; |
| } catch (RepositoryException e) { |
| log.error("error while initialising Sesame repository; classpath problem?",e); |
| throw new DataRetrievalException("error while initialising Sesame repository; classpath problem?",e); |
| } catch (ClientProtocolException e) { |
| log.error("HTTP client error while trying to retrieve resource {}: {}", resource, e.getMessage()); |
| throw new DataRetrievalException("I/O error while trying to retrieve resource "+resource,e); |
| } catch (IOException e) { |
| log.error("I/O error while trying to retrieve resource {}: {}", resource, e.getMessage()); |
| throw new DataRetrievalException("I/O error while trying to retrieve resource "+resource,e); |
| } catch(RuntimeException ex) { |
| log.error("Unknown error while trying to retrieve resource {}: {}", resource, ex.getMessage()); |
| throw new DataRetrievalException("Unknown error while trying to retrieve resource "+resource,ex); |
| } |
| |
| } |
| |
| /** |
| * Check whether the content type returned by the server is acceptable to the endpoint and data provider |
| */ |
| protected boolean isValidContentType(String contentType, Endpoint endpoint) { |
| if(endpoint != null && endpoint.getContentTypes().size() > 0) { |
| ContentType parsed = parseContentType(contentType); |
| for(ContentType valid : endpoint.getContentTypes()) { |
| if(valid.matches(parsed) || valid.matchesWildcard(parsed)) { |
| return true; |
| } |
| } |
| return false; |
| } else { |
| // TODO: should probably be removed, since it is not used |
| for(String type : listMimeTypes()) { |
| if(type.split(";")[0].equalsIgnoreCase(contentType)) return true; |
| } |
| return false; |
| } |
| } |
| |
| private class ResponseHandler implements org.apache.http.client.ResponseHandler<List<String>> { |
| |
| private Date expiresDate; |
| |
| private String requestUrl; |
| |
| // the repository where the triples will be stored in case the data providers return them |
| private final Repository triples; |
| |
| private final Endpoint endpoint; |
| |
| private final String resource; |
| |
| private int httpStatus; |
| |
| public ResponseHandler(String resource, Endpoint endpoint) throws RepositoryException { |
| this.resource = resource; |
| this.endpoint = endpoint; |
| |
| triples = new SailRepository(new MemoryStore()); |
| triples.initialize(); |
| } |
| |
| @Override |
| public List<String> handleResponse(HttpResponse response) throws ClientProtocolException, IOException { |
| ArrayList<String> requestUrls = new ArrayList<String>(); |
| |
| if (response.getStatusLine().getStatusCode() >= 200 && response.getStatusLine().getStatusCode() < 400) { |
| final HttpEntity entity = response.getEntity(); |
| if (entity == null) |
| throw new IOException("no content returned by Linked Data resource " + resource); |
| |
| if (!isValidContentType(entity.getContentType().getValue().split(";")[0], endpoint)) { |
| // FIXME: here was get.abort() |
| throw new IOException("invalid content returned by Linked Data resource " + resource + ": " |
| + entity.getContentType().getValue()); |
| } |
| |
| this.httpStatus = response.getStatusLine().getStatusCode(); |
| |
| if (entity != null) { |
| String parseContentType = "application/rdf+xml"; |
| if (endpoint != null && "SPARQL".equals(endpoint.getType())) { |
| parseContentType = "application/sparql-results+xml"; |
| } else if (entity.getContentType() != null) { |
| parseContentType = entity.getContentType().getValue().split(";")[0]; |
| } |
| |
| InputStream in = entity.getContent(); |
| try { |
| |
| List<String> urls = parseResponse(resource, requestUrl, triples, in, parseContentType); |
| requestUrls.addAll(urls); |
| |
| if (expiresDate == null) { |
| Header expires = response.getFirstHeader("Expires"); |
| if (expires != null) { |
| expiresDate = DateUtils.parseDate(expires.getValue()); |
| } |
| } |
| |
| } catch (DataRetrievalException e) { |
| // FIXME: get.abort(); |
| throw new IOException(e); |
| } finally { |
| in.close(); |
| } |
| } |
| EntityUtils.consume(entity); |
| } else if(response.getStatusLine().getStatusCode() == 500 || response.getStatusLine().getStatusCode() == 503 || response.getStatusLine().getStatusCode() == 504) { |
| this.httpStatus = response.getStatusLine().getStatusCode(); |
| |
| Header retry = response.getFirstHeader("Retry-After"); |
| if(retry != null) { |
| try { |
| int duration = Integer.parseInt(retry.getValue()); |
| expiresDate = new Date(System.currentTimeMillis() + duration*1000); |
| } catch(NumberFormatException ex) { |
| log.debug("error parsing Retry-After: header"); |
| } |
| } else { |
| expiresDate = new Date(System.currentTimeMillis() + RETRY_AFTER *1000); |
| } |
| |
| } else { |
| log.error("the HTTP request failed (status: {})", response.getStatusLine()); |
| throw new ClientProtocolException("the HTTP request failed (status: " + response.getStatusLine() + ")"); |
| } |
| |
| return requestUrls; |
| } |
| |
| } |
| |
| } |