| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.it; |
| |
| import static org.apache.stanbol.enhancer.it.MultipartContentItemTestUtils.buildPathWithParams; |
| import static org.apache.stanbol.enhancer.it.MultipartContentItemTestUtils.getHTMLContent; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_PERSON; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_PLACE; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.OutputStream; |
| import java.nio.charset.Charset; |
| import java.util.LinkedHashMap; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| |
| import org.apache.clerezza.commons.rdf.BlankNode; |
| import org.apache.clerezza.commons.rdf.Graph; |
| import org.apache.clerezza.commons.rdf.RDFTerm; |
| import org.apache.clerezza.commons.rdf.IRI; |
| import org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl; |
| import org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph; |
| import org.apache.clerezza.commons.rdf.impl.utils.TripleImpl; |
| import org.apache.clerezza.rdf.core.serializedform.Serializer; |
| import org.apache.clerezza.rdf.core.serializedform.SupportedFormat; |
| import org.apache.clerezza.rdf.ontologies.RDF; |
| import org.apache.http.HttpEntity; |
| import org.apache.http.entity.ContentType; |
| import org.apache.http.entity.mime.MIME; |
| import org.apache.http.entity.mime.MultipartEntityBuilder; |
| import org.apache.http.entity.mime.content.AbstractContentBody; |
| import org.apache.http.entity.mime.content.ContentBody; |
| import org.apache.http.entity.mime.content.ContentDescriptor; |
| import org.apache.http.entity.mime.content.StringBody; |
| import org.apache.http.message.BasicNameValuePair; |
| import org.apache.stanbol.enhancer.servicesapi.ContentItem; |
| import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; |
| import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.Properties; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses; |
| import org.junit.Assert; |
| import org.junit.Test; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * This tests RESTful API extensions to the Stanbol Enhancer as described by |
| * STANBOL-481 |
| */ |
| public class MultipartRequestTest extends EnhancerTestBase { |
| private static final Charset UTF8 = Charset.forName("UTF-8"); |
| private static Serializer serializer = Serializer.getInstance(); |
| |
| private static final Logger log = LoggerFactory.getLogger(MultipartRequestTest.class); |
| |
| private final String TEXT_CONTENT = "The Apache Stanbol Enhancer.\n" + |
| "The Stanbol enhancer can detect famous cities such as Paris and " + |
| "people such as Bob Marley."; |
| private final String[] TEXT_CONTENT_LINES = TEXT_CONTENT.split("\n"); |
| private final String HTML_CONTENT = getHTMLContent(TEXT_CONTENT_LINES); |
| private final String[] HTML_CONTENT_LINES = HTML_CONTENT.split("\n"); |
| public MultipartRequestTest() { |
| super(); //use the default endpoint |
| } |
| |
| @Test |
| public void testIllegalRdfFormat() throws IOException { |
| String[] params = new String []{ |
| "outputContent","*/*", |
| "rdfFormat","notAvalidMimeFormat"}; |
| executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","multipart/from-data") |
| .withHeader("Content-Type", "text/html; charset=UTF-8") |
| .withContent(HTML_CONTENT) |
| ) |
| .assertStatus(400); //BAD_REQUEST |
| } |
| @Test |
| public void testIllegalOutputContent() throws IOException { |
| String[] params = new String []{ |
| "outputContent","notAvalidMimeFormat"}; |
| executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","multipart/from-data") |
| .withHeader("Content-Type", "text/html; charset=UTF-8") |
| .withContent(HTML_CONTENT) |
| ) |
| .assertStatus(400); //BAD_REQUEST |
| } |
| @Test |
| public void testDefaultContentTypes() throws IOException { |
| //'*/*', 'text/plain' and 'application/octet-stream' where considered as |
| // Indicators that the default RDF serialisation format for the metadata |
| // should be used. |
| //This is basically testing backward compatibility |
| |
| String[] jsonLDMetadataTests = new String[]{ |
| "\"@id\" : \"http://dbpedia.org/resource/Paris\",", |
| "\"@id\" : \"http://dbpedia.org/resource/Bob_Marley\",", |
| "\"http://purl.org/dc/terms/creator\" : [ {", |
| "\"@value\" : \"org.apache.stanbol.enhancer.engines.entitytagging.impl.NamedEntityTaggingEngine\"" |
| }; |
| String[] params = new String []{ |
| "outputContent","text/plain" |
| }; |
| executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","text/plain") //must be multipart/from-data |
| .withHeader("Content-Type", "text/html; charset=UTF-8") |
| .withContent(HTML_CONTENT) |
| ) |
| .assertStatus(200) //metadata as JSONLD |
| .assertContentContains(jsonLDMetadataTests); |
| |
| params = new String []{ |
| "outputContent","application/octet-stream"}; //omitMetadata=false |
| executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","text/plain") //must be multipart/from-data |
| .withHeader("Content-Type", "text/html; charset=UTF-8") |
| .withContent(HTML_CONTENT) |
| ) |
| .assertStatus(200) //metadata as JSONLD |
| .assertContentContains(jsonLDMetadataTests); |
| |
| params = new String []{ |
| "outputContent","application/octet-stream"}; //omitMetadata=false |
| executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","text/plain") //must be multipart/from-data |
| .withHeader("Content-Type", "text/html; charset=UTF-8") |
| .withContent(HTML_CONTENT) |
| ) |
| .assertStatus(200) //metadata as JSONLD |
| .assertContentContains(jsonLDMetadataTests); |
| } |
| |
| @Test |
| public void testOutputMetadataAndAllContent() throws IOException { |
| String[] params = new String []{ |
| "outputContent","*/*", |
| "rdfFormat","text/rdf+nt"}; |
| String content = executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","multipart/from-data") |
| .withHeader("Content-Type", "text/html; charset=UTF-8") |
| .withContent(HTML_CONTENT) |
| ) |
| .assertStatus(200) |
| .assertContentContains( |
| "--contentItem", |
| "Content-Disposition: form-data; name=\"content\"", |
| "Content-Type: multipart/alternate; boundary=contentParts-", |
| "Content-Type: text/plain; charset=UTF-8", |
| "Content-Type: text/html", |
| "--contentParts") |
| .assertContentContains(TEXT_CONTENT_LINES) |
| .assertContentContains(HTML_CONTENT_LINES) //line by line the HTML content |
| .assertContentRegexp( |
| "Content-Disposition: form-data; name=\"metadata\"; filename=.*", |
| "Content-Disposition: form-data; name=\"urn:tika:text:.*", |
| "Content-Disposition: form-data; name=\"urn:content-item-sha1-.*", |
| "--contentItem-.*--", |
| "--contentParts-.*--", |
| //and the expected enhancements in the metadata |
| "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine", |
| "http://purl.org/dc/terms/language.*en", |
| "http://fise.iks-project.eu/ontology/entity-label.*Paris", |
| "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine", |
| "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley") |
| .getContent(); |
| log.debug("Content:\n{}\n",content); |
| |
| } |
| @Test |
| public void testOutputAllContentOmitMetadata() throws IOException { |
| String[] params = new String []{ |
| "outputContent","*/*", |
| "omitMetadata","true", |
| "rdfFormat","text/rdf+nt"}; |
| String content = executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","multipart/from-data") |
| .withHeader("Content-Type", "text/html; charset=UTF-8") |
| .withContent(HTML_CONTENT) |
| ) |
| .assertStatus(200) |
| .assertContentContains( |
| "--contentItem", |
| "Content-Disposition: form-data; name=\"content\"", |
| "Content-Type: multipart/alternate; boundary=contentParts-", |
| "Content-Type: text/plain; charset=UTF-8", |
| "Content-Type: text/html", |
| "--contentParts") |
| .assertContentContains(TEXT_CONTENT_LINES) |
| .assertContentContains(HTML_CONTENT_LINES) //line by line the HTML content |
| .assertContentRegexp( //MUST contain |
| "--contentItem-.*--", |
| "--contentParts-.*--", |
| "Content-Disposition: form-data; name=\"urn:tika:text:.*", |
| "Content-Disposition: form-data; name=\"urn:content-item-sha1-.*") |
| .assertContentRegexp(false, //MUST NOT contain |
| "Content-Disposition: form-data; name=\"metadata\"; filename=.*", |
| //and the expected enhancements in the metadata |
| "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine", |
| "http://purl.org/dc/terms/language.*en", |
| "http://fise.iks-project.eu/ontology/entity-label.*Paris", |
| "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine", |
| "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley") |
| .getContent(); |
| log.debug("Content:\n{}\n",content); |
| |
| } |
| |
| @Test |
| public void testOutputPlainTextContent() throws IOException { |
| String[] params = new String []{ |
| "outputContent","text/plain", |
| "rdfFormat","text/rdf+nt"}; |
| String content = executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","multipart/from-data") |
| .withHeader("Content-Type", "text/html; charset=UTF-8") |
| .withContent(HTML_CONTENT) |
| ) |
| .assertStatus(200) |
| .assertContentContains( |
| "--contentItem", |
| "Content-Disposition: form-data; name=\"content\"", |
| "Content-Type: multipart/alternate; boundary=contentParts-", |
| "Content-Type: text/plain; charset=UTF-8", |
| "--contentParts") |
| .assertContentContains(TEXT_CONTENT_LINES) |
| .assertContentRegexp( |
| "--contentItem-.*--", |
| "--contentParts-.*--", |
| "Content-Disposition: form-data; name=\"metadata\"; filename=.*", |
| "Content-Disposition: form-data; name=\"urn:tika:text:.*", |
| //and the expected enhancements in the metadata |
| "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine", |
| "http://purl.org/dc/terms/language.*en", |
| "http://fise.iks-project.eu/ontology/entity-label.*Paris", |
| "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine", |
| "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley") |
| .getContent(); |
| log.debug("Content:\n{}\n",content); |
| |
| } |
| @Test |
| public void testOutputContentOtherThanParsed() throws IOException { |
| //metadata and text content |
| String[] params = new String []{ |
| "outputContent","*/*", |
| "omitParsed","true", |
| "rdfFormat","text/rdf+nt"}; |
| String content = executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","multipart/from-data") |
| .withHeader("Content-Type", "text/html; charset=UTF-8") |
| .withContent(HTML_CONTENT) |
| ) |
| .assertStatus(200) |
| .assertContentContains( |
| "--contentItem", |
| "Content-Disposition: form-data; name=\"content\"", |
| "Content-Type: multipart/alternate; boundary=contentParts-", |
| "Content-Type: text/plain; charset=UTF-8", |
| "--contentParts") |
| .assertContentContains(TEXT_CONTENT_LINES) |
| .assertContentRegexp( |
| "--contentItem-.*--", |
| "--contentParts-.*--", |
| "Content-Disposition: form-data; name=\"metadata\"; filename=.*", |
| "Content-Disposition: form-data; name=\"urn:tika:text:.*", |
| //and the expected enhancements in the metadata |
| "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine", |
| "http://purl.org/dc/terms/language.*en", |
| "http://fise.iks-project.eu/ontology/entity-label.*Paris", |
| "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine", |
| "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley") |
| .getContent(); |
| log.debug("Content:\n{}\n",content); |
| } |
| @Test |
| public void testOutputContentPart() throws IOException { |
| String[] params = new String []{ |
| "outputContentPart","http://stanbol.apache.org/ontology/enhancer/executionmetadata#ChainExecution", |
| "omitMetadata","true", |
| "rdfFormat","application/rdf+xml"}; |
| String content = executor.execute( |
| builder.buildPostRequest(buildPathWithParams(getEndpoint(), params)) |
| .withHeader("Accept","multipart/from-data") |
| .withHeader("Content-Type", "text/plain; charset=UTF-8") |
| .withContent(TEXT_CONTENT) |
| ) |
| .assertStatus(200) |
| .assertContentContains( |
| "--contentItem", |
| "Content-Disposition: form-data; name=\"http://stanbol.apache.org/ontology/enhancer/executionmetadata#ChainExecution\"", |
| "Content-Type: application/rdf+xml; charset=UTF-8", |
| "<rdf:type rdf:resource=\"http://stanbol.apache.org/ontology/enhancer/executionplan#ExecutionPlan\"/>", |
| "<rdf:type rdf:resource=\"http://stanbol.apache.org/ontology/enhancer/executionplan#ExecutionNode\"/>", |
| "<rdf:type rdf:resource=\"http://stanbol.apache.org/ontology/enhancer/executionmetadata#EngineExecution\"/>", |
| "<rdf:type rdf:resource=\"http://stanbol.apache.org/ontology/enhancer/executionmetadata#ChainExecution\"/>") |
| .assertContentRegexp("--contentItem-.*--") |
| .getContent(); |
| log.debug("Content:\n{}\n",content); |
| } |
| /** |
| * This uploads the HTML as well as the plain text version of an content. |
| * This allows it CMS to parse already available alternate content versions |
| * in a single request. Stanbol can than still use the original content |
| * (e.g. to extract metadata) but other engines that require the alternate |
| * version (e.g. plain text version) of an document will directly use the |
| * parsed version .<p> |
| * This UnitTest ensures this by adding a "secret" extension the to plain |
| * text version and than checks if the two entities mentioned in that |
| * part are included in the extracted entities. |
| * @throws IOException |
| */ |
| @Test |
| public void testUploadMultipleContents() throws IOException { |
| //It is a secret, that Berlin is the capital of Germany |
| String extraTextConent = TEXT_CONTENT + |
| "\nIt is a secret, that the city of Berlin is the capital of Germany since 1990."; |
| //The multipartBuilder used to construct the contentItem for the contentItem |
| MultipartEntityBuilder ciBuilder = MultipartEntityBuilder.create(); |
| String boundary = "contentItem-47jjksnbue73fnis"; |
| ciBuilder.setBoundary(boundary); |
| //use a small extension to deal with multipart/alternate |
| Map<String, ContentBody> alternates = new LinkedHashMap<String,ContentBody>(); |
| alternates.put("http://www.example.com/test.html", |
| new StringBody(HTML_CONTENT, ContentType.TEXT_HTML.withCharset(UTF8))); |
| alternates.put("http://www.example.com/test.txt", |
| new StringBody(extraTextConent, ContentType.TEXT_PLAIN.withCharset(UTF8))); |
| ciBuilder.addPart("content", |
| new MultipartContentBody(alternates, "contentParts", |
| ContentType.create("multipart/alternate"))); |
| |
| String receivedContent = executor.execute( |
| builder.buildPostRequest(getEndpoint()) |
| .withHeader("Accept","text/rdf+nt") |
| .withEntity(ciBuilder.build()) |
| ) |
| .assertStatus(200) |
| .assertContentRegexp( |
| //and the expected enhancements in the metadata |
| "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine", |
| "http://purl.org/dc/terms/language.*en", |
| "http://fise.iks-project.eu/ontology/entity-label.*Paris", |
| "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine", |
| "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley", |
| //check also for expeted entities extracted from the secret Text part! |
| "http://fise.iks-project.eu/ontology/entity-label.*Berlin", |
| "http://fise.iks-project.eu/ontology/entity-label.*Germany") |
| .getContent(); |
| log.debug("Content:\n{}\n",receivedContent); |
| |
| } |
| |
| |
| @Test |
| public void testContentBeforeMetadata() throws IOException{ |
| final IRI contentItemId = new IRI("http://www.example.com/test.html"); |
| String rdfContentType = SupportedFormat.RDF_XML; |
| String rdfContent = getDummyRdfMetadata(contentItemId, rdfContentType); |
| MultipartEntityBuilder ciBuilder = MultipartEntityBuilder.create(); |
| ciBuilder.addTextBody("content",HTML_CONTENT,ContentType.TEXT_HTML.withCharset(UTF8)); |
| ciBuilder.addTextBody("metadata", rdfContent, ContentType.create(rdfContentType,UTF8)); |
| String receivedContent = executor.execute( |
| builder.buildPostRequest(getEndpoint()) |
| .withHeader("Accept","text/rdf+nt") |
| .withEntity(ciBuilder.build()) |
| ) |
| .assertStatus(400) //BAD request |
| .getContent(); |
| //check also the error message |
| Assert.assertTrue(receivedContent.contains( |
| "The Multipart MIME part with the 'metadata' MUST BE before the " + |
| "MIME part containing the 'content'")); |
| } |
| @Test |
| public void testMissingContent() throws IOException{ |
| final IRI contentItemId = new IRI("http://www.example.com/test.html"); |
| String rdfContentType = SupportedFormat.RDF_XML; |
| String rdfContent = getDummyRdfMetadata(contentItemId, rdfContentType); |
| MultipartEntityBuilder ciBuilder = MultipartEntityBuilder.create(); |
| ciBuilder.addTextBody("metadata", rdfContent, ContentType.create(rdfContentType,UTF8)); |
| |
| String receivedContent = executor.execute( |
| builder.buildPostRequest(getEndpoint()) |
| .withHeader("Accept","text/rdf+nt") |
| .withEntity(ciBuilder.build()) |
| ) |
| .assertStatus(400) //BAD request |
| .getContent(); |
| //check also the error message |
| Assert.assertTrue(receivedContent.contains( |
| "The parsed multipart content item does not contain any content.")); |
| } |
| |
| /** |
| * @param contentItemId |
| * @param rdfContentType |
| * @return |
| */ |
| private String getDummyRdfMetadata(final IRI contentItemId, String rdfContentType) { |
| Graph metadata = new SimpleGraph(); |
| metadata.add(new TripleImpl(new BlankNode(), Properties.ENHANCER_EXTRACTED_FROM, contentItemId)); |
| ByteArrayOutputStream out = new ByteArrayOutputStream(); |
| serializer.serialize(out, metadata, rdfContentType); |
| String rdfContent = new String(out.toByteArray(),UTF8); |
| return rdfContent; |
| } |
| |
| /** |
| * Stanbol also supports to upload pre-existing metadata with the content. |
| * This UnitTest uses an example that parsed TextAnnotations for free text |
| * tags provided by users that are than linked to Entities in DBPedia |
| * @throws IOException |
| */ |
| @Test |
| public void testUploadWithMetadata() throws IOException { |
| //create the metadata |
| RDFTerm user = new PlainLiteralImpl("Rupert Westenthaler"); |
| final IRI contentItemId = new IRI("http://www.example.com/test.html"); |
| Graph metadata = new SimpleGraph(); |
| addTagAsTextAnnotation(metadata, contentItemId, |
| "Germany",DBPEDIA_PLACE, user); |
| addTagAsTextAnnotation(metadata, contentItemId, |
| "Europe",DBPEDIA_PLACE, user); |
| addTagAsTextAnnotation(metadata, contentItemId, |
| "NATO",DBPEDIA_ORGANISATION, user); |
| addTagAsTextAnnotation(metadata, contentItemId, |
| "Silvio Berlusconi",DBPEDIA_PERSON, user); |
| |
| String rdfContentType = SupportedFormat.RDF_XML; |
| |
| ByteArrayOutputStream out = new ByteArrayOutputStream(); |
| serializer.serialize(out, metadata, rdfContentType); |
| String rdfContent = new String(out.toByteArray(),UTF8); |
| |
| MultipartEntityBuilder ciBuilder = MultipartEntityBuilder.create(); |
| //add the metadata |
| /* |
| * NOTE: We need here to override the getFilename, because this MUST |
| * BE the URI of the ContentItem. This is important, because the |
| * Metadata do contain triples about that ContentItem and therefore |
| * it MUST BE assured that the URI of the ContentItem created by |
| * the Stanbol Enhancer is the same of as the URI used in the |
| * Metadata! |
| */ |
| ciBuilder.addPart("metadata", |
| new StringBody(rdfContent, ContentType.create(rdfContentType).withCharset(UTF8)){ |
| @Override |
| public String getFilename() { //The filename MUST BE the |
| return contentItemId.getUnicodeString(); //uri of the ContentItem |
| } |
| }); |
| //add the content |
| ciBuilder.addTextBody("content", HTML_CONTENT, ContentType.TEXT_HTML.withCharset(UTF8)); |
| //send the request |
| String receivedContent = executor.execute( |
| builder.buildPostRequest(getEndpoint()) |
| .withHeader("Accept","text/rdf+nt") |
| .withEntity(ciBuilder.build()) |
| ) |
| .assertStatus(200) |
| .assertContentRegexp( |
| //and the expected enhancements based on the parsed content |
| "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine", |
| "http://purl.org/dc/terms/language.*en", |
| "http://fise.iks-project.eu/ontology/entity-label.*Paris", |
| "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine", |
| "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley", |
| //additional enhancements based on parsed metadata |
| "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Germany.*", |
| "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/NATO.*", |
| "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Silvio_Berlusconi.*", |
| "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Europe.*") |
| .getContent(); |
| log.debug("Content:\n{}\n",receivedContent); |
| } |
| /** |
| * Utility that creates an {@link TechnicalClasses#ENHANCER_TEXTANNOTATION TextAnnotation} |
| * for the parsed contentItem, free text tag an user. |
| * @param graph the grpah to add the information |
| * @param contentItem the {@link ContentItem#getUri() uri} of the {@link ContentItem} |
| * @param tag the free text tag for the document |
| * @param tagType the type of the tag. Typically Stanbol supports: <ul> |
| * <li>{@link OntologicalClasses#DBPEDIA_PERSON} |
| * <li>{@link OntologicalClasses#DBPEDIA_ORGANISATION} |
| * <li>{@link OntologicalClasses#DBPEDIA_PLACE} |
| * </ul> |
| * But specific {@link EnhancementEngine}s might also process other types |
| * or even TextAnnotations without an type |
| * @param user the user that created the tag |
| * @return the uri of the created annotation |
| */ |
| private static final IRI addTagAsTextAnnotation(Graph graph, IRI contentItem, |
| String tag, IRI tagType, RDFTerm user){ |
| IRI ta = new IRI("urn:user-annotation:"+EnhancementEngineHelper.randomUUID()); |
| graph.add(new TripleImpl(ta, RDF.type, TechnicalClasses.ENHANCER_TEXTANNOTATION)); |
| graph.add(new TripleImpl(ta, Properties.ENHANCER_EXTRACTED_FROM,contentItem)); |
| if(tagType != null){ |
| graph.add(new TripleImpl(ta, Properties.DC_TYPE, tagType)); |
| } |
| graph.add(new TripleImpl(ta, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(tag))); |
| graph.add(new TripleImpl(ta, RDF.type, TechnicalClasses.ENHANCER_ENHANCEMENT)); |
| if(user != null){ |
| graph.add(new TripleImpl(ta, Properties.DC_CREATOR,user)); |
| } |
| return ta; |
| } |
| /** |
| * Supports sending multipart mime as {@link ContentBody}. |
| * @author Rupert Westenthaler |
| * |
| */ |
| private class MultipartContentBody extends AbstractContentBody implements ContentBody,ContentDescriptor { |
| |
| private Map<String,ContentBody> parts; |
| private String boundary; |
| |
| public MultipartContentBody(Map<String,ContentBody> parts, String boundary, ContentType contentType){ |
| super(ContentType.create(contentType.getMimeType(), new BasicNameValuePair("boundary",boundary))); |
| this.parts = parts; |
| this.boundary = boundary; |
| } |
| |
| @Override |
| public String getTransferEncoding() { |
| return MIME.ENC_8BIT; |
| } |
| |
| @Override |
| public long getContentLength() { |
| //not known as we would need to count the content length AND |
| //the length of the different mime headers. |
| return -1; |
| } |
| |
| @Override |
| public String getFilename() { |
| return null; |
| } |
| |
| @Override |
| public void writeTo(OutputStream out) throws IOException { |
| MultipartEntityBuilder builder = MultipartEntityBuilder.create(); |
| builder.setBoundary(boundary); |
| for(Entry<String,ContentBody> part : parts.entrySet()){ |
| builder.addPart(part.getKey(), part.getValue()); |
| } |
| HttpEntity entity = builder.build(); |
| entity.writeTo(out); |
| } |
| |
| } |
| |
| } |