blob: 8f69aedabf86b511feb9bae9d536a78d9de9ed00 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.celi.classification.impl;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import javax.xml.soap.MessageFactory;
import javax.xml.soap.SOAPBody;
import javax.xml.soap.SOAPException;
import javax.xml.soap.SOAPMessage;
import javax.xml.soap.SOAPPart;
import javax.xml.transform.stream.StreamSource;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.util.Base64;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.stanbol.enhancer.engines.celi.utils.Utils;
import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
public class ClassificationClientHTTP {
private final static Logger log = LoggerFactory.getLogger(ClassificationClientHTTP.class);
//NOTE: Defining charset, content-type and SOAP prefix/suffix as
// constants does make more easy to configure those things
/**
* The UTF-8 {@link Charset}
*/
private static final Charset UTF8 = Charset.forName("UTF-8");
/**
* The content type "text/xml; charset={@link #UTF8}"
*/
private static final String CONTENT_TYPE = "text/xml; charset="+UTF8.name();
/**
* The XML version, encoding; SOAP envelope, heder and starting element of the body;
* processTextRequest and text starting element.
*/
private static final String SOAP_PREFIX = "<?xml version=\"1.0\" encoding=\""+UTF8.name()+"\"?>"
+ "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" "
+ "xmlns:clas=\"http://linguagrid.org/v20110204/classification\"><soapenv:Header/><soapenv:Body>";
/**
* closes the text, processTextRequest, SOAP body and envelope
*/
private static final String SOAP_SUFFIX = "</soapenv:Body></soapenv:Envelope>";
//TODO: This should be configurable
private static final int maxResultToReturn = 3;
private final URL serviceEP;
private final String licenseKey;
private final int conTimeout;
private final Map<String,String> requestHeaders;
public ClassificationClientHTTP(URL serviceUrl, String licenseKey, int conTimeout){
this.serviceEP=serviceUrl;
this.licenseKey=licenseKey;
this.conTimeout = conTimeout;
Map<String,String> headers = new HashMap<String,String>();
headers.put("Content-Type", CONTENT_TYPE);
if(licenseKey != null){
String encoded = Base64.encode(this.licenseKey.getBytes(UTF8));
headers.put("Authorization", "Basic "+encoded);
}
this.requestHeaders = Collections.unmodifiableMap(headers);
}
public List<Concept> extractConcepts(String text,String lang) throws IOException, SOAPException {
if(text == null || text.isEmpty()){
//no text -> no classification
return Collections.emptyList();
}
//create the POST request
HttpURLConnection con = Utils.createPostRequest(serviceEP, requestHeaders,conTimeout);
//"stream" the request content directly to the buffered writer
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(con.getOutputStream(),UTF8));
writer.write(SOAP_PREFIX);
writer.write("<clas:classify>");
writer.write("<clas:user>wiki</clas:user>");//TODO: should the user be configurable?
writer.write("<clas:model>");
writer.write(lang);
writer.write("</clas:model>");
writer.write("<clas:text>");
StringEscapeUtils.escapeXml(writer, text); //write the escaped text directly to the request
writer.write("</clas:text>");
writer.write("</clas:classify>");
writer.write(SOAP_SUFFIX);
writer.close();
//Call the service
long start = System.currentTimeMillis();
InputStream stream = con.getInputStream();
log.debug("Request to {} took {}ms",serviceEP,System.currentTimeMillis()-start);
MessageFactory msgFactory = MessageFactory.newInstance();
SOAPMessage message = msgFactory.createMessage();
SOAPPart soapPart = message.getSOAPPart();
StreamSource source = new StreamSource(stream);
// Set contents of message
soapPart.setContent(source);
SOAPBody soapBody = message.getSOAPBody();
List<Concept> extractedConcepts = new Vector<Concept>();
NodeList nlist = soapBody.getElementsByTagNameNS("*","return");
for (int i = 0; i < nlist.getLength() && i<maxResultToReturn; i++) {
Element result = (Element) nlist.item(i);
//NOTE: (rwesten) implemented a mapping from the CELI classification
// to the Stanbol fise:TopicEnhancements (STANBOL-617) that
// * one fise:TopicAnnotation is generated per "model"
// * the whole label string is used as fise:entity-label
// * the uri of the most specific dbpedia ontology type (see
// selectClassificationClass) is used as fise:entity-reference
// This has the intuition that for users it is easier to grasp
// the meaning of the whole lable, while for machines the link
// to the most specific dbpedia ontology class is best suited.
String model = result.getElementsByTagNameNS("*","label").item(0).getTextContent();
model=model.substring(1, model.length()-1);
UriRef modelConcept = selectClassificationClass(model);
String conf=result.getElementsByTagNameNS("*","score").item(0).getTextContent();
Double confidence= new Double(conf);
extractedConcepts.add(new Concept(model,modelConcept,confidence));
}
return extractedConcepts;
}
/**
* TopicClassifications require only a single fise:entity-reference.
* However the CELI classification service delivers <p>
* <code><pre>
* <ns2:label>[Organisation HockeyTeam SportsTeam]</ns2:label>
* </pre></code>
* because of that this method needs to select one of the labels.<p>
* This method currently selects the 2nd token if there are more than one
* concept suggestions included. NOTE that the whole literal is used as
* fise:entity-label!
* @param classificationLabels the label string
* @return the selected label
*/
private UriRef selectClassificationClass(String classificationLabels) {
//NOTE: (rwesten) In general it would be better if CELI could provide
// de-referenceable URLs for those suggestions.
// If that is possible one would no longer need to link to the
// most specific dbpedia ontology class for a category e.g.
// http://dbpedia.org/ontology/HockeyTeam
// for
// [Organisation HockeyTeam SportsTeam]
// but e.g.
// http://linguagrid.org/category/HockeyTeam
// meaning the linguagrid could provide categories as skos thesaurus
// via it's web interface
int start = classificationLabels.charAt(0) == '[' ? 1 : 0;
int end = classificationLabels.charAt(classificationLabels.length()-1) == ']' ?
classificationLabels.length() - 1 : classificationLabels.length();
String[] tmps = classificationLabels.substring(start, end).split(" ");
return new UriRef(NamespaceEnum.dbpedia_ont.getNamespace()+ //the namespace
(tmps.length > 1 ? tmps[1] : tmps[0])); //the Class for the label
}
}