blob: cfa8fba7b5ae1a2c14828f975e21eab4c3289d38 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.language.translate.impl;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider;
/**
* <p>This translator is designed to work with a TCP-IP available
* Joshua translation server, specifically the
* <a href="https://github.com/joshua-decoder/joshua_translation_engine">
* REST-based Joshua server</a>.</p>
*
* <p>If you were to interact with the server via curl a request
* would look as follows</p>
*
* <pre>
* {code
* curl http://localhost:5000/joshua/translate/english \
* -i -H "Content-Type: application/json" \
* -X POST -d '{"inputLanguage": "Spanish", "inputText": "vuelo"}' -v
* }
* </pre>
*
* Joshua requires input to be pre-formatted into sentences, one per line,
* so this translation implementation takes care of that.
*/
public class JoshuaNetworkTranslator extends AbstractTranslator {
private static final Logger LOG = LoggerFactory.getLogger(JoshuaNetworkTranslator.class);
private static final String PROPERTIES_FILE = "translator.joshua.properties";
private static final String JOSHUA_SERVER = "joshua.server.url";
private String networkServer;
private String networkURI;
/**
* Default constructor which first checks for the presence of
* the <code>translator.joshua.properties</code> file.
* We check if the remote server is available on each
* translation process. This check is not a remote call, but instead
* a check for null value within of a local variable represetning the
* value for <code>joshua.server.url</code>, which should be populated
* within the <code>translator.joshua.properties</code> file.
*/
public JoshuaNetworkTranslator() {
Properties props = new Properties();
InputStream stream;
stream = JoshuaNetworkTranslator.class.getResourceAsStream(PROPERTIES_FILE);
try {
if(stream != null) {
props.load(stream);
networkServer = props.getProperty(JOSHUA_SERVER);
}
} catch (IOException e) {
LOG.error("An error occured whilst reading translator.joshua.properties file", e);
}
}
/**
* <p>Initially then check if the source language has been provided.
* If no source language (or a null value) has been provided then
* we make an attempt to guess the source using Tika's
* {@link org.apache.tika.langdetect.OptimaizeLangDetector}. If we
* are still unable to guess the language then we return the source
* text.</p>
*
* <p>We then process the input text into a new string consisting of
* sentences, one per line e.g. insert \n between the presence of '.'</p>
*
* @see org.apache.tika.language.translate.Translator#translate
* (java.lang.String, java.lang.String, java.lang.String)
*/
@Override
public String translate(String text, String sourceLanguage,
String targetLanguage) throws TikaException, IOException {
//create networkURI
if (!networkServer.endsWith("/")) {
networkURI = networkServer + "/" + targetLanguage;
} else {
networkURI = networkServer + targetLanguage;
}
if (!this.isAvailable())
return text;
//make an attempt to guess language if one is not provided.
if (sourceLanguage == null)
sourceLanguage = detectLanguage(text).getLanguage();
//process input text into sentences, one per line
// e.g. insert \n between the presence of '.'
StringBuilder sb = new StringBuilder(text);
int i = 0;
while ((i = sb.indexOf(".", i + 1)) != -1) {
sb.replace(i, i + 1, "\n");
}
String inputText = sb.toString();
WebClient client;
final List<Object> providers = new ArrayList<>();
JacksonJsonProvider jacksonJsonProvider = new JacksonJsonProvider();
providers.add(jacksonJsonProvider);
client = WebClient.create(networkURI, providers);
ObjectMapper requestMapper = new ObjectMapper();
ObjectNode jsonNode = requestMapper.createObjectNode();
jsonNode.put("inputLanguage", sourceLanguage);
jsonNode.put("inputText", inputText);
//make the request
Response response = client.accept(MediaType.APPLICATION_JSON).type(MediaType.APPLICATION_JSON).post(jsonNode);
StringBuilder responseText = new StringBuilder();
try (InputStreamReader inputStreamReader = new InputStreamReader(
(InputStream) response.getEntity(), UTF_8);
BufferedReader reader = new BufferedReader(inputStreamReader);
) {
String line;
while ((line = reader.readLine()) != null) {
responseText.append(line);
}
}
try {
ObjectMapper responseMapper = new ObjectMapper();
JsonNode jsonResp = responseMapper.readTree(responseText.toString());
if (jsonResp.findValuesAsText("outputText") != null) {
return jsonResp.findValuesAsText("outputText").get(0);
} else {
throw new TikaException(jsonResp.findValue("message").get(0).asText());
}
} catch (JsonParseException e) {
throw new TikaException("Error requesting translation from '" +
sourceLanguage + "' to '" + targetLanguage + "', JSON response "
+ "from Joshua REST Server is not well formatted: " + responseText.toString());
}
}
/**
* Make an attempt to guess the source language via
* {@link org.apache.tika.language.translate.AbstractTranslator#detectLanguage(String)}
* before making the call to
* {@link JoshuaNetworkTranslator#translate(String, String, String)}
* @see org.apache.tika.language.translate.Translator#translate(java.lang.String, java.lang.String)
*/
@Override
public String translate(String text, String targetLanguage)
throws TikaException, IOException {
if (isAvailable())
return text;
String sourceLanguage = detectLanguage(text).getLanguage();
return translate(text, sourceLanguage, targetLanguage);
}
/**
* @see org.apache.tika.language.translate.Translator#isAvailable()
*/
@Override
public boolean isAvailable() {
if (this.networkServer!=null) {
URL url = null;
try {
url = new URL(networkURI);
} catch (MalformedURLException mue) {
LOG.error("Error reading {} property from {}. {}", JOSHUA_SERVER, PROPERTIES_FILE, mue);
}
HttpURLConnection connection = null;
try {
if (url!=null) {
connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("Connection", "close");
connection.setConnectTimeout(2000); // Timeout 2 seconds
connection.connect();
return tryResponseCode(connection);
}
} catch (IOException ioe) {
LOG.error("Error whilst checking availability of {}. {}", JOSHUA_SERVER, ioe);
}
}
return false;
}
private boolean tryResponseCode(HttpURLConnection connection) {
// If the web service is available
try {
if (connection.getResponseCode() == 200)
return true;
} catch (IOException ioe) {
LOG.error("Error retrieving response code from Joshua Network Translator.", ioe);
}
return false;
}
}