blob: 1967e687015744f9b2241939395299aff2d2615f [file] [log] [blame]
package org.apache.tika.parser.ner.opennlp
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This file downloads Apache OpenNLP NER models for testing the NamedEntityParser
*/
import org.apache.maven.settings.Proxy as MvnProxy
import java.net.Proxy as JDKProxy
import groovy.transform.Field
//BEGIN: Global context ; ${settings} is injected by the plugin
List<MvnProxy> mvnProxies = settings.getProxies()?.findAll{it.isActive()}
@Field JDKProxy proxy = null
if (mvnProxies && mvnProxies.size() > 0) {
mvnProxy = mvnProxies.get(0)
println "Using the first Proxy setting : ${mvnProxy.username}@ ${mvnProxy.host} : ${mvnProxy.port} "
proxy = new JDKProxy(JDKProxy.Type.HTTP, new InetSocketAddress(mvnProxy.host, mvnProxy.port))
Authenticator.setDefault(new Authenticator(){
@Override
protected PasswordAuthentication getPasswordAuthentication(){
return new PasswordAuthentication(mvnProxy.username, mvnProxy.password?.toCharArray())
}
})
println "Proxy is configured"
}
//END : Global Context
/**
* Copies input stream to output stream, additionally printing the progress.
* NOTE: this is optimized for large content
* @param inStr source stream
* @param outStr target stream
* @param totalLength the total length of the content (used to calculate progress)
* @return
*/
def copyWithProgress(InputStream inStr, OutputStream outStr, long totalLength){
int PROGRESS_DELAY = 1000;
byte[] buffer = new byte[1024 * 4]
long count = 0
int len
long tt = System.currentTimeMillis()
while ((len = inStr.read(buffer)) > 0) {
outStr.write(buffer, 0, len)
count += len
if (System.currentTimeMillis() - tt > PROGRESS_DELAY) {
println "${count * 100.0/totalLength}% : $count bytes of $totalLength"
tt = System.currentTimeMillis()
}
}
println "Copy complete. "
inStr.close()
outStr.close()
}
/**
* Downloads file
* @param urlStr url of file
* @param file path to store file
* @return
*/
def downloadFile(String urlStr, File file) {
println "GET : $urlStr -> $file (Using proxy? ${proxy != null})"
url = new URL(urlStr)
try {
urlConn = proxy ? url.openConnection(proxy) : url.openConnection()
contentLength = urlConn.getContentLengthLong()
file.getParentFile().mkdirs()
inStream = urlConn.getInputStream()
outStream = new FileOutputStream(file)
copyWithProgress(inStream, outStream, contentLength)
outStream.close()
inStream.close()
println "Download Complete.."
} catch (IOException e) {
println "Couldn't download $file at the moment. Will skip tests that require that model/file for now."
e.printStackTrace()
}
}
def urlPrefix = "http://opennlp.sourceforge.net/models-1.5"
def prefixPath = "src/test/resources/org/apache/tika/parser/ner/opennlp/"
def ageUrlPrefix = "https://raw.githubusercontent.com/USCDataScience/AgePredictor/master/model"
def agePrefixPath = "src/test/resources/org/apache/tika/parser/recognition/"
// detecting proper path for test resources
if (new File("tika-parsers").exists() && new File("tika-app").exists() ) {
// running from parent maven project, but resources should go to sub-module
prefixPath = "tika-parsers/tika-parsers-ml/tika-parser-nlp-module/" + prefixPath
agePrefixPath = "tika-parsers/tika-parsers-ml/tika-age-recogniser/" + agePrefixPath
}
def modelFiles = //filePath : url
[(prefixPath + "ner-person.bin"): (urlPrefix + "/en-ner-person.bin"),
(prefixPath + "ner-location.bin"): (urlPrefix + "/en-ner-location.bin"),
(prefixPath + "ner-organization.bin"): (urlPrefix + "/en-ner-organization.bin"),
(prefixPath + "en-pos-maxent.bin"): (urlPrefix + "/en-pos-maxent.bin"),
(prefixPath + "en-sent.bin"): (urlPrefix + "/en-sent.bin"),
(prefixPath + "en-token.bin"): (urlPrefix + "/en-token.bin"),
(prefixPath + "ner-date.bin"): (urlPrefix + "/en-ner-date.bin"),
(agePrefixPath + "classify-bigram.bin"): (ageUrlPrefix + "/classify-bigram.bin"),
(agePrefixPath + "regression-global.bin"): (ageUrlPrefix + "/regression-global.bin")]
for (def entry : modelFiles) {
File file = new File(entry.key)
if (!file.exists()) {
downloadFile(entry.value, file)
}
}