OPENNLP-480 Added initial support for tokenizer and sentence detector, updated name finder and pos tagger.
diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/ServiceUtil.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/ServiceUtil.java
new file mode 100644
index 0000000..16832f4
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/ServiceUtil.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server;
+
+import org.osgi.framework.Bundle;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.FrameworkUtil;
+import org.osgi.framework.ServiceReference;
+
+public class ServiceUtil {
+
+ private ServiceUtil() {
+ }
+
+ public static ServiceReference getServiceReference(Class<?> serviceClazz) {
+ Bundle bundle = FrameworkUtil.getBundle(ServiceUtil.class);
+ BundleContext context = bundle.getBundleContext();
+
+ return context.getServiceReference(serviceClazz.getName());
+ }
+
+ @SuppressWarnings("unchecked")
+ public static <T> T getService(ServiceReference modelService, Class<T> modelClazz) {
+
+ T model;
+ if (modelService != null) {
+ BundleContext context = modelService.getBundle().getBundleContext();
+ model = (T) context.getService(modelService);
+ }
+ else {
+ throw new RuntimeException("Model does not exist!");
+ }
+
+ return model;
+ }
+
+ public static void releaseService(ServiceReference service) {
+ if (service != null) {
+ BundleContext context = service.getBundle().getBundleContext();
+ context.ungetService(service);
+ }
+ }
+}
diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/DefaultRawTextNameFinderFactory.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/DefaultRawTextNameFinderFactory.java
new file mode 100644
index 0000000..ec47e0c
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/DefaultRawTextNameFinderFactory.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server.namefind;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+
+public class DefaultRawTextNameFinderFactory implements RawTextNameFinderFactory {
+
+ private final SentenceModel sentModel;
+ private final TokenizerModel tokenModel;
+ private final TokenNameFinderModel nameModel;
+
+ public DefaultRawTextNameFinderFactory(SentenceModel sentModel,
+ TokenizerModel tokenModel, TokenNameFinderModel nameModel) {
+ this.sentModel = sentModel;
+ this.tokenModel = tokenModel;
+ this.nameModel = nameModel;
+ }
+
+ @Override
+ public SentenceDetector createSentenceDetector() {
+ return new SentenceDetectorME(sentModel);
+ }
+
+ @Override
+ public Tokenizer createTokenizer() {
+ return new TokenizerME(tokenModel);
+ }
+
+ @Override
+ public TokenNameFinder createNameFinder() {
+ return new NameFinderME(nameModel);
+ }
+}
diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/NameFinderResource.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/NameFinderResource.java
index 4c23a23..1873b08 100644
--- a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/NameFinderResource.java
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/NameFinderResource.java
@@ -18,23 +18,22 @@
package org.apache.opennlp.tagging_server.namefind;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
import javax.ws.rs.Consumes;
-import javax.ws.rs.OPTIONS;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.core.MediaType;
-import javax.ws.rs.core.Response;
import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Span;
-import org.apache.opennlp.tagging_server.ModelUtil;
+import org.apache.opennlp.tagging_server.ServiceUtil;
import org.osgi.framework.ServiceReference;
@@ -42,10 +41,10 @@
public class NameFinderResource {
public static class NameFinderDocument {
- private String document[][];
+ private List<Span[]> document;
private List<Span[]> names;
- NameFinderDocument(String document[][], List<Span[]> names) {
+ NameFinderDocument(List<Span[]> document, List<Span[]> names) {
this.document = document;
this.names = names;
}
@@ -54,77 +53,88 @@
return names;
}
- public String[][] getDocument() {
+ public List<Span[]> getDocument() {
return document;
}
}
+ private List<Span[]> find(TokenNameFinder nameFinder, String[][] document) {
+
+ List<Span[]> names = new ArrayList<Span[]>();
+
+ for (String sentence[] : document) {
+ names.add(nameFinder.find(sentence));
+ }
+
+ return names;
+ }
+
@POST
@Consumes(MediaType.APPLICATION_JSON)
@Produces(MediaType.APPLICATION_JSON)
@Path("_find")
- public List<Span> find(String[][] document) {
+ public List<Span[]> find(String[][] document) {
- ServiceReference modelService = ModelUtil.getService(TokenNameFinderModel.class);
+ ServiceReference modelService = ServiceUtil.getServiceReference(TokenNameFinderModel.class);
try {
NameFinderME nameFinder = new NameFinderME(
- ModelUtil.getModel(modelService, TokenNameFinderModel.class));
+ ServiceUtil.getService(modelService, TokenNameFinderModel.class));
- List<Span> names = new ArrayList<Span>();
+ List<Span[]> names = new ArrayList<Span[]>();
for (String sentence[] : document) {
- names.addAll(Arrays.asList(nameFinder.find(sentence)));
+ names.add(nameFinder.find(sentence));
}
return names;
}
finally {
- ModelUtil.releaseService(modelService);
+ ServiceUtil.releaseService(modelService);
}
}
-
- // Just a hack to get arround cross domain issues in my test environment!
- // Need to investigate how this should be done!
- @OPTIONS
- @Path("_findRawText")
- public Response findRawTextOptions() {
- System.out.println("Called options ...");
- return Response.ok()
- .header("Access-Control-Allow-Origin", "*")
- .header("Access-Control-Allow-Headers", "Content-Type")
- .header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
- .build();
- }
+ // TODO:
+ // User should pass a key for the models (e.g. default_eng)
@POST
@Consumes(MediaType.APPLICATION_JSON)
@Produces(MediaType.APPLICATION_JSON)
@Path("_findRawText")
- public NameFinderDocument findRawText(String document) { // input could be a single string ... return contains everything!
-
-
- System.out.println("Request: " + document);
- String[][] tokenizedSentences = new String[][]{SimpleTokenizer.INSTANCE.tokenize(document)};
-
- // TODO: Fix this. User should be able to define this in blueprint!
-
- ServiceReference modelService = ModelUtil.getService(TokenNameFinderModel.class);
-
- try {
- NameFinderME nameFinder = new NameFinderME(
- ModelUtil.getModel(modelService, TokenNameFinderModel.class));
+ public NameFinderDocument findRawText(String document) {
- List<Span[]> names = new ArrayList<Span[]>();
+ ServiceReference preprocessFactoryService = ServiceUtil.getServiceReference(RawTextNameFinderFactory.class);
+
+ try {
+ // TODO: Pass a key here!
+ RawTextNameFinderFactory factory =
+ ServiceUtil.getService(preprocessFactoryService, RawTextNameFinderFactory.class);
- for (String sentence[] : tokenizedSentences) {
- names.add(nameFinder.find(sentence));
+ SentenceDetector sentDetect = factory.createSentenceDetector();
+ Tokenizer tokenizer = factory.createTokenizer();
+
+ Span sentenceSpans[] = sentDetect.sentPosDetect(document);
+
+ List<Span[]> tokenizedSentencesSpan = new ArrayList<Span[]>();
+ String[][] tokenizedSentences = new String[sentenceSpans.length][];
+
+ for (int i = 0; i < sentenceSpans.length; i++) {
+ Span tokenSpans[] = tokenizer.tokenizePos(sentenceSpans[i].getCoveredText(document).toString());
+ tokenizedSentencesSpan.add(tokenSpans);
+
+ String tokens[] = new String[tokenSpans.length];
+ for (int ti = 0; ti < tokenSpans.length; ti++) {
+ tokens[ti] = tokenSpans[ti].getCoveredText(document).toString();
+ }
+
+ tokenizedSentences[i] = tokens;
}
- return new NameFinderDocument(tokenizedSentences, names);
+ TokenNameFinder nameFinder = factory.createNameFinder();
+
+ return new NameFinderDocument(tokenizedSentencesSpan, find(nameFinder, tokenizedSentences));
}
finally {
- ModelUtil.releaseService(modelService);
+ ServiceUtil.releaseService(preprocessFactoryService);
}
}
}
diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/RawTextNameFinderFactory.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/RawTextNameFinderFactory.java
new file mode 100644
index 0000000..a942a82
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/RawTextNameFinderFactory.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server.namefind;
+
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+
+/**
+ * The raw text name finder factory supports the creation of all
+ * components which are needed to process raw text with the name finder.
+ */
+public interface RawTextNameFinderFactory {
+ SentenceDetector createSentenceDetector();
+ Tokenizer createTokenizer();
+ TokenNameFinder createNameFinder();
+}
\ No newline at end of file
diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/postag/POSTaggerResource.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/postag/POSTaggerResource.java
index ed0b895..c0a3ad7 100644
--- a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/postag/POSTaggerResource.java
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/postag/POSTaggerResource.java
@@ -27,7 +27,7 @@
import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.POSTaggerME;
-import org.apache.opennlp.tagging_server.ModelUtil;
+import org.apache.opennlp.tagging_server.ServiceUtil;
import org.osgi.framework.ServiceReference;
@Path("/postagger")
@@ -41,10 +41,10 @@
// @QueryParam("model") String modelName
public String[][] tag(String document[][]) {
- ServiceReference modelService = ModelUtil.getService(POSModel.class);
+ ServiceReference modelService = ServiceUtil.getServiceReference(POSModel.class);
try {
- POSTagger tagger = new POSTaggerME(ModelUtil.getModel(modelService, POSModel.class));
+ POSTagger tagger = new POSTaggerME(ServiceUtil.getService(modelService, POSModel.class));
String[][] tags = new String[document.length][];
@@ -55,7 +55,7 @@
return tags;
}
finally {
- ModelUtil.releaseService(modelService);
+ ServiceUtil.releaseService(modelService);
}
}
}
diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/sentdetect/SentenceDetectorResource.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/sentdetect/SentenceDetectorResource.java
new file mode 100644
index 0000000..75cdbee
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/sentdetect/SentenceDetectorResource.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server.sentdetect;
+
+import java.util.Arrays;
+import java.util.List;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.MediaType;
+
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.util.Span;
+
+import org.apache.opennlp.tagging_server.ServiceUtil;
+import org.osgi.framework.ServiceReference;
+
+@Path("/sentdetect")
+public class SentenceDetectorResource {
+
+ @POST
+ @Consumes(MediaType.APPLICATION_JSON)
+ @Produces(MediaType.APPLICATION_JSON)
+ @Path("_sentPosDetect")
+ public List<Span> sentPosDetect(String document) {
+
+ ServiceReference modelService = ServiceUtil.getServiceReference(SentenceModel.class);
+
+ try {
+ SentenceDetector sentDetector = new SentenceDetectorME(
+ ServiceUtil.getService(modelService, SentenceModel.class));
+
+ return Arrays.asList(sentDetector.sentPosDetect(document));
+ }
+ finally {
+ ServiceUtil.releaseService(modelService);
+ }
+ }
+}
diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/tokenize/TokenizerResource.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/tokenize/TokenizerResource.java
new file mode 100644
index 0000000..3a62a5f
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/tokenize/TokenizerResource.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server.tokenize;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.MediaType;
+
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
+
+import org.apache.opennlp.tagging_server.ServiceUtil;
+import org.osgi.framework.ServiceReference;
+
+@Path("/tokenize")
+public class TokenizerResource {
+
+ @POST
+ @Consumes(MediaType.APPLICATION_JSON)
+ @Produces(MediaType.APPLICATION_JSON)
+ @Path("_tokenize")
+ public List<String[]> tokenize(String[] document) {
+ ServiceReference modelService = ServiceUtil.getServiceReference(TokenizerModel.class);
+
+ try {
+ TokenizerME tokenizer = new TokenizerME(
+ ServiceUtil.getService(modelService, TokenizerModel.class));
+
+ List<String[]> tokenizedSentences = new ArrayList<String[]>();
+
+ for (String sentence : document) {
+ tokenizedSentences.add(tokenizer.tokenize(sentence));
+ }
+
+ return tokenizedSentences;
+ }
+ finally {
+ ServiceUtil.releaseService(modelService);
+ }
+ }
+
+ @POST
+ @Consumes(MediaType.APPLICATION_JSON)
+ @Produces(MediaType.APPLICATION_JSON)
+ @Path("_tokenizePos")
+ public List<Span[]> tokenizePos(String[] document) {
+ ServiceReference modelService = ServiceUtil.getServiceReference(TokenizerModel.class);
+
+ try {
+ TokenizerME tokenizer = new TokenizerME(
+ ServiceUtil.getService(modelService, TokenizerModel.class));
+
+ List<Span[]> tokenizedSentences = new ArrayList<Span[]>();
+
+ for (String sentence : document) {
+ tokenizedSentences.add(tokenizer.tokenizePos(sentence));
+ }
+
+ return tokenizedSentences;
+ }
+ finally {
+ ServiceUtil.releaseService(modelService);
+ }
+ }
+}