OPENNLP-480 Added initial support for tokenizer and sentence detector, updated name finder and pos tagger.

commit: a96dac5a4c1c5536ab468b4f93c991aabd40e115 [log] [tgz]
author: Jörn Kottmann <joern@apache.org> Tue Jul 17 14:30:50 2012 +0000
committer: Jörn Kottmann <joern@apache.org> Tue Jul 17 14:30:50 2012 +0000
tree: 58e19f6526712eabe4c9e0af2db4b4c9736cbc52
parent: a1c86357b0472b3bda887f8a56c8ceafa41050cb [diff]
diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/ServiceUtil.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/ServiceUtil.java
new file mode 100644
index 0000000..16832f4
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/ServiceUtil.java

@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server;
+
+import org.osgi.framework.Bundle;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.FrameworkUtil;
+import org.osgi.framework.ServiceReference;
+
+public class ServiceUtil {
+
+  private ServiceUtil() {
+  }
+  
+  public static ServiceReference getServiceReference(Class<?> serviceClazz) {
+    Bundle bundle = FrameworkUtil.getBundle(ServiceUtil.class);
+    BundleContext context = bundle.getBundleContext();
+
+    return context.getServiceReference(serviceClazz.getName());
+  }
+
+  @SuppressWarnings("unchecked")
+  public static <T> T getService(ServiceReference modelService, Class<T> modelClazz) {
+    
+    T model;
+    if (modelService != null) {
+      BundleContext context = modelService.getBundle().getBundleContext();
+      model = (T) context.getService(modelService);
+    }
+    else {
+      throw new RuntimeException("Model does not exist!");
+    }
+    
+    return model;
+  }
+
+  public static void releaseService(ServiceReference service) {
+    if (service != null) {
+      BundleContext context = service.getBundle().getBundleContext();
+      context.ungetService(service);
+    }
+  }
+}

diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/DefaultRawTextNameFinderFactory.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/DefaultRawTextNameFinderFactory.java
new file mode 100644
index 0000000..ec47e0c
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/DefaultRawTextNameFinderFactory.java

@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server.namefind;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+
+public class DefaultRawTextNameFinderFactory implements RawTextNameFinderFactory {
+  
+  private final SentenceModel sentModel;
+  private final TokenizerModel tokenModel;
+  private final TokenNameFinderModel nameModel;
+
+  public DefaultRawTextNameFinderFactory(SentenceModel sentModel,
+      TokenizerModel tokenModel, TokenNameFinderModel nameModel) {
+    this.sentModel = sentModel;
+    this.tokenModel = tokenModel;
+    this.nameModel = nameModel;
+  }
+
+  @Override
+  public SentenceDetector createSentenceDetector() {
+    return new SentenceDetectorME(sentModel);
+  }
+
+  @Override
+  public Tokenizer createTokenizer() {
+    return new TokenizerME(tokenModel);
+  }
+
+  @Override
+  public TokenNameFinder createNameFinder() {
+    return new NameFinderME(nameModel);
+  }
+}

diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/NameFinderResource.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/NameFinderResource.java
index 4c23a23..1873b08 100644
--- a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/NameFinderResource.java
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/NameFinderResource.java

@@ -18,23 +18,22 @@
 package org.apache.opennlp.tagging_server.namefind;
 
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 
 import javax.ws.rs.Consumes;
-import javax.ws.rs.OPTIONS;
 import javax.ws.rs.POST;
 import javax.ws.rs.Path;
 import javax.ws.rs.Produces;
 import javax.ws.rs.core.MediaType;
-import javax.ws.rs.core.Response;
 
 import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
 import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.Span;
 
-import org.apache.opennlp.tagging_server.ModelUtil;
+import org.apache.opennlp.tagging_server.ServiceUtil;
 import org.osgi.framework.ServiceReference;
 
 
@@ -42,10 +41,10 @@
 public class NameFinderResource {
 
   public static class NameFinderDocument {
-    private String document[][];
+    private List<Span[]> document;
     private List<Span[]> names;
     
-    NameFinderDocument(String document[][], List<Span[]> names) {
+    NameFinderDocument(List<Span[]> document, List<Span[]> names) {
       this.document = document;
       this.names = names;
     }
@@ -54,77 +53,88 @@
       return names;
     }
     
-    public String[][] getDocument() {
+    public List<Span[]> getDocument() {
       return document;
     }
   }
   
+  private List<Span[]> find(TokenNameFinder nameFinder, String[][] document) {
+
+    List<Span[]> names = new ArrayList<Span[]>();
+
+    for (String sentence[] : document) {
+      names.add(nameFinder.find(sentence));
+    }
+
+    return names;
+  }
+  
   @POST
   @Consumes(MediaType.APPLICATION_JSON)
   @Produces(MediaType.APPLICATION_JSON)
   @Path("_find")
-  public List<Span> find(String[][] document) {
+  public List<Span[]> find(String[][] document) {
     
-    ServiceReference modelService = ModelUtil.getService(TokenNameFinderModel.class);
+    ServiceReference modelService = ServiceUtil.getServiceReference(TokenNameFinderModel.class);
     
     try {
       NameFinderME nameFinder = new NameFinderME(
-          ModelUtil.getModel(modelService, TokenNameFinderModel.class));
+          ServiceUtil.getService(modelService, TokenNameFinderModel.class));
       
-      List<Span> names = new ArrayList<Span>();
+      List<Span[]> names = new ArrayList<Span[]>();
       
       for (String sentence[] : document) {
-        names.addAll(Arrays.asList(nameFinder.find(sentence)));
+        names.add(nameFinder.find(sentence));
       }
       
       return names;
     }
     finally {
-      ModelUtil.releaseService(modelService);
+      ServiceUtil.releaseService(modelService);
     }
   }
-  
-  // Just a hack to get arround cross domain issues in my test environment!
-  // Need to investigate how this should be done!
-  @OPTIONS
-  @Path("_findRawText")
-  public Response findRawTextOptions() {
-    System.out.println("Called options ...");
-    return Response.ok()
-        .header("Access-Control-Allow-Origin", "*")
-        .header("Access-Control-Allow-Headers", "Content-Type")
-        .header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
-        .build();
-  }
 
+  // TODO:
+  // User should pass a key for the models (e.g. default_eng)
   @POST
   @Consumes(MediaType.APPLICATION_JSON)
   @Produces(MediaType.APPLICATION_JSON)
   @Path("_findRawText")
-  public NameFinderDocument findRawText(String document) { // input could be a single string ... return contains everything!
-    
-    
-    System.out.println("Request: " + document);
-    String[][] tokenizedSentences = new String[][]{SimpleTokenizer.INSTANCE.tokenize(document)};
-    
-    // TODO: Fix this. User should be able to define this in blueprint!
-    
-    ServiceReference modelService = ModelUtil.getService(TokenNameFinderModel.class);
-      
-    try {
-      NameFinderME nameFinder = new NameFinderME(
-              ModelUtil.getModel(modelService, TokenNameFinderModel.class));
+  public NameFinderDocument findRawText(String document) {
 
-      List<Span[]> names = new ArrayList<Span[]>();
+    ServiceReference preprocessFactoryService = ServiceUtil.getServiceReference(RawTextNameFinderFactory.class);
+    
+    try {
+      // TODO: Pass a key here!
+      RawTextNameFinderFactory factory =
+              ServiceUtil.getService(preprocessFactoryService, RawTextNameFinderFactory.class);
       
-      for (String sentence[] : tokenizedSentences) {
-        names.add(nameFinder.find(sentence));
+      SentenceDetector sentDetect = factory.createSentenceDetector();
+      Tokenizer tokenizer = factory.createTokenizer();
+      
+      Span sentenceSpans[] = sentDetect.sentPosDetect(document);
+      
+      List<Span[]> tokenizedSentencesSpan = new ArrayList<Span[]>();
+      String[][] tokenizedSentences = new String[sentenceSpans.length][];
+      
+      for (int i = 0; i < sentenceSpans.length; i++) {
+        Span tokenSpans[] = tokenizer.tokenizePos(sentenceSpans[i].getCoveredText(document).toString());
+        tokenizedSentencesSpan.add(tokenSpans);
+        
+        String tokens[] = new String[tokenSpans.length];
+        for (int ti = 0; ti < tokenSpans.length; ti++) {
+          tokens[ti] = tokenSpans[ti].getCoveredText(document).toString();
+        }
+        
+        tokenizedSentences[i] = tokens;
       }
       
-      return new NameFinderDocument(tokenizedSentences, names);
+      TokenNameFinder nameFinder = factory.createNameFinder();
+      
+      return new NameFinderDocument(tokenizedSentencesSpan, find(nameFinder, tokenizedSentences));
     }
     finally {
-      ModelUtil.releaseService(modelService);
+      ServiceUtil.releaseService(preprocessFactoryService);
     }
   }
 }

diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/RawTextNameFinderFactory.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/RawTextNameFinderFactory.java
new file mode 100644
index 0000000..a942a82
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/namefind/RawTextNameFinderFactory.java

@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server.namefind;
+
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+
+/**
+ * The raw text name finder factory supports the creation of all
+ * components which are needed to process raw text with the name finder.
+ */
+public interface RawTextNameFinderFactory {
+  SentenceDetector createSentenceDetector();
+  Tokenizer createTokenizer();
+  TokenNameFinder createNameFinder();
+}
\ No newline at end of file

diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/postag/POSTaggerResource.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/postag/POSTaggerResource.java
index ed0b895..c0a3ad7 100644
--- a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/postag/POSTaggerResource.java
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/postag/POSTaggerResource.java

@@ -27,7 +27,7 @@
 import opennlp.tools.postag.POSTagger;
 import opennlp.tools.postag.POSTaggerME;
 
-import org.apache.opennlp.tagging_server.ModelUtil;
+import org.apache.opennlp.tagging_server.ServiceUtil;
 import org.osgi.framework.ServiceReference;
 
 @Path("/postagger")
@@ -41,10 +41,10 @@
   // @QueryParam("model") String modelName
   public String[][] tag(String document[][]) {
     
-    ServiceReference modelService = ModelUtil.getService(POSModel.class);
+    ServiceReference modelService = ServiceUtil.getServiceReference(POSModel.class);
     
     try {
-      POSTagger tagger = new POSTaggerME(ModelUtil.getModel(modelService, POSModel.class));
+      POSTagger tagger = new POSTaggerME(ServiceUtil.getService(modelService, POSModel.class));
       
       String[][] tags = new String[document.length][];
       
@@ -55,7 +55,7 @@
       return tags;
     }
     finally {
-      ModelUtil.releaseService(modelService);
+      ServiceUtil.releaseService(modelService);
     }
   }
 }

diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/sentdetect/SentenceDetectorResource.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/sentdetect/SentenceDetectorResource.java
new file mode 100644
index 0000000..75cdbee
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/sentdetect/SentenceDetectorResource.java

@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server.sentdetect;
+
+import java.util.Arrays;
+import java.util.List;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.MediaType;
+
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.util.Span;
+
+import org.apache.opennlp.tagging_server.ServiceUtil;
+import org.osgi.framework.ServiceReference;
+
+@Path("/sentdetect")
+public class SentenceDetectorResource {
+
+  @POST
+  @Consumes(MediaType.APPLICATION_JSON)
+  @Produces(MediaType.APPLICATION_JSON)
+  @Path("_sentPosDetect")
+  public List<Span> sentPosDetect(String document) {
+    
+    ServiceReference modelService = ServiceUtil.getServiceReference(SentenceModel.class);
+      
+    try {
+      SentenceDetector sentDetector = new SentenceDetectorME(
+              ServiceUtil.getService(modelService, SentenceModel.class));
+      
+      return Arrays.asList(sentDetector.sentPosDetect(document));
+    }
+    finally {
+      ServiceUtil.releaseService(modelService);
+    }
+  }
+}

diff --git a/tagging-server/src/main/java/org/apache/opennlp/tagging_server/tokenize/TokenizerResource.java b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/tokenize/TokenizerResource.java
new file mode 100644
index 0000000..3a62a5f
--- /dev/null
+++ b/tagging-server/src/main/java/org/apache/opennlp/tagging_server/tokenize/TokenizerResource.java

@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.tagging_server.tokenize;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.MediaType;
+
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
+
+import org.apache.opennlp.tagging_server.ServiceUtil;
+import org.osgi.framework.ServiceReference;
+
+@Path("/tokenize")
+public class TokenizerResource {
+
+  @POST
+  @Consumes(MediaType.APPLICATION_JSON)
+  @Produces(MediaType.APPLICATION_JSON)
+  @Path("_tokenize")
+  public List<String[]> tokenize(String[] document) {
+    ServiceReference modelService = ServiceUtil.getServiceReference(TokenizerModel.class);
+    
+    try {
+      TokenizerME tokenizer = new TokenizerME(
+          ServiceUtil.getService(modelService, TokenizerModel.class));
+      
+      List<String[]> tokenizedSentences = new ArrayList<String[]>();
+      
+      for (String sentence : document) {
+        tokenizedSentences.add(tokenizer.tokenize(sentence));
+      }
+      
+      return tokenizedSentences;
+    }
+    finally {
+      ServiceUtil.releaseService(modelService);
+    }
+  }
+  
+  @POST
+  @Consumes(MediaType.APPLICATION_JSON)
+  @Produces(MediaType.APPLICATION_JSON)
+  @Path("_tokenizePos")
+  public List<Span[]> tokenizePos(String[] document) {
+    ServiceReference modelService = ServiceUtil.getServiceReference(TokenizerModel.class);
+    
+    try {
+      TokenizerME tokenizer = new TokenizerME(
+              ServiceUtil.getService(modelService, TokenizerModel.class));
+
+      List<Span[]> tokenizedSentences = new ArrayList<Span[]>();
+      
+      for (String sentence : document) {
+        tokenizedSentences.add(tokenizer.tokenizePos(sentence));
+      }
+      
+      return tokenizedSentences;
+    }
+    finally {
+      ServiceUtil.releaseService(modelService);
+    }
+  }
+}
commit	a96dac5a4c1c5536ab468b4f93c991aabd40e115	[log] [tgz]
author	Jörn Kottmann <joern@apache.org>	Tue Jul 17 14:30:50 2012 +0000
committer	Jörn Kottmann <joern@apache.org>	Tue Jul 17 14:30:50 2012 +0000
tree	58e19f6526712eabe4c9e0af2db4b4c9736cbc52
parent	a1c86357b0472b3bda887f8a56c8ceafa41050cb [diff]