OPENNLP-791 WordNet based clusters patch, uses ME for now will have to modify for other classifiers. Thanks to Anthony Beylerian for providing a patch!

commit: d81166bb292063ab9e4c12348732ca263fb98718 [log] [tgz]
author: Jörn Kottmann <joern@apache.org> Wed Aug 26 15:56:53 2015 +0000
committer: Jörn Kottmann <joern@apache.org> Wed Aug 26 15:56:53 2015 +0000
tree: b3a750bc7daddb45c4d21673544ea6ba7debeace
parent: 558081800e61b8aff75dd78121de9f7999b351ee [diff]
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
index f0bb765..136d5f2 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java

@@ -62,7 +62,7 @@
   }
 
   public WSDParameters() {
-    this.isCoarseSense = true;
+    this.isCoarseSense = false;
   }
 
   /**

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
index 9ef35d0..06451e5 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java

@@ -75,8 +75,11 @@
    * @param ambiguousTokenIndex
    * @return result as an array of WordNet IDs
    */
-  public abstract String[] disambiguate(String[] tokenizedContext,
-      String[] tokenTags, String[] lemmas, int ambiguousTokenIndex);
+  public String[] disambiguate(String[] tokenizedContext,
+      String[] tokenTags, String[] lemmas, int ambiguousTokenIndex){
+	  return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,
+		        ambiguousTokenIndex));
+  }
 
   /**
    * The disambiguation method for all the words in a Span

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
new file mode 100644
index 0000000..9584487
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java

@@ -0,0 +1,109 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator.oscc;

+

+import java.util.ArrayList;

+import java.util.Arrays;

+import java.util.HashSet;

+

+import net.sf.extjwnl.data.Synset;

+import opennlp.tools.disambiguator.WSDHelper;

+import opennlp.tools.disambiguator.WSDSample;

+import opennlp.tools.disambiguator.WordPOS;

+

+/**

+ * The default Context Generator of IMS

+ */

+public class DefaultOSCCContextGenerator implements OSCCContextGenerator {

+

+  public DefaultOSCCContextGenerator() {

+  }

+

+  public String[] extractSurroundingContextClusters(int index, String[] toks,

+      String[] tags, String[] lemmas, int windowSize) {

+

+    ArrayList<String> contextClusters = new ArrayList<String>();

+

+    for (int i = 0; i < toks.length; i++) {

+      if (lemmas != null) {

+

+        if (!WSDHelper.stopWords.contains(toks[i].toLowerCase())

+            && (index != i)) {

+

+          String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "")

+              .trim();

+          

+          WordPOS word = new WordPOS(lemma, tags[i]);

+

+          // TODO check fix for "_" and null pointers

+          if (lemma.length() > 1 && !lemma.contains("_")) {

+            try{

+            ArrayList<Synset> synsets = word.getSynsets();

+            if (synsets!=null && synsets.size() > 0 ){

+              contextClusters.add(synsets.get(0).getOffset() + "");

+            }

+            }catch(NullPointerException ex)

+            {

+              //TODO tagger mistake add proper exception

+            }

+          }

+

+        }

+      }

+    }

+

+    return contextClusters.toArray(new String[contextClusters.size()]);

+

+  }

+

+  /**

+   * Get Context of a word To disambiguate

+   * 

+   * @return The OSCC context of the word to disambiguate

+   */

+  @Override

+  public String[] getContext(int index, String[] toks, String[] tags,

+      String[] lemmas, int windowSize) {

+

+    HashSet<String> surroundingContextClusters = new HashSet<>();

+    surroundingContextClusters.addAll(Arrays

+        .asList(extractSurroundingContextClusters(index, toks, tags, lemmas,

+            windowSize)));

+

+    String[] serializedFeatures = new String[surroundingContextClusters.size()];

+

+    int i = 0;

+

+    for (String feature : surroundingContextClusters) {

+      serializedFeatures[i] = "F" + i + "=" + feature;

+      i++;

+    }

+

+    return serializedFeatures;

+

+  }

+

+  public String[] getContext(WSDSample sample, int windowSize) {

+

+    return getContext(sample.getTargetPosition(), sample.getSentence(),

+        sample.getTags(), sample.getLemmas(), windowSize);

+  }

+

+}


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
new file mode 100644
index 0000000..9c0055f
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java

@@ -0,0 +1,33 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator.oscc;

+

+import opennlp.tools.disambiguator.WSDSample;

+

+/**

+ * Interface for {@link OSCCME} context generators.

+ */

+public interface OSCCContextGenerator {

+

+  String[] getContext(int index, String[] toks, String[] tags, String[] lemmas,

+    int windowSize);

+

+  String[] getContext(WSDSample sample, int windowSize);

+}


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
new file mode 100644
index 0000000..e9cdecb
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java

@@ -0,0 +1,62 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.disambiguator.oscc;

+

+import opennlp.tools.util.BaseToolFactory;

+import opennlp.tools.util.InvalidFormatException;

+import opennlp.tools.util.ext.ExtensionLoader;

+

+public class OSCCFactory extends BaseToolFactory {

+

+  /**

+   * Creates a {@link OSCCFactory} that provides the default implementation of

+   * the resources.

+   * */

+  public OSCCFactory() {

+

+  }

+

+  public static OSCCFactory create(String subclassName)

+      throws InvalidFormatException {

+    if (subclassName == null) {

+      // will create the default factory

+      return new OSCCFactory();

+    }

+    try {

+      OSCCFactory theFactory = ExtensionLoader.instantiateExtension(

+          OSCCFactory.class, subclassName);

+      return theFactory;

+    } catch (Exception e) {

+      String msg = "Could not instantiate the " + subclassName

+          + ". The initialization throw an exception.";

+      System.err.println(msg);

+      e.printStackTrace();

+      throw new InvalidFormatException(msg, e);

+    }

+  }

+

+  @Override

+  public void validateArtifactMap() throws InvalidFormatException {

+    // no additional artifacts

+  }

+

+  public OSCCContextGenerator getContextGenerator() {

+    return new DefaultOSCCContextGenerator();

+  }

+

+}


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
new file mode 100644
index 0000000..1bb3410
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java

@@ -0,0 +1,208 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.disambiguator.oscc;

+

+import java.io.File;

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.HashMap;

+

+import junit.framework.Assert;

+import opennlp.tools.disambiguator.WSDHelper;

+import opennlp.tools.disambiguator.WSDSample;

+import opennlp.tools.disambiguator.WSDisambiguator;

+import opennlp.tools.disambiguator.mfs.MFS;

+import opennlp.tools.ml.EventTrainer;

+import opennlp.tools.ml.TrainerFactory;

+import opennlp.tools.ml.model.MaxentModel;

+import opennlp.tools.ml.model.Event;

+import opennlp.tools.util.InvalidFormatException;

+import opennlp.tools.util.ObjectStream;

+import opennlp.tools.util.ObjectStreamUtils;

+import opennlp.tools.util.TrainingParameters;

+

+public class OSCCME extends WSDisambiguator {

+

+  protected OSCCModel osccModel;

+

+  protected static OSCCContextGenerator cg = new DefaultOSCCContextGenerator();

+

+  public OSCCME(OSCCParameters params) {

+    this.params = params;

+  }

+

+  public OSCCME(OSCCModel model, OSCCParameters params) {

+    this.osccModel = osccModel;

+    this.params = params;

+

+    Assert.assertEquals(model.getWindowSize(), params.getWindowSize());

+  }

+

+  public void setModel(OSCCModel model) {

+    this.osccModel = model;

+  }

+

+  public void setParameters(OSCCParameters parameters) {

+    this.params = parameters;

+  }

+

+  public static OSCCModel train(String lang, ObjectStream<WSDSample> samples,

+      TrainingParameters mlParams, OSCCParameters osccParams,

+      OSCCFactory imsfactory) throws IOException {

+

+    HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();

+

+    MaxentModel osccModel = null;

+

+    ArrayList<Event> events = new ArrayList<Event>();

+    ObjectStream<Event> es = null;

+

+    WSDSample sample = samples.read();

+    String wordTag = "";

+    if (sample != null) {

+      wordTag = sample.getTargetWordTag();

+      do {

+

+        String sense = sample.getSenseIDs().get(0);

+

+        String[] context = cg.getContext(sample, osccParams.windowSize);

+        Event ev = new Event(sense + "", context);

+

+        events.add(ev);

+

+        es = ObjectStreamUtils.createObjectStream(events);

+

+      } while ((sample = samples.read()) != null);

+    }

+

+    EventTrainer trainer = TrainerFactory.getEventTrainer(

+        mlParams.getSettings(), manifestInfoEntries);

+    osccModel = trainer.train(es);

+

+    return new OSCCModel(lang, wordTag, osccParams.windowSize, osccModel, manifestInfoEntries, imsfactory);

+  }

+

+

+  @Override

+  public String[] disambiguate(WSDSample sample) {

+    if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {

+      String wordTag = sample.getTargetWordTag();

+

+      String trainingFile = ((OSCCParameters) this.getParams())

+          .getTrainingDataDirectory() + sample.getTargetWordTag();

+

+      if (osccModel == null

+          || !osccModel.getWordTag().equals(sample.getTargetWordTag())) {

+

+        File file = new File(trainingFile + ".ims.model");

+        if (file.exists() && !file.isDirectory()) {

+          try {

+            setModel(new OSCCModel(file));

+

+          } catch (InvalidFormatException e) {

+            // TODO Auto-generated catch block

+            e.printStackTrace();

+          } catch (IOException e) {

+            // TODO Auto-generated catch block

+            e.printStackTrace();

+          }

+

+          String outcome = "";

+

+          String[] context = cg.getContext(sample,

+              ((OSCCParameters) this.params).windowSize);

+

+          double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);

+          outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);

+

+          if (outcome != null && !outcome.equals("")) {

+

+            outcome = this.getParams().getSenseSource().name() + " "

+                + wordTag.split("\\.")[0] + "%" + outcome;

+

+            String[] s = { outcome };

+

+            return s;

+          } else {

+            MFS mfs = new MFS();

+            return mfs.disambiguate(wordTag);

+          }

+

+        } else {

+

+          MFS mfs = new MFS();

+          return mfs.disambiguate(wordTag);

+        }

+      } else {

+        String outcome = "";

+

+        String[] context = cg.getContext(sample,

+            ((OSCCParameters) this.params).windowSize);

+

+        double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);

+        outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);

+

+        if (outcome != null && !outcome.equals("")) {

+

+          outcome = this.getParams().getSenseSource().name() + " "

+              + wordTag.split("\\.")[0] + "%" + outcome;

+

+          String[] s = { outcome };

+

+          return s;

+        } else {

+

+          MFS mfs = new MFS();

+          return mfs.disambiguate(wordTag);

+        }

+      }

+    } else {

+

+      if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {

+        String s = OSCCParameters.SenseSource.WSDHELPER.name() + " "

+            + sample.getTargetTag();

+        String[] sense = { s };

+        return sense;

+      } else {

+        return null;

+      }

+

+    }

+

+  }

+

+  /**

+   * The IMS disambiguation method for a single word

+   * 

+   * @param tokenizedContext

+   *          : the text containing the word to disambiguate

+   * @param tokenTags

+   *          : the tags corresponding to the context

+   * @param lemmas

+   *          : the lemmas of ALL the words in the context

+   * @param index

+   *          : the index of the word to disambiguate

+   * @return an array of the senses of the word to disambiguate

+   */

+  public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,

+      String[] lemmas, int index) {

+    return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,

+        index));

+  }

+

+}


diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
new file mode 100644
index 0000000..f3b28ab
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java

@@ -0,0 +1,155 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.disambiguator.oscc;

+

+import java.io.File;

+import java.io.IOException;

+import java.io.InputStream;

+import java.util.ArrayList;

+import java.util.Map;

+import java.util.Properties;

+import java.net.URL;

+

+import org.apache.commons.lang3.StringUtils;

+

+import opennlp.tools.cmdline.CmdLineUtil;

+import opennlp.tools.ml.model.AbstractModel;

+import opennlp.tools.ml.model.MaxentModel;

+import opennlp.tools.util.BaseToolFactory;

+import opennlp.tools.util.InvalidFormatException;

+import opennlp.tools.util.model.BaseModel;

+

+public class OSCCModel extends BaseModel {

+

+  private static final String COMPONENT_NAME = "OSCCME";

+  private static final String OSCC_MODEL_ENTRY_NAME = "OSCC.model";

+

+  private static final String WORDTAG = "wordtag";

+  private static final String WINSIZE = "winsize";

+  private static final String CONTEXTCLUSTERS = "contextclusters";

+

+  //private ArrayList<String> contextClusters = new ArrayList<String>();

+  private String wordTag;

+  private int windowSize;

+

+  /*public ArrayList<String> getContextClusters() {

+    return contextClusters;

+  }*/

+

+  public int getWindowSize() {

+    return windowSize;

+  }

+

+  public void setWindowSize(int windowSize) {

+    this.windowSize = windowSize;

+  }

+

+ /* public void setContextClusters(ArrayList<String> contextClusters) {

+    this.contextClusters = contextClusters;

+  }*/

+

+  public String getWordTag() {

+    return wordTag;

+  }

+

+  public void setWordTag(String wordTag) {

+    this.wordTag = wordTag;

+  }

+

+   public OSCCModel(String languageCode, String wordTag, int windowSize,

+   MaxentModel osccModel,

+      Map<String, String> manifestInfoEntries, OSCCFactory factory) {

+    super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);

+

+    artifactMap.put(OSCC_MODEL_ENTRY_NAME, osccModel);

+    this.setManifestProperty(WORDTAG, wordTag);

+    this.setManifestProperty(WINSIZE, windowSize + "");

+    

+//    this.setManifestProperty(CONTEXTCLUSTERS,

+//        StringUtils.join(contextClusters, ","));

+

+    //this.contextClusters = contextClusters;

+    checkArtifactMap();

+  }

+

+  public OSCCModel(String languageCode, String wordTag, int windowSize,

+      int ngram, MaxentModel osccModel, 

+      OSCCFactory factory) {

+    this(languageCode, wordTag, windowSize, osccModel,

+        null, factory);

+  }

+

+  public OSCCModel(InputStream in) throws IOException, InvalidFormatException {

+    super(COMPONENT_NAME, in);

+    updateAttributes();

+  }

+

+  public OSCCModel(File modelFile) throws IOException, InvalidFormatException {

+    super(COMPONENT_NAME, modelFile);

+    updateAttributes();

+  }

+

+  public OSCCModel(URL modelURL) throws IOException, InvalidFormatException {

+    super(COMPONENT_NAME, modelURL);

+    updateAttributes();

+  }

+

+  // path must include the word.tag i.e. : write.v

+  public boolean writeModel(String path) {

+    File outFile = new File(path + ".oscc.model");

+    CmdLineUtil.writeModel("oscc model", outFile, this);

+    return true;

+  }

+

+  @Override

+  protected void validateArtifactMap() throws InvalidFormatException {

+    super.validateArtifactMap();

+

+    if (!(artifactMap.get(OSCC_MODEL_ENTRY_NAME) instanceof AbstractModel)) {

+      throw new InvalidFormatException("OSCC model is incomplete!");

+    }

+  }

+

+  public MaxentModel getOSCCMaxentModel() {

+    if (artifactMap.get(OSCC_MODEL_ENTRY_NAME) instanceof MaxentModel) {

+      return (MaxentModel) artifactMap.get(OSCC_MODEL_ENTRY_NAME);

+    } else {

+      return null;

+    }

+  }

+

+  public void updateAttributes() {

+    Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);

+    //String contextClusters = (String) manifest.get(CONTEXTCLUSTERS);

+

+   /* this.contextClusters = new ArrayList(

+        Arrays.asList(contextClusters.split(",")));*/

+    this.wordTag = (String) manifest.get(WORDTAG);

+    this.windowSize = Integer.parseInt((String) manifest.get(WINSIZE));

+  }

+

+  @Override

+  protected Class<? extends BaseToolFactory> getDefaultFactory() {

+    return OSCCFactory.class;

+  }

+

+  public OSCCFactory getFactory() {

+    return (OSCCFactory) this.toolFactory;

+  }

+

+}
\ No newline at end of file

diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
new file mode 100644
index 0000000..42a7742
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java

@@ -0,0 +1,117 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator.oscc;

+

+import java.io.File;

+

+import opennlp.tools.disambiguator.WSDParameters;

+

+/**

+ * This class contains the parameters for the OSCC approach as well as the

+ * directories containing the files used

+ */

+public class OSCCParameters extends WSDParameters {

+

+  protected String languageCode;

+  protected int windowSize;

+  protected String trainingDataDirectory;

+

+  protected static final int DFLT_WIN_SIZE = 3;

+  protected static final String DFLT_LANG_CODE = "En";

+  protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET;

+

+  /**

+   * This constructor takes only two parameters. The default language used is

+   * <i>English</i>

+   * 

+   * @param windowSize

+   *          the size of the window used for the extraction of the features

+   *          qualified of Surrounding Context Clusters

+   * 

+   * @param source

+   *          the source of the training data

+   */

+  public OSCCParameters(int windowSize, SenseSource senseSource,

+      String trainingDataDirectory) {

+    this.languageCode = DFLT_LANG_CODE;

+    this.windowSize = windowSize;

+    this.senseSource = senseSource;

+    this.trainingDataDirectory = trainingDataDirectory;

+    this.isCoarseSense = false;

+

+    File folder = new File(trainingDataDirectory);

+    if (!folder.exists())

+      folder.mkdirs();

+  }

+

+  public OSCCParameters(String trainingDataDirectory) {

+    this(DFLT_WIN_SIZE, DFLT_SOURCE, trainingDataDirectory);

+

+    File folder = new File(trainingDataDirectory);

+    if (!folder.exists())

+      folder.mkdirs();

+  }

+

+  public OSCCParameters() {

+    // TODO change the "" into null ??

+    this(DFLT_WIN_SIZE, DFLT_SOURCE, "");

+  }

+

+  public OSCCParameters(int windowSize) {

+    // TODO change the "" into null ??

+    this(windowSize, DFLT_SOURCE, "");

+  }

+

+  public String getLanguageCode() {

+    return languageCode;

+  }

+

+  public void setLanguageCode(String languageCode) {

+    this.languageCode = languageCode;

+  }

+

+  public int getWindowSize() {

+    return windowSize;

+  }

+

+  public void setWindowSize(int windowSize) {

+    this.windowSize = windowSize;

+  }

+

+  public OSCCContextGenerator createContextGenerator() {

+

+    return new DefaultOSCCContextGenerator();

+  }

+

+  public String getTrainingDataDirectory() {

+    return trainingDataDirectory;

+  }

+

+  public void setTrainingDataDirectory(String trainingDataDirectory) {

+    this.trainingDataDirectory = trainingDataDirectory;

+  }

+

+  @Override

+  public boolean isValid() {

+    // TODO make validity check

+    return true;

+  }

+

+}


diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
new file mode 100644
index 0000000..c9723fa
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java

@@ -0,0 +1,75 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator;

+

+import java.util.ArrayList;

+

+import opennlp.tools.disambiguator.datareader.SensevalReader;

+import opennlp.tools.disambiguator.oscc.OSCCME;

+import opennlp.tools.disambiguator.oscc.OSCCParameters;

+

+import org.junit.Test;

+

+public class OSCCEvaluatorTest {

+

+  static SensevalReader seReader = new SensevalReader();

+

+  @Test

+  public static void main(String[] args) {

+    

+    

+    WSDHelper.print("Evaluation Started");

+    

+    // TODO write unit test

+    String modelsDir = "src\\test\\resources\\models\\";

+    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");

+    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");

+    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");

+

+    OSCCParameters OSCCParams = new OSCCParameters("");

+    OSCCME oscc = new OSCCME(OSCCParams);

+

+    ArrayList<String> words = seReader.getSensevalWords();

+

+    for (String word : words) {

+      WSDEvaluator evaluator = new WSDEvaluator(oscc);

+

+      // don't take verbs because they are not from WordNet

+      if (!word.split("\\.")[1].equals("v")) {

+

+        ArrayList<WSDSample> instances = seReader.getSensevalData(word);

+        if (instances != null) {

+          WSDHelper.print("------------------" + word + "------------------");

+          for (WSDSample instance : instances) {

+            if (instance.getSenseIDs() != null

+                && !instance.getSenseIDs().get(0).equals("null")) {

+              evaluator.evaluateSample(instance);

+            }

+          }

+          WSDHelper.print(evaluator.toString());

+        } else {

+          WSDHelper.print("null instances");

+        }

+      }

+

+    }

+

+  }

+}


diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
new file mode 100644
index 0000000..ec6377d
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java

@@ -0,0 +1,116 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ * 

+ *   http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing,

+ * software distributed under the License is distributed on an

+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

+ * KIND, either express or implied.  See the License for the

+ * specific language governing permissions and limitations

+ * under the License.

+ */

+

+package opennlp.tools.disambiguator;

+

+import java.io.File;

+import java.io.IOException;

+import java.util.ArrayList;

+import java.util.List;

+

+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;

+import opennlp.tools.disambiguator.oscc.OSCCFactory;

+import opennlp.tools.disambiguator.oscc.OSCCME;

+import opennlp.tools.disambiguator.oscc.OSCCModel;

+import opennlp.tools.disambiguator.oscc.OSCCParameters;

+import opennlp.tools.util.ObjectStream;

+import opennlp.tools.util.Span;

+import opennlp.tools.util.TrainingParameters;

+

+public class OSCCTester {

+

+  public static void main(String[] args) {

+

+    SemcorReaderExtended sr = new SemcorReaderExtended();

+

+    String modelsDir = "src\\test\\resources\\models\\";

+    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");

+    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");

+    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");

+

+    String test = "write.v";

+    TrainingParameters trainingParams = new TrainingParameters();

+    OSCCParameters OSCCParams = new OSCCParameters("");

+    OSCCFactory OSCCFactory = new OSCCFactory();

+

+    ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);

+

+    OSCCModel model = null;

+    OSCCModel readModel = null;

+    try {

+      model = OSCCME.train("en", sampleStream, trainingParams, OSCCParams,

+          OSCCFactory);

+      model.writeModel(test);

+      File outFile = new File(test + ".OSCC.model");

+      readModel = new OSCCModel(outFile);

+

+    } catch (IOException e1) {

+      // TODO Auto-generated catch block

+      e1.printStackTrace();

+    }

+    OSCCME OSCC = new OSCCME(readModel, OSCCParams);

+

+    /**

+     * This is how to make the context for one-word-disambiguation using OSCC

+     */

+    String test1 = "We need to discuss important topic, please write to me soon.";

+    String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);

+    String[] tags1 = WSDHelper.getTagger().tag(sentence1);

+    List<String> tempLemmas1 = new ArrayList<String>();

+    for (int i = 0; i < sentence1.length; i++) {

+      String lemma = WSDHelper.getLemmatizer()

+          .lemmatize(sentence1[i], tags1[i]);

+      tempLemmas1.add(lemma);

+    }

+    String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);

+

+    // output

+    String[] senses1 = OSCC.disambiguate(sentence1, tags1, lemmas1, 8);

+    System.out.print(lemmas1[8] + " :\t");

+    WSDHelper.print(senses1);

+    WSDHelper.print("*****************************");

+

+    /**

+     * This is how to make the context for disambiguation of span of words

+     */

+    String test2 = "The component was highly radioactive to the point that"

+        + " it has been activated the second it touched water";

+    String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);

+    String[] tags2 = WSDHelper.getTagger().tag(sentence2);

+    List<String> tempLemmas2 = new ArrayList<String>();

+    for (int i = 0; i < sentence2.length; i++) {

+      String lemma = WSDHelper.getLemmatizer()

+          .lemmatize(sentence2[i], tags2[i]);

+      tempLemmas2.add(lemma);

+    }

+    String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);

+    Span span = new Span(3, 7);

+

+    // output

+    List<String[]> senses2 = OSCC.disambiguate(sentence2, tags2, lemmas2, span);

+    for (int i = span.getStart(); i < span.getEnd() + 1; i++) {

+      String[] senses = senses2.get(i - span.getStart());

+      System.out.print(lemmas2[i] + " :\t");

+      WSDHelper.print(senses);

+      WSDHelper.print("----------");

+    }

+

+    WSDHelper.print("*****************************");

+  }

+}
\ No newline at end of file

diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
index 866fc4c..3adcd7d 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java

@@ -1,39 +1,36 @@
 package opennlp.tools.disambiguator;
 
-import java.util.ArrayList;
-import java.util.List;
 
-import opennlp.tools.disambiguator.ims.IMS;
 
 public class Tester {
 
   public static void main(String[] args) {
-
-    String modelsDir = "src\\test\\resources\\models\\";
-    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
-    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
-    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
-    IMS ims = new IMS();
-
-    String test3 = "The summer is almost over and I haven't been to the beach even once";
-    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
-    String[] tags3 = WSDHelper.getTagger().tag(sentence3);
-    List<String> tempLemmas3 = new ArrayList<String>();
-    for (int i = 0; i < sentence3.length; i++) {
-      String lemma = WSDHelper.getLemmatizer()
-          .lemmatize(sentence3[i], tags3[i]);
-      tempLemmas3.add(lemma);
-    }
-    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
-    // output
-    List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
-    for (int i = 0; i < sentence3.length; i++) {
-      System.out.print(sentence3[i] + " : ");
-      WSDHelper.printResults(ims, senses3.get(i));
-      WSDHelper.print("----------");
-    }
+//
+//    String modelsDir = "src\\test\\resources\\models\\";
+//    WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+//    WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+//    WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+//
+//    IMSME ims = new IMSME();
+//
+//    String test3 = "The summer is almost over and I haven't been to the beach even once";
+//    String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+//    String[] tags3 = WSDHelper.getTagger().tag(sentence3);
+//    List<String> tempLemmas3 = new ArrayList<String>();
+//    for (int i = 0; i < sentence3.length; i++) {
+//      String lemma = WSDHelper.getLemmatizer()
+//          .lemmatize(sentence3[i], tags3[i]);
+//      tempLemmas3.add(lemma);
+//    }
+//    String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+//
+//    // output
+//    List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
+//    for (int i = 0; i < sentence3.length; i++) {
+//      System.out.print(sentence3[i] + " : ");
+//      WSDHelper.printResults(ims, senses3.get(i));
+//      WSDHelper.print("----------");
+//    }
 
   }
 }
\ No newline at end of file
commit	d81166bb292063ab9e4c12348732ca263fb98718	[log] [tgz]
author	Jörn Kottmann <joern@apache.org>	Wed Aug 26 15:56:53 2015 +0000
committer	Jörn Kottmann <joern@apache.org>	Wed Aug 26 15:56:53 2015 +0000
tree	b3a750bc7daddb45c4d21673544ea6ba7debeace
parent	558081800e61b8aff75dd78121de9f7999b351ee [diff]