OPENNLP-791 WordNet based clusters patch, uses ME for now will have to modify for other classifiers. Thanks to Anthony Beylerian for providing a patch!
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
index f0bb765..136d5f2 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDParameters.java
@@ -62,7 +62,7 @@
}
public WSDParameters() {
- this.isCoarseSense = true;
+ this.isCoarseSense = false;
}
/**
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
index 9ef35d0..06451e5 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDisambiguator.java
@@ -75,8 +75,11 @@
* @param ambiguousTokenIndex
* @return result as an array of WordNet IDs
*/
- public abstract String[] disambiguate(String[] tokenizedContext,
- String[] tokenTags, String[] lemmas, int ambiguousTokenIndex);
+ public String[] disambiguate(String[] tokenizedContext,
+ String[] tokenTags, String[] lemmas, int ambiguousTokenIndex){
+ return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,
+ ambiguousTokenIndex));
+ }
/**
* The disambiguation method for all the words in a Span
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
new file mode 100644
index 0000000..9584487
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import net.sf.extjwnl.data.Synset;
+import opennlp.tools.disambiguator.WSDHelper;
+import opennlp.tools.disambiguator.WSDSample;
+import opennlp.tools.disambiguator.WordPOS;
+
+/**
+ * The default Context Generator of IMS
+ */
+public class DefaultOSCCContextGenerator implements OSCCContextGenerator {
+
+ public DefaultOSCCContextGenerator() {
+ }
+
+ public String[] extractSurroundingContextClusters(int index, String[] toks,
+ String[] tags, String[] lemmas, int windowSize) {
+
+ ArrayList<String> contextClusters = new ArrayList<String>();
+
+ for (int i = 0; i < toks.length; i++) {
+ if (lemmas != null) {
+
+ if (!WSDHelper.stopWords.contains(toks[i].toLowerCase())
+ && (index != i)) {
+
+ String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "")
+ .trim();
+
+ WordPOS word = new WordPOS(lemma, tags[i]);
+
+ // TODO check fix for "_" and null pointers
+ if (lemma.length() > 1 && !lemma.contains("_")) {
+ try{
+ ArrayList<Synset> synsets = word.getSynsets();
+ if (synsets!=null && synsets.size() > 0 ){
+ contextClusters.add(synsets.get(0).getOffset() + "");
+ }
+ }catch(NullPointerException ex)
+ {
+ //TODO tagger mistake add proper exception
+ }
+ }
+
+ }
+ }
+ }
+
+ return contextClusters.toArray(new String[contextClusters.size()]);
+
+ }
+
+ /**
+ * Get Context of a word To disambiguate
+ *
+ * @return The OSCC context of the word to disambiguate
+ */
+ @Override
+ public String[] getContext(int index, String[] toks, String[] tags,
+ String[] lemmas, int windowSize) {
+
+ HashSet<String> surroundingContextClusters = new HashSet<>();
+ surroundingContextClusters.addAll(Arrays
+ .asList(extractSurroundingContextClusters(index, toks, tags, lemmas,
+ windowSize)));
+
+ String[] serializedFeatures = new String[surroundingContextClusters.size()];
+
+ int i = 0;
+
+ for (String feature : surroundingContextClusters) {
+ serializedFeatures[i] = "F" + i + "=" + feature;
+ i++;
+ }
+
+ return serializedFeatures;
+
+ }
+
+ public String[] getContext(WSDSample sample, int windowSize) {
+
+ return getContext(sample.getTargetPosition(), sample.getSentence(),
+ sample.getTags(), sample.getLemmas(), windowSize);
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
new file mode 100644
index 0000000..9c0055f
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import opennlp.tools.disambiguator.WSDSample;
+
+/**
+ * Interface for {@link OSCCME} context generators.
+ */
+public interface OSCCContextGenerator {
+
+ String[] getContext(int index, String[] toks, String[] tags, String[] lemmas,
+ int windowSize);
+
+ String[] getContext(WSDSample sample, int windowSize);
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
new file mode 100644
index 0000000..e9cdecb
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+public class OSCCFactory extends BaseToolFactory {
+
+ /**
+ * Creates a {@link OSCCFactory} that provides the default implementation of
+ * the resources.
+ * */
+ public OSCCFactory() {
+
+ }
+
+ public static OSCCFactory create(String subclassName)
+ throws InvalidFormatException {
+ if (subclassName == null) {
+ // will create the default factory
+ return new OSCCFactory();
+ }
+ try {
+ OSCCFactory theFactory = ExtensionLoader.instantiateExtension(
+ OSCCFactory.class, subclassName);
+ return theFactory;
+ } catch (Exception e) {
+ String msg = "Could not instantiate the " + subclassName
+ + ". The initialization throw an exception.";
+ System.err.println(msg);
+ e.printStackTrace();
+ throw new InvalidFormatException(msg, e);
+ }
+ }
+
+ @Override
+ public void validateArtifactMap() throws InvalidFormatException {
+ // no additional artifacts
+ }
+
+ public OSCCContextGenerator getContextGenerator() {
+ return new DefaultOSCCContextGenerator();
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
new file mode 100644
index 0000000..1bb3410
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.Assert;
+import opennlp.tools.disambiguator.WSDHelper;
+import opennlp.tools.disambiguator.WSDSample;
+import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.disambiguator.mfs.MFS;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+import opennlp.tools.util.TrainingParameters;
+
+public class OSCCME extends WSDisambiguator {
+
+ protected OSCCModel osccModel;
+
+ protected static OSCCContextGenerator cg = new DefaultOSCCContextGenerator();
+
+ public OSCCME(OSCCParameters params) {
+ this.params = params;
+ }
+
+ public OSCCME(OSCCModel model, OSCCParameters params) {
+ this.osccModel = osccModel;
+ this.params = params;
+
+ Assert.assertEquals(model.getWindowSize(), params.getWindowSize());
+ }
+
+ public void setModel(OSCCModel model) {
+ this.osccModel = model;
+ }
+
+ public void setParameters(OSCCParameters parameters) {
+ this.params = parameters;
+ }
+
+ public static OSCCModel train(String lang, ObjectStream<WSDSample> samples,
+ TrainingParameters mlParams, OSCCParameters osccParams,
+ OSCCFactory imsfactory) throws IOException {
+
+ HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+ MaxentModel osccModel = null;
+
+ ArrayList<Event> events = new ArrayList<Event>();
+ ObjectStream<Event> es = null;
+
+ WSDSample sample = samples.read();
+ String wordTag = "";
+ if (sample != null) {
+ wordTag = sample.getTargetWordTag();
+ do {
+
+ String sense = sample.getSenseIDs().get(0);
+
+ String[] context = cg.getContext(sample, osccParams.windowSize);
+ Event ev = new Event(sense + "", context);
+
+ events.add(ev);
+
+ es = ObjectStreamUtils.createObjectStream(events);
+
+ } while ((sample = samples.read()) != null);
+ }
+
+ EventTrainer trainer = TrainerFactory.getEventTrainer(
+ mlParams.getSettings(), manifestInfoEntries);
+ osccModel = trainer.train(es);
+
+ return new OSCCModel(lang, wordTag, osccParams.windowSize, osccModel, manifestInfoEntries, imsfactory);
+ }
+
+
+ @Override
+ public String[] disambiguate(WSDSample sample) {
+ if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
+ String wordTag = sample.getTargetWordTag();
+
+ String trainingFile = ((OSCCParameters) this.getParams())
+ .getTrainingDataDirectory() + sample.getTargetWordTag();
+
+ if (osccModel == null
+ || !osccModel.getWordTag().equals(sample.getTargetWordTag())) {
+
+ File file = new File(trainingFile + ".ims.model");
+ if (file.exists() && !file.isDirectory()) {
+ try {
+ setModel(new OSCCModel(file));
+
+ } catch (InvalidFormatException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ String outcome = "";
+
+ String[] context = cg.getContext(sample,
+ ((OSCCParameters) this.params).windowSize);
+
+ double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);
+ outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);
+
+ if (outcome != null && !outcome.equals("")) {
+
+ outcome = this.getParams().getSenseSource().name() + " "
+ + wordTag.split("\\.")[0] + "%" + outcome;
+
+ String[] s = { outcome };
+
+ return s;
+ } else {
+ MFS mfs = new MFS();
+ return mfs.disambiguate(wordTag);
+ }
+
+ } else {
+
+ MFS mfs = new MFS();
+ return mfs.disambiguate(wordTag);
+ }
+ } else {
+ String outcome = "";
+
+ String[] context = cg.getContext(sample,
+ ((OSCCParameters) this.params).windowSize);
+
+ double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);
+ outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);
+
+ if (outcome != null && !outcome.equals("")) {
+
+ outcome = this.getParams().getSenseSource().name() + " "
+ + wordTag.split("\\.")[0] + "%" + outcome;
+
+ String[] s = { outcome };
+
+ return s;
+ } else {
+
+ MFS mfs = new MFS();
+ return mfs.disambiguate(wordTag);
+ }
+ }
+ } else {
+
+ if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {
+ String s = OSCCParameters.SenseSource.WSDHELPER.name() + " "
+ + sample.getTargetTag();
+ String[] sense = { s };
+ return sense;
+ } else {
+ return null;
+ }
+
+ }
+
+ }
+
+ /**
+ * The IMS disambiguation method for a single word
+ *
+ * @param tokenizedContext
+ * : the text containing the word to disambiguate
+ * @param tokenTags
+ * : the tags corresponding to the context
+ * @param lemmas
+ * : the lemmas of ALL the words in the context
+ * @param index
+ * : the index of the word to disambiguate
+ * @return an array of the senses of the word to disambiguate
+ */
+ public String[] disambiguate(String[] tokenizedContext, String[] tokenTags,
+ String[] lemmas, int index) {
+ return disambiguate(new WSDSample(tokenizedContext, tokenTags, lemmas,
+ index));
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
new file mode 100644
index 0000000..f3b28ab
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.Properties;
+import java.net.URL;
+
+import org.apache.commons.lang3.StringUtils;
+
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+public class OSCCModel extends BaseModel {
+
+ private static final String COMPONENT_NAME = "OSCCME";
+ private static final String OSCC_MODEL_ENTRY_NAME = "OSCC.model";
+
+ private static final String WORDTAG = "wordtag";
+ private static final String WINSIZE = "winsize";
+ private static final String CONTEXTCLUSTERS = "contextclusters";
+
+ //private ArrayList<String> contextClusters = new ArrayList<String>();
+ private String wordTag;
+ private int windowSize;
+
+ /*public ArrayList<String> getContextClusters() {
+ return contextClusters;
+ }*/
+
+ public int getWindowSize() {
+ return windowSize;
+ }
+
+ public void setWindowSize(int windowSize) {
+ this.windowSize = windowSize;
+ }
+
+ /* public void setContextClusters(ArrayList<String> contextClusters) {
+ this.contextClusters = contextClusters;
+ }*/
+
+ public String getWordTag() {
+ return wordTag;
+ }
+
+ public void setWordTag(String wordTag) {
+ this.wordTag = wordTag;
+ }
+
+ public OSCCModel(String languageCode, String wordTag, int windowSize,
+ MaxentModel osccModel,
+ Map<String, String> manifestInfoEntries, OSCCFactory factory) {
+ super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+
+ artifactMap.put(OSCC_MODEL_ENTRY_NAME, osccModel);
+ this.setManifestProperty(WORDTAG, wordTag);
+ this.setManifestProperty(WINSIZE, windowSize + "");
+
+// this.setManifestProperty(CONTEXTCLUSTERS,
+// StringUtils.join(contextClusters, ","));
+
+ //this.contextClusters = contextClusters;
+ checkArtifactMap();
+ }
+
+ public OSCCModel(String languageCode, String wordTag, int windowSize,
+ int ngram, MaxentModel osccModel,
+ OSCCFactory factory) {
+ this(languageCode, wordTag, windowSize, osccModel,
+ null, factory);
+ }
+
+ public OSCCModel(InputStream in) throws IOException, InvalidFormatException {
+ super(COMPONENT_NAME, in);
+ updateAttributes();
+ }
+
+ public OSCCModel(File modelFile) throws IOException, InvalidFormatException {
+ super(COMPONENT_NAME, modelFile);
+ updateAttributes();
+ }
+
+ public OSCCModel(URL modelURL) throws IOException, InvalidFormatException {
+ super(COMPONENT_NAME, modelURL);
+ updateAttributes();
+ }
+
+ // path must include the word.tag i.e. : write.v
+ public boolean writeModel(String path) {
+ File outFile = new File(path + ".oscc.model");
+ CmdLineUtil.writeModel("oscc model", outFile, this);
+ return true;
+ }
+
+ @Override
+ protected void validateArtifactMap() throws InvalidFormatException {
+ super.validateArtifactMap();
+
+ if (!(artifactMap.get(OSCC_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+ throw new InvalidFormatException("OSCC model is incomplete!");
+ }
+ }
+
+ public MaxentModel getOSCCMaxentModel() {
+ if (artifactMap.get(OSCC_MODEL_ENTRY_NAME) instanceof MaxentModel) {
+ return (MaxentModel) artifactMap.get(OSCC_MODEL_ENTRY_NAME);
+ } else {
+ return null;
+ }
+ }
+
+ public void updateAttributes() {
+ Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
+ //String contextClusters = (String) manifest.get(CONTEXTCLUSTERS);
+
+ /* this.contextClusters = new ArrayList(
+ Arrays.asList(contextClusters.split(",")));*/
+ this.wordTag = (String) manifest.get(WORDTAG);
+ this.windowSize = Integer.parseInt((String) manifest.get(WINSIZE));
+ }
+
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+ return OSCCFactory.class;
+ }
+
+ public OSCCFactory getFactory() {
+ return (OSCCFactory) this.toolFactory;
+ }
+
+}
\ No newline at end of file
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
new file mode 100644
index 0000000..42a7742
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.oscc;
+
+import java.io.File;
+
+import opennlp.tools.disambiguator.WSDParameters;
+
+/**
+ * This class contains the parameters for the OSCC approach as well as the
+ * directories containing the files used
+ */
+public class OSCCParameters extends WSDParameters {
+
+ protected String languageCode;
+ protected int windowSize;
+ protected String trainingDataDirectory;
+
+ protected static final int DFLT_WIN_SIZE = 3;
+ protected static final String DFLT_LANG_CODE = "En";
+ protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET;
+
+ /**
+ * This constructor takes only two parameters. The default language used is
+ * <i>English</i>
+ *
+ * @param windowSize
+ * the size of the window used for the extraction of the features
+ * qualified of Surrounding Context Clusters
+ *
+ * @param source
+ * the source of the training data
+ */
+ public OSCCParameters(int windowSize, SenseSource senseSource,
+ String trainingDataDirectory) {
+ this.languageCode = DFLT_LANG_CODE;
+ this.windowSize = windowSize;
+ this.senseSource = senseSource;
+ this.trainingDataDirectory = trainingDataDirectory;
+ this.isCoarseSense = false;
+
+ File folder = new File(trainingDataDirectory);
+ if (!folder.exists())
+ folder.mkdirs();
+ }
+
+ public OSCCParameters(String trainingDataDirectory) {
+ this(DFLT_WIN_SIZE, DFLT_SOURCE, trainingDataDirectory);
+
+ File folder = new File(trainingDataDirectory);
+ if (!folder.exists())
+ folder.mkdirs();
+ }
+
+ public OSCCParameters() {
+ // TODO change the "" into null ??
+ this(DFLT_WIN_SIZE, DFLT_SOURCE, "");
+ }
+
+ public OSCCParameters(int windowSize) {
+ // TODO change the "" into null ??
+ this(windowSize, DFLT_SOURCE, "");
+ }
+
+ public String getLanguageCode() {
+ return languageCode;
+ }
+
+ public void setLanguageCode(String languageCode) {
+ this.languageCode = languageCode;
+ }
+
+ public int getWindowSize() {
+ return windowSize;
+ }
+
+ public void setWindowSize(int windowSize) {
+ this.windowSize = windowSize;
+ }
+
+ public OSCCContextGenerator createContextGenerator() {
+
+ return new DefaultOSCCContextGenerator();
+ }
+
+ public String getTrainingDataDirectory() {
+ return trainingDataDirectory;
+ }
+
+ public void setTrainingDataDirectory(String trainingDataDirectory) {
+ this.trainingDataDirectory = trainingDataDirectory;
+ }
+
+ @Override
+ public boolean isValid() {
+ // TODO make validity check
+ return true;
+ }
+
+}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
new file mode 100644
index 0000000..c9723fa
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.util.ArrayList;
+
+import opennlp.tools.disambiguator.datareader.SensevalReader;
+import opennlp.tools.disambiguator.oscc.OSCCME;
+import opennlp.tools.disambiguator.oscc.OSCCParameters;
+
+import org.junit.Test;
+
+public class OSCCEvaluatorTest {
+
+ static SensevalReader seReader = new SensevalReader();
+
+ @Test
+ public static void main(String[] args) {
+
+
+ WSDHelper.print("Evaluation Started");
+
+ // TODO write unit test
+ String modelsDir = "src\\test\\resources\\models\\";
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+ OSCCParameters OSCCParams = new OSCCParameters("");
+ OSCCME oscc = new OSCCME(OSCCParams);
+
+ ArrayList<String> words = seReader.getSensevalWords();
+
+ for (String word : words) {
+ WSDEvaluator evaluator = new WSDEvaluator(oscc);
+
+ // don't take verbs because they are not from WordNet
+ if (!word.split("\\.")[1].equals("v")) {
+
+ ArrayList<WSDSample> instances = seReader.getSensevalData(word);
+ if (instances != null) {
+ WSDHelper.print("------------------" + word + "------------------");
+ for (WSDSample instance : instances) {
+ if (instance.getSenseIDs() != null
+ && !instance.getSenseIDs().get(0).equals("null")) {
+ evaluator.evaluateSample(instance);
+ }
+ }
+ WSDHelper.print(evaluator.toString());
+ } else {
+ WSDHelper.print("null instances");
+ }
+ }
+
+ }
+
+ }
+}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
new file mode 100644
index 0000000..ec6377d
--- /dev/null
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCTester.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
+import opennlp.tools.disambiguator.oscc.OSCCFactory;
+import opennlp.tools.disambiguator.oscc.OSCCME;
+import opennlp.tools.disambiguator.oscc.OSCCModel;
+import opennlp.tools.disambiguator.oscc.OSCCParameters;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+public class OSCCTester {
+
+ public static void main(String[] args) {
+
+ SemcorReaderExtended sr = new SemcorReaderExtended();
+
+ String modelsDir = "src\\test\\resources\\models\\";
+ WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+ WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+ WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+
+ String test = "write.v";
+ TrainingParameters trainingParams = new TrainingParameters();
+ OSCCParameters OSCCParams = new OSCCParameters("");
+ OSCCFactory OSCCFactory = new OSCCFactory();
+
+ ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
+
+ OSCCModel model = null;
+ OSCCModel readModel = null;
+ try {
+ model = OSCCME.train("en", sampleStream, trainingParams, OSCCParams,
+ OSCCFactory);
+ model.writeModel(test);
+ File outFile = new File(test + ".OSCC.model");
+ readModel = new OSCCModel(outFile);
+
+ } catch (IOException e1) {
+ // TODO Auto-generated catch block
+ e1.printStackTrace();
+ }
+ OSCCME OSCC = new OSCCME(readModel, OSCCParams);
+
+ /**
+ * This is how to make the context for one-word-disambiguation using OSCC
+ */
+ String test1 = "We need to discuss important topic, please write to me soon.";
+ String[] sentence1 = WSDHelper.getTokenizer().tokenize(test1);
+ String[] tags1 = WSDHelper.getTagger().tag(sentence1);
+ List<String> tempLemmas1 = new ArrayList<String>();
+ for (int i = 0; i < sentence1.length; i++) {
+ String lemma = WSDHelper.getLemmatizer()
+ .lemmatize(sentence1[i], tags1[i]);
+ tempLemmas1.add(lemma);
+ }
+ String[] lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
+
+ // output
+ String[] senses1 = OSCC.disambiguate(sentence1, tags1, lemmas1, 8);
+ System.out.print(lemmas1[8] + " :\t");
+ WSDHelper.print(senses1);
+ WSDHelper.print("*****************************");
+
+ /**
+ * This is how to make the context for disambiguation of span of words
+ */
+ String test2 = "The component was highly radioactive to the point that"
+ + " it has been activated the second it touched water";
+ String[] sentence2 = WSDHelper.getTokenizer().tokenize(test2);
+ String[] tags2 = WSDHelper.getTagger().tag(sentence2);
+ List<String> tempLemmas2 = new ArrayList<String>();
+ for (int i = 0; i < sentence2.length; i++) {
+ String lemma = WSDHelper.getLemmatizer()
+ .lemmatize(sentence2[i], tags2[i]);
+ tempLemmas2.add(lemma);
+ }
+ String[] lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
+ Span span = new Span(3, 7);
+
+ // output
+ List<String[]> senses2 = OSCC.disambiguate(sentence2, tags2, lemmas2, span);
+ for (int i = span.getStart(); i < span.getEnd() + 1; i++) {
+ String[] senses = senses2.get(i - span.getStart());
+ System.out.print(lemmas2[i] + " :\t");
+ WSDHelper.print(senses);
+ WSDHelper.print("----------");
+ }
+
+ WSDHelper.print("*****************************");
+ }
+}
\ No newline at end of file
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
index 866fc4c..3adcd7d 100644
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
+++ b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/Tester.java
@@ -1,39 +1,36 @@
package opennlp.tools.disambiguator;
-import java.util.ArrayList;
-import java.util.List;
-import opennlp.tools.disambiguator.ims.IMS;
public class Tester {
public static void main(String[] args) {
-
- String modelsDir = "src\\test\\resources\\models\\";
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
- IMS ims = new IMS();
-
- String test3 = "The summer is almost over and I haven't been to the beach even once";
- String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
- String[] tags3 = WSDHelper.getTagger().tag(sentence3);
- List<String> tempLemmas3 = new ArrayList<String>();
- for (int i = 0; i < sentence3.length; i++) {
- String lemma = WSDHelper.getLemmatizer()
- .lemmatize(sentence3[i], tags3[i]);
- tempLemmas3.add(lemma);
- }
- String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
- // output
- List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
- for (int i = 0; i < sentence3.length; i++) {
- System.out.print(sentence3[i] + " : ");
- WSDHelper.printResults(ims, senses3.get(i));
- WSDHelper.print("----------");
- }
+//
+// String modelsDir = "src\\test\\resources\\models\\";
+// WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
+// WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
+// WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
+//
+// IMSME ims = new IMSME();
+//
+// String test3 = "The summer is almost over and I haven't been to the beach even once";
+// String[] sentence3 = WSDHelper.getTokenizer().tokenize(test3);
+// String[] tags3 = WSDHelper.getTagger().tag(sentence3);
+// List<String> tempLemmas3 = new ArrayList<String>();
+// for (int i = 0; i < sentence3.length; i++) {
+// String lemma = WSDHelper.getLemmatizer()
+// .lemmatize(sentence3[i], tags3[i]);
+// tempLemmas3.add(lemma);
+// }
+// String[] lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
+//
+// // output
+// List<String[]> senses3 = ims.disambiguate(sentence3, tags3, lemmas3);
+// for (int i = 0; i < sentence3.length; i++) {
+// System.out.print(sentence3[i] + " : ");
+// WSDHelper.printResults(ims, senses3.get(i));
+// WSDHelper.print("----------");
+// }
}
}
\ No newline at end of file