OPENNLP-843 - removed the unnecessary files
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
deleted file mode 100644
index 71b928e..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSContextGenerator.java
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-
-import opennlp.tools.disambiguator.WSDHelper;
-import opennlp.tools.disambiguator.WSDSample;
-import opennlp.tools.disambiguator.ims.WTDIMS;
-
-/**
- * The default Context Generator of IMS
- */
-// TODO remove this class later
-public class DefaultIMSContextGenerator implements IMSContextGenerator {
-
- public DefaultIMSContextGenerator() {
- }
-
- private String[] extractPosOfSurroundingWords(int index, String[] tags,
- int windowSize) {
-
- String[] windowTags = new String[2 * windowSize + 1];
-
- int j = 0;
-
- for (int i = index - windowSize; i < index + windowSize; i++) {
- if (i < 0 || i >= tags.length) {
- windowTags[j] = "null";
- } else {
- windowTags[j] = tags[i].toLowerCase();
- }
- j++;
- }
-
- return windowTags;
- }
-
- public String[] extractSurroundingWords(int index, String[] toks,
- String[] lemmas, int windowSize) {
-
- // TODO consider the windowSize
- ArrayList<String> contextWords = new ArrayList<String>();
-
- for (int i = 0; i < toks.length; i++) {
- if (lemmas != null) {
- if (!WSDHelper.stopWords.contains(toks[i].toLowerCase()) && (index
- != i)) {
-
- String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "")
- .trim();
-
- if (lemma.length() > 1) {
- contextWords.add(lemma);
- }
-
- }
- }
- }
-
- return contextWords.toArray(new String[contextWords.size()]);
- }
-
- private String[] extractLocalCollocations(int index, String[] sentence,
- int ngram) {
- /**
- * Here the author used only 11 features of this type. the range was set to
- * 3 (bigrams extracted in a way that they are at max separated by 1 word).
- */
-
- ArrayList<String> localCollocations = new ArrayList<String>();
-
- for (int i = index - ngram; i <= index + ngram; i++) {
-
- if (!(i < 0 || i > sentence.length - 2)) {
- if ((i != index) && (i + 1 != index) && (i + 1 < index + ngram)) {
- String lc = sentence[i] + " " + sentence[i + 1];
- localCollocations.add(lc);
- }
- if ((i != index) && (i + 2 != index) && (i + 2 < index + ngram)) {
- String lc = sentence[i] + " " + sentence[i + 2];
- localCollocations.add(lc);
- }
- }
-
- }
- String[] res = new String[localCollocations.size()];
- res = localCollocations.toArray(new String[localCollocations.size()]);
-
- return res;
- }
-
- /**
- * Get Context of a word To disambiguate
- *
- * @return The IMS context of the word to disambiguate
- */
- @Override public String[] getContext(int index, String[] toks, String[] tags,
- String[] lemmas, int ngram, int windowSize, ArrayList<String> model) {
-
- String[] posOfSurroundingWords = extractPosOfSurroundingWords(index, toks,
- windowSize);
-
- HashSet<String> surroundingWords = new HashSet<>();
- surroundingWords.addAll(
- Arrays.asList(extractSurroundingWords(index, toks, lemmas, windowSize)));
-
- String[] localCollocations = extractLocalCollocations(index, toks, ngram);
-
- String[] serializedFeatures = new String[posOfSurroundingWords.length
- + localCollocations.length + model.size()];
-
- int i = 0;
-
- for (String feature : posOfSurroundingWords) {
- serializedFeatures[i] = "F" + i + "=" + feature;
- i++;
- }
-
- for (String feature : localCollocations) {
- serializedFeatures[i] = "F" + i + "=" + feature;
- i++;
- }
- for (String word : model) {
-
- if (surroundingWords.contains(word.toString())) {
- serializedFeatures[i] = "F" + i + "=1";
- } else {
- serializedFeatures[i] = "F" + i + "=0";
- }
- i++;
-
- }
-
- return serializedFeatures;
-
- }
-
- public String[] getContext(WSDSample sample, int ngram, int windowSize,
- ArrayList<String> model) {
-
- return getContext(sample.getTargetPosition(), sample.getSentence(),
- sample.getTags(), sample.getLemmas(), ngram, windowSize, model);
- }
-
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSSequenceValidator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSSequenceValidator.java
deleted file mode 100644
index 535c30f..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/DefaultIMSSequenceValidator.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import opennlp.tools.util.SequenceValidator;
-
-// TODO remove this class later
-public class DefaultIMSSequenceValidator implements SequenceValidator<String> {
-
- private boolean validOutcome(String outcome, String prevOutcome) {
- if (outcome.startsWith("I-")) {
- if (prevOutcome == null) {
- return (false);
- } else {
- if (prevOutcome.equals("O")) {
- return (false);
- }
- if (!prevOutcome.substring(2).equals(outcome.substring(2))) {
- return (false);
- }
- }
- }
- return true;
- }
-
- protected boolean validOutcome(String outcome, String[] sequence) {
- String prevOutcome = null;
- if (sequence.length > 0) {
- prevOutcome = sequence[sequence.length - 1];
- }
- return validOutcome(outcome, prevOutcome);
- }
-
- public boolean validSequence(int i, String[] sequence, String[] s,
- String outcome) {
- return validOutcome(outcome, s);
- }
-
-}
\ No newline at end of file
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
deleted file mode 100644
index 37405ef..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSContextGenerator.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import java.util.ArrayList;
-
-import opennlp.tools.disambiguator.WSDSample;
-
-/**
- * Interface for {@link IMSME} context generators.
- */
-// TODO remove this class later
-public interface IMSContextGenerator {
-
- String[] getContext(int index, String[] toks, String[] tags, String[] lemmas,
- int ngram, int windowSize, ArrayList<String> model);
-
- String[] getContext(WSDSample sample, int ngram, int windowSize,
- ArrayList<String> model);
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
deleted file mode 100644
index a7bd2f4..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSFactory.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import opennlp.tools.util.BaseToolFactory;
-import opennlp.tools.util.InvalidFormatException;
-import opennlp.tools.util.SequenceValidator;
-import opennlp.tools.util.ext.ExtensionLoader;
-
-// TODO remove this class later
-public class IMSFactory extends BaseToolFactory {
-
- /**
- * Creates a {@link IMSFactory} that provides the default implementation of
- * the resources.
- */
- public IMSFactory() {
-
- }
-
- public static IMSFactory create(String subclassName)
- throws InvalidFormatException {
- if (subclassName == null) {
- // will create the default factory
- return new IMSFactory();
- }
- try {
- IMSFactory theFactory = ExtensionLoader
- .instantiateExtension(IMSFactory.class, subclassName);
- return theFactory;
- } catch (Exception e) {
- String msg = "Could not instantiate the " + subclassName
- + ". The initialization throw an exception.";
- System.err.println(msg);
- e.printStackTrace();
- throw new InvalidFormatException(msg, e);
- }
- }
-
- @Override public void validateArtifactMap() throws InvalidFormatException {
- // no additional artifacts
- }
-
- public IMSContextGenerator getContextGenerator() {
- return new DefaultIMSContextGenerator();
- }
-
- public SequenceValidator<String> getSequenceValidator() {
- return new DefaultIMSSequenceValidator();
- }
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
deleted file mode 100644
index 1755b33..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSME.java
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import opennlp.tools.disambiguator.WSDHelper;
-import opennlp.tools.disambiguator.WSDSample;
-import opennlp.tools.disambiguator.WSDisambiguator;
-import opennlp.tools.disambiguator.MFS;
-import opennlp.tools.ml.EventTrainer;
-import opennlp.tools.ml.TrainerFactory;
-import opennlp.tools.ml.model.MaxentModel;
-import opennlp.tools.ml.model.Event;
-import opennlp.tools.util.InvalidFormatException;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.ObjectStreamUtils;
-import opennlp.tools.util.TrainingParameters;
-
-public class IMSME extends WSDisambiguator {
-
- protected IMSModel imsModel;
-
- protected static IMSContextGenerator cg = new DefaultIMSContextGenerator();
-
- public IMSME(IMSParameters params) {
- this.params = params;
- }
-
- public IMSME(IMSModel model, IMSParameters params) {
- this.imsModel = model;
- this.params = params;
- }
-
- public IMSModel getModel() {
- return imsModel;
- }
-
- public void setModel(IMSModel model) {
- this.imsModel = model;
- }
-
- public void setParameters(IMSParameters parameters) {
- this.params = parameters;
- }
-
- public static IMSModel train(String lang, ObjectStream<WSDSample> samples,
- TrainingParameters mlParams, IMSParameters imsParams,
- IMSFactory imsfactory) throws IOException {
-
- ArrayList<String> surroundingWordModel = buildSurroundingWords(samples,
- imsParams.getWindowSize());
-
- HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
-
- MaxentModel imsModel = null;
-
- ArrayList<Event> events = new ArrayList<Event>();
- ObjectStream<Event> es = null;
-
- WSDSample sample = samples.read();
- String wordTag = "";
- if (sample != null) {
- wordTag = sample.getTargetWordTag();
- do {
-
- String sense = sample.getSenseIDs()[0];
-
- String[] context = cg.getContext(sample, imsParams.ngram,
- imsParams.windowSize, surroundingWordModel);
- Event ev = new Event(sense + "", context);
-
- events.add(ev);
-
- } while ((sample = samples.read()) != null);
- }
-
- es = ObjectStreamUtils.createObjectStream(events);
-
- EventTrainer trainer = TrainerFactory
- .getEventTrainer(mlParams.getSettings(), manifestInfoEntries);
- imsModel = trainer.train(es);
-
- return new IMSModel(lang, wordTag, imsParams.windowSize, imsParams.ngram,
- imsModel, surroundingWordModel, manifestInfoEntries, imsfactory);
- }
-
- public static ArrayList<String> buildSurroundingWords(
- ObjectStream<WSDSample> samples, int windowSize) throws IOException {
- DefaultIMSContextGenerator imsCG = new DefaultIMSContextGenerator();
- ArrayList<String> surroundingWordsModel = new ArrayList<String>();
- WSDSample sample;
- while ((sample = samples.read()) != null) {
- String[] words = imsCG.extractSurroundingWords(sample.getTargetPosition(),
- sample.getSentence(), sample.getLemmas(), windowSize);
-
- if (words.length > 0) {
- for (String word : words) {
- surroundingWordsModel.add(word);
- }
- }
- }
- samples.reset();
- return surroundingWordsModel;
- }
-
- @Override
- public String disambiguate(WSDSample sample) {
- if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
- String wordTag = sample.getTargetWordTag();
-
- if (imsModel == null
- || !imsModel.getWordTag().equals(sample.getTargetWordTag())) {
-
- String trainingFile = ((IMSParameters) this.getParams())
- .getTrainingDataDirectory() + sample.getTargetWordTag();
-
- File file = new File(trainingFile + ".ims.model");
- if (file.exists() && !file.isDirectory()) {
- try {
- setModel(new IMSModel(file));
-
- } catch (InvalidFormatException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
-
- String outcome = "";
-
- String[] context = cg.getContext(sample,
- ((IMSParameters) this.params).ngram,
- ((IMSParameters) this.params).windowSize,
- imsModel.getSurroundingWords());
-
- double[] outcomeProbs = imsModel.getIMSMaxentModel().eval(context);
- outcome = imsModel.getIMSMaxentModel().getBestOutcome(outcomeProbs);
-
- if (outcome != null && !outcome.equals("")) {
-
- return this.getParams().getSenseSource().name() + " "
- + wordTag.split("\\.")[0] + "%" + outcome;
-
- } else {
- MFS mfs = new MFS();
- return mfs.disambiguate(wordTag);
- }
-
- } else {
- MFS mfs = new MFS();
- return mfs.disambiguate(wordTag);
- }
- } else {
-
- String outcome = "";
-
- String[] context = cg.getContext(sample,
- ((IMSParameters) this.params).ngram,
- ((IMSParameters) this.params).windowSize,
- imsModel.getSurroundingWords());
-
- double[] outcomeProbs = imsModel.getIMSMaxentModel().eval(context);
- outcome = imsModel.getIMSMaxentModel().getBestOutcome(outcomeProbs);
-
- if (outcome != null && !outcome.equals("")) {
-
- return this.getParams().getSenseSource().name() + " "
- + wordTag.split("\\.")[0] + "%" + outcome;
-
- } else {
-
- MFS mfs = new MFS();
- return mfs.disambiguate(wordTag);
- }
- }
- } else {
-
- if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {
- return IMSParameters.SenseSource.WSDHELPER.name() + " "
- + sample.getTargetTag();
- } else {
- return null;
- }
-
- }
-
- }
-
- /**
- * The IMS disambiguation method for a single word
- *
- * @param tokenizedContext
- * : the text containing the word to disambiguate
- * @param tokenTags
- * : the tags corresponding to the context
- * @param lemmas
- * : the lemmas of ALL the words in the context
- * @param index
- * : the index of the word to disambiguate
- * @return an array of the senses of the word to disambiguate
- */
- public String disambiguate(String[] tokenizedContext, String[] tokenTags,
- String[] lemmas, int index) {
- return disambiguate(
- new WSDSample(tokenizedContext, tokenTags, lemmas, index));
- }
-
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSModel.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSModel.java
deleted file mode 100644
index 9bdfd45..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSModel.java
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Map;
-import java.util.Properties;
-import java.net.URL;
-
-import org.apache.commons.lang3.StringUtils;
-
-import opennlp.tools.cmdline.CmdLineUtil;
-import opennlp.tools.ml.model.AbstractModel;
-import opennlp.tools.ml.model.MaxentModel;
-import opennlp.tools.ml.model.SequenceClassificationModel;
-import opennlp.tools.util.BaseToolFactory;
-import opennlp.tools.util.InvalidFormatException;
-import opennlp.tools.util.model.BaseModel;
-
-// TODO remove this class later
-public class IMSModel extends BaseModel {
-
- private static final String COMPONENT_NAME = "IMSME";
- private static final String IMS_MODEL_ENTRY_NAME = "IMS.model";
-
- private static final String WORDTAG = "wordtag";
- private static final String WINSIZE = "winsize";
- private static final String NGRAM = "ngram";
- private static final String SURROUNDINGS = "surroundings";
-
- private ArrayList<String> surroundingWords = new ArrayList<String>();
- private String wordTag;
-
- private int windowSize;
- private int ngram;
-
- public ArrayList<String> getSurroundingWords() {
- return surroundingWords;
- }
-
- public int getWindowSize() {
- return windowSize;
- }
-
- public void setWindowSize(int windowSize) {
- this.windowSize = windowSize;
- }
-
- public int getNgram() {
- return ngram;
- }
-
- public void setNgram(int ngram) {
- this.ngram = ngram;
- }
-
- public void setSurroundingWords(ArrayList<String> surroundingWords) {
- this.surroundingWords = surroundingWords;
- }
-
- public String getWordTag() {
- return wordTag;
- }
-
- public void setWordTag(String wordTag) {
- this.wordTag = wordTag;
- }
-
- public IMSModel(String languageCode, String wordTag, int windowSize,
- int ngram, MaxentModel imsModel, ArrayList<String> surroundingWords,
- Map<String, String> manifestInfoEntries, IMSFactory factory) {
- super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
-
- artifactMap.put(IMS_MODEL_ENTRY_NAME, imsModel);
- this.setManifestProperty(WORDTAG, wordTag);
- this.setManifestProperty(WINSIZE, windowSize + "");
- this.setManifestProperty(NGRAM, ngram + "");
- this.setManifestProperty(SURROUNDINGS,
- StringUtils.join(surroundingWords, ","));
-
- this.surroundingWords = surroundingWords;
- checkArtifactMap();
- }
-
- public IMSModel(String languageCode, String wordTag, int windowSize,
- int ngram, MaxentModel imsModel, ArrayList<String> surroundingWords,
- IMSFactory factory) {
- this(languageCode, wordTag, windowSize, ngram, imsModel, surroundingWords,
- null, factory);
- }
-
- public IMSModel(InputStream in) throws IOException, InvalidFormatException {
- super(COMPONENT_NAME, in);
- updateAttributes();
- }
-
- public IMSModel(File modelFile) throws IOException, InvalidFormatException {
- super(COMPONENT_NAME, modelFile);
- updateAttributes();
- /*
- * String modelPath = modelFile.getPath(); String surrPath =
- * modelPath.substring(0, modelPath.length() - 6) + ".surr";
- *
- * ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(
- * new FileInputStream(surrPath))); try {
- * this.setSurroundingWords((ArrayList<String>) ois.readObject()); } catch
- * (ClassNotFoundException e) { // TODO Auto-generated catch block
- * e.printStackTrace(); } finally { ois.close(); }
- */
- }
-
- public IMSModel(URL modelURL) throws IOException, InvalidFormatException {
- super(COMPONENT_NAME, modelURL);
- updateAttributes();
- }
-
- // path must include the word.tag i.e. : write.v
- public boolean writeModel(String path) {
- File outFile = new File(path + ".ims.model");
- CmdLineUtil.writeModel("ims model", outFile, this);
- return true;
- }
-
- @Override protected void validateArtifactMap() throws InvalidFormatException {
- super.validateArtifactMap();
-
- if (!(artifactMap.get(IMS_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
- throw new InvalidFormatException("IMS model is incomplete!");
- }
- }
-
- public MaxentModel getIMSMaxentModel() {
- if (artifactMap.get(IMS_MODEL_ENTRY_NAME) instanceof MaxentModel) {
- return (MaxentModel) artifactMap.get(IMS_MODEL_ENTRY_NAME);
- } else {
- return null;
- }
- }
-
- public void updateAttributes() {
- Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
- String surroundings = (String) manifest.get(SURROUNDINGS);
-
- this.surroundingWords = new ArrayList(
- Arrays.asList(surroundings.split(",")));
- this.wordTag = (String) manifest.get(WORDTAG);
- this.windowSize = Integer.parseInt((String) manifest.get(WINSIZE));
- this.ngram = Integer.parseInt((String) manifest.get(NGRAM));
- }
-
- @Override protected Class<? extends BaseToolFactory> getDefaultFactory() {
- return IMSFactory.class;
- }
-
- public IMSFactory getFactory() {
- return (IMSFactory) this.toolFactory;
- }
-
-}
\ No newline at end of file
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
deleted file mode 100644
index 6680335..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/IMSParameters.java
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import java.io.File;
-import java.security.InvalidParameterException;
-
-import opennlp.tools.disambiguator.WSDParameters;
-
-/**
- * This class contains the parameters for the IMS approach as well as the
- * directories containing the files used
- */
-// TODO remove this class later
-public class IMSParameters extends WSDParameters {
-
- protected String languageCode;
- protected int windowSize;
- protected int ngram;
-
- protected String trainingDataDirectory;
-
- protected static final int DFLT_WIN_SIZE = 3;
- protected static final int DFLT_NGRAM = 2;
- protected static final String DFLT_LANG_CODE = "En";
- protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET;
-
- /**
- * This constructor takes only two parameters. The default language used is
- * <i>English</i>
- *
- * @param windowSize the size of the window used for the extraction of the features
- * qualified of Surrounding Words
- * @param ngram the number words used for the extraction of features qualified of
- * Local Collocations
- * @param senseSource the source of the training data
- */
- public IMSParameters(int windowSize, int ngram, SenseSource senseSource,
- String trainingDataDirectory) {
-
- this.languageCode = DFLT_LANG_CODE;
- this.windowSize = windowSize;
- this.ngram = ngram;
- this.senseSource = senseSource;
- this.trainingDataDirectory = trainingDataDirectory;
-
- File folder = new File(trainingDataDirectory);
- if (!folder.exists())
- folder.mkdirs();
- }
-
- public IMSParameters(String trainingDataDirectory) {
- this(DFLT_WIN_SIZE, DFLT_NGRAM, DFLT_SOURCE, trainingDataDirectory);
- }
-
- public String getLanguageCode() {
- return languageCode;
- }
-
- public void setLanguageCode(String languageCode) {
- this.languageCode = languageCode;
- }
-
- public int getWindowSize() {
- return windowSize;
- }
-
- public void setWindowSize(int windowSize) {
- this.windowSize = windowSize;
- }
-
- public int getNgram() {
- return ngram;
- }
-
- public void setNgram(int ngram) {
- this.ngram = ngram;
- }
-
- void init() {
- }
-
- /**
- * Creates the context generator of IMS
- */
- public IMSContextGenerator createContextGenerator() {
- return new DefaultIMSContextGenerator();
- }
-
- public String getTrainingDataDirectory() {
- return trainingDataDirectory;
- }
-
- public void setTrainingDataDirectory(String trainingDataDirectory) {
- this.trainingDataDirectory = trainingDataDirectory;
- }
-
- @Override public boolean isValid() {
- // TODO recheck this pattern switch to maps
- return true;
- }
-
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
deleted file mode 100644
index 32bb5da..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/ims/WTDIMS.java
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.ims;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import net.sf.extjwnl.data.POS;
-import opennlp.tools.disambiguator.WSDHelper;
-import opennlp.tools.disambiguator.WSDSample;
-
-public class WTDIMS {
-
- // Attributes related to the context
- protected String[] sentence;
- protected String[] posTags;
- protected String[] lemmas;
- protected int wordIndex;
- protected int sense;
- protected String[] senseIDs;
-
- // Attributes related to IMS features
- protected String[] posOfSurroundingWords;
- protected String[] surroundingWords;
- protected String[] localCollocations;
- protected String[] features;
-
- public WTDIMS(String[] sentence, String[] posTags, String[] lemmas,
- int wordIndex) {
- this.sentence = sentence;
- this.posTags = posTags;
- this.wordIndex = wordIndex;
- this.lemmas = lemmas;
- }
-
- public WTDIMS(String[] sentence, String[] posTags, String[] lemmas,
- int wordIndex, String[] senseIDs) {
- this.sentence = sentence;
- this.posTags = posTags;
- this.wordIndex = wordIndex;
- this.lemmas = lemmas;
- this.senseIDs = senseIDs;
-
- }
-
- public WTDIMS(String[] sentence, String[] posTags, String[] lemmas,
- String word, String[] senseIDs) {
- super();
-
- this.sentence = sentence;
- this.posTags = posTags;
- this.lemmas = lemmas;
-
- for (int i = 0; i < sentence.length; i++) {
- if (word.equals(sentence[i])) {
- this.wordIndex = i;
- break;
- }
- }
-
- this.senseIDs = senseIDs;
-
- }
-
- public WTDIMS(WSDSample sample) {
- this.sentence = sample.getSentence();
- this.posTags = sample.getTags();
- this.lemmas = sample.getLemmas();
- this.wordIndex = sample.getTargetPosition();
- this.senseIDs = sample.getSenseIDs();
-
- }
-
- public String[] getSentence() {
- return sentence;
- }
-
- public void setSentence(String[] sentence) {
- this.sentence = sentence;
- }
-
- public String[] getPosTags() {
- return posTags;
- }
-
- public void setPosTags(String[] posTags) {
- this.posTags = posTags;
- }
-
- public int getWordIndex() {
- return wordIndex;
- }
-
- public void setWordIndex(int wordIndex) {
- this.wordIndex = wordIndex;
- }
-
- public String[] getLemmas() {
- return lemmas;
- }
-
- public void setLemmas(String[] lemmas) {
- this.lemmas = lemmas;
- }
-
- public int getSense() {
- return sense;
- }
-
- public void setSense(int sense) {
- this.sense = sense;
- }
-
- public String[] getSenseIDs() {
- return senseIDs;
- }
-
- public void setSenseIDs(String[] senseIDs) {
- this.senseIDs = senseIDs;
- }
-
- public String getWord() {
- return this.getSentence()[this.getWordIndex()];
- }
-
- public String getWordTag() {
-
- String wordBaseForm = this.getLemmas()[this.getWordIndex()];
-
- String ref = "";
-
- if ((WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()]) != null)) {
- if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
- .equals(POS.VERB)) {
- ref = wordBaseForm + ".v";
- } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
- .equals(POS.NOUN)) {
- ref = wordBaseForm + ".n";
- } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
- .equals(POS.ADJECTIVE)) {
- ref = wordBaseForm + ".a";
- } else if (WSDHelper.getPOS(this.getPosTags()[this.getWordIndex()])
- .equals(POS.ADVERB)) {
- ref = wordBaseForm + ".r";
- }
- }
-
- return ref;
- }
-
- public String[] getPosOfSurroundingWords() {
- return posOfSurroundingWords;
- }
-
- public void setPosOfSurroundingWords(String[] posOfSurroundingWords) {
- this.posOfSurroundingWords = posOfSurroundingWords;
- }
-
- public String[] getSurroundingWords() {
- return surroundingWords;
- }
-
- public void setSurroundingWords(String[] surroundingWords) {
- this.surroundingWords = surroundingWords;
- }
-
- public String[] getLocalCollocations() {
- return localCollocations;
- }
-
- public void setLocalCollocations(String[] localCollocations) {
- this.localCollocations = localCollocations;
- }
-
- public String[] getFeatures() {
- return this.features;
- }
-
- public void setFeatures(String[] features) {
- this.features = features;
- }
-
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
deleted file mode 100644
index f7247c0..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/DefaultOSCCContextGenerator.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.oscc;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-
-import net.sf.extjwnl.data.Synset;
-import opennlp.tools.disambiguator.WSDHelper;
-import opennlp.tools.disambiguator.WSDSample;
-import opennlp.tools.disambiguator.WordPOS;
-
-/**
- * The default Context Generator of IMS
- */
-// TODO remove this class later
-public class DefaultOSCCContextGenerator implements OSCCContextGenerator {
-
- public DefaultOSCCContextGenerator() {
- }
-
- public String[] extractSurroundingContextClusters(int index, String[] toks,
- String[] tags, String[] lemmas, int windowSize) {
-
- ArrayList<String> contextClusters = new ArrayList<String>();
-
- for (int i = 0; i < toks.length; i++) {
- if (lemmas != null) {
-
- if (!WSDHelper.stopWords.contains(toks[i].toLowerCase()) && (index
- != i)) {
-
- String lemma = lemmas[i].toLowerCase().replaceAll("[^a-z_]", "")
- .trim();
-
- WordPOS word = new WordPOS(lemma, tags[i]);
-
- if (lemma.length() > 1) {
- try {
- ArrayList<Synset> synsets = word.getSynsets();
- if (synsets != null && synsets.size() > 0) {
- for (Synset syn : synsets) {
- contextClusters.add(syn.getOffset() + "");
- }
- }
- } catch (NullPointerException ex) {
- // TODO tagger mistake add proper exception
- }
- }
-
- }
- }
- }
-
- return contextClusters.toArray(new String[contextClusters.size()]);
-
- }
-
- /**
- * Get Context of a word To disambiguate
- *
- * @return The OSCC context of the word to disambiguate
- */
- @Override public String[] getContext(int index, String[] toks, String[] tags,
- String[] lemmas, int windowSize, ArrayList<String> model) {
-
- HashSet<String> surroundingContextClusters = new HashSet<>();
- surroundingContextClusters.addAll(Arrays.asList(
- extractSurroundingContextClusters(index, toks, tags, lemmas,
- windowSize)));
-
- String[] serializedFeatures = new String[model.size()];
-
- int i = 0;
- for (String word : model) {
- if (surroundingContextClusters.contains(word.toString())) {
- serializedFeatures[i] = "F" + i + "=1";
- } else {
- serializedFeatures[i] = "F" + i + "=0";
- }
- i++;
- }
-
- return serializedFeatures;
- }
-
- public String[] getContext(WSDSample sample, int windowSize,
- ArrayList<String> model) {
-
- return getContext(sample.getTargetPosition(), sample.getSentence(),
- sample.getTags(), sample.getLemmas(), windowSize, model);
- }
-
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
deleted file mode 100644
index fad17d5..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCContextGenerator.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.oscc;
-
-import java.util.ArrayList;
-
-import opennlp.tools.disambiguator.WSDSample;
-
-/**
- * Interface for {@link OSCCME} context generators.
- */
-// TODO remove this class later
-public interface OSCCContextGenerator {
-
- String[] getContext(int index, String[] toks, String[] tags, String[] lemmas,
- int windowSize, ArrayList<String> model);
-
- String[] getContext(WSDSample sample, int windowSize,
- ArrayList<String> model);
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
deleted file mode 100644
index 0f6ce53..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCFactory.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.disambiguator.oscc;
-
-import opennlp.tools.util.BaseToolFactory;
-import opennlp.tools.util.InvalidFormatException;
-import opennlp.tools.util.ext.ExtensionLoader;
-
-// TODO remove this class later
-public class OSCCFactory extends BaseToolFactory {
-
- /**
- * Creates a {@link OSCCFactory} that provides the default implementation of
- * the resources.
- */
- public OSCCFactory() {
-
- }
-
- public static OSCCFactory create(String subclassName)
- throws InvalidFormatException {
- if (subclassName == null) {
- // will create the default factory
- return new OSCCFactory();
- }
- try {
- OSCCFactory theFactory = ExtensionLoader
- .instantiateExtension(OSCCFactory.class, subclassName);
- return theFactory;
- } catch (Exception e) {
- String msg = "Could not instantiate the " + subclassName
- + ". The initialization throw an exception.";
- System.err.println(msg);
- e.printStackTrace();
- throw new InvalidFormatException(msg, e);
- }
- }
-
- @Override public void validateArtifactMap() throws InvalidFormatException {
- // no additional artifacts
- }
-
- public OSCCContextGenerator getContextGenerator() {
- return new DefaultOSCCContextGenerator();
- }
-
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
deleted file mode 100644
index f06f140..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCME.java
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.disambiguator.oscc;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import opennlp.tools.disambiguator.WSDHelper;
-import opennlp.tools.disambiguator.WSDSample;
-import opennlp.tools.disambiguator.WSDisambiguator;
-import opennlp.tools.disambiguator.MFS;
-import opennlp.tools.ml.EventTrainer;
-import opennlp.tools.ml.TrainerFactory;
-import opennlp.tools.ml.model.MaxentModel;
-import opennlp.tools.ml.model.Event;
-import opennlp.tools.util.InvalidFormatException;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.ObjectStreamUtils;
-import opennlp.tools.util.TrainingParameters;
-
-/**
- * Maximum Entropy version of the <b>one sence per cluster</b> approach in
- *
- * http://nlp.cs.rpi.edu/paper/wsd.pdf
- *
- * The approach is a hybrid approach using unsupervised context clustering to
- * enhance disambiguation using a typical classifier.
- *
- * The context clusters are considered a group of words representing an enriched
- * context of a target word.
- *
- * The clusters can be formed by clustering techniques like K-means, or a
- * simpler version can use WordNet to get clusters simply from SynSets.
- *
- * Please see {@link DefaultOSCCContextGenerator}
- *
- * The approach finds the context clusters surrounding the target and uses a
- * classifier to judge on the best case.
- *
- * Here an ME classifier is used.
- *
- */
-public class OSCCME extends WSDisambiguator {
-
- protected OSCCModel osccModel;
-
- protected static OSCCContextGenerator cg = new DefaultOSCCContextGenerator();
-
- public OSCCME(OSCCParameters params) {
- this.params = params;
- }
-
- public OSCCME(OSCCModel model, OSCCParameters params) {
- this.osccModel = model;
- this.params = params;
- }
-
- public OSCCModel getModel() {
- return osccModel;
- }
-
- public void setModel(OSCCModel model) {
- this.osccModel = model;
- }
-
- public void setParameters(OSCCParameters parameters) {
- this.params = parameters;
- }
-
- public static OSCCModel train(String lang, ObjectStream<WSDSample> samples,
- TrainingParameters mlParams, OSCCParameters osccParams,
- OSCCFactory osccFactory) throws IOException {
-
- ArrayList<String> surroundingClusterModel = buildSurroundingClusters(
- samples, osccParams.getWindowSize());
-
- HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
-
- MaxentModel osccModel = null;
-
- ArrayList<Event> events = new ArrayList<Event>();
- ObjectStream<Event> es = null;
-
- WSDSample sample = samples.read();
- String wordTag = "";
- if (sample != null) {
- wordTag = sample.getTargetWordTag();
- do {
- String sense = sample.getSenseIDs()[0];
- String[] context = cg.getContext(sample, osccParams.windowSize,
- surroundingClusterModel);
- Event ev = new Event(sense + "", context);
- events.add(ev);
- } while ((sample = samples.read()) != null);
- }
-
- es = ObjectStreamUtils.createObjectStream(events);
- EventTrainer trainer = TrainerFactory
- .getEventTrainer(mlParams.getSettings(), manifestInfoEntries);
-
- osccModel = trainer.train(es);
-
- return new OSCCModel(lang, wordTag, osccParams.windowSize, osccModel,
- surroundingClusterModel, manifestInfoEntries, osccFactory);
- }
-
- public static ArrayList<String> buildSurroundingClusters(
- ObjectStream<WSDSample> samples, int windowSize) throws IOException {
- // TODO modify to clusters
- DefaultOSCCContextGenerator osccCG = new DefaultOSCCContextGenerator();
- ArrayList<String> surroundingWordsModel = new ArrayList<String>();
- WSDSample sample;
- while ((sample = samples.read()) != null) {
- String[] words = osccCG.extractSurroundingContextClusters(
- sample.getTargetPosition(), sample.getSentence(), sample.getTags(),
- sample.getLemmas(), windowSize);
-
- if (words.length > 0) {
- for (String word : words) {
- surroundingWordsModel.add(word);
- }
- }
- }
- samples.reset();
- return surroundingWordsModel;
- }
-
- @Override
- public String disambiguate(WSDSample sample) {
- if (WSDHelper.isRelevantPOSTag(sample.getTargetTag())) {
- String wordTag = sample.getTargetWordTag();
-
- if (osccModel == null
- || !osccModel.getWordTag().equals(sample.getTargetWordTag())) {
-
- String trainingFile = ((OSCCParameters) this.getParams())
- .getTrainingDataDirectory() + sample.getTargetWordTag();
-
- File file = new File(trainingFile + ".oscc.model");
- if (file.exists() && !file.isDirectory()) {
- try {
- setModel(new OSCCModel(file));
-
- } catch (InvalidFormatException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
-
- String outcome = "";
-
- String[] context = cg.getContext(sample,
- ((OSCCParameters) this.params).windowSize,
- osccModel.getContextClusters());
-
- double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);
- outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);
-
- if (outcome != null && !outcome.equals("")) {
-
- return this.getParams().getSenseSource().name() + " "
- + wordTag.split("\\.")[0] + "%" + outcome;
-
- } else {
- MFS mfs = new MFS();
- return mfs.disambiguate(wordTag);
- }
-
- } else {
-
- MFS mfs = new MFS();
- return mfs.disambiguate(wordTag);
- }
- } else {
- String outcome = "";
-
- String[] context = cg.getContext(sample,
- ((OSCCParameters) this.params).windowSize,
- osccModel.getContextClusters());
-
- double[] outcomeProbs = osccModel.getOSCCMaxentModel().eval(context);
- outcome = osccModel.getOSCCMaxentModel().getBestOutcome(outcomeProbs);
-
- if (outcome != null && !outcome.equals("")) {
-
- return this.getParams().getSenseSource().name() + " "
- + wordTag.split("\\.")[0] + "%" + outcome;
- } else {
-
- MFS mfs = new MFS();
- return mfs.disambiguate(wordTag);
- }
- }
- } else {
-
- if (WSDHelper.getNonRelevWordsDef(sample.getTargetTag()) != null) {
- return OSCCParameters.SenseSource.WSDHELPER.name() + " "
- + sample.getTargetTag();
- } else {
- return null;
- }
-
- }
-
- }
-
- /**
- * The IMS disambiguation method for a single word
- *
- * @param tokenizedContext
- * : the text containing the word to disambiguate
- * @param tokenTags
- * : the tags corresponding to the context
- * @param lemmas
- * : the lemmas of ALL the words in the context
- * @param index
- * : the index of the word to disambiguate
- * @return an array of the senses of the word to disambiguate
- */
- public String disambiguate(String[] tokenizedContext, String[] tokenTags,
- String[] lemmas, int index) {
- return disambiguate(
- new WSDSample(tokenizedContext, tokenTags, lemmas, index));
- }
-
-}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
deleted file mode 100644
index 65495c2..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCModel.java
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.disambiguator.oscc;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Map;
-import java.util.Properties;
-import java.net.URL;
-
-import org.apache.commons.lang3.StringUtils;
-
-import opennlp.tools.cmdline.CmdLineUtil;
-import opennlp.tools.ml.model.AbstractModel;
-import opennlp.tools.ml.model.MaxentModel;
-import opennlp.tools.util.BaseToolFactory;
-import opennlp.tools.util.InvalidFormatException;
-import opennlp.tools.util.model.BaseModel;
-
-// TODO remove this class later
-public class OSCCModel extends BaseModel {
-
- private static final String COMPONENT_NAME = "OSCCME";
- private static final String OSCC_MODEL_ENTRY_NAME = "OSCC.model";
-
- private static final String WORDTAG = "wordtag";
- private static final String WINSIZE = "winsize";
- private static final String CONTEXTCLUSTERS = "contextclusters";
-
- private ArrayList<String> contextClusters = new ArrayList<String>();
- private String wordTag;
- private int windowSize;
-
- public ArrayList<String> getContextClusters() {
- return contextClusters;
- }
-
- public int getWindowSize() {
- return windowSize;
- }
-
- public void setWindowSize(int windowSize) {
- this.windowSize = windowSize;
- }
-
- public void setContextClusters(ArrayList<String> contextClusters) {
- this.contextClusters = contextClusters;
- }
-
- public String getWordTag() {
- return wordTag;
- }
-
- public void setWordTag(String wordTag) {
- this.wordTag = wordTag;
- }
-
- public OSCCModel(String languageCode, String wordTag, int windowSize,
- MaxentModel osccModel, ArrayList<String> contextClusters,
- Map<String, String> manifestInfoEntries, OSCCFactory factory) {
- super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
-
- artifactMap.put(OSCC_MODEL_ENTRY_NAME, osccModel);
- this.setManifestProperty(WORDTAG, wordTag);
- this.setManifestProperty(WINSIZE, windowSize + "");
-
- this.setManifestProperty(CONTEXTCLUSTERS,
- StringUtils.join(contextClusters, ","));
-
- this.contextClusters = contextClusters;
- checkArtifactMap();
- }
-
- public OSCCModel(String languageCode, String wordTag, int windowSize,
- int ngram, MaxentModel osccModel, ArrayList<String> contextClusters,
- OSCCFactory factory) {
- this(languageCode, wordTag, windowSize, osccModel, contextClusters, null,
- factory);
- }
-
- public OSCCModel(InputStream in) throws IOException, InvalidFormatException {
- super(COMPONENT_NAME, in);
- updateAttributes();
- }
-
- public OSCCModel(File modelFile) throws IOException, InvalidFormatException {
- super(COMPONENT_NAME, modelFile);
- updateAttributes();
- }
-
- public OSCCModel(URL modelURL) throws IOException, InvalidFormatException {
- super(COMPONENT_NAME, modelURL);
- updateAttributes();
- }
-
- // path must include the word.tag i.e. : write.v
- public boolean writeModel(String path) {
- File outFile = new File(path + ".oscc.model");
- CmdLineUtil.writeModel("oscc model", outFile, this);
- return true;
- }
-
- @Override protected void validateArtifactMap() throws InvalidFormatException {
- super.validateArtifactMap();
-
- if (!(artifactMap.get(OSCC_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
- throw new InvalidFormatException("OSCC model is incomplete!");
- }
- }
-
- public MaxentModel getOSCCMaxentModel() {
- if (artifactMap.get(OSCC_MODEL_ENTRY_NAME) instanceof MaxentModel) {
- return (MaxentModel) artifactMap.get(OSCC_MODEL_ENTRY_NAME);
- } else {
- return null;
- }
- }
-
- public void updateAttributes() {
- Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
- String contextClusters = (String) manifest.get(CONTEXTCLUSTERS);
-
- this.contextClusters = new ArrayList(
- Arrays.asList(contextClusters.split(",")));
- this.wordTag = (String) manifest.get(WORDTAG);
- this.windowSize = Integer.parseInt((String) manifest.get(WINSIZE));
- }
-
- @Override protected Class<? extends BaseToolFactory> getDefaultFactory() {
- return OSCCFactory.class;
- }
-
- public OSCCFactory getFactory() {
- return (OSCCFactory) this.toolFactory;
- }
-
-}
\ No newline at end of file
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
deleted file mode 100644
index 3f0eb2c..0000000
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/oscc/OSCCParameters.java
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator.oscc;
-
-import java.io.File;
-
-import opennlp.tools.disambiguator.WSDParameters;
-
-/**
- * This class contains the parameters for the OSCC approach as well as the
- * directories containing the files used
- */
-// TODO remove this class later
-public class OSCCParameters extends WSDParameters {
-
- protected String languageCode;
- protected int windowSize;
- protected String trainingDataDirectory;
-
- protected static final int DFLT_WIN_SIZE = 3;
- protected static final String DFLT_LANG_CODE = "En";
- protected static final SenseSource DFLT_SOURCE = SenseSource.WORDNET;
-
- /**
- * This constructor takes only two parameters. The default language used is
- * <i>English</i>
- *
- * @param windowSize the size of the window used for the extraction of the features
- * qualified of Surrounding Context Clusters
- * @param senseSource the source of the training data
- */
- public OSCCParameters(int windowSize, SenseSource senseSource,
- String trainingDataDirectory) {
- this.languageCode = DFLT_LANG_CODE;
- this.windowSize = windowSize;
- this.senseSource = senseSource;
- this.trainingDataDirectory = trainingDataDirectory;
-
- File folder = new File(trainingDataDirectory);
- if (!folder.exists())
- folder.mkdirs();
- }
-
- public OSCCParameters(String trainingDataDirectory) {
- this(DFLT_WIN_SIZE, DFLT_SOURCE, trainingDataDirectory);
-
- File folder = new File(trainingDataDirectory);
- if (!folder.exists())
- folder.mkdirs();
- }
-
- public OSCCParameters() {
- this(DFLT_WIN_SIZE, DFLT_SOURCE, null);
- }
-
- public OSCCParameters(int windowSize) {
- this(windowSize, DFLT_SOURCE, null);
- }
-
- public String getLanguageCode() {
- return languageCode;
- }
-
- public void setLanguageCode(String languageCode) {
- this.languageCode = languageCode;
- }
-
- public int getWindowSize() {
- return windowSize;
- }
-
- public void setWindowSize(int windowSize) {
- this.windowSize = windowSize;
- }
-
- public OSCCContextGenerator createContextGenerator() {
-
- return new DefaultOSCCContextGenerator();
- }
-
- public String getTrainingDataDirectory() {
- return trainingDataDirectory;
- }
-
- public void setTrainingDataDirectory(String trainingDataDirectory) {
- this.trainingDataDirectory = trainingDataDirectory;
- }
-
- @Override public boolean isValid() {
- // TODO make validity check
- return true;
- }
-
-}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
deleted file mode 100644
index c5e63cf..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSEvaluatorTest.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.util.ArrayList;
-
-import opennlp.tools.disambiguator.datareader.SensevalReader;
-import opennlp.tools.disambiguator.ims.IMSME;
-import opennlp.tools.disambiguator.ims.IMSParameters;
-
-import org.junit.Test;
-
-public class IMSEvaluatorTest {
-
- static SensevalReader seReader = new SensevalReader();
-
- @Test
- public static void main(String[] args) {
-
-
- WSDHelper.print("Evaluation Started");
-
- // TODO write unit test
- String modelsDir = "src\\test\\resources\\models\\";
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
- IMSParameters imsParams = new IMSParameters("");
- IMSME ims = new IMSME(imsParams);
-
- ArrayList<String> words = seReader.getSensevalWords();
-
- for (String word : words) {
- WSDEvaluator evaluator = new WSDEvaluator(ims);
-
- // don't take verbs because they are not from WordNet
- if (!word.split("\\.")[1].equals("v")) {
-
- ArrayList<WSDSample> instances = seReader.getSensevalData(word);
- if (instances != null) {
- WSDHelper.print("------------------" + word + "------------------");
- for (WSDSample instance : instances) {
- if (instance.getSenseIDs() != null
- && !instance.getSenseIDs()[0].equals("null")) {
- evaluator.evaluateSample(instance);
- }
- }
- WSDHelper.print(evaluator.toString());
- } else {
- WSDHelper.print("null instances");
- }
- }
-
- }
-
- }
-}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java
deleted file mode 100644
index 881de6a..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/IMSMETester.java
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import static org.junit.Assert.*;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
-import opennlp.tools.disambiguator.ims.IMSFactory;
-import opennlp.tools.disambiguator.ims.IMSME;
-import opennlp.tools.disambiguator.ims.IMSModel;
-import opennlp.tools.disambiguator.ims.IMSParameters;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.Span;
-import opennlp.tools.util.TrainingParameters;
-
-/**
- * This is the test class for {@link IMSME}.
- *
- * The scope of this test is to make sure that the IMS disambiguator code can be
- * executed. This test can not detect mistakes which lead to incorrect feature
- * generation or other mistakes which decrease the disambiguation performance of
- * the disambiguator.
- *
- * In this test the {@link IMSME} is trained with Semcor and then the computed
- * model is used to predict sentences from the training sentences.
- */
-public class IMSMETester {
- // TODO write more tests
- // TODO modify when we fix the parameter model
-
- static String modelsDir = "src\\test\\resources\\models\\";
- static String trainingDataDirectory = "src\\test\\resources\\supervised\\models\\";
-
- static IMSParameters IMSParams;
- static IMSME ims;
- static IMSFactory IMSFactory;
- static IMSModel model;
-
- static String test = "please.v";
- static File outFile;
-
- static String test1 = "We need to discuss an important topic, please write to me soon.";
- static String test2 = "The component was highly radioactive to the point that"
- + " it has been activated the second it touched water";
- static String test3 = "The summer is almost over and I did not go to the beach even once";
-
- static String[] sentence1;
- static String[] sentence2;
- static String[] sentence3;
-
- static String[] tags1;
- static String[] tags2;
- static String[] tags3;
-
- static String[] lemmas1;
- static String[] lemmas2;
- static String[] lemmas3;
-
- /*
- * Setup the testing variables
- */
- @BeforeClass
- public static void setUpAndTraining() {
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
- sentence1 = WSDHelper.getTokenizer().tokenize(test1);
- sentence2 = WSDHelper.getTokenizer().tokenize(test2);
- sentence3 = WSDHelper.getTokenizer().tokenize(test3);
-
- tags1 = WSDHelper.getTagger().tag(sentence1);
- tags2 = WSDHelper.getTagger().tag(sentence2);
- tags3 = WSDHelper.getTagger().tag(sentence3);
-
- List<String> tempLemmas1 = new ArrayList<String>();
- for (int i = 0; i < sentence1.length; i++) {
- tempLemmas1
- .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
- }
- lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
-
- List<String> tempLemmas2 = new ArrayList<String>();
- for (int i = 0; i < sentence2.length; i++) {
- tempLemmas2
- .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
- }
- lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
-
- List<String> tempLemmas3 = new ArrayList<String>();
- for (int i = 0; i < sentence3.length; i++) {
- tempLemmas3
- .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
- }
- lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
- IMSParams = new IMSParameters("");
- IMSParams.setTrainingDataDirectory(trainingDataDirectory);
- IMSFactory = new IMSFactory();
- TrainingParameters trainingParams = new TrainingParameters();
- SemcorReaderExtended sr = new SemcorReaderExtended();
- ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
-
- IMSModel writeModel = null;
- /*
- * Tests training the disambiguator We test both writing and reading a model
- * file trained by semcor
- */
-
- try {
- writeModel = IMSME.train("en", sampleStream, trainingParams, IMSParams,
- IMSFactory);
- assertNotNull("Checking the model to be written", writeModel);
- writeModel.writeModel(IMSParams.getTrainingDataDirectory() + test);
- outFile = new File(
- IMSParams.getTrainingDataDirectory() + test + ".ims.model");
- model = new IMSModel(outFile);
- assertNotNull("Checking the read model", model);
- ims = new IMSME(model, IMSParams);
- assertNotNull("Checking the disambiguator", ims);
- } catch (IOException e1) {
- e1.printStackTrace();
- fail("Exception in training");
- }
- }
-
- /*
- * Tests disambiguating only one word : The ambiguous word "please"
- */
- @Test
- public void testOneWordDisambiguation() {
- String sense = ims.disambiguate(sentence1, tags1, lemmas1, 8);
- assertEquals("Check 'please' sense ID", "WORDNET please%2:37:00::", sense);
- }
-
- /*
- * Tests disambiguating a word Span In this case we test a mix of monosemous
- * and polysemous words as well as words that do not need disambiguation such
- * as determiners
- */
- @Test
- public void testWordSpanDisambiguation() {
- Span span = new Span(3, 7);
- List<String> senses = ims.disambiguate(sentence2, tags2, lemmas2, span);
-
- assertEquals("Check number of returned words", 5, senses.size());
- assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01::",
- senses.get(0));
- assertEquals("Check 'radioactive' sense ID",
- "WORDNET radioactive%3:00:00::", senses.get(1));
- assertEquals("Check preposition", "WSDHELPER to", senses.get(2));
- assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3));
- }
-
- /*
- * Tests disambiguating all the words
- */
- @Test
- public void testAllWordsDisambiguation() {
- List<String> senses = ims.disambiguate(sentence3, tags3, lemmas3);
-
- assertEquals("Check number of returned words", 15, senses.size());
- assertEquals("Check preposition", "WSDHELPER personal pronoun",
- senses.get(6));
- }
-
-}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
deleted file mode 100644
index f46a58b..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCEvaluatorTest.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import java.io.IOException;
-import java.util.ArrayList;
-
-import opennlp.tools.disambiguator.datareader.SensevalReader;
-import opennlp.tools.disambiguator.oscc.OSCCFactory;
-import opennlp.tools.disambiguator.oscc.OSCCME;
-import opennlp.tools.disambiguator.oscc.OSCCModel;
-import opennlp.tools.disambiguator.oscc.OSCCParameters;
-import opennlp.tools.util.TrainingParameters;
-
-import org.junit.Test;
-
-public class OSCCEvaluatorTest {
-
- static SensevalReader seReader = new SensevalReader();
-
- @Test
- public static void main(String[] args) {
-
-
- WSDHelper.print("Evaluation Started");
-
- // TODO write unit test
- String modelsDir = "src\\test\\resources\\models\\";
- String trainingDataDirectory = "src\\test\\resources\\supervised\\models\\";
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
- OSCCParameters OSCCParams = new OSCCParameters("");
- OSCCParams.setTrainingDataDirectory(trainingDataDirectory);
- OSCCME oscc = new OSCCME(OSCCParams);
- OSCCModel model = null;
- ArrayList<String> words = seReader.getSensevalWords();
-
- for (String word : words) {
- // don't take verbs because they are not from WordNet
- if (!word.split("\\.")[1].equals("v")) {
- try {
- model = OSCCME.train("en", seReader.getSensevalDataStream(word), new TrainingParameters(), OSCCParams,
- new OSCCFactory());
- model.writeModel(OSCCParams.getTrainingDataDirectory() + word);
- oscc = new OSCCME(model, OSCCParams);
-
- } catch (IOException e) {
- e.printStackTrace();
- WSDHelper.print("skipped sample");
- }
-
- WSDEvaluator evaluator = new WSDEvaluator(oscc);
- ArrayList<WSDSample> instances = seReader.getSensevalData(word);
- if (instances != null) {
- WSDHelper.print("------------------" + word + "------------------");
- for (WSDSample instance : instances) {
- if (instance.getSenseIDs() != null
- && !instance.getSenseIDs()[0].equals("null")) {
- evaluator.evaluateSample(instance);
- }else{
- WSDHelper.print("skipped sample");
- }
- }
- WSDHelper.print(evaluator.toString());
- } else {
- WSDHelper.print("null instances");
- }
- }
-
- }
-
- }
-}
diff --git a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java b/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java
deleted file mode 100644
index 63fb07d..0000000
--- a/opennlp-wsd/src/test/java/opennlp/tools/disambiguator/OSCCMETester.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package opennlp.tools.disambiguator;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.fail;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import opennlp.tools.disambiguator.datareader.SemcorReaderExtended;
-import opennlp.tools.disambiguator.oscc.OSCCFactory;
-import opennlp.tools.disambiguator.oscc.OSCCME;
-import opennlp.tools.disambiguator.oscc.OSCCModel;
-import opennlp.tools.disambiguator.oscc.OSCCParameters;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.Span;
-import opennlp.tools.util.TrainingParameters;
-
-/**
- * This is the test class for {@link OSCCME}.
- *
- * The scope of this test is to make sure that the OSCC disambiguator code can
- * be executed. This test can not detect mistakes which lead to incorrect
- * feature generation or other mistakes which decrease the disambiguation
- * performance of the disambiguator.
- *
- * In this test the {@link OSCCME} is trained with Semcor and then the computed
- * model is used to predict sentences from the training sentences.
- */
-public class OSCCMETester {
- // TODO write more tests
- // TODO modify when we fix the parameter model
-
- static String modelsDir = "src\\test\\resources\\models\\";
- static String trainingDataDirectory = "src\\test\\resources\\supervised\\models\\";
-
- static OSCCParameters OSCCParams;
- static OSCCME oscc;
- static OSCCFactory osccFactory;
- static OSCCModel model;
-
- static String test = "please.v";
- static File outFile;
-
- static String test1 = "We need to discuss an important topic, please write to me soon.";
- static String test2 = "The component was highly radioactive to the point that"
- + " it has been activated the second it touched water";
- static String test3 = "The summer is almost over and I did not go to the beach even once";
-
- static String[] sentence1;
- static String[] sentence2;
- static String[] sentence3;
-
- static String[] tags1;
- static String[] tags2;
- static String[] tags3;
-
- static String[] lemmas1;
- static String[] lemmas2;
- static String[] lemmas3;
-
- /*
- * Setup the testing variables
- */
- @BeforeClass
- public static void setUpAndTraining() {
- WSDHelper.loadTokenizer(modelsDir + "en-token.bin");
- WSDHelper.loadLemmatizer(modelsDir + "en-lemmatizer.dict");
- WSDHelper.loadTagger(modelsDir + "en-pos-maxent.bin");
-
- sentence1 = WSDHelper.getTokenizer().tokenize(test1);
- sentence2 = WSDHelper.getTokenizer().tokenize(test2);
- sentence3 = WSDHelper.getTokenizer().tokenize(test3);
-
- tags1 = WSDHelper.getTagger().tag(sentence1);
- tags2 = WSDHelper.getTagger().tag(sentence2);
- tags3 = WSDHelper.getTagger().tag(sentence3);
-
- List<String> tempLemmas1 = new ArrayList<String>();
- for (int i = 0; i < sentence1.length; i++) {
- tempLemmas1
- .add(WSDHelper.getLemmatizer().lemmatize(sentence1[i], tags1[i]));
- }
- lemmas1 = tempLemmas1.toArray(new String[tempLemmas1.size()]);
-
- List<String> tempLemmas2 = new ArrayList<String>();
- for (int i = 0; i < sentence2.length; i++) {
- tempLemmas2
- .add(WSDHelper.getLemmatizer().lemmatize(sentence2[i], tags2[i]));
- }
- lemmas2 = tempLemmas2.toArray(new String[tempLemmas2.size()]);
-
- List<String> tempLemmas3 = new ArrayList<String>();
- for (int i = 0; i < sentence3.length; i++) {
- tempLemmas3
- .add(WSDHelper.getLemmatizer().lemmatize(sentence3[i], tags3[i]));
- }
- lemmas3 = tempLemmas3.toArray(new String[tempLemmas3.size()]);
-
- OSCCParams = new OSCCParameters("");
- OSCCParams.setTrainingDataDirectory(trainingDataDirectory);
- osccFactory = new OSCCFactory();
- TrainingParameters trainingParams = new TrainingParameters();
- SemcorReaderExtended sr = new SemcorReaderExtended();
- ObjectStream<WSDSample> sampleStream = sr.getSemcorDataStream(test);
-
- OSCCModel writeModel = null;
- /*
- * Tests training the disambiguator We test both writing and reading a model
- * file trained by semcor
- */
-
- try {
- writeModel = OSCCME.train("en", sampleStream, trainingParams, OSCCParams,
- osccFactory);
- assertNotNull("Checking the model to be written", writeModel);
- writeModel.writeModel(OSCCParams.getTrainingDataDirectory() + test);
- outFile = new File(
- OSCCParams.getTrainingDataDirectory() + test + ".oscc.model");
- model = new OSCCModel(outFile);
- assertNotNull("Checking the read model", model);
- oscc = new OSCCME(model, OSCCParams);
- assertNotNull("Checking the disambiguator", oscc);
- } catch (IOException e1) {
- e1.printStackTrace();
- fail("Exception in training");
- }
- }
-
- /*
- * Tests disambiguating only one word : The ambiguous word "please"
- */
- @Test
- public void testOneWordDisambiguation() {
- String sense = oscc.disambiguate(sentence1, tags1, lemmas1, 8);
- assertEquals("Check 'please' sense ID", "WORDNET please%2:37:00::", sense);
- }
-
- /*
- * Tests disambiguating a word Span In this case we test a mix of monosemous
- * and polysemous words as well as words that do not need disambiguation such
- * as determiners
- */
- @Test
- public void testWordSpanDisambiguation() {
- Span span = new Span(3, 7);
- List<String> senses = oscc.disambiguate(sentence2, tags2, lemmas2, span);
-
-
- assertEquals("Check number of returned words", 5, senses.size());
- assertEquals("Check 'highly' sense ID", "WORDNET highly%4:02:01::",
- senses.get(0));
- assertEquals("Check 'radioactive' sense ID",
- "WORDNET radioactive%3:00:00::", senses.get(1));
- assertEquals("Check preposition", "WSDHELPER to", senses.get(2));
- assertEquals("Check determiner", "WSDHELPER determiner", senses.get(3));
- }
-
- /*
- * Tests disambiguating all the words
- */
- @Test
- public void testAllWordsDisambiguation() {
- List<String> senses = oscc.disambiguate(sentence3, tags3, lemmas3);
-
- assertEquals("Check number of returned words", 15, senses.size());
- assertEquals("Check preposition", "WSDHELPER personal pronoun",
- senses.get(6));
- }
-
-}
\ No newline at end of file