OPENNLP-791 Reads the mentioned clustering files, could also switch to objectstream. Thanks to Anthony Beylerian for providing a patch.
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClusterer.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClusterer.java
new file mode 100644
index 0000000..afc5084
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClusterer.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.contextclustering;
+
+import java.security.InvalidParameterException;
+
+import opennlp.tools.disambiguator.WSDParameters;
+import opennlp.tools.disambiguator.WSDisambiguator;
+import opennlp.tools.util.Span;
+
+/**
+ * Implementation of the <b>Context Clustering</b> approach. This approach
+ * returns uses n-gram based clusters.
+ *
+ * This implementation is based on {@link http://nlp.cs.rpi.edu/paper/wsd.pdf}
+ */
+public class ContextClusterer implements WSDisambiguator {
+
+ protected ContextClustererParameters params;
+
+ @Override
+ public WSDParameters getParams() {
+ return params;
+ }
+
+ @Override
+ public void setParams(WSDParameters params) throws InvalidParameterException {
+ if (params == null) {
+ this.params = new ContextClustererParameters();
+ } else {
+ if (params.isValid()) {
+ this.params = (ContextClustererParameters) params;
+ } else {
+ throw new InvalidParameterException("wrong params");
+ }
+ }
+ }
+
+ @Override
+ public String[] disambiguate(String[] tokenizedContext,
+ int ambiguousTokenIndex) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public String[][] disambiguate(String[] tokenizedContext,
+ Span[] ambiguousTokenIndexSpans) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClustererParameters.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClustererParameters.java
new file mode 100644
index 0000000..bb69fd7
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/contextclustering/ContextClustererParameters.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.contextclustering;
+
+import opennlp.tools.disambiguator.WSDParameters;
+
+public class ContextClustererParameters extends WSDParameters {
+
+ protected int ngram;
+
+ public int getNgram() {
+ return ngram;
+ }
+
+ public void setNgram(int ngram) {
+ this.ngram = ngram;
+ }
+
+ @Override
+ public boolean isValid() {
+ return ngram > 0;
+ }
+
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClusterMembership.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClusterMembership.java
new file mode 100644
index 0000000..2b3fbf7
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClusterMembership.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+public class ClusterMembership {
+
+ public int clusterID;
+ public double centroidSimilarity;
+ public String phrase;
+ public String[] phraseWords;
+
+ public ClusterMembership(int clusterID, double centroidSimilarity) {
+ this.clusterID = clusterID;
+ this.centroidSimilarity = centroidSimilarity;
+ }
+
+ public ClusterMembership() {
+ this(0, 0.0);
+ }
+}
diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClustersReader.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClustersReader.java
new file mode 100644
index 0000000..e8b384e
--- /dev/null
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/datareader/ClustersReader.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package opennlp.tools.disambiguator.datareader;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class ClustersReader {
+
+ public static String path = "src\\test\\resources\\phraseclusters\\";
+ private static HashMap<String, ArrayList<ClusterMembership>> map = new HashMap<String, ArrayList<ClusterMembership>>();
+
+ public void readFile(String url) {
+
+ File file = new File(url);
+
+ try (BufferedReader clusterList = new BufferedReader(new FileReader(file))) {
+
+ String line;
+
+ // Read the file
+ while ((line = clusterList.readLine()) != null) {
+
+ String[] parts = line.split("\\t");
+ String phraseKey = parts[0];
+ String[] phraseWords = phraseKey.split("\\s");
+
+ System.out.println(phraseKey);
+
+ ArrayList<ClusterMembership> memberships = new ArrayList<ClusterMembership>();
+
+ for (int i = 1; i < parts.length; i += 2) {
+ ClusterMembership membership = new ClusterMembership(
+ Integer.parseInt(parts[i]), Double.parseDouble(parts[i + 1]));
+ membership.phrase = phraseKey;
+ membership.phraseWords = phraseWords;
+
+ memberships.add(membership);
+ }
+ map.put(phraseKey, memberships);
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public boolean getNgramClusters(String word) {
+
+ File folder = new File(path);
+ if (folder.isDirectory()) {
+ for (File file : folder.listFiles()) {
+ readFile(file.getAbsolutePath());
+ }
+
+ } else {
+ return false;
+ }
+
+ return true;
+
+ }
+
+}