opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/AriAdapter.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.similarity.apps.taxo_builder;

 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

 /**
  * This class makes it possible to use old prolog-files as the bases for
  * taxonomy-learner. It cleans the prolog files and returns with Strings which
  * can be used for the taxonomy extender process.
  *
  */
 public class AriAdapter {
   // income_taks(state,company(cafeteria,_)):-do(71100).
   Map<String, List<List<String>>> lemma_AssocWords = new HashMap<String, List<List<String>>>();

   public void getChainsFromARIfile(String fileName) {

     try {
       BufferedReader br = new BufferedReader(new InputStreamReader(
           new FileInputStream(fileName)));
       String line;
       while ((line = br.readLine()) != null) {
         if (line.length() < 10 || line.startsWith("%") || line.startsWith(":"))
           continue;
         String chain0 = line.replace("_,", "&").replace("_)", "&")
             .replace(":-do(", "&").replace(":-var", "&").replace("taks", "tax")
             .replace(":- do(", "&").replace("X=", "&").replace(":-", "&")
             .replace("[X|_]", "&").replace("nonvar", "&").replace("var", "&")
             .replace('(', '&').replace(')', '&').replace(',', '&')
             .replace('.', '&').replace("&&&", "&").replace("&&", "&")
             .replace("&", " ");
         String[] chains = chain0.split(" ");
         List<String> chainList = new ArrayList<String>(); // Arrays.asList(chains);
         for (String word : chains) {
           if (word != null && word.length() > 2 && word.indexOf("0") < 0
               && word.indexOf("1") < 0 && word.indexOf("2") < 0
               && word.indexOf("3") < 0 && word.indexOf("4") < 0
               && word.indexOf("5") < 0)
             chainList.add(word);
         }
         if (chains.length < 1 || chainList.size() < 1
             || chainList.get(0).length() < 3)
           continue;
         String entry = chainList.get(0);
         if (entry.length() < 3)
           continue;
         chainList.remove(entry);
         List<List<String>> res = lemma_AssocWords.get(entry);
         if (res == null) {
           List<List<String>> resList = new ArrayList<List<String>>();
           resList.add(chainList);
           lemma_AssocWords.put(entry, resList);
         } else {
           res.add(chainList);
           lemma_AssocWords.put(entry, res);
         }
       }
     } catch (Exception e) {
       e.printStackTrace();

     }
   }

   public static void main(String[] args) {

     AriAdapter ad = new AriAdapter();
     ad.getChainsFromARIfile("src/test/resources/taxonomies/irs_dom.ari");
     System.out.println(ad.lemma_AssocWords);

   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.similarity.apps.taxo_builder;

	import java.io.BufferedReader;
	import java.io.FileInputStream;
	import java.io.InputStreamReader;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;

	/**
	* This class makes it possible to use old prolog-files as the bases for
	* taxonomy-learner. It cleans the prolog files and returns with Strings which
	* can be used for the taxonomy extender process.
	*
	*/
	public class AriAdapter {
	// income_taks(state,company(cafeteria,_)):-do(71100).
	Map<String, List<List<String>>> lemma_AssocWords = new HashMap<String, List<List<String>>>();

	public void getChainsFromARIfile(String fileName) {

	try {
	BufferedReader br = new BufferedReader(new InputStreamReader(
	new FileInputStream(fileName)));
	String line;
	while ((line = br.readLine()) != null) {
	if (line.length() < 10 \|\| line.startsWith("%") \|\| line.startsWith(":"))
	continue;
	String chain0 = line.replace("_,", "&").replace("_)", "&")
	.replace(":-do(", "&").replace(":-var", "&").replace("taks", "tax")
	.replace(":- do(", "&").replace("X=", "&").replace(":-", "&")
	.replace("[X\|_]", "&").replace("nonvar", "&").replace("var", "&")
	.replace('(', '&').replace(')', '&').replace(',', '&')
	.replace('.', '&').replace("&&&", "&").replace("&&", "&")
	.replace("&", " ");
	String[] chains = chain0.split(" ");
	List<String> chainList = new ArrayList<String>(); // Arrays.asList(chains);
	for (String word : chains) {
	if (word != null && word.length() > 2 && word.indexOf("0") < 0
	&& word.indexOf("1") < 0 && word.indexOf("2") < 0
	&& word.indexOf("3") < 0 && word.indexOf("4") < 0
	&& word.indexOf("5") < 0)
	chainList.add(word);
	}
	if (chains.length < 1 \|\| chainList.size() < 1
	\|\| chainList.get(0).length() < 3)
	continue;
	String entry = chainList.get(0);
	if (entry.length() < 3)
	continue;
	chainList.remove(entry);
	List<List<String>> res = lemma_AssocWords.get(entry);
	if (res == null) {
	List<List<String>> resList = new ArrayList<List<String>>();
	resList.add(chainList);
	lemma_AssocWords.put(entry, resList);
	} else {
	res.add(chainList);
	lemma_AssocWords.put(entry, res);
	}
	}
	} catch (Exception e) {
	e.printStackTrace();

	}
	}

	public static void main(String[] args) {

	AriAdapter ad = new AriAdapter();
	ad.getChainsFromARIfile("src/test/resources/taxonomies/irs_dom.ari");
	System.out.println(ad.lemma_AssocWords);

	}

	}