framework/common/src/org/ofbiz/common/KeywordSearchUtil.java - ofbiz - Git at Google

 /*******************************************************************************
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *******************************************************************************/
 package org.ofbiz.common;

 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.StringTokenizer;
 import java.util.TreeSet;

 import javolution.util.FastMap;

 import org.ofbiz.base.util.Debug;
 import org.ofbiz.base.util.UtilMisc;
 import org.ofbiz.base.util.UtilProperties;
 import org.ofbiz.base.util.UtilValidate;
 import org.ofbiz.entity.Delegator;
 import org.ofbiz.entity.GenericEntityException;
 import org.ofbiz.entity.GenericValue;

 /**
  * A few utility methods related to Keyword Search.
  */
 public class KeywordSearchUtil {

     public static final String module = KeywordSearchUtil.class.getName();

     public static Set<String> thesaurusRelsToInclude = new HashSet<String>();
     public static Set<String> thesaurusRelsForReplace = new HashSet<String>();

     static {
         thesaurusRelsToInclude.add("KWTR_UF");
         thesaurusRelsToInclude.add("KWTR_USE");
         thesaurusRelsToInclude.add("KWTR_CS");
         thesaurusRelsToInclude.add("KWTR_NT");
         thesaurusRelsToInclude.add("KWTR_BT");
         thesaurusRelsToInclude.add("KWTR_RT");

         thesaurusRelsForReplace.add("KWTR_USE");
         thesaurusRelsForReplace.add("KWTR_CS");
     }

     public static String getSeparators() {
         // String separators = ";: ,.!?\t\"\'\r\n\\/()[]{}*%<>-+_";
         String seps = UtilProperties.getPropertyValue("keywordsearch", "index.keyword.separators", ";: ,.!?\t\"\'\r\n\\/()[]{}*%<>-+_");
         return seps;
     }

     public static String getStopWordBagOr() {
         return UtilProperties.getPropertyValue("keywordsearch", "stop.word.bag.or");
     }
     public static String getStopWordBagAnd() {
         return UtilProperties.getPropertyValue("keywordsearch", "stop.word.bag.and");
     }

     public static boolean getRemoveStems() {
         String removeStemsStr = UtilProperties.getPropertyValue("keywordsearch", "remove.stems");
         return "true".equals(removeStemsStr);
     }
     public static Set<String> getStemSet() {
         String stemBag = UtilProperties.getPropertyValue("keywordsearch", "stem.bag");
         Set<String> stemSet = new TreeSet<String>();
         if (UtilValidate.isNotEmpty(stemBag)) {
             String curToken;
             StringTokenizer tokenizer = new StringTokenizer(stemBag, ": ");
             while (tokenizer.hasMoreTokens()) {
                 curToken = tokenizer.nextToken();
                 stemSet.add(curToken);
             }
         }
         return stemSet;
     }

     public static void processForKeywords(String str, Map<String, Long> keywords, boolean forSearch, boolean anyPrefix, boolean anySuffix, boolean isAnd) {
         String separators = getSeparators();
         String stopWordBagOr = getStopWordBagOr();
         String stopWordBagAnd = getStopWordBagAnd();

         boolean removeStems = getRemoveStems();
         Set<String> stemSet = getStemSet();

         processForKeywords(str, keywords, separators, stopWordBagAnd, stopWordBagOr, removeStems, stemSet, forSearch, anyPrefix, anySuffix, isAnd);
     }

     public static void processKeywordsForIndex(String str, Map<String, Long> keywords, String separators, String stopWordBagAnd, String stopWordBagOr, boolean removeStems, Set<String> stemSet) {
         processForKeywords(str, keywords, separators, stopWordBagAnd, stopWordBagOr, removeStems, stemSet, false, false, false, false);
     }

     public static void processForKeywords(String str, Map<String, Long> keywords, String separators, String stopWordBagAnd, String stopWordBagOr, boolean removeStems, Set<String> stemSet, boolean forSearch, boolean anyPrefix, boolean anySuffix, boolean isAnd) {
         Set<String> keywordSet = makeKeywordSet(str, separators, forSearch);
         fixupKeywordSet(keywordSet, keywords, stopWordBagAnd, stopWordBagOr, removeStems, stemSet, forSearch, anyPrefix, anySuffix, isAnd);
     }

     public static void fixupKeywordSet(Set<String> keywordSet, Map<String, Long> keywords, String stopWordBagAnd, String stopWordBagOr, boolean removeStems, Set<String> stemSet, boolean forSearch, boolean anyPrefix, boolean anySuffix, boolean isAnd) {
         if (keywordSet == null) {
             return;
         }

         for (String token: keywordSet) {

             // when cleaning up the tokens the ordering is inportant: check stop words, remove stems, then get rid of 1 character tokens (1 digit okay)

             // check stop words
             String colonToken = ":" + token + ":";
             if (forSearch) {
                 if ((isAnd && stopWordBagAnd.indexOf(colonToken) >= 0) || (!isAnd && stopWordBagOr.indexOf(colonToken) >= 0)) {
                     continue;
                 }
             } else {
                 if (stopWordBagOr.indexOf(colonToken) >= 0 && stopWordBagAnd.indexOf(colonToken) >= 0) {
                     continue;
                 }
             }

             // remove stems
             if (removeStems) {
                 for (String stem: stemSet) {
                     if (token.endsWith(stem)) {
                         token = token.substring(0, token.length() - stem.length());
                     }
                 }
             }

             // get rid of all length 0 tokens now
             if (token.length() == 0) {
                 continue;
             }

             // get rid of all length 1 character only tokens, pretty much useless
             if (token.length() == 1 && Character.isLetter(token.charAt(0))) {
                 continue;
             }

             if (forSearch) {
                 StringBuilder strSb = new StringBuilder();
                 if (anyPrefix) strSb.append('%');
                 strSb.append(token);
                 if (anySuffix) strSb.append('%');
                 // replace all %% with %
                 int dblPercIdx = -1;
                 while ((dblPercIdx = strSb.indexOf("%%")) >= 0) {
                     //Debug.logInfo("before strSb: " + strSb, module);
                     strSb.replace(dblPercIdx, dblPercIdx+2, "%");
                     //Debug.logInfo("after strSb: " + strSb, module);
                 }
                 token = strSb.toString();
             }

             // group by word, add up weight
             Long curWeight = keywords.get(token);
             if (curWeight == null) {
                 keywords.put(token, Long.valueOf(1));
             } else {
                 keywords.put(token, Long.valueOf(curWeight.longValue() + 1));
             }
         }
     }

     public static Set<String> makeKeywordSet(String str, String separators, boolean forSearch) {
         if (separators == null) separators = getSeparators();

         Set<String> keywords = new TreeSet<String>();
         if (str.length() > 0) {
             // strip off weird characters
             str = str.replaceAll("\\\302\\\240|\\\240", " ");

             if (forSearch) {
                 // remove %_*? from separators if is for a search
                 StringBuilder sb = new StringBuilder(separators);
                 if (sb.indexOf("%") >= 0) sb.deleteCharAt(sb.indexOf("%"));
                 if (sb.indexOf("_") >= 0) sb.deleteCharAt(sb.indexOf("_"));
                 if (sb.indexOf("*") >= 0) sb.deleteCharAt(sb.indexOf("*"));
                 if (sb.indexOf("?") >= 0) sb.deleteCharAt(sb.indexOf("?"));
                 separators = sb.toString();
             }

             StringTokenizer tokener = new StringTokenizer(str, separators, false);
             while (tokener.hasMoreTokens()) {
                 // make sure it is lower case before doing anything else
                 String token = tokener.nextToken().toLowerCase();

                 if (forSearch) {
                     // these characters will only be present if it is for a search, ie not for indexing
                     token = token.replace('*', '%');
                     token = token.replace('?', '_');
                 }

                 keywords.add(token);
             }
         }
         return keywords;
 }

     public static Set<String> fixKeywordsForSearch(Set<String> keywordSet, boolean anyPrefix, boolean anySuffix, boolean removeStems, boolean isAnd) {
         Map<String, Long> keywords = FastMap.newInstance();
         fixupKeywordSet(keywordSet, keywords, getStopWordBagAnd(), getStopWordBagOr(), removeStems, getStemSet(), true, anyPrefix, anySuffix, isAnd);
         return keywords.keySet();
     }

     public static boolean expandKeywordForSearch(String enteredKeyword, Set<String> addToSet, Delegator delegator) {
         boolean replaceEnteredKeyword = false;

         try {
             List<GenericValue> thesaurusList = delegator.findByAndCache("KeywordThesaurus", UtilMisc.toMap("enteredKeyword", enteredKeyword));
             for (GenericValue keywordThesaurus: thesaurusList) {
                 String relationshipEnumId = (String) keywordThesaurus.get("relationshipEnumId");
                 if (thesaurusRelsToInclude.contains(relationshipEnumId)) {
                     addToSet.addAll(makeKeywordSet(keywordThesaurus.getString("alternateKeyword"), null, true));
                     if (thesaurusRelsForReplace.contains(relationshipEnumId)) {
                         replaceEnteredKeyword = true;
                     }
                 }
             }
         } catch (GenericEntityException e) {
             Debug.logError(e, "Error expanding entered keyword", module);
         }

         Debug.logInfo("Expanded keyword [" + enteredKeyword + "], got set: " + addToSet, module);
         return replaceEnteredKeyword;
     }
 }
	/*******************************************************************************
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*******************************************************************************/
	package org.ofbiz.common;

	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.StringTokenizer;
	import java.util.TreeSet;

	import javolution.util.FastMap;

	import org.ofbiz.base.util.Debug;
	import org.ofbiz.base.util.UtilMisc;
	import org.ofbiz.base.util.UtilProperties;
	import org.ofbiz.base.util.UtilValidate;
	import org.ofbiz.entity.Delegator;
	import org.ofbiz.entity.GenericEntityException;
	import org.ofbiz.entity.GenericValue;

	/**
	* A few utility methods related to Keyword Search.
	*/
	public class KeywordSearchUtil {

	public static final String module = KeywordSearchUtil.class.getName();

	public static Set<String> thesaurusRelsToInclude = new HashSet<String>();
	public static Set<String> thesaurusRelsForReplace = new HashSet<String>();

	static {
	thesaurusRelsToInclude.add("KWTR_UF");
	thesaurusRelsToInclude.add("KWTR_USE");
	thesaurusRelsToInclude.add("KWTR_CS");
	thesaurusRelsToInclude.add("KWTR_NT");
	thesaurusRelsToInclude.add("KWTR_BT");
	thesaurusRelsToInclude.add("KWTR_RT");

	thesaurusRelsForReplace.add("KWTR_USE");
	thesaurusRelsForReplace.add("KWTR_CS");
	}

	public static String getSeparators() {
	// String separators = ";: ,.!?\t\"\'\r\n\\/()[]{}*%<>-+_";
	String seps = UtilProperties.getPropertyValue("keywordsearch", "index.keyword.separators", ";: ,.!?\t\"\'\r\n\\/()[]{}*%<>-+_");
	return seps;
	}

	public static String getStopWordBagOr() {
	return UtilProperties.getPropertyValue("keywordsearch", "stop.word.bag.or");
	}
	public static String getStopWordBagAnd() {
	return UtilProperties.getPropertyValue("keywordsearch", "stop.word.bag.and");
	}

	public static boolean getRemoveStems() {
	String removeStemsStr = UtilProperties.getPropertyValue("keywordsearch", "remove.stems");
	return "true".equals(removeStemsStr);
	}
	public static Set<String> getStemSet() {
	String stemBag = UtilProperties.getPropertyValue("keywordsearch", "stem.bag");
	Set<String> stemSet = new TreeSet<String>();
	if (UtilValidate.isNotEmpty(stemBag)) {
	String curToken;
	StringTokenizer tokenizer = new StringTokenizer(stemBag, ": ");
	while (tokenizer.hasMoreTokens()) {
	curToken = tokenizer.nextToken();
	stemSet.add(curToken);
	}
	}
	return stemSet;
	}

	public static void processForKeywords(String str, Map<String, Long> keywords, boolean forSearch, boolean anyPrefix, boolean anySuffix, boolean isAnd) {
	String separators = getSeparators();
	String stopWordBagOr = getStopWordBagOr();
	String stopWordBagAnd = getStopWordBagAnd();

	boolean removeStems = getRemoveStems();
	Set<String> stemSet = getStemSet();

	processForKeywords(str, keywords, separators, stopWordBagAnd, stopWordBagOr, removeStems, stemSet, forSearch, anyPrefix, anySuffix, isAnd);
	}

	public static void processKeywordsForIndex(String str, Map<String, Long> keywords, String separators, String stopWordBagAnd, String stopWordBagOr, boolean removeStems, Set<String> stemSet) {
	processForKeywords(str, keywords, separators, stopWordBagAnd, stopWordBagOr, removeStems, stemSet, false, false, false, false);
	}

	public static void processForKeywords(String str, Map<String, Long> keywords, String separators, String stopWordBagAnd, String stopWordBagOr, boolean removeStems, Set<String> stemSet, boolean forSearch, boolean anyPrefix, boolean anySuffix, boolean isAnd) {
	Set<String> keywordSet = makeKeywordSet(str, separators, forSearch);
	fixupKeywordSet(keywordSet, keywords, stopWordBagAnd, stopWordBagOr, removeStems, stemSet, forSearch, anyPrefix, anySuffix, isAnd);
	}

	public static void fixupKeywordSet(Set<String> keywordSet, Map<String, Long> keywords, String stopWordBagAnd, String stopWordBagOr, boolean removeStems, Set<String> stemSet, boolean forSearch, boolean anyPrefix, boolean anySuffix, boolean isAnd) {
	if (keywordSet == null) {
	return;
	}

	for (String token: keywordSet) {

	// when cleaning up the tokens the ordering is inportant: check stop words, remove stems, then get rid of 1 character tokens (1 digit okay)

	// check stop words
	String colonToken = ":" + token + ":";
	if (forSearch) {
	if ((isAnd && stopWordBagAnd.indexOf(colonToken) >= 0) \|\| (!isAnd && stopWordBagOr.indexOf(colonToken) >= 0)) {
	continue;
	}
	} else {
	if (stopWordBagOr.indexOf(colonToken) >= 0 && stopWordBagAnd.indexOf(colonToken) >= 0) {
	continue;
	}
	}

	// remove stems
	if (removeStems) {
	for (String stem: stemSet) {
	if (token.endsWith(stem)) {
	token = token.substring(0, token.length() - stem.length());
	}
	}
	}

	// get rid of all length 0 tokens now
	if (token.length() == 0) {
	continue;
	}

	// get rid of all length 1 character only tokens, pretty much useless
	if (token.length() == 1 && Character.isLetter(token.charAt(0))) {
	continue;
	}

	if (forSearch) {
	StringBuilder strSb = new StringBuilder();
	if (anyPrefix) strSb.append('%');
	strSb.append(token);
	if (anySuffix) strSb.append('%');
	// replace all %% with %
	int dblPercIdx = -1;
	while ((dblPercIdx = strSb.indexOf("%%")) >= 0) {
	//Debug.logInfo("before strSb: " + strSb, module);
	strSb.replace(dblPercIdx, dblPercIdx+2, "%");
	//Debug.logInfo("after strSb: " + strSb, module);
	}
	token = strSb.toString();
	}

	// group by word, add up weight
	Long curWeight = keywords.get(token);
	if (curWeight == null) {
	keywords.put(token, Long.valueOf(1));
	} else {
	keywords.put(token, Long.valueOf(curWeight.longValue() + 1));
	}
	}
	}

	public static Set<String> makeKeywordSet(String str, String separators, boolean forSearch) {
	if (separators == null) separators = getSeparators();

	Set<String> keywords = new TreeSet<String>();
	if (str.length() > 0) {
	// strip off weird characters
	str = str.replaceAll("\\\302\\\240\|\\\240", " ");

	if (forSearch) {
	// remove %_*? from separators if is for a search
	StringBuilder sb = new StringBuilder(separators);
	if (sb.indexOf("%") >= 0) sb.deleteCharAt(sb.indexOf("%"));
	if (sb.indexOf("_") >= 0) sb.deleteCharAt(sb.indexOf("_"));
	if (sb.indexOf("") >= 0) sb.deleteCharAt(sb.indexOf(""));
	if (sb.indexOf("?") >= 0) sb.deleteCharAt(sb.indexOf("?"));
	separators = sb.toString();
	}

	StringTokenizer tokener = new StringTokenizer(str, separators, false);
	while (tokener.hasMoreTokens()) {
	// make sure it is lower case before doing anything else
	String token = tokener.nextToken().toLowerCase();

	if (forSearch) {
	// these characters will only be present if it is for a search, ie not for indexing
	token = token.replace('*', '%');
	token = token.replace('?', '_');
	}

	keywords.add(token);
	}
	}
	return keywords;
	}

	public static Set<String> fixKeywordsForSearch(Set<String> keywordSet, boolean anyPrefix, boolean anySuffix, boolean removeStems, boolean isAnd) {
	Map<String, Long> keywords = FastMap.newInstance();
	fixupKeywordSet(keywordSet, keywords, getStopWordBagAnd(), getStopWordBagOr(), removeStems, getStemSet(), true, anyPrefix, anySuffix, isAnd);
	return keywords.keySet();
	}

	public static boolean expandKeywordForSearch(String enteredKeyword, Set<String> addToSet, Delegator delegator) {
	boolean replaceEnteredKeyword = false;

	try {
	List<GenericValue> thesaurusList = delegator.findByAndCache("KeywordThesaurus", UtilMisc.toMap("enteredKeyword", enteredKeyword));
	for (GenericValue keywordThesaurus: thesaurusList) {
	String relationshipEnumId = (String) keywordThesaurus.get("relationshipEnumId");
	if (thesaurusRelsToInclude.contains(relationshipEnumId)) {
	addToSet.addAll(makeKeywordSet(keywordThesaurus.getString("alternateKeyword"), null, true));
	if (thesaurusRelsForReplace.contains(relationshipEnumId)) {
	replaceEnteredKeyword = true;
	}
	}
	}
	} catch (GenericEntityException e) {
	Debug.logError(e, "Error expanding entered keyword", module);
	}

	Debug.logInfo("Expanded keyword [" + enteredKeyword + "], got set: " + addToSet, module);
	return replaceEnteredKeyword;
	}
	}