src/java/org/apache/commons/codec/language/bm/Lang.java - commons-codec - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.commons.codec.language.bm;

 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.EnumMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Scanner;
 import java.util.Set;
 import java.util.regex.Pattern;

 /**
  * <p>
  * Language guessing utility.
  * </p>
  * <p>
  * This class encapsulates rules used to guess the possible languages that a word originates from. This is done by reference to a whole
  * series of rules distributed in resource files.
  * </p>
  * <p>
  * Instances of this class are typically managed through the static factory method instance(). Unless you are developing your own language
  * guessing rules, you will not need to interact with this class directly.
  * </p>
  * <p>
  * This class is intended to be immutable and thread-safe.
  * </p>
  * <h2>Lang resources</h2
  * <p>
  * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named
  * following the pattern: <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote> The format of these resources is the
  * following:
  * </p>
  * <ul>
  * <li><b>Rules:</b> whitespace separated strings. There should be 3 columns to each row, and these will be interpreted as:
  * <ol>
  * <li>pattern: a regular expression.</li>
  * <li>languages: a '+'-separated list of languages.</li>
  * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
  * </ol>
  * </li>
  * <li><b>End-of-line comments:</b> Any occurance of '//' will cause all text following on that line to be discarded as a comment.</li>
  * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a
  * line ending in '*' and '/' is found.</li>
  * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
  * </ul>
  * <p/>
  * Port of lang.php
  *
  * @author Apache Software Foundation
  * @since 2.0
  */
 public class Lang {

     private static final class LangRule {
         private final boolean acceptOnMatch;
         private final Set<String> languages;
         private final Pattern pattern;

         private LangRule(Pattern pattern, Set<String> languages, boolean acceptOnMatch) {
             this.pattern = pattern;
             this.languages = languages;
             this.acceptOnMatch = acceptOnMatch;
         }

         public boolean matches(String txt) {
             return this.pattern.matcher(txt).find();
         }
     }

     private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);

     private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";

     static {
         for (NameType s : NameType.values()) {
             Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
         }
     }

     /**
      * Gets a Lang instance for one of the supported NameTypes.
      *
      * @param nameType
      *            the NameType to look up
      * @return a Lang encapsulating the language guessing rules for that name type
      */
     public static Lang instance(NameType nameType) {
         return Langs.get(nameType);
     }

     /**
      * <p>
      * Loads language rules from a resource.
      * </p>
      * <p>
      * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method. You will only need to call this
      * yourself if you are developing custom language mapping rules.
      * </p>
      *
      * @param languageRulesResourceName
      *            the fully-qualified resource name to load
      * @param languages
      *            the languages that these rules will support
      * @return a Lang encapsulating the loaded language-guessing rules.
      */
     public static Lang loadFromResource(String languageRulesResourceName, Languages languages) {
         List<LangRule> rules = new ArrayList<LangRule>();
         InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);

         if (lRulesIS == null) {
             throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
         }

         Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
         boolean inExtendedComment = false;
         while (scanner.hasNextLine()) {
             String rawLine = scanner.nextLine();
             String line = rawLine;

             if (inExtendedComment) {
                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
                     inExtendedComment = false;
                 } else {
                     // discard doc comment line
                 }
             } else {
                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
                     inExtendedComment = true;
                 } else {
                     // discard comments
                     int cmtI = line.indexOf(ResourceConstants.CMT);
                     if (cmtI >= 0) {
                         // System.err.println("index of comment: " + cmtI);
                         line = line.substring(0, cmtI);
                     }

                     // trim leading-trailing whitespace
                     line = line.trim();

                     if (line.length() == 0) {
                         continue; // empty lines can be safely skipped
                     }

                     // split it up
                     String[] parts = line.split("\\s+");
                     // System.err.println("part count: " + parts.length);

                     if (parts.length != 3) {
                         // fixme: we really need to log this somewhere
                         System.err.println("Warning: malformed line '" + rawLine + "'");
                         continue;
                     }

                     Pattern pattern = Pattern.compile(parts[0]);
                     String[] langs = parts[1].split("\\+");
                     boolean accept = parts[2].equals("true");

                     rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
                 }
             }
         }

         return new Lang(rules, languages);
     }

     private final Languages languages;
     private final List<LangRule> rules;

     private Lang(List<LangRule> rules, Languages languages) {
         this.rules = Collections.unmodifiableList(rules);
         this.languages = languages;
     }

     /**
      * Guesses the language of a word.
      *
      * @param text
      *            the word
      * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
      */
     public String guessLanguage(String text) {
         Languages.LanguageSet ls = guessLanguages(text);
         return ls.isSingleton() ? ls.getAny() : Languages.ANY;
     }

     /**
      * Guesses the languages of a word.
      *
      * @param input
      *            the word
      * @return a Set of Strings of language names that are potential matches for the input word
      */
     public Languages.LanguageSet guessLanguages(String input) {
         String text = input.toLowerCase(Locale.ENGLISH);
         // System.out.println("Testing text: '" + text + "'");

         Set<String> langs = new HashSet<String>(this.languages.getLanguages());
         for (LangRule rule : this.rules) {
             if (rule.matches(text)) {
                 // System.out.println("Rule " + rule.pattern + " matches " + text);
                 if (rule.acceptOnMatch) {
                     // System.out.println("Retaining " + rule.languages);
                     langs.retainAll(rule.languages);
                 } else {
                     // System.out.println("Removing " + rule.languages);
                     langs.removeAll(rule.languages);
                 }
                 // System.out.println("Current languages: " + langs);
             } else {
                 // System.out.println("Rule " + rule.pattern + " does not match " + text);
             }
         }

         Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
         return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.commons.codec.language.bm;

	import java.io.InputStream;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.EnumMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Locale;
	import java.util.Map;
	import java.util.Scanner;
	import java.util.Set;
	import java.util.regex.Pattern;

	/**
	* <p>
	* Language guessing utility.
	* </p>
	* <p>
	* This class encapsulates rules used to guess the possible languages that a word originates from. This is done by reference to a whole
	* series of rules distributed in resource files.
	* </p>
	* <p>
	* Instances of this class are typically managed through the static factory method instance(). Unless you are developing your own language
	* guessing rules, you will not need to interact with this class directly.
	* </p>
	* <p>
	* This class is intended to be immutable and thread-safe.
	* </p>
	* <h2>Lang resources</h2
	* <p>
	* Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically named
	* following the pattern: <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote> The format of these resources is the
	* following:
	* </p>
	* <ul>
	* <li><b>Rules:</b> whitespace separated strings. There should be 3 columns to each row, and these will be interpreted as:
	* <ol>
	* <li>pattern: a regular expression.</li>
	* <li>languages: a '+'-separated list of languages.</li>
	* <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
	* </ol>
	* </li>
	* <li><b>End-of-line comments:</b> Any occurance of '//' will cause all text following on that line to be discarded as a comment.</li>
	* <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a
	* line ending in '*' and '/' is found.</li>
	* <li><b>Blank lines:</b> All blank lines will be skipped.</li>
	* </ul>
	* <p/>
	* Port of lang.php
	*
	* @author Apache Software Foundation
	* @since 2.0
	*/
	public class Lang {

	private static final class LangRule {
	private final boolean acceptOnMatch;
	private final Set<String> languages;
	private final Pattern pattern;

	private LangRule(Pattern pattern, Set<String> languages, boolean acceptOnMatch) {
	this.pattern = pattern;
	this.languages = languages;
	this.acceptOnMatch = acceptOnMatch;
	}

	public boolean matches(String txt) {
	return this.pattern.matcher(txt).find();
	}
	}

	private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);

	private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";

	static {
	for (NameType s : NameType.values()) {
	Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
	}
	}

	/**
	* Gets a Lang instance for one of the supported NameTypes.
	*
	* @param nameType
	* the NameType to look up
	* @return a Lang encapsulating the language guessing rules for that name type
	*/
	public static Lang instance(NameType nameType) {
	return Langs.get(nameType);
	}

	/**
	* <p>
	* Loads language rules from a resource.
	* </p>
	* <p>
	* In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method. You will only need to call this
	* yourself if you are developing custom language mapping rules.
	* </p>
	*
	* @param languageRulesResourceName
	* the fully-qualified resource name to load
	* @param languages
	* the languages that these rules will support
	* @return a Lang encapsulating the loaded language-guessing rules.
	*/
	public static Lang loadFromResource(String languageRulesResourceName, Languages languages) {
	List<LangRule> rules = new ArrayList<LangRule>();
	InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);

	if (lRulesIS == null) {
	throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
	}

	Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
	boolean inExtendedComment = false;
	while (scanner.hasNextLine()) {
	String rawLine = scanner.nextLine();
	String line = rawLine;

	if (inExtendedComment) {
	if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
	inExtendedComment = false;
	} else {
	// discard doc comment line
	}
	} else {
	if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
	inExtendedComment = true;
	} else {
	// discard comments
	int cmtI = line.indexOf(ResourceConstants.CMT);
	if (cmtI >= 0) {
	// System.err.println("index of comment: " + cmtI);
	line = line.substring(0, cmtI);
	}

	// trim leading-trailing whitespace
	line = line.trim();

	if (line.length() == 0) {
	continue; // empty lines can be safely skipped
	}

	// split it up
	String[] parts = line.split("\\s+");
	// System.err.println("part count: " + parts.length);

	if (parts.length != 3) {
	// fixme: we really need to log this somewhere
	System.err.println("Warning: malformed line '" + rawLine + "'");
	continue;
	}

	Pattern pattern = Pattern.compile(parts[0]);
	String[] langs = parts[1].split("\\+");
	boolean accept = parts[2].equals("true");

	rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
	}
	}
	}

	return new Lang(rules, languages);
	}

	private final Languages languages;
	private final List<LangRule> rules;

	private Lang(List<LangRule> rules, Languages languages) {
	this.rules = Collections.unmodifiableList(rules);
	this.languages = languages;
	}

	/**
	* Guesses the language of a word.
	*
	* @param text
	* the word
	* @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
	*/
	public String guessLanguage(String text) {
	Languages.LanguageSet ls = guessLanguages(text);
	return ls.isSingleton() ? ls.getAny() : Languages.ANY;
	}

	/**
	* Guesses the languages of a word.
	*
	* @param input
	* the word
	* @return a Set of Strings of language names that are potential matches for the input word
	*/
	public Languages.LanguageSet guessLanguages(String input) {
	String text = input.toLowerCase(Locale.ENGLISH);
	// System.out.println("Testing text: '" + text + "'");

	Set<String> langs = new HashSet<String>(this.languages.getLanguages());
	for (LangRule rule : this.rules) {
	if (rule.matches(text)) {
	// System.out.println("Rule " + rule.pattern + " matches " + text);
	if (rule.acceptOnMatch) {
	// System.out.println("Retaining " + rule.languages);
	langs.retainAll(rule.languages);
	} else {
	// System.out.println("Removing " + rule.languages);
	langs.removeAll(rule.languages);
	}
	// System.out.println("Current languages: " + langs);
	} else {
	// System.out.println("Rule " + rule.pattern + " does not match " + text);
	}
	}

	Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
	return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
	}
	}