| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.joshua.util; |
| |
| import java.io.PrintStream; |
| import java.io.UnsupportedEncodingException; |
| import java.util.regex.Pattern; |
| |
| import org.apache.joshua.corpus.Vocabulary; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Utility class for format issues. |
| * |
| * @author Juri Ganitkevitch |
| * @author Lane Schwartz |
| */ |
| public class FormatUtils { |
| |
| private static final Logger LOG = LoggerFactory.getLogger(FormatUtils.class); |
| |
| private static final String INDEX_SEPARATOR = ","; |
| |
| /** |
| * Determines whether the string is a nonterminal by checking that the first character is [ |
| * and the last character is ]. |
| * |
| * @param token input string |
| * @return true if it's a nonterminal symbol, false otherwise |
| */ |
| public static boolean isNonterminal(String token) { |
| return (token.length() >=3 && token.charAt(0) == '[') && (token.charAt(token.length() - 1) == ']'); |
| } |
| |
| /** |
| * Determines whether the ID represents a nonterminal. This is a trivial check, since nonterminal |
| * IDs are simply negative ones. |
| * |
| * @param id the vocabulary ID |
| * @return true if a nonterminal ID, false otherwise |
| */ |
| public static boolean isNonterminal(int id) { |
| return id < 0; |
| } |
| |
| /** |
| * Nonterminals are stored in the vocabulary in square brackets. This removes them when you |
| * just want the raw nonterminal word. |
| * Supports indexed and non-indexed nonTerminals: |
| * [GOAL] -> GOAL |
| * [X,1] -> [X] |
| * |
| * @param nt the nonterminal, e.g., "[GOAL]" |
| * @return the cleaned nonterminal, e.g., "GOAL" |
| */ |
| public static String cleanNonTerminal(String nt) { |
| if (isNonterminal(nt)) { |
| if (isIndexedNonTerminal(nt)) { |
| // strip ",.*]" |
| return nt.substring(1, nt.indexOf(INDEX_SEPARATOR)); |
| } |
| // strip "]" |
| return nt.substring(1, nt.length() - 1); |
| } |
| return nt; |
| } |
| |
| private static boolean isIndexedNonTerminal(String nt) { |
| return nt.contains(INDEX_SEPARATOR); |
| } |
| |
| /** |
| * Removes the index from a nonTerminal: [X,1] -> [X]. |
| * @param nt an input non-terminal string |
| * @return the stripped non terminal string |
| */ |
| public static String stripNonTerminalIndex(String nt) { |
| return ensureNonTerminalBrackets(cleanNonTerminal(nt)); |
| } |
| |
| /** |
| * Nonterminals on source and target sides are represented as [X,1], where 1 is an integer |
| * that links the two sides. This function extracts the index, e.g., |
| * |
| * getNonterminalIndex("[X,7]") produces 7 |
| * |
| * @param nt the nonterminal string |
| * @return the index |
| */ |
| public static int getNonterminalIndex(String nt) { |
| return Integer.parseInt(nt.substring(nt.indexOf(INDEX_SEPARATOR) + 1, nt.length() - 1)); |
| } |
| |
| /** |
| * Ensures that a string looks like what the system considers a nonterminal to be. |
| * |
| * @param nt the nonterminal string |
| * @return the nonterminal string surrounded in square brackets (if not already) |
| */ |
| public static String ensureNonTerminalBrackets(String nt) { |
| if (isNonterminal(nt)) |
| return nt; |
| else |
| return "[" + nt + "]"; |
| } |
| |
| public static String escapeSpecialSymbols(String s) { |
| return s.replace("\\[", "-lsb-") |
| .replace("\\]", "-rsb-") |
| .replace("\\|", "-pipe-"); |
| } |
| |
| public static String unescapeSpecialSymbols(String s) { |
| return s.replace("-lsb-", "[") |
| .replace("-rsb-", "]") |
| .replace("-pipe-", "|"); |
| } |
| |
| /** |
| * wrap sentence with sentence start/stop markers |
| * as defined by Vocabulary; separated by a single whitespace. |
| * @param s an input sentence |
| * @return the wrapped sentence |
| */ |
| public static String addSentenceMarkers(String s) { |
| return Vocabulary.START_SYM + " " + s + " " + Vocabulary.STOP_SYM; |
| } |
| |
| /** |
| * strip sentence markers (and whitespaces) from string |
| * @param s the sentence to strip of markers (and whitespaces) |
| * @return the stripped string |
| */ |
| public static String removeSentenceMarkers(String s) { |
| return s.replaceAll("<s> ", "").replace(" </s>", ""); |
| } |
| |
| /** |
| * Returns true if the String parameter represents a valid number. |
| * <p> |
| * The body of this method is taken from the Javadoc documentation for the Java Double class. |
| * |
| * @param string an input string |
| * @see java.lang.Double |
| * @return <code>true</code> if the string represents a valid number, <code>false</code> otherwise |
| */ |
| public static boolean isNumber(String string) { |
| final String Digits = "(\\p{Digit}+)"; |
| final String HexDigits = "(\\p{XDigit}+)"; |
| // an exponent is 'e' or 'E' followed by an optionally |
| // signed decimal integer. |
| final String Exp = "[eE][+-]?" + Digits; |
| final String fpRegex = ("[\\x00-\\x20]*" + // Optional leading "whitespace" |
| "[+-]?(" + // Optional sign character |
| "NaN|" + // "NaN" string |
| "Infinity|" + // "Infinity" string |
| |
| // A decimal floating-point string representing a finite positive |
| // number without a leading sign has at most five basic pieces: |
| // Digits . Digits ExponentPart FloatTypeSuffix |
| // |
| // Since this method allows integer-only strings as input |
| // in addition to strings of floating-point literals, the |
| // two sub-patterns below are simplifications of the grammar |
| // productions from the Java Language Specification, 2nd |
| // edition, section 3.10.2. |
| |
| // Digits ._opt Digits_opt ExponentPart_opt FloatTypeSuffix_opt |
| "(((" + Digits + "(\\.)?(" + Digits + "?)(" + Exp + ")?)|" + |
| |
| // . Digits ExponentPart_opt FloatTypeSuffix_opt |
| "(\\.(" + Digits + ")(" + Exp + ")?)|" + |
| |
| // Hexadecimal strings |
| "((" + |
| // 0[xX] HexDigits ._opt BinaryExponent FloatTypeSuffix_opt |
| "(0[xX]" + HexDigits + "(\\.)?)|" + |
| |
| // 0[xX] HexDigits_opt . HexDigits BinaryExponent FloatTypeSuffix_opt |
| "(0[xX]" + HexDigits + "?(\\.)" + HexDigits + ")" + |
| |
| ")[pP][+-]?" + Digits + "))" + "[fFdD]?))" + "[\\x00-\\x20]*");// Optional |
| // trailing |
| // "whitespace" |
| |
| return Pattern.matches(fpRegex, string); |
| } |
| |
| /** |
| * Set System.out and System.err to use the UTF8 character encoding. |
| * |
| * @return <code>true</code> if both System.out and System.err were successfully set to use UTF8, |
| * <code>false</code> otherwise. |
| */ |
| public static boolean useUTF8() { |
| |
| try { |
| System.setOut(new PrintStream(System.out, true, "UTF8")); |
| System.setErr(new PrintStream(System.err, true, "UTF8")); |
| return true; |
| } catch (UnsupportedEncodingException e1) { |
| LOG.warn("UTF8 is not a valid encoding; using system default encoding for System.out and System.err."); |
| return false; |
| } catch (SecurityException e2) { |
| LOG.warn("Security manager is configured to disallow changes to System.out or System.err; using system default encoding."); |
| return false; |
| } |
| } |
| |
| /** |
| * Determines if a string contains ALL CAPS |
| * |
| * @param token an input token |
| * @return true if the string is all in uppercase, false otherwise |
| */ |
| public static boolean ISALLUPPERCASE(String token) { |
| for (int i = 0; i < token.length(); i++) |
| if (! Character.isUpperCase(token.charAt(i))) |
| return false; |
| return true; |
| } |
| |
| public static String capitalize(String word) { |
| if (word == null || word.length() == 0) |
| return word; |
| return word.substring(0, 1).toUpperCase() + word.substring(1); |
| } |
| } |