| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.commons.text; |
| |
| import java.io.UnsupportedEncodingException; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.LinkedHashSet; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.Objects; |
| import java.util.Set; |
| |
| /** |
| * <p> |
| * Convert from one alphabet to another, with the possibility of leaving certain |
| * characters unencoded. |
| * </p> |
| * |
| * <p> |
| * The target and do not encode languages must be in the Unicode BMP, but the |
| * source language does not. |
| * </p> |
| * |
| * <p> |
| * The encoding will all be of a fixed length, except for the 'do not encode' |
| * chars, which will be of length 1 |
| * </p> |
| * |
| * <h3>Sample usage</h3> |
| * |
| * <pre> |
| * Character[] originals; // a, b, c, d |
| * Character[] encoding; // 0, 1, d |
| * Character[] doNotEncode; // d |
| * |
| * AlphabetConverter ac = AlphabetConverter.createConverterFromChars(originals, |
| * encoding, doNotEncode); |
| * |
| * ac.encode("a"); // 00 |
| * ac.encode("b"); // 01 |
| * ac.encode("c"); // 0d |
| * ac.encode("d"); // d |
| * ac.encode("abcd"); // 00010dd |
| * </pre> |
| * |
| * <p> |
| * #ThreadSafe# AlphabetConverter class methods are thread-safe as they do not |
| * change internal state. |
| * </p> |
| * |
| * @since 1.0 |
| * |
| */ |
| public final class AlphabetConverter { |
| |
| /** |
| * Original string to be encoded. |
| */ |
| private final Map<Integer, String> originalToEncoded; |
| /** |
| * Encoding alphabet. |
| */ |
| private final Map<String, String> encodedToOriginal; |
| /** |
| * Length of the encoded letter. |
| */ |
| private final int encodedLetterLength; |
| /** |
| * Arrow constant, used for converting the object into a string. |
| */ |
| private static final String ARROW = " -> "; |
| |
| /** |
| * Hidden constructor for alphabet converter. Used by static helper methods. |
| * |
| * @param originalToEncoded original string to be encoded |
| * @param encodedToOriginal encoding alphabet |
| * @param encodedLetterLength length of the encoded letter |
| */ |
| private AlphabetConverter(final Map<Integer, String> originalToEncoded, |
| final Map<String, String> encodedToOriginal, |
| final int encodedLetterLength) { |
| |
| this.originalToEncoded = originalToEncoded; |
| this.encodedToOriginal = encodedToOriginal; |
| this.encodedLetterLength = encodedLetterLength; |
| } |
| |
| /** |
| * Encode a given string. |
| * |
| * @param original the string to be encoded |
| * @return the encoded string, {@code null} if the given string is null |
| * @throws UnsupportedEncodingException if chars that are not supported are |
| * encountered |
| */ |
| public String encode(final String original) |
| throws UnsupportedEncodingException { |
| if (original == null) { |
| return null; |
| } |
| |
| final StringBuilder sb = new StringBuilder(); |
| |
| for (int i = 0; i < original.length();) { |
| final int codepoint = original.codePointAt(i); |
| |
| final String nextLetter = originalToEncoded.get(codepoint); |
| |
| if (nextLetter == null) { |
| throw new UnsupportedEncodingException( |
| "Couldn't find encoding for '" |
| + codePointToString(codepoint) |
| + "' in " |
| + original |
| ); |
| } |
| |
| sb.append(nextLetter); |
| |
| i += Character.charCount(codepoint); |
| } |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Decode a given string. |
| * |
| * @param encoded a string that has been encoded using this |
| * AlphabetConverter |
| * @return the decoded string, {@code null} if the given string is null |
| * @throws UnsupportedEncodingException if unexpected characters that |
| * cannot be handled are encountered |
| */ |
| public String decode(final String encoded) |
| throws UnsupportedEncodingException { |
| if (encoded == null) { |
| return null; |
| } |
| |
| final StringBuilder result = new StringBuilder(); |
| |
| for (int j = 0; j < encoded.length();) { |
| final Integer i = encoded.codePointAt(j); |
| final String s = codePointToString(i); |
| |
| if (s.equals(originalToEncoded.get(i))) { |
| result.append(s); |
| j++; // because we do not encode in Unicode extended the |
| // length of each encoded char is 1 |
| } else { |
| if (j + encodedLetterLength > encoded.length()) { |
| throw new UnsupportedEncodingException("Unexpected end " |
| + "of string while decoding " + encoded); |
| } |
| final String nextGroup = encoded.substring(j, |
| j + encodedLetterLength); |
| final String next = encodedToOriginal.get(nextGroup); |
| if (next == null) { |
| throw new UnsupportedEncodingException( |
| "Unexpected string without decoding (" |
| + nextGroup + ") in " + encoded); |
| } |
| result.append(next); |
| j += encodedLetterLength; |
| } |
| } |
| |
| return result.toString(); |
| } |
| |
| /** |
| * Get the length of characters in the encoded alphabet that are necessary |
| * for each character in the original |
| * alphabet. |
| * |
| * @return the length of the encoded char |
| */ |
| public int getEncodedCharLength() { |
| return encodedLetterLength; |
| } |
| |
| /** |
| * Get the mapping from integer code point of source language to encoded |
| * string. Use to reconstruct converter from |
| * serialized map. |
| * |
| * @return the original map |
| */ |
| public Map<Integer, String> getOriginalToEncoded() { |
| return Collections.unmodifiableMap(originalToEncoded); |
| } |
| |
| /** |
| * Recursive method used when creating encoder/decoder. |
| * |
| * @param level at which point it should add a single encoding |
| * @param currentEncoding current encoding |
| * @param encoding letters encoding |
| * @param originals original values |
| * @param doNotEncodeMap map of values that should not be encoded |
| */ |
| @SuppressWarnings("PMD") |
| private void addSingleEncoding(final int level, |
| final String currentEncoding, |
| final Collection<Integer> encoding, |
| final Iterator<Integer> originals, |
| final Map<Integer, String> doNotEncodeMap) { |
| |
| if (level > 0) { |
| for (final int encodingLetter : encoding) { |
| if (originals.hasNext()) { |
| |
| // this skips the doNotEncode chars if they are in the |
| // leftmost place |
| if (level != encodedLetterLength |
| || !doNotEncodeMap.containsKey(encodingLetter)) { |
| addSingleEncoding(level - 1, |
| currentEncoding |
| + codePointToString(encodingLetter), |
| encoding, |
| originals, |
| doNotEncodeMap |
| ); |
| } |
| } else { |
| return; // done encoding all the original alphabet |
| } |
| } |
| } else { |
| Integer next = originals.next(); |
| |
| while (doNotEncodeMap.containsKey(next)) { |
| final String originalLetterAsString = codePointToString(next); |
| |
| originalToEncoded.put(next, originalLetterAsString); |
| encodedToOriginal.put(originalLetterAsString, |
| originalLetterAsString); |
| |
| if (!originals.hasNext()) { |
| return; |
| } |
| |
| next = originals.next(); |
| } |
| |
| final String originalLetterAsString = codePointToString(next); |
| |
| originalToEncoded.put(next, currentEncoding); |
| encodedToOriginal.put(currentEncoding, originalLetterAsString); |
| } |
| } |
| |
| @Override |
| public String toString() { |
| final StringBuilder sb = new StringBuilder(); |
| |
| for (final Entry<Integer, String> entry |
| : originalToEncoded.entrySet()) { |
| sb.append(codePointToString(entry.getKey())) |
| .append(ARROW) |
| .append(entry.getValue()).append(System.lineSeparator()); |
| } |
| |
| return sb.toString(); |
| } |
| |
| @Override |
| public boolean equals(final Object obj) { |
| if (obj == null) { |
| return false; |
| } |
| if (obj == this) { |
| return true; |
| } |
| if (!(obj instanceof AlphabetConverter)) { |
| return false; |
| } |
| final AlphabetConverter other = (AlphabetConverter) obj; |
| return originalToEncoded.equals(other.originalToEncoded) |
| && encodedToOriginal.equals(other.encodedToOriginal) |
| && encodedLetterLength == other.encodedLetterLength; |
| } |
| |
| @Override |
| public int hashCode() { |
| return Objects.hash(originalToEncoded, |
| encodedToOriginal, |
| encodedLetterLength); |
| } |
| |
| // -- static methods |
| |
| /** |
| * Create a new converter from a map. |
| * |
| * @param originalToEncoded a map returned from getOriginalToEncoded() |
| * @return the reconstructed AlphabetConverter |
| * @see AlphabetConverter#getOriginalToEncoded() |
| */ |
| public static AlphabetConverter createConverterFromMap( |
| final Map<Integer, String> originalToEncoded) { |
| final Map<Integer, String> unmodifiableOriginalToEncoded = |
| Collections.unmodifiableMap(originalToEncoded); |
| final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); |
| |
| int encodedLetterLength = 1; |
| |
| for (final Entry<Integer, String> e |
| : unmodifiableOriginalToEncoded.entrySet()) { |
| final String originalAsString = codePointToString(e.getKey()); |
| encodedToOriginal.put(e.getValue(), originalAsString); |
| |
| if (e.getValue().length() > encodedLetterLength) { |
| encodedLetterLength = e.getValue().length(); |
| } |
| } |
| |
| return new AlphabetConverter(unmodifiableOriginalToEncoded, |
| encodedToOriginal, |
| encodedLetterLength); |
| } |
| |
| /** |
| * Create an alphabet converter, for converting from the original alphabet, |
| * to the encoded alphabet, while leaving the characters in |
| * <em>doNotEncode</em> as they are (if possible). |
| * |
| * <p>Duplicate letters in either original or encoding will be ignored.</p> |
| * |
| * @param original an array of chars representing the original alphabet |
| * @param encoding an array of chars representing the alphabet to be used |
| * for encoding |
| * @param doNotEncode an array of chars to be encoded using the original |
| * alphabet - every char here must appear in |
| * both the previous params |
| * @return the AlphabetConverter |
| * @throws IllegalArgumentException if an AlphabetConverter cannot be |
| * constructed |
| */ |
| public static AlphabetConverter createConverterFromChars( |
| final Character[] original, |
| final Character[] encoding, |
| final Character[] doNotEncode) { |
| return AlphabetConverter.createConverter( |
| convertCharsToIntegers(original), |
| convertCharsToIntegers(encoding), |
| convertCharsToIntegers(doNotEncode)); |
| } |
| |
| /** |
| * Convert characters to integers. |
| * |
| * @param chars array of characters |
| * @return an equivalent array of integers |
| */ |
| private static Integer[] convertCharsToIntegers(final Character[] chars) { |
| if (chars == null || chars.length == 0) { |
| return new Integer[0]; |
| } |
| final Integer[] integers = new Integer[chars.length]; |
| for (int i = 0; i < chars.length; i++) { |
| integers[i] = (int) chars[i]; |
| } |
| return integers; |
| } |
| |
| /** |
| * Create an alphabet converter, for converting from the original alphabet, |
| * to the encoded alphabet, while leaving |
| * the characters in <em>doNotEncode</em> as they are (if possible). |
| * |
| * <p>Duplicate letters in either original or encoding will be ignored.</p> |
| * |
| * @param original an array of ints representing the original alphabet in |
| * codepoints |
| * @param encoding an array of ints representing the alphabet to be used for |
| * encoding, in codepoints |
| * @param doNotEncode an array of ints representing the chars to be encoded |
| * using the original alphabet - every char |
| * here must appear in both the previous params |
| * @return the AlphabetConverter |
| * @throws IllegalArgumentException if an AlphabetConverter cannot be |
| * constructed |
| */ |
| public static AlphabetConverter createConverter( |
| final Integer[] original, |
| final Integer[] encoding, |
| final Integer[] doNotEncode) { |
| final Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.<Integer> asList(original)); |
| final Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.<Integer> asList(encoding)); |
| final Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.<Integer> asList(doNotEncode)); |
| |
| final Map<Integer, String> originalToEncoded = new LinkedHashMap<>(); |
| final Map<String, String> encodedToOriginal = new LinkedHashMap<>(); |
| final Map<Integer, String> doNotEncodeMap = new HashMap<>(); |
| |
| int encodedLetterLength; |
| |
| for (final int i : doNotEncodeCopy) { |
| if (!originalCopy.contains(i)) { |
| throw new IllegalArgumentException( |
| "Can not use 'do not encode' list because original " |
| + "alphabet does not contain '" |
| + codePointToString(i) + "'"); |
| } |
| |
| if (!encodingCopy.contains(i)) { |
| throw new IllegalArgumentException( |
| "Can not use 'do not encode' list because encoding alphabet does not contain '" |
| + codePointToString(i) + "'"); |
| } |
| |
| doNotEncodeMap.put(i, codePointToString(i)); |
| } |
| |
| if (encodingCopy.size() >= originalCopy.size()) { |
| encodedLetterLength = 1; |
| |
| final Iterator<Integer> it = encodingCopy.iterator(); |
| |
| for (final int originalLetter : originalCopy) { |
| final String originalLetterAsString = |
| codePointToString(originalLetter); |
| |
| if (doNotEncodeMap.containsKey(originalLetter)) { |
| originalToEncoded.put(originalLetter, |
| originalLetterAsString); |
| encodedToOriginal.put(originalLetterAsString, |
| originalLetterAsString); |
| } else { |
| Integer next = it.next(); |
| |
| while (doNotEncodeCopy.contains(next)) { |
| next = it.next(); |
| } |
| |
| final String encodedLetter = codePointToString(next); |
| |
| originalToEncoded.put(originalLetter, encodedLetter); |
| encodedToOriginal.put(encodedLetter, |
| originalLetterAsString); |
| } |
| } |
| |
| return new AlphabetConverter(originalToEncoded, |
| encodedToOriginal, |
| encodedLetterLength); |
| |
| } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) { |
| throw new IllegalArgumentException( |
| "Must have at least two encoding characters (excluding " |
| + "those in the 'do not encode' list), but has " |
| + (encodingCopy.size() - doNotEncodeCopy.size())); |
| } else { |
| // we start with one which is our minimum, and because we do the |
| // first division outside the loop |
| int lettersSoFar = 1; |
| |
| // the first division takes into account that the doNotEncode |
| // letters can't be in the leftmost place |
| int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size()) |
| / (encodingCopy.size() - doNotEncodeCopy.size()); |
| |
| while (lettersLeft / encodingCopy.size() >= 1) { |
| lettersLeft = lettersLeft / encodingCopy.size(); |
| lettersSoFar++; |
| } |
| |
| encodedLetterLength = lettersSoFar + 1; |
| |
| final AlphabetConverter ac = |
| new AlphabetConverter(originalToEncoded, |
| encodedToOriginal, |
| encodedLetterLength); |
| |
| ac.addSingleEncoding(encodedLetterLength, |
| "", |
| encodingCopy, |
| originalCopy.iterator(), |
| doNotEncodeMap); |
| |
| return ac; |
| } |
| } |
| |
| /** |
| * Create new String that contains just the given code point. |
| * |
| * @param i code point |
| * @return a new string with the new code point |
| * @see "http://www.oracle.com/us/technologies/java/supplementary-142654.html" |
| */ |
| private static String codePointToString(final int i) { |
| if (Character.charCount(i) == 1) { |
| return String.valueOf((char) i); |
| } |
| return new String(Character.toChars(i)); |
| } |
| } |