Creating commons-codec-1.6-RC1 tag
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/tags/commons-codec-1.6-RC1@1201740 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/RELEASE-NOTES.txt b/RELEASE-NOTES.txt
index 2606353..459c1c7 100644
--- a/RELEASE-NOTES.txt
+++ b/RELEASE-NOTES.txt
@@ -1,6 +1,6 @@
$Id$
-The Commons Codec team is pleased to announce the commons-codec-1.5 release!
+The Commons Codec team is pleased to announce the commons-codec-1.6 release!
The codec package contains simple encoder and decoders for
various formats such as Base64 and Hexadecimal. In addition to these
@@ -9,6 +9,25 @@
Changes in this version include:
+Fixed Bugs:
+o Use standard Maven directory layout. Issue: CODEC-129. Thanks to ggregory.
+o Documentation spelling fixes. Issue: CODEC-128. Thanks to ville.skytta@iki.fi.
+o Fix various character encoding issues in comments and test cases. Issue: CODEC-127.
+o ColognePhonetic Javadoc should use HTML entities for special characters. Issue: CODEC-123.
+
+Changes:
+o Implement a Beider-Morse phonetic matching codec. Issue: CODEC-125. Thanks to Matthew Pocock.
+o Migrate to Java 5. Issue: CODEC-119.
+o Migrate to JUnit 4. Issue: CODEC-120.
+
+Have fun!
+-Commons Codec team
+
+
+===============================================================================
+
+Commons Codec Package Version 1.5 Release Notes
+
New features:
o Add test(s) to check that encodeBase64() does not chunk output. Issue: CODEC-93. Thanks to sebb.
o ArrayIndexOutOfBoundsException when doing multiple reads() on encoding Base64InputStream. Issue: CODEC-105. Thanks to zak.
diff --git a/default.properties b/default.properties
index d745013..7a223db 100644
--- a/default.properties
+++ b/default.properties
@@ -21,8 +21,8 @@
repository=${user.home}/.m2/repository
# The pathname of the "junit.jar" JAR file
-junit.home=${repository}/junit/junit/4.9
-junit.jar = ${junit.home}/junit-4.9.jar
+junit.home=${repository}/junit/junit/4.10
+junit.jar = ${junit.home}/junit-4.10.jar
# The name of this component
component.name = commons-codec
diff --git a/pom.xml b/pom.xml
index 2604903..43541ee 100644
--- a/pom.xml
+++ b/pom.xml
@@ -25,7 +25,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
- <version>1.6-SNAPSHOT</version>
+ <version>1.6</version>
<name>Commons Codec</name>
<inceptionYear>2002</inceptionYear>
<description>
@@ -191,7 +191,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
- <version>4.9</version>
+ <version>4.10</version>
<scope>test</scope>
</dependency>
</dependencies>
@@ -210,10 +210,28 @@
<commons.encoding>UTF-8</commons.encoding>
</properties>
<build>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-site-plugin</artifactId>
+ <version>3.0</version>
+ <dependencies>
+ <dependency>
+ <!-- add support for ssh/scp -->
+ <groupId>org.apache.maven.wagon</groupId>
+ <artifactId>wagon-ssh</artifactId>
+ <version>1.0</version>
+ </dependency>
+ </dependencies>
+ </plugin>
+ </plugins>
+ </pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
+ <version>2.10</version>
<configuration>
<includes>
<include>**/*Test.java</include>
@@ -227,6 +245,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
+ <version>2.3.2</version>
<executions>
<execution>
<goals>
@@ -238,6 +257,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
+ <version>2.2.1</version>
<configuration>
<descriptors>
<descriptor>src/main/assembly/bin.xml</descriptor>
@@ -253,8 +273,8 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-changes-plugin</artifactId>
+ <version>2.6</version>
<configuration>
-<!-- <xmlPath>${basedir}/src/changes/changes.xml</xmlPath> -->
<issueLinkTemplate>%URL%/%ISSUE%</issueLinkTemplate>
<!-- TODO: <onlyCurrentVersion>true</onlyCurrentVersion> -->
</configuration>
@@ -270,7 +290,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
- <version>2.6</version>
+ <version>2.8</version>
<configuration>
<configLocation>${basedir}/checkstyle.xml</configLocation>
<enableRulesSummary>false</enableRulesSummary>
@@ -295,6 +315,23 @@
<artifactId>findbugs-maven-plugin</artifactId>
<version>2.3.2</version>
</plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>taglist-maven-plugin</artifactId>
+ <version>2.4</version>
+ <configuration>
+ <tags>
+ <tag>TODO</tag>
+ <tag>NOPMD</tag>
+ <tag>NOTE</tag>
+ </tags>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>javancss-maven-plugin</artifactId>
+ <version>2.0</version>
+ </plugin>
</plugins>
</reporting>
</project>
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index e62d64e..6bf4845 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -25,7 +25,7 @@
<!-- <release version="2.0" date="TBA" description="Feature and fix release."> <action dev="ggregory" type="fix" issue="CODEC-126"> Make
org.apache.commons.codec.net.URLCodec charset field final. </action> </release>
-->
- <release version="1.6" date="TBA" description="Feature and fix release.">
+ <release version="1.6" date="20 November 2011" description="Feature and fix release.">
<action dev="ggregory" type="fix" issue="CODEC-129" due-to="ggregory">
Use standard Maven directory layout.
</action>
@@ -35,7 +35,7 @@
<action dev="ggregory, sebb" type="fix" issue="CODEC-127">
Fix various character encoding issues in comments and test cases.
</action>
- <action dev="ggregory, matthewpocock" type="update" issue="CODEC-125">
+ <action dev="ggregory, matthewpocock" type="update" issue="CODEC-125" due-to="Matthew Pocock">
Implement a Beider-Morse phonetic matching codec.
</action>
<action dev="ggregory" type="update" issue="CODEC-119">
diff --git a/src/main/java/org/apache/commons/codec/StringEncoderComparator.java b/src/main/java/org/apache/commons/codec/StringEncoderComparator.java
index dd656c4..ac48b36 100644
--- a/src/main/java/org/apache/commons/codec/StringEncoderComparator.java
+++ b/src/main/java/org/apache/commons/codec/StringEncoderComparator.java
@@ -35,6 +35,16 @@
private final StringEncoder stringEncoder;
/**
+ * Constructs a new instance.
+ *
+ * @deprecated Creating an instance without a {@link StringEncoder} leads to a {@link NullPointerException}. Will be
+ * removed in 2.0.
+ */
+ public StringEncoderComparator() {
+ this.stringEncoder = null; // Trying to use this will cause things to break
+ }
+
+ /**
* Constructs a new instance with the given algorithm.
*
* @param stringEncoder
diff --git a/src/main/java/org/apache/commons/codec/binary/Base64.java b/src/main/java/org/apache/commons/codec/binary/Base64.java
index 0b31c7d..ac1824b 100644
--- a/src/main/java/org/apache/commons/codec/binary/Base64.java
+++ b/src/main/java/org/apache/commons/codec/binary/Base64.java
@@ -468,6 +468,20 @@
}
/**
+ * Tests a given byte array to see if it contains only valid characters within the Base64 alphabet. Currently the
+ * method treats whitespace as valid.
+ *
+ * @param arrayOctet
+ * byte array to test
+ * @return <code>true</code> if all bytes are valid characters in the Base64 alphabet or if the byte array is empty;
+ * <code>false</code>, otherwise
+ * @deprecated 1.5 Use {@link #isBase64(byte[])}, will be removed in 2.0.
+ */
+ public static boolean isArrayByteBase64(byte[] arrayOctet) {
+ return isBase64(arrayOctet);
+ }
+
+ /**
* Returns whether or not the <code>octet</code> is in the base 64 alphabet.
*
* @param octet
diff --git a/src/main/java/org/apache/commons/codec/language/Caverphone.java b/src/main/java/org/apache/commons/codec/language/Caverphone.java
new file mode 100644
index 0000000..29b0694
--- /dev/null
+++ b/src/main/java/org/apache/commons/codec/language/Caverphone.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.codec.language;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * Encodes a string into a Caverphone 2.0 value. Delegate to a {@link Caverphone2} instance.
+ *
+ * This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 2.0
+ * algorithm:
+ *
+ * @author Apache Software Foundation
+ * @version $Id: Caverphone.java 1079535 2011-03-08 20:54:37Z ggregory $
+ * @see <a href="http://en.wikipedia.org/wiki/Caverphone">Wikipedia - Caverphone</a>
+ * @see <a href="http://caversham.otago.ac.nz/files/working/ctp150804.pdf">Caverphone 2.0 specification</a>
+ * @since 1.4
+ * @deprecated 1.5 Replaced by {@link Caverphone2}, will be removed in 2.0.
+ */
+public class Caverphone implements StringEncoder {
+
+ /**
+ * Delegate to a {@link Caverphone2} instance to avoid code duplication.
+ */
+ final private Caverphone2 encoder = new Caverphone2();
+
+ /**
+ * Creates an instance of the Caverphone encoder
+ */
+ public Caverphone() {
+ super();
+ }
+
+ /**
+ * Encodes the given String into a Caverphone value.
+ *
+ * @param source
+ * String the source string
+ * @return A caverphone code for the given String
+ */
+ public String caverphone(String source) {
+ return this.encoder.encode(source);
+ }
+
+ /**
+ * Encodes an Object using the caverphone algorithm. This method is provided in order to satisfy the requirements of
+ * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
+ *
+ * @param pObject
+ * Object to encode
+ * @return An object (or type java.lang.String) containing the caverphone code which corresponds to the String
+ * supplied.
+ * @throws EncoderException
+ * if the parameter supplied is not of type java.lang.String
+ */
+ public Object encode(Object pObject) throws EncoderException {
+ if (!(pObject instanceof String)) {
+ throw new EncoderException("Parameter supplied to Caverphone encode is not of type java.lang.String");
+ }
+ return this.caverphone((String) pObject);
+ }
+
+ /**
+ * Encodes a String using the Caverphone algorithm.
+ *
+ * @param pString
+ * String object to encode
+ * @return The caverphone code corresponding to the String supplied
+ */
+ public String encode(String pString) {
+ return this.caverphone(pString);
+ }
+
+ /**
+ * Tests if the caverphones of two strings are identical.
+ *
+ * @param str1
+ * First of two strings to compare
+ * @param str2
+ * Second of two strings to compare
+ * @return <code>true</code> if the caverphones of these strings are identical, <code>false</code> otherwise.
+ */
+ public boolean isCaverphoneEqual(String str1, String str2) {
+ return this.caverphone(str1).equals(this.caverphone(str2));
+ }
+
+}
diff --git a/src/main/java/org/apache/commons/codec/language/Soundex.java b/src/main/java/org/apache/commons/codec/language/Soundex.java
index 41cc962..eada7a5 100644
--- a/src/main/java/org/apache/commons/codec/language/Soundex.java
+++ b/src/main/java/org/apache/commons/codec/language/Soundex.java
@@ -56,29 +56,12 @@
*/
public static final Soundex US_ENGLISH = new Soundex();
-
/**
- * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
- * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
- * identical values.
+ * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
*
- * @param s1
- * A String that will be encoded and compared.
- * @param s2
- * A String that will be encoded and compared.
- * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
- *
- * @see SoundexUtils#difference(StringEncoder,String,String)
- * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
- * T-SQL DIFFERENCE </a>
- *
- * @throws EncoderException
- * if an error occurs encoding one of the strings
- * @since 1.3
+ * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
*/
- public int difference(String s1, String s2) throws EncoderException {
- return SoundexUtils.difference(this, s1, s2);
- }
+ private int maxLength = 4;
/**
* Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
@@ -124,6 +107,29 @@
}
/**
+ * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
+ * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
+ * identical values.
+ *
+ * @param s1
+ * A String that will be encoded and compared.
+ * @param s2
+ * A String that will be encoded and compared.
+ * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
+ *
+ * @see SoundexUtils#difference(StringEncoder,String,String)
+ * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
+ * T-SQL DIFFERENCE </a>
+ *
+ * @throws EncoderException
+ * if an error occurs encoding one of the strings
+ * @since 1.3
+ */
+ public int difference(String s1, String s2) throws EncoderException {
+ return SoundexUtils.difference(this, s1, s2);
+ }
+
+ /**
* Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
* the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
*
@@ -187,6 +193,16 @@
}
/**
+ * Returns the maxLength. Standard Soundex
+ *
+ * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
+ * @return int
+ */
+ public int getMaxLength() {
+ return this.maxLength;
+ }
+
+ /**
* Returns the soundex mapping.
*
* @return soundexMapping.
@@ -213,6 +229,17 @@
}
/**
+ * Sets the maxLength.
+ *
+ * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
+ * @param maxLength
+ * The maxLength to set
+ */
+ public void setMaxLength(int maxLength) {
+ this.maxLength = maxLength;
+ }
+
+ /**
* Retrieves the Soundex code for a given String object.
*
* @param str
diff --git a/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java b/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
index b71e4aa..e306cd3 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
@@ -31,11 +31,56 @@
* This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable, and may not be
* thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine} directly.
* </p>
+ *
+ * <h2>Encoding overview</h2>
+ *
+ * <p>
+ * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
+ * language the word comes from. For example, if it ends in "<code>ault</code>" then it infers that the word is French. Next,
+ * the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of letters
+ * can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
+ * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly,
+ * this language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking
+ * into account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
+ * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
+ * pronounced in several ways in the source language have only one way to represent them in this average phonetic
+ * language, so the result is again a set of phonetic spellings.
+ * </p>
+ *
+ * <p>
+ * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
+ * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
+ * Secondly, some names have standard prefixes, for example, "<code>Mac/Mc</code>" in Scottish (English) names. As sometimes it is
+ * ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once with the prefix
+ * and once without it. The resulting encoding contains one and then the other result.
+ * </p>
+ *
+ *
+ * <h2>Encoding format</h2>
+ *
+ * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
+ * are multiple possible phonetic representations, these are joined with a pipe (<code>|</code>) character. If multiple hyphenated
+ * words where found, or if the word may contain a name prefix, each encoded word is placed in elipses and these blocks
+ * are then joined with hyphens. For example, "<code>d'ortley</code>" has a possible prefix. The form without prefix encodes to
+ * "<code>ortlaj|ortlej</code>", while the form with prefix encodes to "<code>dortlaj|dortlej</code>". Thus, the full, combined encoding is
+ * "<code>(ortlaj|ortlej)-(dortlaj|dortlej)</code>".
+ *
+ * <p>
+ * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
+ * potential phonetic interpretations. For example, "<code>Renault</code>" encodes to
+ * "<code>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</code>". The <code>APPROX</code> rules will tend to produce larger
+ * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
+ * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
+ * splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
+ * </p>
*
* @author Apache Software Foundation
* @since 1.6
*/
public class BeiderMorseEncoder implements StringEncoder {
+ // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
+ // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
+
// a cached object
private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
diff --git a/src/main/java/org/apache/commons/codec/language/bm/Lang.java b/src/main/java/org/apache/commons/codec/language/bm/Lang.java
index 271231d..f147abe 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/Lang.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/Lang.java
@@ -71,6 +71,13 @@
* @since 1.6
*/
public class Lang {
+ // Implementation note: This class is divided into two sections. The first part is a static factory interface that
+ // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
+ // encapsulate a particular language-guessing rule table and the language guessing itself.
+ //
+ // It may make sense in the future to expose the private constructor to allow power users to build custom language-
+ // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
+ // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
private static final class LangRule {
private final boolean acceptOnMatch;
diff --git a/src/main/java/org/apache/commons/codec/language/bm/Languages.java b/src/main/java/org/apache/commons/codec/language/bm/Languages.java
index 199f56d..fef264e 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/Languages.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/Languages.java
@@ -53,6 +53,9 @@
* @since 1.6
*/
public class Languages {
+ // Iimplementation note: This class is divided into two sections. The first part is a static factory interface that
+ // exposes org/apache/commons/codec/language/bm/%s_languages.txt for %s in NameType.* as a list of supported
+ // languages, and a second part that provides instance methods for accessing this set fo supported languages.
/**
* A set of languages.
diff --git a/src/main/java/org/apache/commons/codec/language/bm/NameType.java b/src/main/java/org/apache/commons/codec/language/bm/NameType.java
index 712e794..17fe54d 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/NameType.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/NameType.java
@@ -18,7 +18,9 @@
package org.apache.commons.codec.language.bm;
/**
- * Supported types of names. Unless you are matching particular family names, use {@link #GENERIC}.
+ * Supported types of names. Unless you are matching particular family names, use {@link #GENERIC}. The
+ * <code>GENERIC</code> NameType should work reasonably well for non-name words. The other encodings are specifically
+ * tuned to family names, and may not work well at all for general text.
*
* @author Apache Software Foundation
* @since 1.6
diff --git a/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java b/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
index 4b36ffe..edf1b1f 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
@@ -51,8 +51,23 @@
*/
public class PhoneticEngine {
+ /**
+ * Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside this package,
+ * and probably not outside the {@link PhoneticEngine} class.
+ *
+ * @author Apache Software Foundation
+ * @since 1.6
+ */
static final class PhonemeBuilder {
+ /**
+ * An empty builder where all phonemes must come from some set of languages. This will contain a single
+ * phoneme of zero characters. This can then be appended to. This should be the only way to create a new
+ * phoneme from scratch.
+ *
+ * @param languages the set of languages
+ * @return a new, empty phoneme builder
+ */
public static PhonemeBuilder empty(Languages.LanguageSet languages) {
return new PhonemeBuilder(Collections.singleton(new Rule.Phoneme("", languages)));
}
@@ -63,6 +78,12 @@
this.phonemes = phonemes;
}
+ /**
+ * Creates a new phoneme builder containing all phonemes in this one extended by <code>str</code>.
+ *
+ * @param str the characters to append to the phonemes
+ * @return a new phoneme builder lenghened by <code>str</code>
+ */
public PhonemeBuilder append(CharSequence str) {
Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
@@ -73,6 +94,16 @@
return new PhonemeBuilder(newPhonemes);
}
+ /**
+ * Creates a new phoneme builder containing the application of the expression to all phonemes in this builder.
+ *
+ * This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
+ * incompatible.
+ *
+ * @param phonemeExpr the expression to apply
+ * @return a new phoneme builder containing the results of <code>phonemeExpr</code> applied to each phoneme
+ * in turn
+ */
public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
@@ -88,10 +119,22 @@
return new PhonemeBuilder(newPhonemes);
}
+ /**
+ * Gets underlying phoneme set. Please don't mutate.
+ *
+ * @return the phoneme set
+ */
public Set<Rule.Phoneme> getPhonemes() {
return this.phonemes;
}
+ /**
+ * Stringifies the phoneme set. This produces a single string of the strings of each phoneme, joined with a pipe.
+ * This is explicitly provied in place of toString as it is a potentially expensive operation, which should be
+ * avoided when debugging.
+ *
+ * @return the stringified phoneme set
+ */
public String makeString() {
StringBuilder sb = new StringBuilder();
@@ -108,6 +151,17 @@
}
}
+ /**
+ * A function closure capturing the application of a list of rules to an input sequence at a particular offset.
+ * After invocation, the values <code>i</code> and <code>found</code> are updated. <code>i</code> points to the
+ * index of the next char in <code>input</code> that must be processed next (the input up to that index having been
+ * processed already), and <code>found</code> indicates if a matching rule was found or not. In the case where a
+ * matching rule was found, <code>phonemeBuilder</code> is replaced with a new buidler containing the phonemes
+ * updated by the matching rule.
+ *
+ * @author Apache Software Foundation
+ * @since 1.6
+ */
private static final class RulesApplication {
private final List<Rule> finalRules;
private final CharSequence input;
@@ -134,6 +188,13 @@
return this.phonemeBuilder;
}
+ /**
+ * Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context
+ * and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no
+ * match, <code>i</code> is advanced one and the character is silently dropped from the phonetic spelling.
+ *
+ * @return <code>this</code>
+ */
public RulesApplication invoke() {
this.found = false;
int patternLength = 0;
@@ -176,6 +237,12 @@
"de la", "della", "des", "di", "do", "dos", "du", "van", "von"))));
}
+ /**
+ * This is a performance hack to avoid overhead associated with very frequent CharSequence.subSequence calls.
+ *
+ * @param cached the character sequence to cache
+ * @return a <code>CharSequence</code> that internally memoises subSequence values
+ */
private static CharSequence cacheSubSequence(final CharSequence cached) {
// return cached;
final CharSequence[][] cache = new CharSequence[cached.length()][cached.length()];
@@ -203,6 +270,12 @@
};
}
+ /**
+ * Joins some strings with an internal separator.
+ * @param strings Strings to join
+ * @param sep String to separate them with
+ * @return a single String consisting of each element of <code>strings</code> interlieved by <code>sep</code>
+ */
private static String join(Iterable<String> strings, String sep) {
StringBuilder sb = new StringBuilder();
Iterator<String> si = strings.iterator();
@@ -244,6 +317,14 @@
this.lang = Lang.instance(nameType);
}
+ /**
+ * Applies the final rules to convert from a language-specific phonetic representation to a language-independent
+ * representation.
+ *
+ * @param phonemeBuilder
+ * @param finalRules
+ * @return
+ */
private PhonemeBuilder applyFinalRules(PhonemeBuilder phonemeBuilder, List<Rule> finalRules) {
if (finalRules == null) {
throw new NullPointerException("finalRules can not be null");
@@ -304,8 +385,11 @@
*/
public String encode(String input, final Languages.LanguageSet languageSet) {
final List<Rule> rules = Rule.getInstance(this.nameType, RuleType.RULES, languageSet);
+ // rules common across many (all) languages
final List<Rule> finalRules1 = Rule.getInstance(this.nameType, this.ruleType, "common");
+ // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
final List<Rule> finalRules2 = Rule.getInstance(this.nameType, this.ruleType, languageSet);
+
// System.err.println("Languages: " + languageSet);
// System.err.println("Rules: " + rules);
@@ -333,6 +417,7 @@
final List<String> words = Arrays.asList(input.split("\\s+"));
final List<String> words2 = new ArrayList<String>();
+ // special-case handling of word prefixes based upon the name type
switch (this.nameType) {
case SEPHARDIC:
for (String aWord : words) {
@@ -380,13 +465,10 @@
// System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
}
- // System.err.println("Applying general rules");
+ // Apply the general rules
phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules1);
- // System.err.println("Now got: " + phonemeBuilder.makeString());
- // System.err.println("Applying language-specific rules");
+ // Apply the language-specific rules
phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules2);
- // System.err.println("Now got: " + phonemeBuilder.makeString());
- // System.err.println("Done");
return phonemeBuilder.makeString();
}
diff --git a/src/main/java/org/apache/commons/codec/language/bm/Rule.java b/src/main/java/org/apache/commons/codec/language/bm/Rule.java
index 9205ec4..6a4af6b 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/Rule.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/Rule.java
@@ -583,7 +583,9 @@
}
/**
- * Decides if the pattern and context match the input starting at a position.
+ * Decides if the pattern and context match the input starting at a position. It is a match if the
+ * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
+ * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
*
* @param input
* the input String
@@ -604,6 +606,9 @@
return false;
}
+ // fixme: this is a readability/speed trade-off - these 3 expressions should be inlined for speed to avoid
+ // evaluating latter ones if earlier ones have already failed, but that would make the code a lot harder to
+ // read
boolean patternMatches = input.subSequence(i, ipl).equals(this.pattern);
boolean rContextMatches = this.rContext.isMatch(input.subSequence(ipl, input.length()));
boolean lContextMatches = this.lContext.isMatch(input.subSequence(0, i));
diff --git a/src/main/java/org/apache/commons/codec/language/bm/RuleType.java b/src/main/java/org/apache/commons/codec/language/bm/RuleType.java
index 378dd8d..a038bc5 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/RuleType.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/RuleType.java
@@ -25,7 +25,12 @@
*/
public enum RuleType {
- APPROX("approx"), EXACT("exact"), RULES("rules");
+ /** Approximate rules, which will lead to the largest number of phonetic interpretations. */
+ APPROX("approx"),
+ /** Exact rules, which will lead to a minimum number of phonetic interpretations. */
+ EXACT("exact"),
+ /** For internal use only. Please use {@link #APPROX} or {@link #EXACT}. */
+ RULES("rules");
private final String name;
diff --git a/src/main/java/org/apache/commons/codec/net/URLCodec.java b/src/main/java/org/apache/commons/codec/net/URLCodec.java
index ba95a03..68605e1 100644
--- a/src/main/java/org/apache/commons/codec/net/URLCodec.java
+++ b/src/main/java/org/apache/commons/codec/net/URLCodec.java
@@ -59,8 +59,10 @@
/**
* The default charset used for string decoding and encoding.
+ *
+ * TODO: This field will be final in 2.0.
*/
- protected final String charset;
+ protected String charset;
/**
* Release 1.5 made this field final.
@@ -346,4 +348,15 @@
return this.charset;
}
+ /**
+ * The <code>String</code> encoding used for decoding and encoding.
+ *
+ * @return Returns the encoding.
+ *
+ * @deprecated Use {@link #getDefaultCharset()}, will be removed in 2.0.
+ */
+ public String getEncoding() {
+ return this.charset;
+ }
+
}
diff --git a/src/site/xdoc/download_codec.xml b/src/site/xdoc/download_codec.xml
index a6905be..8e0011c 100644
--- a/src/site/xdoc/download_codec.xml
+++ b/src/site/xdoc/download_codec.xml
@@ -95,32 +95,32 @@
</p>
</subsection>
</section>
- <section name="Commons Codec 1.5 ">
+ <section name="Commons Codec 1.6 ">
<subsection name="Binaries">
<table>
<tr>
- <td><a href="[preferred]/commons/codec/binaries/commons-codec-1.5-bin.tar.gz">commons-codec-1.5-bin.tar.gz</a></td>
- <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.5-bin.tar.gz.md5">md5</a></td>
- <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.5-bin.tar.gz.asc">pgp</a></td>
+ <td><a href="[preferred]/commons/codec/binaries/commons-codec-1.6-bin.tar.gz">commons-codec-1.6-bin.tar.gz</a></td>
+ <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.6-bin.tar.gz.md5">md5</a></td>
+ <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.6-bin.tar.gz.asc">pgp</a></td>
</tr>
<tr>
- <td><a href="[preferred]/commons/codec/binaries/commons-codec-1.5-bin.zip">commons-codec-1.5-bin.zip</a></td>
- <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.5-bin.zip.md5">md5</a></td>
- <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.5-bin.zip.asc">pgp</a></td>
+ <td><a href="[preferred]/commons/codec/binaries/commons-codec-1.6-bin.zip">commons-codec-1.6-bin.zip</a></td>
+ <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.6-bin.zip.md5">md5</a></td>
+ <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.6-bin.zip.asc">pgp</a></td>
</tr>
</table>
</subsection>
<subsection name="Source">
<table>
<tr>
- <td><a href="[preferred]/commons/codec/source/commons-codec-1.5-src.tar.gz">commons-codec-1.5-src.tar.gz</a></td>
- <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.5-src.tar.gz.md5">md5</a></td>
- <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.5-src.tar.gz.asc">pgp</a></td>
+ <td><a href="[preferred]/commons/codec/source/commons-codec-1.6-src.tar.gz">commons-codec-1.6-src.tar.gz</a></td>
+ <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.6-src.tar.gz.md5">md5</a></td>
+ <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.6-src.tar.gz.asc">pgp</a></td>
</tr>
<tr>
- <td><a href="[preferred]/commons/codec/source/commons-codec-1.5-src.zip">commons-codec-1.5-src.zip</a></td>
- <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.5-src.zip.md5">md5</a></td>
- <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.5-src.zip.asc">pgp</a></td>
+ <td><a href="[preferred]/commons/codec/source/commons-codec-1.6-src.zip">commons-codec-1.6-src.zip</a></td>
+ <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.6-src.zip.md5">md5</a></td>
+ <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.6-src.zip.asc">pgp</a></td>
</tr>
</table>
</subsection>
diff --git a/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java b/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
index 41dbeb7..b52ed1c 100644
--- a/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
+++ b/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
@@ -76,8 +76,8 @@
String phoneticActual = engine.encode(this.name);
- System.err.println("expecting: " + this.phoneticExpected);
- System.err.println("actual: " + phoneticActual);
+ //System.err.println("expecting: " + this.phoneticExpected);
+ //System.err.println("actual: " + phoneticActual);
assertEquals("phoneme incorrect", this.phoneticExpected, phoneticActual);
}
}