Creating commons-codec-1.6-RC1 tag git-svn-id: https://svn.apache.org/repos/asf/commons/proper/codec/tags/commons-codec-1.6-RC1@1201740 13f79535-47bb-0310-9956-ffa450edef68

commit: 551472689cf9570dfe88cecc4e52498db17d59b6 [log] [tgz]
author: Gary D. Gregory <ggregory@apache.org> Mon Nov 14 15:04:32 2011 +0000
committer: Gary D. Gregory <ggregory@apache.org> Mon Nov 14 15:04:32 2011 +0000
tree: 7290fc718a1c5643bf3d33197217357cf9562361
parent: 9d520b567e1bf92a18ab6aa2aa8a7da83a6c39bd [diff]
parent: b453bb690b789e391a5b385578402c31a5faf4d8 [diff]
diff --git a/RELEASE-NOTES.txt b/RELEASE-NOTES.txt
index 2606353..459c1c7 100644
--- a/RELEASE-NOTES.txt
+++ b/RELEASE-NOTES.txt

@@ -1,6 +1,6 @@
 $Id$
 
-The Commons Codec team is pleased to announce the commons-codec-1.5 release!
+The Commons Codec team is pleased to announce the commons-codec-1.6 release!
 
 The codec package contains simple encoder and decoders for
 various formats such as Base64 and Hexadecimal. In addition to these
@@ -9,6 +9,25 @@
 
 Changes in this version include:
 
+Fixed Bugs:
+o Use standard Maven directory layout.  Issue: CODEC-129. Thanks to ggregory. 
+o Documentation spelling fixes.  Issue: CODEC-128. Thanks to ville.skytta@iki.fi. 
+o Fix various character encoding issues in comments and test cases.  Issue: CODEC-127. 
+o ColognePhonetic Javadoc should use HTML entities for special characters.  Issue: CODEC-123. 
+
+Changes:
+o Implement a Beider-Morse phonetic matching codec.  Issue: CODEC-125. Thanks to Matthew Pocock. 
+o Migrate to Java 5.  Issue: CODEC-119. 
+o Migrate to JUnit 4.  Issue: CODEC-120. 
+
+Have fun!
+-Commons Codec team
+
+
+===============================================================================
+
+Commons Codec Package Version 1.5 Release Notes
+
 New features:
 o Add test(s) to check that encodeBase64() does not chunk output.  Issue: CODEC-93. Thanks to sebb. 
 o ArrayIndexOutOfBoundsException when doing multiple reads() on encoding Base64InputStream.  Issue: CODEC-105. Thanks to zak. 

diff --git a/default.properties b/default.properties
index d745013..7a223db 100644
--- a/default.properties
+++ b/default.properties

@@ -21,8 +21,8 @@
 repository=${user.home}/.m2/repository
 
 # The pathname of the "junit.jar" JAR file
-junit.home=${repository}/junit/junit/4.9
-junit.jar = ${junit.home}/junit-4.9.jar
+junit.home=${repository}/junit/junit/4.10
+junit.jar = ${junit.home}/junit-4.10.jar
 
 # The name of this component
 component.name = commons-codec

diff --git a/pom.xml b/pom.xml
index 2604903..43541ee 100644
--- a/pom.xml
+++ b/pom.xml

@@ -25,7 +25,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>commons-codec</groupId>
   <artifactId>commons-codec</artifactId>
-  <version>1.6-SNAPSHOT</version>
+  <version>1.6</version>
   <name>Commons Codec</name>
   <inceptionYear>2002</inceptionYear>
   <description>
@@ -191,7 +191,7 @@
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
-      <version>4.9</version>
+      <version>4.10</version>
       <scope>test</scope>
     </dependency>
   </dependencies>
@@ -210,10 +210,28 @@
     <commons.encoding>UTF-8</commons.encoding>
   </properties>
   <build>
+    <pluginManagement>
+      <plugins>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-site-plugin</artifactId>
+          <version>3.0</version>
+          <dependencies>
+            <dependency>
+              <!-- add support for ssh/scp -->
+              <groupId>org.apache.maven.wagon</groupId>
+              <artifactId>wagon-ssh</artifactId>
+              <version>1.0</version>
+            </dependency>
+          </dependencies>
+        </plugin>
+      </plugins>
+    </pluginManagement>
     <plugins>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
+        <version>2.10</version>
         <configuration>
           <includes>
             <include>**/*Test.java</include>
@@ -227,6 +245,7 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
+        <version>2.3.2</version>
         <executions>
           <execution>
             <goals>
@@ -238,6 +257,7 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-assembly-plugin</artifactId>
+        <version>2.2.1</version>
         <configuration>
           <descriptors>
             <descriptor>src/main/assembly/bin.xml</descriptor>
@@ -253,8 +273,8 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-changes-plugin</artifactId>
+        <version>2.6</version>
         <configuration>
-<!--           <xmlPath>${basedir}/src/changes/changes.xml</xmlPath> -->
           <issueLinkTemplate>%URL%/%ISSUE%</issueLinkTemplate>
           <!-- TODO: <onlyCurrentVersion>true</onlyCurrentVersion> -->
         </configuration>
@@ -270,7 +290,7 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-checkstyle-plugin</artifactId>
-        <version>2.6</version>
+        <version>2.8</version>
         <configuration>
           <configLocation>${basedir}/checkstyle.xml</configLocation>
           <enableRulesSummary>false</enableRulesSummary>
@@ -295,6 +315,23 @@
         <artifactId>findbugs-maven-plugin</artifactId>
         <version>2.3.2</version>
       </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>taglist-maven-plugin</artifactId>
+        <version>2.4</version>
+        <configuration>
+          <tags>
+            <tag>TODO</tag>
+            <tag>NOPMD</tag>
+            <tag>NOTE</tag>
+          </tags>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>javancss-maven-plugin</artifactId>
+        <version>2.0</version>
+      </plugin>
     </plugins>
   </reporting>
 </project>

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index e62d64e..6bf4845 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml

@@ -25,7 +25,7 @@
     <!-- <release version="2.0" date="TBA" description="Feature and fix release."> <action dev="ggregory" type="fix" issue="CODEC-126"> Make 
       org.apache.commons.codec.net.URLCodec charset field final. </action>   </release>
     -->
-    <release version="1.6" date="TBA" description="Feature and fix release.">
+    <release version="1.6" date="20 November 2011" description="Feature and fix release.">
       <action dev="ggregory" type="fix" issue="CODEC-129" due-to="ggregory">
         Use standard Maven directory layout.
       </action>
@@ -35,7 +35,7 @@
       <action dev="ggregory, sebb" type="fix" issue="CODEC-127">
         Fix various character encoding issues in comments and test cases.
       </action>
-      <action dev="ggregory, matthewpocock" type="update" issue="CODEC-125">
+      <action dev="ggregory, matthewpocock" type="update" issue="CODEC-125" due-to="Matthew Pocock">
         Implement a Beider-Morse phonetic matching codec.
       </action>
       <action dev="ggregory" type="update" issue="CODEC-119">

diff --git a/src/main/java/org/apache/commons/codec/StringEncoderComparator.java b/src/main/java/org/apache/commons/codec/StringEncoderComparator.java
index dd656c4..ac48b36 100644
--- a/src/main/java/org/apache/commons/codec/StringEncoderComparator.java
+++ b/src/main/java/org/apache/commons/codec/StringEncoderComparator.java

@@ -35,6 +35,16 @@
     private final StringEncoder stringEncoder;
 
     /**
+     * Constructs a new instance.
+     * 
+     * @deprecated Creating an instance without a {@link StringEncoder} leads to a {@link NullPointerException}. Will be
+     *             removed in 2.0.
+     */
+    public StringEncoderComparator() {
+        this.stringEncoder = null; // Trying to use this will cause things to break
+    }
+
+    /**
      * Constructs a new instance with the given algorithm.
      * 
      * @param stringEncoder

diff --git a/src/main/java/org/apache/commons/codec/binary/Base64.java b/src/main/java/org/apache/commons/codec/binary/Base64.java
index 0b31c7d..ac1824b 100644
--- a/src/main/java/org/apache/commons/codec/binary/Base64.java
+++ b/src/main/java/org/apache/commons/codec/binary/Base64.java

@@ -468,6 +468,20 @@
     }
 
     /**
+     * Tests a given byte array to see if it contains only valid characters within the Base64 alphabet. Currently the
+     * method treats whitespace as valid.
+     * 
+     * @param arrayOctet
+     *            byte array to test
+     * @return <code>true</code> if all bytes are valid characters in the Base64 alphabet or if the byte array is empty;
+     *         <code>false</code>, otherwise
+     * @deprecated 1.5 Use {@link #isBase64(byte[])}, will be removed in 2.0.
+     */
+    public static boolean isArrayByteBase64(byte[] arrayOctet) {
+        return isBase64(arrayOctet);
+    }
+
+    /**
      * Returns whether or not the <code>octet</code> is in the base 64 alphabet.
      * 
      * @param octet

diff --git a/src/main/java/org/apache/commons/codec/language/Caverphone.java b/src/main/java/org/apache/commons/codec/language/Caverphone.java
new file mode 100644
index 0000000..29b0694
--- /dev/null
+++ b/src/main/java/org/apache/commons/codec/language/Caverphone.java

@@ -0,0 +1,103 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License.  You may obtain a copy of the License at

+ * 

+ *      http://www.apache.org/licenses/LICENSE-2.0

+ * 

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package org.apache.commons.codec.language;

+

+import org.apache.commons.codec.EncoderException;

+import org.apache.commons.codec.StringEncoder;

+

+/**

+ * Encodes a string into a Caverphone 2.0 value. Delegate to a {@link Caverphone2} instance.

+ * 

+ * This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 2.0

+ * algorithm:

+ * 

+ * @author Apache Software Foundation

+ * @version $Id: Caverphone.java 1079535 2011-03-08 20:54:37Z ggregory $

+ * @see <a href="http://en.wikipedia.org/wiki/Caverphone">Wikipedia - Caverphone</a>

+ * @see <a href="http://caversham.otago.ac.nz/files/working/ctp150804.pdf">Caverphone 2.0 specification</a>

+ * @since 1.4

+ * @deprecated 1.5 Replaced by {@link Caverphone2}, will be removed in 2.0.

+ */

+public class Caverphone implements StringEncoder {

+

+    /**

+     * Delegate to a {@link Caverphone2} instance to avoid code duplication.

+     */

+    final private Caverphone2 encoder = new Caverphone2();

+

+    /**

+     * Creates an instance of the Caverphone encoder

+     */

+    public Caverphone() {

+        super();

+    }

+

+    /**

+     * Encodes the given String into a Caverphone value.

+     * 

+     * @param source

+     *            String the source string

+     * @return A caverphone code for the given String

+     */

+    public String caverphone(String source) {

+        return this.encoder.encode(source);

+    }

+

+    /**

+     * Encodes an Object using the caverphone algorithm. This method is provided in order to satisfy the requirements of

+     * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.

+     * 

+     * @param pObject

+     *            Object to encode

+     * @return An object (or type java.lang.String) containing the caverphone code which corresponds to the String

+     *         supplied.

+     * @throws EncoderException

+     *             if the parameter supplied is not of type java.lang.String

+     */

+    public Object encode(Object pObject) throws EncoderException {

+        if (!(pObject instanceof String)) {

+            throw new EncoderException("Parameter supplied to Caverphone encode is not of type java.lang.String");

+        }

+        return this.caverphone((String) pObject);

+    }

+

+    /**

+     * Encodes a String using the Caverphone algorithm.

+     * 

+     * @param pString

+     *            String object to encode

+     * @return The caverphone code corresponding to the String supplied

+     */

+    public String encode(String pString) {

+        return this.caverphone(pString);

+    }

+

+    /**

+     * Tests if the caverphones of two strings are identical.

+     * 

+     * @param str1

+     *            First of two strings to compare

+     * @param str2

+     *            Second of two strings to compare

+     * @return <code>true</code> if the caverphones of these strings are identical, <code>false</code> otherwise.

+     */

+    public boolean isCaverphoneEqual(String str1, String str2) {

+        return this.caverphone(str1).equals(this.caverphone(str2));

+    }

+

+}


diff --git a/src/main/java/org/apache/commons/codec/language/Soundex.java b/src/main/java/org/apache/commons/codec/language/Soundex.java
index 41cc962..eada7a5 100644
--- a/src/main/java/org/apache/commons/codec/language/Soundex.java
+++ b/src/main/java/org/apache/commons/codec/language/Soundex.java

@@ -56,29 +56,12 @@
      */
     public static final Soundex US_ENGLISH = new Soundex();
 
-
     /**
-     * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
-     * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
-     * identical values.
+     * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
      * 
-     * @param s1
-     *                  A String that will be encoded and compared.
-     * @param s2
-     *                  A String that will be encoded and compared.
-     * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
-     * 
-     * @see SoundexUtils#difference(StringEncoder,String,String)
-     * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
-     *          T-SQL DIFFERENCE </a>
-     * 
-     * @throws EncoderException
-     *                  if an error occurs encoding one of the strings
-     * @since 1.3
+     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
      */
-    public int difference(String s1, String s2) throws EncoderException {
-        return SoundexUtils.difference(this, s1, s2);
-    }
+    private int maxLength = 4;
 
     /**
      * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
@@ -124,6 +107,29 @@
     }
 
     /**
+     * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
+     * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
+     * identical values.
+     * 
+     * @param s1
+     *                  A String that will be encoded and compared.
+     * @param s2
+     *                  A String that will be encoded and compared.
+     * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
+     * 
+     * @see SoundexUtils#difference(StringEncoder,String,String)
+     * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
+     *          T-SQL DIFFERENCE </a>
+     * 
+     * @throws EncoderException
+     *                  if an error occurs encoding one of the strings
+     * @since 1.3
+     */
+    public int difference(String s1, String s2) throws EncoderException {
+        return SoundexUtils.difference(this, s1, s2);
+    }
+
+    /**
      * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
      * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
      * 
@@ -187,6 +193,16 @@
     }
 
     /**
+     * Returns the maxLength. Standard Soundex
+     * 
+     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
+     * @return int
+     */
+    public int getMaxLength() {
+        return this.maxLength;
+    }
+
+    /**
      * Returns the soundex mapping.
      * 
      * @return soundexMapping.
@@ -213,6 +229,17 @@
     }
 
     /**
+     * Sets the maxLength.
+     * 
+     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
+     * @param maxLength
+     *                  The maxLength to set
+     */
+    public void setMaxLength(int maxLength) {
+        this.maxLength = maxLength;
+    }
+    
+    /**
      * Retrieves the Soundex code for a given String object.
      * 
      * @param str

diff --git a/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java b/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
index b71e4aa..e306cd3 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/BeiderMorseEncoder.java

@@ -31,11 +31,56 @@
  * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it is mutable, and may not be
  * thread-safe. If you require a guaranteed thread-safe encoding then use {@link PhoneticEngine} directly.
  * </p>
+ *
+ * <h2>Encoding overview</h2>
+ *
+ * <p>
+ * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what
+ * language the word comes from. For example, if it ends in "<code>ault</code>" then it infers that the word is French. Next,
+ * the word is translated into a phonetic representation using a language-specific phonetics table. Some runs of letters
+ * can be pronounced in multiple ways, and a single run of letters may be potentially broken up into phonemes at
+ * different places, so this stage results in a set of possible language-specific phonetic representations. Lastly,
+ * this language-specific phonetic representation is processed by a table of rules that re-writes it phonetically taking
+ * into account systematic pronunciation differences between languages, to move it towards a pan-indo-european phonetic
+ * representation. Again, sometimes there are multiple ways this could be done and sometimes things that can be
+ * pronounced in several ways in the source language have only one way to represent them in this average phonetic
+ * language, so the result is again a set of phonetic spellings.
+ * </p>
+ *
+ * <p>
+ * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. In
+ * this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final encoding.
+ * Secondly, some names have standard prefixes, for example, "<code>Mac/Mc</code>" in Scottish (English) names. As sometimes it is
+ * ambiguous whether the prefix is intended or is an accident of the spelling, the word is encoded once with the prefix
+ * and once without it. The resulting encoding contains one and then the other result.
+ * </p>
+ *
+ *
+ * <h2>Encoding format</h2>
+ *
+ * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where there
+ * are multiple possible phonetic representations, these are joined with a pipe (<code>|</code>) character. If multiple hyphenated
+ * words where found, or if the word may contain a name prefix, each encoded word is placed in elipses and these blocks
+ * are then joined with hyphens. For example, "<code>d'ortley</code>" has a possible prefix. The form without prefix encodes to
+ * "<code>ortlaj|ortlej</code>", while the form with prefix encodes to "<code>dortlaj|dortlej</code>". Thus, the full, combined encoding is
+ * "<code>(ortlaj|ortlej)-(dortlaj|dortlej)</code>".
+ *
+ * <p>
+ * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many
+ * potential phonetic interpretations. For example, "<code>Renault</code>" encodes to
+ * "<code>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</code>". The <code>APPROX</code> rules will tend to produce larger
+ * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word.
+ * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by
+ * splitting on pipe (<code>|</code>) and indexing under each of these alternatives.
+ * </p>
  * 
  * @author Apache Software Foundation
  * @since 1.6
  */
 public class BeiderMorseEncoder implements StringEncoder {
+    // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration
+    // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding.
+
     // a cached object
     private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true);
 

diff --git a/src/main/java/org/apache/commons/codec/language/bm/Lang.java b/src/main/java/org/apache/commons/codec/language/bm/Lang.java
index 271231d..f147abe 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/Lang.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/Lang.java

@@ -71,6 +71,13 @@
  * @since 1.6
  */
 public class Lang {
+    // Implementation note: This class is divided into two sections. The first part is a static factory interface that
+    // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
+    // encapsulate a particular language-guessing rule table and the language guessing itself.
+    //
+    // It may make sense in the future to expose the private constructor to allow power users to build custom language-
+    // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
+    // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
 
     private static final class LangRule {
         private final boolean acceptOnMatch;

diff --git a/src/main/java/org/apache/commons/codec/language/bm/Languages.java b/src/main/java/org/apache/commons/codec/language/bm/Languages.java
index 199f56d..fef264e 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/Languages.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/Languages.java

@@ -53,6 +53,9 @@
  * @since 1.6
  */
 public class Languages {
+    // Iimplementation note: This class is divided into two sections. The first part is a static factory interface that
+    // exposes org/apache/commons/codec/language/bm/%s_languages.txt for %s in NameType.* as a list of supported
+    // languages, and a second part that provides instance methods for accessing this set fo supported languages.
 
     /**
      * A set of languages.

diff --git a/src/main/java/org/apache/commons/codec/language/bm/NameType.java b/src/main/java/org/apache/commons/codec/language/bm/NameType.java
index 712e794..17fe54d 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/NameType.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/NameType.java

@@ -18,7 +18,9 @@
 package org.apache.commons.codec.language.bm;
 
 /**
- * Supported types of names. Unless you are matching particular family names, use {@link #GENERIC}.
+ * Supported types of names. Unless you are matching particular family names, use {@link #GENERIC}. The
+ * <code>GENERIC</code> NameType should work reasonably well for non-name words. The other encodings are specifically
+ * tuned to family names, and may not work well at all for general text.
  * 
  * @author Apache Software Foundation
  * @since 1.6

diff --git a/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java b/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
index 4b36ffe..edf1b1f 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java

@@ -51,8 +51,23 @@
  */
 public class PhoneticEngine {
 
+    /**
+     * Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside this package,
+     * and probably not outside the {@link PhoneticEngine} class.
+     *
+     * @author Apache Software Foundation
+     * @since 1.6
+     */
     static final class PhonemeBuilder {
 
+        /**
+         * An empty builder where all phonemes must come from some set of languages. This will contain a single
+         * phoneme of zero characters. This can then be appended to. This should be the only way to create a new
+         * phoneme from scratch.
+         *
+         * @param languages the set of languages
+         * @return  a new, empty phoneme builder
+         */
         public static PhonemeBuilder empty(Languages.LanguageSet languages) {
             return new PhonemeBuilder(Collections.singleton(new Rule.Phoneme("", languages)));
         }
@@ -63,6 +78,12 @@
             this.phonemes = phonemes;
         }
 
+        /**
+         * Creates a new phoneme builder containing all phonemes in this one extended by <code>str</code>.
+         *
+         * @param str   the characters to append to the phonemes
+         * @return  a new phoneme builder lenghened by <code>str</code>
+         */
         public PhonemeBuilder append(CharSequence str) {
             Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
 
@@ -73,6 +94,16 @@
             return new PhonemeBuilder(newPhonemes);
         }
 
+        /**
+         * Creates a new phoneme builder containing the application of the expression to all phonemes in this builder.
+         *
+         * This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
+         * incompatible.
+         *
+         * @param phonemeExpr   the expression to apply
+         * @return  a new phoneme builder containing the results of <code>phonemeExpr</code> applied to each phoneme
+         *      in turn
+         */
         public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr) {
             Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>();
 
@@ -88,10 +119,22 @@
             return new PhonemeBuilder(newPhonemes);
         }
 
+        /**
+         * Gets underlying phoneme set. Please don't mutate.
+         *
+         * @return  the phoneme set
+         */
         public Set<Rule.Phoneme> getPhonemes() {
             return this.phonemes;
         }
 
+        /**
+         * Stringifies the phoneme set. This produces a single string of the strings of each phoneme, joined with a pipe.
+         * This is explicitly provied in place of toString as it is a potentially expensive operation, which should be
+         * avoided when debugging.
+         *
+         * @return  the stringified phoneme set
+         */
         public String makeString() {
 
             StringBuilder sb = new StringBuilder();
@@ -108,6 +151,17 @@
         }
     }
 
+    /**
+     * A function closure capturing the application of a list of rules to an input sequence at a particular offset.
+     * After invocation, the values <code>i</code> and <code>found</code> are updated. <code>i</code> points to the
+     * index of the next char in <code>input</code> that must be processed next (the input up to that index having been
+     * processed already), and <code>found</code> indicates if a matching rule was found or not. In the case where a
+     * matching rule was found, <code>phonemeBuilder</code> is replaced with a new buidler containing the phonemes
+     * updated by the matching rule.
+     *
+     * @author Apache Software Foundation
+     * @since 1.6
+     */
     private static final class RulesApplication {
         private final List<Rule> finalRules;
         private final CharSequence input;
@@ -134,6 +188,13 @@
             return this.phonemeBuilder;
         }
 
+        /**
+         * Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context
+         * and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no
+         * match, <code>i</code> is advanced one and the character is silently dropped from the phonetic spelling.
+         *
+         * @return <code>this</code>
+         */
         public RulesApplication invoke() {
             this.found = false;
             int patternLength = 0;
@@ -176,6 +237,12 @@
                 "de la", "della", "des", "di", "do", "dos", "du", "van", "von"))));
     }
 
+    /**
+     * This is a performance hack to avoid overhead associated with very frequent CharSequence.subSequence calls.
+     *
+     * @param cached the character sequence to cache
+     * @return a <code>CharSequence</code> that internally memoises subSequence values
+     */
     private static CharSequence cacheSubSequence(final CharSequence cached) {
         // return cached;
         final CharSequence[][] cache = new CharSequence[cached.length()][cached.length()];
@@ -203,6 +270,12 @@
         };
     }
 
+    /**
+     * Joins some strings with an internal separator.
+     * @param strings   Strings to join
+     * @param sep       String to separate them with
+     * @return          a single String consisting of each element of <code>strings</code> interlieved by <code>sep</code>
+     */
     private static String join(Iterable<String> strings, String sep) {
         StringBuilder sb = new StringBuilder();
         Iterator<String> si = strings.iterator();
@@ -244,6 +317,14 @@
         this.lang = Lang.instance(nameType);
     }
 
+    /**
+     * Applies the final rules to convert from a language-specific phonetic representation to a language-independent
+     * representation.
+     *
+     * @param phonemeBuilder
+     * @param finalRules
+     * @return
+     */
     private PhonemeBuilder applyFinalRules(PhonemeBuilder phonemeBuilder, List<Rule> finalRules) {
         if (finalRules == null) {
             throw new NullPointerException("finalRules can not be null");
@@ -304,8 +385,11 @@
      */
     public String encode(String input, final Languages.LanguageSet languageSet) {
         final List<Rule> rules = Rule.getInstance(this.nameType, RuleType.RULES, languageSet);
+        // rules common across many (all) languages
         final List<Rule> finalRules1 = Rule.getInstance(this.nameType, this.ruleType, "common");
+        // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
         final List<Rule> finalRules2 = Rule.getInstance(this.nameType, this.ruleType, languageSet);
+
         // System.err.println("Languages: " + languageSet);
         // System.err.println("Rules: " + rules);
 
@@ -333,6 +417,7 @@
         final List<String> words = Arrays.asList(input.split("\\s+"));
         final List<String> words2 = new ArrayList<String>();
 
+        // special-case handling of word prefixes based upon the name type
         switch (this.nameType) {
         case SEPHARDIC:
             for (String aWord : words) {
@@ -380,13 +465,10 @@
             // System.err.println(input + " " + i + ": " + phonemeBuilder.makeString());
         }
 
-        // System.err.println("Applying general rules");
+        // Apply the general rules
         phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules1);
-        // System.err.println("Now got: " + phonemeBuilder.makeString());
-        // System.err.println("Applying language-specific rules");
+        // Apply the language-specific rules
         phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules2);
-        // System.err.println("Now got: " + phonemeBuilder.makeString());
-        // System.err.println("Done");
 
         return phonemeBuilder.makeString();
     }

diff --git a/src/main/java/org/apache/commons/codec/language/bm/Rule.java b/src/main/java/org/apache/commons/codec/language/bm/Rule.java
index 9205ec4..6a4af6b 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/Rule.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/Rule.java

@@ -583,7 +583,9 @@
     }
 
     /**
-     * Decides if the pattern and context match the input starting at a position.
+     * Decides if the pattern and context match the input starting at a position. It is a match if the
+     * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
+     * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
      * 
      * @param input
      *            the input String
@@ -604,6 +606,9 @@
             return false;
         }
 
+        // fixme: this is a readability/speed trade-off - these 3 expressions should be inlined for speed to avoid
+        // evaluating latter ones if earlier ones have already failed, but that would make the code a lot harder to
+        // read
         boolean patternMatches = input.subSequence(i, ipl).equals(this.pattern);
         boolean rContextMatches = this.rContext.isMatch(input.subSequence(ipl, input.length()));
         boolean lContextMatches = this.lContext.isMatch(input.subSequence(0, i));

diff --git a/src/main/java/org/apache/commons/codec/language/bm/RuleType.java b/src/main/java/org/apache/commons/codec/language/bm/RuleType.java
index 378dd8d..a038bc5 100644
--- a/src/main/java/org/apache/commons/codec/language/bm/RuleType.java
+++ b/src/main/java/org/apache/commons/codec/language/bm/RuleType.java

@@ -25,7 +25,12 @@
  */
 public enum RuleType {
 
-    APPROX("approx"), EXACT("exact"), RULES("rules");
+    /** Approximate rules, which will lead to the largest number of phonetic interpretations. */
+    APPROX("approx"),
+    /** Exact rules, which will lead to a minimum number of phonetic interpretations. */
+    EXACT("exact"),
+    /** For internal use only. Please use {@link #APPROX} or {@link #EXACT}. */
+    RULES("rules");
 
     private final String name;
 

diff --git a/src/main/java/org/apache/commons/codec/net/URLCodec.java b/src/main/java/org/apache/commons/codec/net/URLCodec.java
index ba95a03..68605e1 100644
--- a/src/main/java/org/apache/commons/codec/net/URLCodec.java
+++ b/src/main/java/org/apache/commons/codec/net/URLCodec.java

@@ -59,8 +59,10 @@
     
     /**
      * The default charset used for string decoding and encoding.
+     * 
+     * TODO: This field will be final in 2.0.
      */
-    protected final String charset;
+    protected String charset;
     
     /**
      * Release 1.5 made this field final.
@@ -346,4 +348,15 @@
         return this.charset;
     }
 
+    /**
+     * The <code>String</code> encoding used for decoding and encoding.
+     * 
+     * @return Returns the encoding.
+     * 
+     * @deprecated Use {@link #getDefaultCharset()}, will be removed in 2.0.
+     */
+    public String getEncoding() {
+        return this.charset;
+    }
+
 }

diff --git a/src/site/xdoc/download_codec.xml b/src/site/xdoc/download_codec.xml
index a6905be..8e0011c 100644
--- a/src/site/xdoc/download_codec.xml
+++ b/src/site/xdoc/download_codec.xml

@@ -95,32 +95,32 @@
       </p>
     </subsection>
     </section>
-    <section name="Commons Codec 1.5 ">
+    <section name="Commons Codec 1.6 ">
       <subsection name="Binaries">
         <table>
           <tr>
-              <td><a href="[preferred]/commons/codec/binaries/commons-codec-1.5-bin.tar.gz">commons-codec-1.5-bin.tar.gz</a></td>
-              <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.5-bin.tar.gz.md5">md5</a></td>
-              <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.5-bin.tar.gz.asc">pgp</a></td>
+              <td><a href="[preferred]/commons/codec/binaries/commons-codec-1.6-bin.tar.gz">commons-codec-1.6-bin.tar.gz</a></td>
+              <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.6-bin.tar.gz.md5">md5</a></td>
+              <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.6-bin.tar.gz.asc">pgp</a></td>
           </tr>
           <tr>
-              <td><a href="[preferred]/commons/codec/binaries/commons-codec-1.5-bin.zip">commons-codec-1.5-bin.zip</a></td>
-              <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.5-bin.zip.md5">md5</a></td>
-              <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.5-bin.zip.asc">pgp</a></td>
+              <td><a href="[preferred]/commons/codec/binaries/commons-codec-1.6-bin.zip">commons-codec-1.6-bin.zip</a></td>
+              <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.6-bin.zip.md5">md5</a></td>
+              <td><a href="http://www.apache.org/dist/commons/codec/binaries/commons-codec-1.6-bin.zip.asc">pgp</a></td>
           </tr>
         </table>
       </subsection>
       <subsection name="Source">
         <table>
           <tr>
-              <td><a href="[preferred]/commons/codec/source/commons-codec-1.5-src.tar.gz">commons-codec-1.5-src.tar.gz</a></td>
-              <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.5-src.tar.gz.md5">md5</a></td>
-              <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.5-src.tar.gz.asc">pgp</a></td>
+              <td><a href="[preferred]/commons/codec/source/commons-codec-1.6-src.tar.gz">commons-codec-1.6-src.tar.gz</a></td>
+              <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.6-src.tar.gz.md5">md5</a></td>
+              <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.6-src.tar.gz.asc">pgp</a></td>
           </tr>
           <tr>
-              <td><a href="[preferred]/commons/codec/source/commons-codec-1.5-src.zip">commons-codec-1.5-src.zip</a></td>
-              <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.5-src.zip.md5">md5</a></td>
-              <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.5-src.zip.asc">pgp</a></td>
+              <td><a href="[preferred]/commons/codec/source/commons-codec-1.6-src.zip">commons-codec-1.6-src.zip</a></td>
+              <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.6-src.zip.md5">md5</a></td>
+              <td><a href="http://www.apache.org/dist/commons/codec/source/commons-codec-1.6-src.zip.asc">pgp</a></td>
           </tr>
         </table>
       </subsection>

diff --git a/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java b/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
index 41dbeb7..b52ed1c 100644
--- a/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java
+++ b/src/test/java/org/apache/commons/codec/language/bm/PhoneticEngineTest.java

@@ -76,8 +76,8 @@
 
         String phoneticActual = engine.encode(this.name);
 
-        System.err.println("expecting: " + this.phoneticExpected);
-        System.err.println("actual:    " + phoneticActual);
+        //System.err.println("expecting: " + this.phoneticExpected);
+        //System.err.println("actual:    " + phoneticActual);
         assertEquals("phoneme incorrect", this.phoneticExpected, phoneticActual);
     }
 }
commit	551472689cf9570dfe88cecc4e52498db17d59b6	[log] [tgz]
author	Gary D. Gregory <ggregory@apache.org>	Mon Nov 14 15:04:32 2011 +0000
committer	Gary D. Gregory <ggregory@apache.org>	Mon Nov 14 15:04:32 2011 +0000
tree	7290fc718a1c5643bf3d33197217357cf9562361
parent	9d520b567e1bf92a18ab6aa2aa8a7da83a6c39bd [diff]
parent	b453bb690b789e391a5b385578402c31a5faf4d8 [diff]