src/main/java/org/apache/commons/text/names/HumanNameParser.java - commons-text - Git at Google

 /*
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.commons.text.names;

 import java.util.Arrays;
 import java.util.List;
 import java.util.Objects;

 import org.apache.commons.lang3.StringUtils;

 /**
  * A parser capable of parsing name parts out of a single string.
  *
  * <h3>Parsing examples</h3>
  *
  * <p>The code works by basically applying several Regexes in a certain order
  * and removing (chopping) tokens off the original string. The parser creates
  * a {@link Name} object representing the parse result. Note that passing null
  * to the {@link #parse(String)} method will result in an exception.</p>
  *
  * <table summary="Examples">
  *  <tr>
  *   <th>input</th>
  *   <th>Leading initial</th>
  *   <th>First name</th>
  *   <th>Nick name</th>
  *   <th>Middle name</th>
  *   <th>Last Name</th>
  *   <th>Suffix</th>
  *  </tr>
  *  <tr>
  *   <td>J. Walter Weatherman</td>
  *   <td>J.</td>
  *   <td>Walter</td>
  *   <td></td>
  *   <td></td>
  *   <td>Weatherman</td>
  *   <td></td>
  *  </tr>
  *  <tr>
  *   <td>de la Cruz, Ana M.</td>
  *   <td></td>
  *   <td>Ana</td>
  *   <td></td>
  *   <td>M.</td>
  *   <td>de la Cruz</td>
  *   <td></td>
  *  </tr>
  *  <tr>
  *   <td>James C. ('Jimmy') O'Dell, Jr.</td>
  *   <td></td>
  *   <td>James</td>
  *   <td>Jimmy</td>
  *   <td>C.</td>
  *   <td>O'Dell</td>
  *   <td>Jr.</td>
  *  </tr>
  * </table>
  *
  * <h3>Sample usage</h3>
  *
  * <p>HumanNameParser instances are immutable and can be reused for parsing multiple names:</p>
  *
  * <pre>
  * HumanNameParser parser = new HumanNameParser();
  * Name parsedName = parser.parse("Sérgio Vieira de Mello")
  * String firstName = parsedName.getFirstName();
  * String nickname = parsedName.getNickName();
  * // ...
  *
  * Name nextName = parser.parse("James C. ('Jimmy') O'Dell, Jr.")
  * String firstName = nextName.getFirstName();
  * String nickname = nextName.getNickName();
  * </pre>
  *
  * <h3>Further notes</h3>
  *
  * <p>The original code was written in <a href="http://jasonpriem.com/human-name-parse">PHP</a>
  * and ported to <a href="http://tupilabs.github.io/HumanNameParser.java/">Java</a>. This
  * implementation is based on the Java implementation, with additions
  * suggested in <a href="https://issues.apache.org/jira/browse/TEXT-15">TEXT-15</a>
  * and <a href="https://issues.apache.org/jira/browse/TEXT-16">TEXT-16</a>.</p>
  *
  * <p>This class is immutable.</p>
  */
 public final class HumanNameParser {

     private final List<String> suffixes;
     private final List<String> prefixes;

     /**
      * Creates a new parser.
      */
     public HumanNameParser() {
         // TODO make this configurable
         this.suffixes = Arrays.asList(
                 "esq", "esquire", "jr",
                 "sr", "2", "ii", "iii", "iv");
         this.prefixes = Arrays.asList(
                     "bar", "ben", "bin", "da", "dal",
                     "de la", "de", "del", "der", "di", "ibn", "la", "le",
                     "san", "st", "ste", "van", "van der", "van den", "vel",
                     "von" );
     }

     /**
      * Parses a name from the given string.
      *
      * @param name the name to parse. Must not be null.
      * @throws NameParseException if the parser fails to retrieve the name parts.
      * @throws NullPointerException if name is null.
      * @return The name object
      */
     public Name parse(String name) {
         Objects.requireNonNull(name, "Parameter 'name' must not be null.");

         NameString nameString = new NameString(name);
         // TODO compile regexes only once when the parser is created
         String suffixes = StringUtils.join(this.suffixes, "\\.*|") + "\\.*";
         String prefixes = StringUtils.join(this.prefixes, " |") + " ";

         // The regex use is a bit tricky.  *Everything* matched by the regex will be replaced,
         // but you can select a particular parenthesized submatch to be returned.
         // Also, note that each regex requres that the preceding ones have been run, and matches chopped out.
         // names that starts or end w/ an apostrophe break this
         String nicknamesRegex = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) ";
         String suffixRegex = "(?i),* *((" + suffixes + ")$)";
         String lastRegex = "(?i)(?!^)\\b([^ ]+ y |" + prefixes + ")*[^ ]+$";
         // note the lookahead, which isn't returned or replaced
         String leadingInitRegex = "(?i)(^(.\\.*)(?= \\p{L}{2}))";
         String firstRegex = "(?i)^([^ ]+)";

         // get nickname, if there is one
         String nickname = nameString.chopWithRegex(nicknamesRegex, 2);

         // get suffix, if there is one
         String suffix = nameString.chopWithRegex(suffixRegex, 1);

         // flip the before-comma and after-comma parts of the name
         nameString.flip(",");

         // get the last name
         String last = nameString.chopWithRegex(lastRegex, 0);

         // get the first initial, if there is one
         String leadingInit = nameString.chopWithRegex(leadingInitRegex, 1);

         // get the first name
         String first = nameString.chopWithRegex(firstRegex, 0);
         if (StringUtils.isBlank(first)) {
             throw new NameParseException("Couldn't find a first name in '{" + nameString.getWrappedString() + "}'");
         }

         // if anything's left, that's the middle name
         String middle = nameString.getWrappedString();

         return new Name(leadingInit, first, nickname, middle, last, suffix);
     }

 }
	/*
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.commons.text.names;

	import java.util.Arrays;
	import java.util.List;
	import java.util.Objects;

	import org.apache.commons.lang3.StringUtils;

	/**
	* A parser capable of parsing name parts out of a single string.
	*
	* <h3>Parsing examples</h3>
	*
	* <p>The code works by basically applying several Regexes in a certain order
	* and removing (chopping) tokens off the original string. The parser creates
	* a {@link Name} object representing the parse result. Note that passing null
	* to the {@link #parse(String)} method will result in an exception.</p>
	*
	* <table summary="Examples">
	* <tr>
	* <th>input</th>
	* <th>Leading initial</th>
	* <th>First name</th>
	* <th>Nick name</th>
	* <th>Middle name</th>
	* <th>Last Name</th>
	* <th>Suffix</th>
	* </tr>
	* <tr>
	* <td>J. Walter Weatherman</td>
	* <td>J.</td>
	* <td>Walter</td>
	* <td></td>
	* <td></td>
	* <td>Weatherman</td>
	* <td></td>
	* </tr>
	* <tr>
	* <td>de la Cruz, Ana M.</td>
	* <td></td>
	* <td>Ana</td>
	* <td></td>
	* <td>M.</td>
	* <td>de la Cruz</td>
	* <td></td>
	* </tr>
	* <tr>
	* <td>James C. ('Jimmy') O'Dell, Jr.</td>
	* <td></td>
	* <td>James</td>
	* <td>Jimmy</td>
	* <td>C.</td>
	* <td>O'Dell</td>
	* <td>Jr.</td>
	* </tr>
	* </table>
	*
	* <h3>Sample usage</h3>
	*
	* <p>HumanNameParser instances are immutable and can be reused for parsing multiple names:</p>
	*
	* <pre>
	* HumanNameParser parser = new HumanNameParser();
	* Name parsedName = parser.parse("Sérgio Vieira de Mello")
	* String firstName = parsedName.getFirstName();
	* String nickname = parsedName.getNickName();
	* // ...
	*
	* Name nextName = parser.parse("James C. ('Jimmy') O'Dell, Jr.")
	* String firstName = nextName.getFirstName();
	* String nickname = nextName.getNickName();
	* </pre>
	*
	* <h3>Further notes</h3>
	*
	* <p>The original code was written in <a href="http://jasonpriem.com/human-name-parse">PHP</a>
	* and ported to <a href="http://tupilabs.github.io/HumanNameParser.java/">Java</a>. This
	* implementation is based on the Java implementation, with additions
	* suggested in <a href="https://issues.apache.org/jira/browse/TEXT-15">TEXT-15</a>
	* and <a href="https://issues.apache.org/jira/browse/TEXT-16">TEXT-16</a>.</p>
	*
	* <p>This class is immutable.</p>
	*/
	public final class HumanNameParser {

	private final List<String> suffixes;
	private final List<String> prefixes;

	/**
	* Creates a new parser.
	*/
	public HumanNameParser() {
	// TODO make this configurable
	this.suffixes = Arrays.asList(
	"esq", "esquire", "jr",
	"sr", "2", "ii", "iii", "iv");
	this.prefixes = Arrays.asList(
	"bar", "ben", "bin", "da", "dal",
	"de la", "de", "del", "der", "di", "ibn", "la", "le",
	"san", "st", "ste", "van", "van der", "van den", "vel",
	"von" );
	}

	/**
	* Parses a name from the given string.
	*
	* @param name the name to parse. Must not be null.
	* @throws NameParseException if the parser fails to retrieve the name parts.
	* @throws NullPointerException if name is null.
	* @return The name object
	*/
	public Name parse(String name) {
	Objects.requireNonNull(name, "Parameter 'name' must not be null.");

	NameString nameString = new NameString(name);
	// TODO compile regexes only once when the parser is created
	String suffixes = StringUtils.join(this.suffixes, "\\.\|") + "\\.";
	String prefixes = StringUtils.join(this.prefixes, " \|") + " ";

	// The regex use is a bit tricky. Everything matched by the regex will be replaced,
	// but you can select a particular parenthesized submatch to be returned.
	// Also, note that each regex requres that the preceding ones have been run, and matches chopped out.
	// names that starts or end w/ an apostrophe break this
	String nicknamesRegex = "(?i) ('\|\\\"\|\\(\\\"')(.+?)('\|\\\"\|\\\"'\\)) ";
	String suffixRegex = "(?i),* *((" + suffixes + ")$)";
	String lastRegex = "(?i)(?!^)\\b([^ ]+ y \|" + prefixes + ")*[^ ]+$";
	// note the lookahead, which isn't returned or replaced
	String leadingInitRegex = "(?i)(^(.\\.*)(?= \\p{L}{2}))";
	String firstRegex = "(?i)^([^ ]+)";

	// get nickname, if there is one
	String nickname = nameString.chopWithRegex(nicknamesRegex, 2);

	// get suffix, if there is one
	String suffix = nameString.chopWithRegex(suffixRegex, 1);

	// flip the before-comma and after-comma parts of the name
	nameString.flip(",");

	// get the last name
	String last = nameString.chopWithRegex(lastRegex, 0);

	// get the first initial, if there is one
	String leadingInit = nameString.chopWithRegex(leadingInitRegex, 1);

	// get the first name
	String first = nameString.chopWithRegex(firstRegex, 0);
	if (StringUtils.isBlank(first)) {
	throw new NameParseException("Couldn't find a first name in '{" + nameString.getWrappedString() + "}'");
	}

	// if anything's left, that's the middle name
	String middle = nameString.getWrappedString();

	return new Name(leadingInit, first, nickname, middle, last, suffix);
	}

	}