ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java - ctakes - Git at Google

 package org.apache.ctakes.dictionary.cased.util.tokenize;

 import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;

 import javax.annotation.concurrent.Immutable;
 import java.util.*;
 import java.util.stream.Collectors;

 /**
  * @author SPF , chip-nlp
  * @version %I%
  * @since 8/17/2020
  */
 @Immutable
 final public class TokenizedTerm {

    // TODO : follow https://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf or later/
    // More hyphenates in "2.2.1 Hyphenated Words"
    // 2.2.2 Abbreviations Containing Slashes  (e.g. b/c, d/c, w/o)
    // Attempted date detection.  May be difficult, but I've done it before.


    // TODO : add contractions:
 // The following contractions and related items are split into separate tokens.
 //    // 's
 //    // 've
 //    // 're
 //    // 'll
 //    // 'd
 //    // n't
 //    // can not

    static private final Collection<String> PREFIXES = new HashSet<>( Arrays.asList(
          "e-",
          "a-",
          "u-",
          "x-",
          "agro-",
          "ante-",
          "anti-",
          "arch-",
          "be-",
          "bi-",
          "bio-",
          "co-",
          "counter-",
          "cross-",
          "cyber-",
          "de-",
          "eco-",
          "ex-",
          "extra-",
          "inter-",
          "intra-",
          "macro-",
          "mega-",
          "micro-",
          "mid-",
          "mini-",
          "multi-",
          "neo-",
          "non-",
          "over-",
          "pan-",
          "para-",
          "peri-",
          "post-",
          "pre-",
          "pro-",
          "pseudo-",
          "quasi-",
          "re-",
          "semi-",
          "sub-",
          "super-",
          "tri-",
          "ultra-",
          "un-",
          "uni-",
          "vice-",
          // From email from Colin Warner <colinw@ldc.upenn.edu> on 7/25/2010
          "electro-",
          "gasto-",
          "homo-",
          "hetero-",
          "ortho-",
          "phospho-" ) );

    static private final Collection<String> SUFFIXES = new HashSet<>( Arrays.asList(
          "-esque",
          "-ette",
          "-fest",
          "-fold",
          "-gate",
          "-itis",
          "-less",
          "-most",
          "-o-torium",
          "-rama",
          "-wise" ) );


    // TODO - this requires all lower or all-upper case.  That is not the correct way to deal with things.
    // There may be "Upper-Case" and "Upper-case" terms - more than "UPPER-CASE" anyway.

    static private final Collection<String> UPPER_PREFIXES = PREFIXES.stream()
                                                                     .map( String::toUpperCase )
                                                                     .collect( Collectors.toSet() );

    static private final Collection<String> UPPER_SUFFIXES = SUFFIXES.stream()
                                                                     .map( String::toUpperCase )
                                                                     .collect( Collectors.toSet() );


    final private String[] _tokens;
    final private boolean _allUpperCase;
    final private boolean _allLowerCase;
    final private Long _cui;
    final private int _hashcode;

    public TokenizedTerm( final String cui, final String text ) {
       _cui = CuiCodeUtil.getInstance().getCuiCode( cui );
       _tokens = getTermTokens( text );
       boolean anyCaps = false;
       boolean anyLower = false;
       for ( char c : text.toCharArray() ) {
          if ( Character.isUpperCase( c ) ) {
             anyCaps = true;
          } else if ( Character.isLowerCase( c ) ) {
             anyLower = true;
          }
          if ( anyCaps && anyLower ) {
             break;
          }
       }
       _allUpperCase = anyCaps && !anyLower;
       _allLowerCase = anyLower && !anyCaps;
       _hashcode = (cui + "_" + text).hashCode();
    }

    public long getCui() {
       return _cui;
    }

    public String[] getTokens() {
       return _tokens;
    }

    public boolean isAllUpperCase() {
       return _allUpperCase;
    }

    public boolean isAllLowerCase() {
       return _allLowerCase;
    }


    static private String[] getTermTokens( final String text ) {
       if ( text.isEmpty() ) {
          return new String[ 0 ];
       }
       return Arrays.stream( text.split( "\\s+" ) )
                    .map( TokenizedTerm::getTokens )
                    .flatMap( Collection::stream )
                    .toArray( String[]::new );
    }

    // TODO should this be exactly the same as getTokens in TextTokenizer (dictionary gui code)  ? probably ...
    static private List<String> getTokens( final String word ) {
       final List<String> tokens = new ArrayList<>();
       final StringBuilder sb = new StringBuilder();
       final int count = word.length();
       for ( int i = 0; i < count; i++ ) {
          final char c = word.charAt( i );
          if ( Character.isLetterOrDigit( c ) ) {
             sb.append( c );
             continue;
          }
          if ( c == '-' && (isPrefix( sb.toString() ) || isSuffix( word, i + 1 )) ) {
             // what precedes is a prefix or what follows is a suffix so append the dash to the current word and move on
             sb.append( c );
             continue;
          }
          if ( (c == '\'' && isOwnerApostrophe( word, i + 1 ))
               || (c == '.' && isNumberDecimal( word, i + 1 )) ) {
             // what follows is an 's or .# so add the preceding and move on
             if ( sb.length() != 0 ) {
                tokens.add( createToken( sb ) );
                sb.setLength( 0 );
             }
             sb.append( c );
             continue;
          }
          // Wasn't a special symbol for consideration, so add the previous and symbol separately
          if ( sb.length() != 0 ) {
             tokens.add( createToken( sb ) );
             sb.setLength( 0 );
          }
          tokens.add( "" + c );
       }
       if ( sb.length() != 0 ) {
          tokens.add( createToken( sb ) );
       }
       return tokens;
    }

    static private String createToken( final StringBuilder sb ) {
       return sb.toString();
    }

    static private boolean isPrefix( final String word ) {
       return PREFIXES.contains( word + "-" ) || UPPER_PREFIXES.contains( word + "-" );
    }

    static private boolean isSuffix( final String word, final int startIndex ) {
       if ( word.length() <= startIndex ) {
          return false;
       }
       final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) );
       if ( nextCharTerm.isEmpty() ) {
          return false;
       }
       return SUFFIXES.contains( "-" + nextCharTerm ) || UPPER_SUFFIXES.contains( "-" + nextCharTerm );
    }

    static private boolean isOwnerApostrophe( final CharSequence word, final int startIndex ) {
       return word.length() == startIndex + 1 && word.charAt( startIndex ) == 's';
    }

    static private boolean isNumberDecimal( final CharSequence word, final int startIndex ) {
       // Bizarre scenario in which ctakes tokenizes ".2" as a fraction, but not ".22"
       return word.length() == startIndex + 1 && Character.isDigit( word.charAt( startIndex ) );
    }

    static private String getNextCharTerm( final String word ) {
       final int count = word.length();
       for ( int i = 0; i < count; i++ ) {
          final char c = word.charAt( i );
          if ( !Character.isLetterOrDigit( c ) ) {
             return word.substring( 0, i );
          }
       }
       return word;
    }

    public boolean equals( final Object value ) {
       return value instanceof TokenizedTerm
              && Arrays.equals( _tokens, ((TokenizedTerm)value)._tokens )
              && _cui.equals( ((TokenizedTerm)value)._cui );
    }

    public int hashCode() {
       return _hashcode;
    }


 }
	package org.apache.ctakes.dictionary.cased.util.tokenize;

	import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;

	import javax.annotation.concurrent.Immutable;
	import java.util.*;
	import java.util.stream.Collectors;

	/**
	* @author SPF , chip-nlp
	* @version %I%
	* @since 8/17/2020
	*/
	@Immutable
	final public class TokenizedTerm {

	// TODO : follow https://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf or later/
	// More hyphenates in "2.2.1 Hyphenated Words"
	// 2.2.2 Abbreviations Containing Slashes (e.g. b/c, d/c, w/o)
	// Attempted date detection. May be difficult, but I've done it before.



	// TODO : add contractions:
	// The following contractions and related items are split into separate tokens.
	// // 's
	// // 've
	// // 're
	// // 'll
	// // 'd
	// // n't
	// // can not

	static private final Collection<String> PREFIXES = new HashSet<>( Arrays.asList(
	"e-",
	"a-",
	"u-",
	"x-",
	"agro-",
	"ante-",
	"anti-",
	"arch-",
	"be-",
	"bi-",
	"bio-",
	"co-",
	"counter-",
	"cross-",
	"cyber-",
	"de-",
	"eco-",
	"ex-",
	"extra-",
	"inter-",
	"intra-",
	"macro-",
	"mega-",
	"micro-",
	"mid-",
	"mini-",
	"multi-",
	"neo-",
	"non-",
	"over-",
	"pan-",
	"para-",
	"peri-",
	"post-",
	"pre-",
	"pro-",
	"pseudo-",
	"quasi-",
	"re-",
	"semi-",
	"sub-",
	"super-",
	"tri-",
	"ultra-",
	"un-",
	"uni-",
	"vice-",
	// From email from Colin Warner <colinw@ldc.upenn.edu> on 7/25/2010
	"electro-",
	"gasto-",
	"homo-",
	"hetero-",
	"ortho-",
	"phospho-" ) );

	static private final Collection<String> SUFFIXES = new HashSet<>( Arrays.asList(
	"-esque",
	"-ette",
	"-fest",
	"-fold",
	"-gate",
	"-itis",
	"-less",
	"-most",
	"-o-torium",
	"-rama",
	"-wise" ) );


	// TODO - this requires all lower or all-upper case. That is not the correct way to deal with things.
	// There may be "Upper-Case" and "Upper-case" terms - more than "UPPER-CASE" anyway.

	static private final Collection<String> UPPER_PREFIXES = PREFIXES.stream()
	.map( String::toUpperCase )
	.collect( Collectors.toSet() );

	static private final Collection<String> UPPER_SUFFIXES = SUFFIXES.stream()
	.map( String::toUpperCase )
	.collect( Collectors.toSet() );


	final private String[] _tokens;
	final private boolean _allUpperCase;
	final private boolean _allLowerCase;
	final private Long _cui;
	final private int _hashcode;

	public TokenizedTerm( final String cui, final String text ) {
	_cui = CuiCodeUtil.getInstance().getCuiCode( cui );
	_tokens = getTermTokens( text );
	boolean anyCaps = false;
	boolean anyLower = false;
	for ( char c : text.toCharArray() ) {
	if ( Character.isUpperCase( c ) ) {
	anyCaps = true;
	} else if ( Character.isLowerCase( c ) ) {
	anyLower = true;
	}
	if ( anyCaps && anyLower ) {
	break;
	}
	}
	_allUpperCase = anyCaps && !anyLower;
	_allLowerCase = anyLower && !anyCaps;
	_hashcode = (cui + "_" + text).hashCode();
	}

	public long getCui() {
	return _cui;
	}

	public String[] getTokens() {
	return _tokens;
	}

	public boolean isAllUpperCase() {
	return _allUpperCase;
	}

	public boolean isAllLowerCase() {
	return _allLowerCase;
	}


	static private String[] getTermTokens( final String text ) {
	if ( text.isEmpty() ) {
	return new String[ 0 ];
	}
	return Arrays.stream( text.split( "\\s+" ) )
	.map( TokenizedTerm::getTokens )
	.flatMap( Collection::stream )
	.toArray( String[]::new );
	}

	// TODO should this be exactly the same as getTokens in TextTokenizer (dictionary gui code) ? probably ...
	static private List<String> getTokens( final String word ) {
	final List<String> tokens = new ArrayList<>();
	final StringBuilder sb = new StringBuilder();
	final int count = word.length();
	for ( int i = 0; i < count; i++ ) {
	final char c = word.charAt( i );
	if ( Character.isLetterOrDigit( c ) ) {
	sb.append( c );
	continue;
	}
	if ( c == '-' && (isPrefix( sb.toString() ) \|\| isSuffix( word, i + 1 )) ) {
	// what precedes is a prefix or what follows is a suffix so append the dash to the current word and move on
	sb.append( c );
	continue;
	}
	if ( (c == '\'' && isOwnerApostrophe( word, i + 1 ))
	\|\| (c == '.' && isNumberDecimal( word, i + 1 )) ) {
	// what follows is an 's or .# so add the preceding and move on
	if ( sb.length() != 0 ) {
	tokens.add( createToken( sb ) );
	sb.setLength( 0 );
	}
	sb.append( c );
	continue;
	}
	// Wasn't a special symbol for consideration, so add the previous and symbol separately
	if ( sb.length() != 0 ) {
	tokens.add( createToken( sb ) );
	sb.setLength( 0 );
	}
	tokens.add( "" + c );
	}
	if ( sb.length() != 0 ) {
	tokens.add( createToken( sb ) );
	}
	return tokens;
	}

	static private String createToken( final StringBuilder sb ) {
	return sb.toString();
	}

	static private boolean isPrefix( final String word ) {
	return PREFIXES.contains( word + "-" ) \|\| UPPER_PREFIXES.contains( word + "-" );
	}

	static private boolean isSuffix( final String word, final int startIndex ) {
	if ( word.length() <= startIndex ) {
	return false;
	}
	final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) );
	if ( nextCharTerm.isEmpty() ) {
	return false;
	}
	return SUFFIXES.contains( "-" + nextCharTerm ) \|\| UPPER_SUFFIXES.contains( "-" + nextCharTerm );
	}

	static private boolean isOwnerApostrophe( final CharSequence word, final int startIndex ) {
	return word.length() == startIndex + 1 && word.charAt( startIndex ) == 's';
	}

	static private boolean isNumberDecimal( final CharSequence word, final int startIndex ) {
	// Bizarre scenario in which ctakes tokenizes ".2" as a fraction, but not ".22"
	return word.length() == startIndex + 1 && Character.isDigit( word.charAt( startIndex ) );
	}

	static private String getNextCharTerm( final String word ) {
	final int count = word.length();
	for ( int i = 0; i < count; i++ ) {
	final char c = word.charAt( i );
	if ( !Character.isLetterOrDigit( c ) ) {
	return word.substring( 0, i );
	}
	}
	return word;
	}

	public boolean equals( final Object value ) {
	return value instanceof TokenizedTerm
	&& Arrays.equals( _tokens, ((TokenizedTerm)value)._tokens )
	&& _cui.equals( ((TokenizedTerm)value)._cui );
	}

	public int hashCode() {
	return _hashcode;
	}


	}