blob: ab19405e20fcc7343befabfaf31508a4de28e34a [file] [log] [blame]
package org.apache.ctakes.dictionary.cased.util.tokenize;
import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
import javax.annotation.concurrent.Immutable;
import java.util.*;
import java.util.stream.Collectors;
/**
* @author SPF , chip-nlp
* @version %I%
* @since 8/17/2020
*/
@Immutable
final public class TokenizedTerm {
// TODO : follow https://clear.colorado.edu/compsem/documents/treebank_guidelines.pdf or later/
// More hyphenates in "2.2.1 Hyphenated Words"
// 2.2.2 Abbreviations Containing Slashes (e.g. b/c, d/c, w/o)
// Attempted date detection. May be difficult, but I've done it before.
// TODO : add contractions:
// The following contractions and related items are split into separate tokens.
// // 's
// // 've
// // 're
// // 'll
// // 'd
// // n't
// // can not
static private final Collection<String> PREFIXES = new HashSet<>( Arrays.asList(
"e-",
"a-",
"u-",
"x-",
"agro-",
"ante-",
"anti-",
"arch-",
"be-",
"bi-",
"bio-",
"co-",
"counter-",
"cross-",
"cyber-",
"de-",
"eco-",
"ex-",
"extra-",
"inter-",
"intra-",
"macro-",
"mega-",
"micro-",
"mid-",
"mini-",
"multi-",
"neo-",
"non-",
"over-",
"pan-",
"para-",
"peri-",
"post-",
"pre-",
"pro-",
"pseudo-",
"quasi-",
"re-",
"semi-",
"sub-",
"super-",
"tri-",
"ultra-",
"un-",
"uni-",
"vice-",
// From email from Colin Warner <colinw@ldc.upenn.edu> on 7/25/2010
"electro-",
"gasto-",
"homo-",
"hetero-",
"ortho-",
"phospho-" ) );
static private final Collection<String> SUFFIXES = new HashSet<>( Arrays.asList(
"-esque",
"-ette",
"-fest",
"-fold",
"-gate",
"-itis",
"-less",
"-most",
"-o-torium",
"-rama",
"-wise" ) );
// TODO - this requires all lower or all-upper case. That is not the correct way to deal with things.
// There may be "Upper-Case" and "Upper-case" terms - more than "UPPER-CASE" anyway.
static private final Collection<String> UPPER_PREFIXES = PREFIXES.stream()
.map( String::toUpperCase )
.collect( Collectors.toSet() );
static private final Collection<String> UPPER_SUFFIXES = SUFFIXES.stream()
.map( String::toUpperCase )
.collect( Collectors.toSet() );
final private String[] _tokens;
final private boolean _allUpperCase;
final private boolean _allLowerCase;
final private Long _cui;
final private int _hashcode;
public TokenizedTerm( final String cui, final String text ) {
_cui = CuiCodeUtil.getInstance().getCuiCode( cui );
_tokens = getTermTokens( text );
boolean anyCaps = false;
boolean anyLower = false;
for ( char c : text.toCharArray() ) {
if ( Character.isUpperCase( c ) ) {
anyCaps = true;
} else if ( Character.isLowerCase( c ) ) {
anyLower = true;
}
if ( anyCaps && anyLower ) {
break;
}
}
_allUpperCase = anyCaps && !anyLower;
_allLowerCase = anyLower && !anyCaps;
_hashcode = (cui + "_" + text).hashCode();
}
public long getCui() {
return _cui;
}
public String[] getTokens() {
return _tokens;
}
public boolean isAllUpperCase() {
return _allUpperCase;
}
public boolean isAllLowerCase() {
return _allLowerCase;
}
static private String[] getTermTokens( final String text ) {
if ( text.isEmpty() ) {
return new String[ 0 ];
}
return Arrays.stream( text.split( "\\s+" ) )
.map( TokenizedTerm::getTokens )
.flatMap( Collection::stream )
.toArray( String[]::new );
}
// TODO should this be exactly the same as getTokens in TextTokenizer (dictionary gui code) ? probably ...
static private List<String> getTokens( final String word ) {
final List<String> tokens = new ArrayList<>();
final StringBuilder sb = new StringBuilder();
final int count = word.length();
for ( int i = 0; i < count; i++ ) {
final char c = word.charAt( i );
if ( Character.isLetterOrDigit( c ) ) {
sb.append( c );
continue;
}
if ( c == '-' && (isPrefix( sb.toString() ) || isSuffix( word, i + 1 )) ) {
// what precedes is a prefix or what follows is a suffix so append the dash to the current word and move on
sb.append( c );
continue;
}
if ( (c == '\'' && isOwnerApostrophe( word, i + 1 ))
|| (c == '.' && isNumberDecimal( word, i + 1 )) ) {
// what follows is an 's or .# so add the preceding and move on
if ( sb.length() != 0 ) {
tokens.add( createToken( sb ) );
sb.setLength( 0 );
}
sb.append( c );
continue;
}
// Wasn't a special symbol for consideration, so add the previous and symbol separately
if ( sb.length() != 0 ) {
tokens.add( createToken( sb ) );
sb.setLength( 0 );
}
tokens.add( "" + c );
}
if ( sb.length() != 0 ) {
tokens.add( createToken( sb ) );
}
return tokens;
}
static private String createToken( final StringBuilder sb ) {
return sb.toString();
}
static private boolean isPrefix( final String word ) {
return PREFIXES.contains( word + "-" ) || UPPER_PREFIXES.contains( word + "-" );
}
static private boolean isSuffix( final String word, final int startIndex ) {
if ( word.length() <= startIndex ) {
return false;
}
final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) );
if ( nextCharTerm.isEmpty() ) {
return false;
}
return SUFFIXES.contains( "-" + nextCharTerm ) || UPPER_SUFFIXES.contains( "-" + nextCharTerm );
}
static private boolean isOwnerApostrophe( final CharSequence word, final int startIndex ) {
return word.length() == startIndex + 1 && word.charAt( startIndex ) == 's';
}
static private boolean isNumberDecimal( final CharSequence word, final int startIndex ) {
// Bizarre scenario in which ctakes tokenizes ".2" as a fraction, but not ".22"
return word.length() == startIndex + 1 && Character.isDigit( word.charAt( startIndex ) );
}
static private String getNextCharTerm( final String word ) {
final int count = word.length();
for ( int i = 0; i < count; i++ ) {
final char c = word.charAt( i );
if ( !Character.isLetterOrDigit( c ) ) {
return word.substring( 0, i );
}
}
return word;
}
public boolean equals( final Object value ) {
return value instanceof TokenizedTerm
&& Arrays.equals( _tokens, ((TokenizedTerm)value)._tokens )
&& _cui.equals( ((TokenizedTerm)value)._cui );
}
public int hashCode() {
return _hashcode;
}
}