blob: dcb6bd67919ca4ad2eccc312d525a3055be2f4a5 [file] [log] [blame]
#ifndef UIMA_UNICODESTRINGREF_HPP
#define UIMA_UNICODESTRINGREF_HPP
/** \file unistrref.hpp .
-----------------------------------------------------------------------------
string interface of uima::UnicodeStringRef
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
-----------------------------------------------------------------------------
\brief Shallow string object consisting of a pair of string pointer and a length
-----------------------------------------------------------------------------
*/
#include "uima/pragmas.hpp" //must be included first to disable warnings
#include <vector>
#include <string>
#include <iostream>
#include "uima/types.h"
#include "uima/assertmsg.h"
#include "uima/ccsid.hpp"
#include "unicode/unistr.h"
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "uima/strtools.hpp"
/* ----------------------------------------------------------------------- */
/* Interface dependencies */
/* ----------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- */
/* Types / Classes */
/* ----------------------------------------------------------------------- */
namespace uima {
/**
* The class <TT>UnicodeStringRef</TT> provides support for non zero-terminated
* strings that are presented as pointers to Unicode character arrays
* with an associated length.
* As this type of string is supposed to be used only as string reference into
* read-only buffers, the string pointer is constant.
* The member functions are named to implement the icu::UnicodeString interface
* but only providing const member functions
* This class is a quick ,light-weight, shallow string
* (internally it consists only of a pointer and a length)
* which can be copied by value without performance penalty.
* It allows references into other string buffers to be treated like real
* string objects.
* Since it does not own it's string memory care must be taken to make sure
* the lifetime of an UnicodeStringRef object does not exceed the lifetime
* of the Unicode character buffer it references.
*/
class UIMA_LINK_IMPORTSPEC UnicodeStringRef {
public:
/**
* Default Constructor
*/
UnicodeStringRef( void );
/**
* Constructor from icu::UnicodeString
*/
UnicodeStringRef( const icu::UnicodeString & crUniString );
/**
* Constructor from zero terminated string
*/
explicit UnicodeStringRef( UChar const * cpacString );
/**
* Constructor from string and length
*/
UnicodeStringRef( UChar const * cpacString, int32_t uiLength );
/**
* Constructor from a two pointers (begin/end).
* Note: end points to the first char <em>behind</em> the string.
* @deprecated Replace with UnicodeStringRef(paucStringBegin,paucStringEnd-paucStringBegin).
*/
UnicodeStringRef( UChar const * paucStringBegin, UChar const * paucStringEnd );
///Accessor for the number of bytes occupied by this string
int32_t getSizeInBytes( void ) const;
///CONST Accessor for the string content (NOT ZERO DELIMITED!).
UChar const * getBuffer( void ) const;
///Assignment operator
UnicodeStringRef & operator=( UnicodeStringRef const & crclRHS );
///Equality operator
int operator==( const UnicodeStringRef & crclRHS ) const;
///Inequality operator
int operator!=( const UnicodeStringRef & crclRHS ) const;
///less operator
bool operator< ( UnicodeStringRef const & text ) const;
///less equal operator
bool operator<=( UnicodeStringRef const & text ) const;
///greater operator
bool operator> ( UnicodeStringRef const & text ) const;
///greater equal operator
bool operator>=( UnicodeStringRef const & text ) const;
/**
* Compare the characters bitwise in this UnicodeStringRef to
* the characters in <TT>text</TT>.
* @param text The UnicodeStringRef to compare to this one.
* @return The result of bitwise character comparison: 0 if <TT>text</TT>
* contains the same characters as this, -1 if the characters in
* <TT>text</TT> are bitwise less than the characters in this, +1 if the
* characters in <TT>text</TT> are bitwise greater than the characters
* in this.
* @stable
*/
inline int8_t compare(const UnicodeStringRef& text) const;
/**
* Compare the characters bitwise in this UnicodeStringRef to
* the characters in <TT>text</TT>.
* @param text The UnicodeString to compare to this one.
* @return The result of bitwise character comparison: 0 if <TT>text</TT>
* contains the same characters as this, -1 if the characters in
* <TT>text</TT> are bitwise less than the characters in this, +1 if the
* characters in <TT>text</TT> are bitwise greater than the characters
* in this.
* @stable
*/
inline int8_t compare(const icu::UnicodeString& text) const;
/**
* Compare the characters bitwise in the range
* [<TT>start</TT>, <TT>start + length</TT>) with the characters
* in <TT>srcText</TT>
* @param start the offset at which the compare operation begins
* @param length the number of characters of text to compare.
* @param srcText the text to be compared
* @return The result of bitwise character comparison: 0 if <TT>text</TT>
* contains the same characters as this, -1 if the characters in
* <TT>text</TT> are bitwise less than the characters in this, +1 if the
* characters in <TT>text</TT> are bitwise greater than the characters
* in this.
* @stable
*/
inline int8_t compare(int32_t start,
int32_t length,
const UnicodeStringRef& srcText) const;
/**
* Compare the characters bitwise in the range
* [<TT>start</TT>, <TT>start + length</TT>) with the characters
* in <TT>srcText</TT> in the range
* [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
* @param start the offset at which the compare operation begins
* @param length the number of characters in this to compare.
* @param srcText the text to be compared
* @param srcStart the offset into <TT>srcText</TT> to start comparison
* @param srcLength the number of characters in <TT>src</TT> to compare
* @return The result of bitwise character comparison: 0 if <TT>text</TT>
* contains the same characters as this, -1 if the characters in
* <TT>text</TT> are bitwise less than the characters in this, +1 if the
* characters in <TT>text</TT> are bitwise greater than the characters
* in this.
* @stable
*/
inline int8_t compare(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const;
/**
* Compare the characters bitwise in this UnicodeStringRef with the first
* <TT>srcLength</TT> characters in <TT>srcChars</TT>.
* @param srcChars The characters to compare to this UnicodeStringRef.
* @param srcLength the number of characters in <TT>srcChars</TT> to compare
* @return The result of bitwise character comparison: 0 if <TT>text</TT>
* contains the same characters as this, -1 if the characters in
* <TT>text</TT> are bitwise less than the characters in this, +1 if the
* characters in <TT>text</TT> are bitwise greater than the characters
* in this.
* @stable
*/
inline int8_t compare(UChar const *srcChars,
int32_t srcLength) const;
/**
* Compare the characters bitwise in the range
* [<TT>start</TT>, <TT>start + length</TT>) with the first
* <TT>length</TT> characters in <TT>srcChars</TT>
* @param start the offset at which the compare operation begins
* @param length the number of characters to compare.
* @param srcChars the characters to be compared
* @return The result of bitwise character comparison: 0 if <TT>text</TT>
* contains the same characters as this, -1 if the characters in
* <TT>text</TT> are bitwise less than the characters in this, +1 if the
* characters in <TT>text</TT> are bitwise greater than the characters
* in this.
* @stable
*/
inline int8_t compare(int32_t start,
int32_t length,
UChar const *srcChars) const;
/**
* Compare the characters bitwise in the range
* [<TT>start</TT>, <TT>start + length</TT>) with the characters
* in <TT>srcChars</TT> in the range
* [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
* @param start the offset at which the compare operation begins
* @param length the number of characters in this to compare
* @param srcChars the characters to be compared
* @param srcStart the offset into <TT>srcChars</TT> to start comparison
* @param srcLength the number of characters in <TT>srcChars</TT> to compare
* @return The result of bitwise character comparison: 0 if <TT>text</TT>
* contains the same characters as this, -1 if the characters in
* <TT>text</TT> are bitwise less than the characters in this, +1 if the
* characters in <TT>text</TT> are bitwise greater than the characters
* in this.
* @stable
*/
inline int8_t compare(int32_t start,
int32_t length,
UChar const *srcChars,
int32_t srcStart,
int32_t srcLength) const;
/**
* Compare the characters bitwise in the range
* [<TT>start</TT>, <TT>limit</TT>) with the characters
* in <TT>srcText</TT> in the range
* [<TT>srcStart</TT>, <TT>srcLimit</TT>).
* @param start the offset at which the compare operation begins
* @param limit the offset immediately following the compare operation
* @param srcText the text to be compared
* @param srcStart the offset into <TT>srcText</TT> to start comparison
* @param srcLimit the offset into <TT>srcText</TT> to limit comparison
* @return The result of bitwise character comparison: 0 if <TT>text</TT>
* contains the same characters as this, -1 if the characters in
* <TT>text</TT> are bitwise less than the characters in this, +1 if the
* characters in <TT>text</TT> are bitwise greater than the characters
* in this.
* @stable
*/
inline int8_t compareBetween(int32_t start,
int32_t limit,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLimit) const;
/**
* Compare two Unicode strings in code point order.
* This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
* stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
* which means that they compare as less than some other BMP characters like U+feff.
* This function compares Unicode strings in code point order.
* If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
*
* @param text Another string to compare this one to.
* @return a negative/zero/positive integer corresponding to whether
* this string is less than/equal to/greater than the second one
* in code point order
*/
inline int8_t compareCodePointOrder(const UnicodeStringRef& text) const;
/**
* Compare two Unicode strings in code point order.
* This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
* stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
* which means that they compare as less than some other BMP characters like U+feff.
* This function compares Unicode strings in code point order.
* If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
*
* @param start The start offset in this string at which the compare operation begins.
* @param length The number of code units from this string to compare.
* @param srcText Another string to compare this one to.
* @return a negative/zero/positive integer corresponding to whether
* this string is less than/equal to/greater than the second one
* in code point order
*/
inline int8_t compareCodePointOrder(int32_t start,
int32_t length,
const UnicodeStringRef& srcText) const;
/**
* Compare two Unicode strings in code point order.
* This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
* stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
* which means that they compare as less than some other BMP characters like U+feff.
* This function compares Unicode strings in code point order.
* If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
*
* @param start The start offset in this string at which the compare operation begins.
* @param length The number of code units from this string to compare.
* @param srcText Another string to compare this one to.
* @param srcStart The start offset in that string at which the compare operation begins.
* @param srcLength The number of code units from that string to compare.
* @return a negative/zero/positive integer corresponding to whether
* this string is less than/equal to/greater than the second one
* in code point order
*/
inline int8_t compareCodePointOrder(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const;
/**
* Compare two Unicode strings in code point order.
* This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
* stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
* which means that they compare as less than some other BMP characters like U+feff.
* This function compares Unicode strings in code point order.
* If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
*
* @param srcChars A pointer to another string to compare this one to.
* @param srcLength The number of code units from that string to compare.
* @return a negative/zero/positive integer corresponding to whether
* this string is less than/equal to/greater than the second one
* in code point order
*/
inline int8_t compareCodePointOrder(UChar const *srcChars,
int32_t srcLength) const;
/**
* Compare two Unicode strings in code point order.
* This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
* stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
* which means that they compare as less than some other BMP characters like U+feff.
* This function compares Unicode strings in code point order.
* If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
*
* @param start The start offset in this string at which the compare operation begins.
* @param length The number of code units from this string to compare.
* @param srcChars A pointer to another string to compare this one to.
* @return a negative/zero/positive integer corresponding to whether
* this string is less than/equal to/greater than the second one
* in code point order
*/
inline int8_t compareCodePointOrder(int32_t start,
int32_t length,
UChar const *srcChars) const;
/**
* Compare two Unicode strings in code point order.
* This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
* stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
* which means that they compare as less than some other BMP characters like U+feff.
* This function compares Unicode strings in code point order.
* If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
*
* @param start The start offset in this string at which the compare operation begins.
* @param length The number of code units from this string to compare.
* @param srcChars A pointer to another string to compare this one to.
* @param srcStart The start offset in that string at which the compare operation begins.
* @param srcLength The number of code units from that string to compare.
* @return a negative/zero/positive integer corresponding to whether
* this string is less than/equal to/greater than the second one
* in code point order
*/
inline int8_t compareCodePointOrder(int32_t start,
int32_t length,
UChar const *srcChars,
int32_t srcStart,
int32_t srcLength) const;
/**
* Compare two Unicode strings in code point order.
* This is different in UTF-16 from how compare(), operator==, startsWith() etc. work
* if supplementary characters are present:
*
* In UTF-16, supplementary characters (with code points U+10000 and above) are
* stored with pairs of surrogate code units. These have values from 0xd800 to 0xdfff,
* which means that they compare as less than some other BMP characters like U+feff.
* This function compares Unicode strings in code point order.
* If either of the UTF-16 strings is malformed (i.e., it contains unpaired surrogates), then the result is not defined.
*
* @param start The start offset in this string at which the compare operation begins.
* @param limit The offset after the last code unit from this string to compare.
* @param srcText Another string to compare this one to.
* @param srcStart The start offset in that string at which the compare operation begins.
* @param srcLimit The offset after the last code unit from that string to compare.
* @return a negative/zero/positive integer corresponding to whether
* this string is less than/equal to/greater than the second one
* in code point order
*/
inline int8_t compareCodePointOrderBetween(int32_t start,
int32_t limit,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLimit) const;
/**
* Compare two strings case-insensitively using full case folding.
* This is equivalent to this->foldCase(options).compare(text.foldCase(options)).
*
* @param text Another string to compare this one to.
* @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @return A negative, zero, or positive integer indicating the comparison result.
*/
inline int8_t caseCompare(const UnicodeStringRef& text, uint32_t options) const;
/**
* Compare two strings case-insensitively using full case folding.
* This is equivalent to this->foldCase(options).compare(srcText.foldCase(options)).
*
* @param start The start offset in this string at which the compare operation begins.
* @param length The number of code units from this string to compare.
* @param srcText Another string to compare this one to.
* @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @return A negative, zero, or positive integer indicating the comparison result.
*/
inline int8_t caseCompare(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
uint32_t options) const;
/**
* Compare two strings case-insensitively using full case folding.
* This is equivalent to this->foldCase(options).compare(srcText.foldCase(options)).
*
* @param start The start offset in this string at which the compare operation begins.
* @param length The number of code units from this string to compare.
* @param srcText Another string to compare this one to.
* @param srcStart The start offset in that string at which the compare operation begins.
* @param srcLength The number of code units from that string to compare.
* @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @return A negative, zero, or positive integer indicating the comparison result.
*/
inline int8_t caseCompare(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength,
uint32_t options) const;
/**
* Compare two strings case-insensitively using full case folding.
* This is equivalent to this->foldCase(options).compare(srcChars.foldCase(options)).
*
* @param srcChars A pointer to another string to compare this one to.
* @param srcLength The number of code units from that string to compare.
* @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @return A negative, zero, or positive integer indicating the comparison result.
*/
inline int8_t caseCompare(UChar const *srcChars,
int32_t srcLength,
uint32_t options) const;
/**
* Compare two strings case-insensitively using full case folding.
* This is equivalent to this->foldCase(options).compare(srcChars.foldCase(options)).
*
* @param start The start offset in this string at which the compare operation begins.
* @param length The number of code units from this string to compare.
* @param srcChars A pointer to another string to compare this one to.
* @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @return A negative, zero, or positive integer indicating the comparison result.
*/
inline int8_t caseCompare(int32_t start,
int32_t length,
UChar const *srcChars,
uint32_t options) const;
/**
* Compare two strings case-insensitively using full case folding.
* This is equivalent to this->foldCase(options).compare(srcChars.foldCase(options)).
*
* @param start The start offset in this string at which the compare operation begins.
* @param length The number of code units from this string to compare.
* @param srcChars A pointer to another string to compare this one to.
* @param srcStart The start offset in that string at which the compare operation begins.
* @param srcLength The number of code units from that string to compare.
* @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @return A negative, zero, or positive integer indicating the comparison result.
*/
inline int8_t caseCompare(int32_t start,
int32_t length,
UChar const *srcChars,
int32_t srcStart,
int32_t srcLength,
uint32_t options) const;
/**
* Compare two strings case-insensitively using full case folding.
* This is equivalent to this->foldCase(options).compareBetween(text.foldCase(options)).
*
* @param start The start offset in this string at which the compare operation begins.
* @param limit The offset after the last code unit from this string to compare.
* @param srcText Another string to compare this one to.
* @param srcStart The start offset in that string at which the compare operation begins.
* @param srcLimit The offset after the last code unit from that string to compare.
* @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @return A negative, zero, or positive integer indicating the comparison result.
*/
inline int8_t caseCompareBetween(int32_t start,
int32_t limit,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLimit,
uint32_t options) const;
/**
* Determine if this starts with the characters in <TT>text</TT>
* @param text The text to match.
* @return TRUE if this starts with the characters in <TT>text</TT>,
* FALSE otherwise
* @stable
*/
inline bool startsWith(const UnicodeStringRef& text) const;
/**
* Determine if this starts with the characters in <TT>srcText</TT>
* in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
* @param srcText The text to match.
* @param srcStart the offset into <TT>srcText</TT> to start matching
* @param srcLength the number of characters in <TT>srcText</TT> to match
* @return TRUE if this starts with the characters in <TT>text</TT>,
* FALSE otherwise
* @stable
*/
inline bool startsWith(const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const;
/**
* Determine if this starts with the characters in <TT>srcChars</TT>
* @param srcChars The characters to match.
* @param srcLength the number of characters in <TT>srcChars</TT>
* @return TRUE if this starts with the characters in <TT>srcChars</TT>,
* FALSE otherwise
* @stable
*/
inline bool startsWith(UChar const *srcChars,
int32_t srcLength) const;
/**
* Determine if this starts with the characters in <TT>srcChars</TT>
* in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
* @param srcChars The characters to match.
* @param srcStart the offset into <TT>srcText</TT> to start matching
* @param srcLength the number of characters in <TT>srcChars</TT> to match
* @return TRUE if this starts with the characters in <TT>srcChars</TT>,
* FALSE otherwise
* @stable
*/
inline bool startsWith(UChar const *srcChars,
int32_t srcStart,
int32_t srcLength) const;
/**
* Determine if this ends with the characters in <TT>text</TT>
* @param text The text to match.
* @return TRUE if this ends with the characters in <TT>text</TT>,
* FALSE otherwise
* @stable
*/
inline bool endsWith(const UnicodeStringRef& text) const;
/**
* Determine if this ends with the characters in <TT>srcText</TT>
* in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
* @param srcText The text to match.
* @param srcStart the offset into <TT>srcText</TT> to start matching
* @param srcLength the number of characters in <TT>srcText</TT> to match
* @return TRUE if this ends with the characters in <TT>text</TT>,
* FALSE otherwise
* @stable
*/
inline bool endsWith(const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const;
/**
* Determine if this ends with the characters in <TT>srcChars</TT>
* @param srcChars The characters to match.
* @param srcLength the number of characters in <TT>srcChars</TT>
* @return TRUE if this ends with the characters in <TT>srcChars</TT>,
* FALSE otherwise
* @stable
*/
inline bool endsWith(UChar const *srcChars,
int32_t srcLength) const;
/**
* Determine if this ends with the characters in <TT>srcChars</TT>
* in the range [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>).
* @param srcChars The characters to match.
* @param srcStart the offset into <TT>srcText</TT> to start matching
* @param srcLength the number of characters in <TT>srcChars</TT> to match
* @return TRUE if this ends with the characters in <TT>srcChars</TT>,
* FALSE otherwise
* @stable
*/
inline bool endsWith(UChar const *srcChars,
int32_t srcStart,
int32_t srcLength) const;
/* Searching - bitwise only */
/**
* Locate in this the first occurrence of the characters in <TT>text</TT>,
* using bitwise comparison.
* @param text The text to search for.
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t indexOf(const UnicodeStringRef& text) const;
/**
* Locate in this the first occurrence of the characters in <TT>text</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param text The text to search for.
* @param start The offset at which searching will start.
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t indexOf(const UnicodeStringRef& text,
int32_t start) const;
/**
* Locate in this the first occurrence in the range
* [<TT>start</TT>, <TT>start + length</TT>) of the characters
* in <TT>text</TT>, using bitwise comparison.
* @param text The text to search for.
* @param start The offset at which searching will start.
* @param length The number of characters to search
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t indexOf(const UnicodeStringRef& text,
int32_t start,
int32_t length) const;
/**
* Locate in this the first occurrence in the range
* [<TT>start</TT>, <TT>start + length</TT>) of the characters
* in <TT>srcText</TT> in the range
* [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>),
* using bitwise comparison.
* @param srcText The text to search for.
* @param srcStart the offset into <TT>srcText</TT> at which
* to start matching
* @param srcLength the number of characters in <TT>srcText</TT> to match
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t indexOf(const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength,
int32_t start,
int32_t length) const;
/**
* Locate in this the first occurrence of the characters in
* <TT>srcChars</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param srcChars The text to search for.
* @param srcLength the number of characters in <TT>srcChars</TT> to match
* @param start the offset into this at which to start matching
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t indexOf(UChar const *srcChars,
int32_t srcLength,
int32_t start) const;
/**
* Locate in this the first occurrence in the range
* [<TT>start</TT>, <TT>start + length</TT>) of the characters
* in <TT>srcChars</TT>, using bitwise comparison.
* @param srcChars The text to search for.
* @param srcLength the number of characters in <TT>srcChars</TT>
* @param start The offset at which searching will start.
* @param length The number of characters to search
* @return The offset into this of the start of <TT>srcChars</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t indexOf(UChar const *srcChars,
int32_t srcLength,
int32_t start,
int32_t length) const;
/**
* Locate in this the first occurrence in the range
* [<TT>start</TT>, <TT>start + length</TT>) of the characters
* in <TT>srcChars</TT> in the range
* [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>),
* using bitwise comparison.
* @param srcChars The text to search for.
* @param srcStart the offset into <TT>srcChars</TT> at which
* to start matching
* @param srcLength the number of characters in <TT>srcChars</TT> to match
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
int32_t indexOf(UChar const *srcChars,
int32_t srcStart,
int32_t srcLength,
int32_t start,
int32_t length) const;
/**
* Locate in this the first occurrence of the code unit <TT>c</TT>,
* using bitwise comparison.
* @param c The code unit to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t indexOf(UChar c) const;
/**
* Locate in this the first occurrence of the code point <TT>c</TT>,
* using bitwise comparison.
* @param c The code point to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t indexOf(UChar32 c) const;
/**
* Locate in this the first occurrence of the code unit <TT>c</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param c The code unit to search for.
* @param start The offset at which searching will start.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t indexOf(UChar c,
int32_t start) const;
/**
* Locate in this the first occurrence of the code point <TT>c</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param c The code point to search for.
* @param start The offset at which searching will start.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t indexOf(UChar32 c,
int32_t start) const;
/**
* Locate in this the first occurrence of the code unit <TT>c</TT>
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
* @param c The code unit to search for.
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t indexOf(UChar c,
int32_t start,
int32_t length) const;
/**
* Locate in this the first occurrence of the code point <TT>c</TT>
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
* @param c The code point to search for.
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t indexOf(UChar32 c,
int32_t start,
int32_t length) const;
/**
* Locate in this the last occurrence of the characters in <TT>text</TT>,
* using bitwise comparison.
* @param text The text to search for.
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(const UnicodeStringRef& text) const;
/**
* Locate in this the last occurrence of the characters in <TT>text</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param text The text to search for.
* @param start The offset at which searching will start.
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(const UnicodeStringRef& text,
int32_t start) const;
/**
* Locate in this the last occurrence in the range
* [<TT>start</TT>, <TT>start + length</TT>) of the characters
* in <TT>text</TT>, using bitwise comparison.
* @param text The text to search for.
* @param start The offset at which searching will start.
* @param length The number of characters to search
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(const UnicodeStringRef& text,
int32_t start,
int32_t length) const;
/**
* Locate in this the last occurrence in the range
* [<TT>start</TT>, <TT>start + length</TT>) of the characters
* in <TT>srcText</TT> in the range
* [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>),
* using bitwise comparison.
* @param srcText The text to search for.
* @param srcStart the offset into <TT>srcText</TT> at which
* to start matching
* @param srcLength the number of characters in <TT>srcText</TT> to match
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength,
int32_t start,
int32_t length) const;
/**
* Locate in this the last occurrence of the characters in <TT>srcChars</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param srcChars The text to search for.
* @param srcLength the number of characters in <TT>srcChars</TT> to match
* @param start the offset into this at which to start matching
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(UChar const *srcChars,
int32_t srcLength,
int32_t start) const;
/**
* Locate in this the last occurrence in the range
* [<TT>start</TT>, <TT>start + length</TT>) of the characters
* in <TT>srcChars</TT>, using bitwise comparison.
* @param srcChars The text to search for.
* @param srcLength the number of characters in <TT>srcChars</TT>
* @param start The offset at which searching will start.
* @param length The number of characters to search
* @return The offset into this of the start of <TT>srcChars</TT>,
* or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(UChar const *srcChars,
int32_t srcLength,
int32_t start,
int32_t length) const;
/**
* Locate in this the last occurrence in the range
* [<TT>start</TT>, <TT>start + length</TT>) of the characters
* in <TT>srcChars</TT> in the range
* [<TT>srcStart</TT>, <TT>srcStart + srcLength</TT>),
* using bitwise comparison.
* @param srcChars The text to search for.
* @param srcStart the offset into <TT>srcChars</TT> at which
* to start matching
* @param srcLength the number of characters in <TT>srcChars</TT> to match
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
* @return The offset into this of the start of <TT>text</TT>,
* or -1 if not found.
* @stable
*/
int32_t lastIndexOf(UChar const *srcChars,
int32_t srcStart,
int32_t srcLength,
int32_t start,
int32_t length) const;
/**
* Locate in this the last occurrence of the code unit <TT>c</TT>,
* using bitwise comparison.
* @param c The code unit to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(UChar c) const;
/**
* Locate in this the last occurrence of the code point <TT>c</TT>,
* using bitwise comparison.
* @param c The code point to search for.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(UChar32 c) const;
/**
* Locate in this the last occurrence of the code unit <TT>c</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param c The code unit to search for.
* @param start The offset at which searching will start.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(UChar c,
int32_t start) const;
/**
* Locate in this the last occurrence of the code point <TT>c</TT>
* starting at offset <TT>start</TT>, using bitwise comparison.
* @param c The code point to search for.
* @param start The offset at which searching will start.
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(UChar32 c,
int32_t start) const;
/**
* Locate in this the last occurrence of the code unit <TT>c</TT>
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
* @param c The code unit to search for.
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(UChar c,
int32_t start,
int32_t length) const;
/**
* Locate in this the last occurrence of the code point <TT>c</TT>
* in the range [<TT>start</TT>, <TT>start + length</TT>),
* using bitwise comparison.
* @param c The code point to search for.
* @param start the offset into this at which to start matching
* @param length the number of characters in this to search
* @return The offset into this of <TT>c</TT>, or -1 if not found.
* @stable
*/
inline int32_t lastIndexOf(UChar32 c,
int32_t start,
int32_t length) const;
/* Character access */
/**
* Return the code unit at offset <tt>offset</tt>.
* @param offset a valid offset into the text
* @returns the code unit at offset <tt>offset</tt>
* @stable
*/
inline UChar charAt(int32_t offset) const;
/**
* Return the code unit at offset <tt>offset</tt>.
* @param offset a valid offset into the text
* @returns the code unit at offset <tt>offset</tt>
* @stable
*/
inline UChar operator [] (int32_t offset) const;
/**
* Return the code point that contains the code unit
* at offset <tt>offset</tt>.
* @param offset a valid offset into the text
* that indicates the text offset of any of the code units
* that will be assembled into a code point (21-bit value) and returned
* @returns the code point of text at <tt>offset</tt>
* @stable
*/
inline UChar32 char32At(int32_t offset) const;
/**
* Adjust a random-access offset so that
* it points to the beginning of a Unicode character.
* The offset that is passed in points to
* any code unit of a code point,
* while the returned offset will point to the first code unit
* of the same code point.
* In UTF-16, if the input offset points to a iv_uiLength surrogate
* of a surrogate pair, then the returned offset will point
* to the first surrogate.
* @param offset a valid offset into one code point of the text
* @return offset of the first code unit of the same code point
*/
inline int32_t getChar32Start(int32_t offset) const;
/**
* Adjust a random-access offset so that
* it points behind a Unicode character.
* The offset that is passed in points behind
* any code unit of a code point,
* while the returned offset will point behind the last code unit
* of the same code point.
* In UTF-16, if the input offset points behind the first surrogate
* (i.e., to the iv_uiLength surrogate)
* of a surrogate pair, then the returned offset will point
* behind the iv_uiLength surrogate (i.e., to the first surrogate).
* @param offset a valid offset after any code unit of a code point of the text
* @return offset of the first code unit after the same code point
*/
inline int32_t getChar32Limit(int32_t offset) const;
/**
* Move the code unit index along the string by delta code points.
* Interpret the input index as a code unit-based offset into the string,
* move the index forward or backward by delta code points, and
* return the resulting index.
* The input index should point to the first code unit of a code point,
* if there is more than one.
*
* Both input and output indexes are code unit-based as for all
* string indexes/offsets in ICU (and other libraries, like MBCS char*).
* If delta<0 then the index is moved backward (toward the start of the string).
* If delta>0 then the index is moved forward (toward the end of the string).
*
* This behaves like CharacterIterator::move32(delta, kCurrent).
*
* Examples:
* <code>
* // s has code points 'a' U+10000 'b' U+10ffff U+2029
* UnicodeStringRef s=UNICODE_STRING("a\\U00010000b\\U0010ffff\\u2029", 31).unescape();
*
* // initial index: position of U+10000
* int32_t index=1;
*
* // the following examples will all result in index==4, position of U+10ffff
*
* // skip 2 code points from some position in the string
* index=s.moveIndex32(index, 2); // skips U+10000 and 'b'
*
* // go to the 3rd code point from the start of s (0-based)
* index=s.moveIndex32(0, 3); // skips 'a', U+10000, and 'b'
*
* // go to the next-to-last code point of s
*
* index=s.moveIndex32(s.length(), -2); // backward-skips U+2029 and U+10ffff
* </code>
*
* @param index input code unit index
* @param delta (signed) code point count to move the index forward or backward
* in the string
* @return the resulting code unit index
*/
int32_t moveIndex32(int32_t index, int32_t delta) const;
/* Substring extraction without conversion */
/**
* Copy the characters in the range
* [<tt>start</tt>, <tt>start + length</tt>) into the array <tt>dst</tt>,
* beginning at <tt>dstStart</tt>.
* If the string aliases to <code>dst</code> itself as an external buffer,
* then extract() will not copy the contents.
*
* @param start offset of first character which will be copied into the array
* @param length the number of characters to extract
* @param dst array in which to copy characters. The length of <tt>dst</tt>
* must be at least (<tt>dstStart + length</tt>).
* @param dstStart the offset in <TT>dst</TT> where the first character
* will be extracted
* @stable
*/
inline void extract(int32_t start,
int32_t length,
UChar *dst,
int32_t dstStart = 0) const;
/**
* Copy the characters in the range [<tt>start</tt>, <tt>limit</tt>)
* into the array <tt>dst</tt>, beginning at <tt>dstStart</tt>.
* @param start offset of first character which will be copied into the array
* @param limit offset immediately following the last character to be copied
* @param dst array in which to copy characters. The length of <tt>dst</tt>
* must be at least (<tt>dstStart + (limit - start)</tt>).
* @param dstStart the offset in <TT>dst</TT> where the first character
* will be extracted
* @stable
*/
inline void extractBetween(int32_t start,
int32_t limit,
UChar *dst,
int32_t dstStart = 0) const;
/**
* Copy the contents of the string into dst.
* This is a convenience function that
* checks if there is enough space in dst,
* extracts the entire string if possible,
* and NUL-terminates dst if possible.
*
* If the string fits into dst but cannot be NUL-terminated
* (length()==dstCapacity) then the error code is set to U_STRING_NOT_TERMINATED_WARNING.
* If the string itself does not fit into dst
* (length()>dstCapacity) then the error code is set to U_BUFFER_OVERFLOW_ERROR.
*
* If the string aliases to <code>dst</code> itself as an external buffer,
* then extract() will not copy the contents.
*
* @param dst Destination string buffer.
* @param dstCapacity Number of UChars available at dst.
* @param errorCode ICU error code.
* @return length()
*/
int32_t
extract(UChar *dst, int32_t dstCapacity,
UErrorCode &errorCode) const;
/**
* Copy the characters in the range
* [<tt>start</tt>, <tt>start + length</tt>) into the UnicodeString
* <tt>dst</tt>.
* @param start offset of first character which will be copied
* @param length the number of characters to extract
* @param dst UnicodeString into which to copy characters.
* @return A reference to <TT>dst</TT>
* @stable
*/
inline void extract(int32_t start,
int32_t length,
icu::UnicodeString& dst) const;
/**
* Copy the characters in the range [<tt>start</tt>, <tt>limit</tt>)
* into the UnicodeString <tt>dst</tt>.
* @param start offset of first character which will be copied
* @param limit offset immediately following the last character to be copied
* @param dst UnicodeString into which to copy characters.
* @return A reference to <TT>dst</TT>
* @stable
*/
inline void extractBetween(int32_t start,
int32_t limit,
icu::UnicodeString& dst) const;
/* Substring extraction with conversion */
/**
* Copy the characters in the range
* [<tt>start</TT>, <tt>start + length</TT>) into an array of characters
* in a specified codepage.
* The output string is NUL-terminated.
*
* @param start offset of first character which will be copied
* @param startLength the number of characters to extract
* @param target the target buffer for extraction
* @param codepage the desired codepage for the characters. 0 has
* the special meaning of the default codepage
* If <code>codepage</code> is an empty string (<code>""</code>),
* then a simple conversion is performed on the codepage-invariant
* subset ("invariant characters") of the platform encoding. See utypes.h.
* If <TT>target</TT> is NULL, then the number of bytes required for
* <TT>target</TT> is returned.
* NOTE: It is assumed that the target is big enough to fit all of the characters.
* @return the output string length, not including the terminating NUL
* @stable
*/
inline int32_t extract(int32_t start,
int32_t startLength,
char *target,
const char *codepage = 0) const;
/**
* Copy the characters in the range
* [<tt>start</TT>, <tt>start + length</TT>) into an array of characters
* in a specified codepage.
* This function does not write any more than <code>targetLength</code>
* characters but returns the length of the entire output string
* so that one can allocate a larger buffer and call the function again
* if necessary.
* The output string is NUL-terminated if possible.
*
* @param start offset of first character which will be copied
* @param startLength the number of characters to extract
* @param target the target buffer for extraction
* @param targetLength the length of the target buffer
* @param codepage the desired codepage for the characters. 0 has
* the special meaning of the default codepage
* If <code>codepage</code> is an empty string (<code>""</code>),
* then a simple conversion is performed on the codepage-invariant
* subset ("invariant characters") of the platform encoding. See utypes.h.
* If <TT>target</TT> is NULL, then the number of bytes required for
* <TT>target</TT> is returned.
* @return the output string length, not including the terminating NUL
* @stable
*/
int32_t extract(int32_t start,
int32_t startLength,
char *target,
uint32_t targetLength,
const char *codepage = 0) const;
/**
* Convert the UnicodeStringRef into a codepage string using an existing UConverter.
* The output string is NUL-terminated if possible.
*
* This function avoids the overhead of opening and closing a converter if
* multiple strings are extracted.
*
* @param target destination string buffer, can be NULL if targetCapacity==0
* @param targetCapacity the number of chars available at target
* @param cnv the converter object to be used (ucnv_resetFromUnicode() will be called),
* or NULL for the default converter
* @param errorCode normal ICU error code
* @return the length of the output string, not counting the terminating NUL;
* if the length is greater than targetCapacity, then the string will not fit
* and a buffer of the indicated length would need to be passed in
* @stable
*/
int32_t extract(char *target, int32_t targetCapacity,
UConverter *cnv,
UErrorCode &errorCode) const;
/**
* Copy the characters in the range
* [<tt>start</TT>, <tt>start + length</TT>) into a std::string object
* in a specified codepage.
* The output string is NUL-terminated.
*
* @param start offset of first character which will be copied
* @param startLength the number of characters to extract
* @param target the target string for extraction
* @param codepage the desired codepage for the characters. 0 has
* the special meaning of the default codepage.
* If <code>codepage</code> is an empty string (<code>""</code>),
* then a simple conversion is performed on the codepage-invariant
* subset ("invariant characters") of the platform encoding. See utypes.h.
* @return the output string length, not including the terminating NUL
* @stable
*/
int32_t extract(int32_t start,
int32_t startLength,
std::string & target,
const char *codepage = 0) const;
/**
* Copy all the characters in the string into an std::string object
* in a specified codepage. Equivalent to
* extract(0, length(), target, codepage)
*
* @param target the target string for extraction
* @param codepage the desired codepage for the characters.
* @return the output string length, not including the terminating NUL
* @stable
*/
inline int32_t extract(std::string & target,
const char *codepage = 0) const;
/**
* Copy all the characters in the string into an std::string object
* in UTF-8. Slightly more efficient than asUTF8() as avoids
* one copy.
*
* @param target the target string for extraction
* @return the output string length, not including the terminating NUL
*/
int32_t extractUTF8(std::string & target) const;
/**
* Convert to a UTF8 string
* @return a std::string
*/
inline std::string asUTF8(void) const;
/**
* Release contents of string container allocated by extract methods
* Useful when caller and callee use different heaps,
* e.g. when debug code uses a release library.
* Is static so can be called on the <TT>UnicodeStringRef</TT> class directly.
*/
static void release(std::string & target);
/* Length operations */
/**
* Return the length of the UnicodeStringRef object.
* The length is the number of characters in the text.
* @returns the length of the UnicodeStringRef object
* @stable
*/
inline int32_t length(void) const;
/**
* Count Unicode code points in the length UChar code units of the string.
* A code point may occupy either one or two UChar code units.
* Counting code points involves reading all code units.
*
* This functions is basically the inverse of moveIndex32().
*
* @param start the index of the first code unit to check
* @param length the number of UChar code units to check
* @return the number of code points in the specified code units
*/
int32_t
countChar32(int32_t start=0, int32_t length=0x7fffffff) const;
/**
* Determine if this string is empty.
* @return TRUE if this string contains 0 characters, FALSE otherwise.
*/
inline bool isEmpty(void) const;
/**
* Set the text in the UnicodeString object to the characters in
* <TT>srcText</TT>.
* <TT>srcText</TT> is not modified.
* @param srcText the source for the new characters
* @return a reference to this
* @stable
*/
inline UnicodeStringRef& setTo(const UnicodeStringRef& srcText);
/**
* Set the text in the UnicodeString object to the characters in
* <TT>srcText</TT>.
* <TT>srcText</TT> is not modified.
* @param srcText the source for the new characters
* @return a reference to this
* @stable
*/
inline UnicodeStringRef& setTo(const icu::UnicodeString& srcText);
/**
* Set the characters in the UnicodeString object to the characters
* in <TT>srcChars</TT>. <TT>srcChars</TT> is not modified.
* @param srcChars the source for the new characters
* @param srcLength the number of Unicode characters in srcChars.
* @return a reference to this
* @stable
*/
inline UnicodeStringRef& setTo(const UChar *srcChars,
int32_t srcLength);
/**
* Print a single byte version to outStream.
* The encoding is UTF-8 if outStream is directed to disk,
* if outStream is cout our cerr the encoding is a Console-CCSID
* that will allow most character to be readable in a shell/command window.
*/
void toSingleByteStream(std::ostream & outStream) const;
private:
/* --- functions -------------------------------------------------------- */
inline int8_t
doCompare( int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const;
int8_t
doCompare( int32_t start,
int32_t length,
const UChar *srcChars,
int32_t srcStart,
int32_t srcLength) const;
inline int8_t
doCompareCodePointOrder(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const;
int8_t
doCompareCodePointOrder(int32_t start,
int32_t length,
const UChar *srcChars,
int32_t srcStart,
int32_t srcLength) const;
inline int8_t
doCaseCompare(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength,
uint32_t options) const;
int8_t
doCaseCompare(int32_t start,
int32_t length,
const UChar *srcChars,
int32_t srcStart,
int32_t srcLength,
uint32_t options) const;
int32_t doIndexOf(UChar c,
int32_t start,
int32_t length) const;
int32_t doLastIndexOf(UChar c,
int32_t start,
int32_t length) const;
inline void doExtract(int32_t start,
int32_t length,
UChar *dst,
int32_t dstStart) const;
inline void doExtract(int32_t start,
int32_t length,
icu::UnicodeString& dst) const;
inline void
pinIndices(int32_t& start,
int32_t& length) const;
// constants
enum {
kInvalidUChar=0xffff // invalid UChar index
};
/* --- variables -------------------------------------------------------- */
UChar const * iv_pUChars;
int32_t iv_uiLength;
}
; // class UnicodeStringRef
///Output stream support for UnicodeStringRef (Note: inside namespace)
UIMA_LINK_IMPORTSPEC std::ostream &
operator << (
std::ostream & outStream,
const uima::UnicodeStringRef & crUStrRef
);
} // namespace uima
/* ----------------------------------------------------------------------- */
/* Implementation UnicodeStringRef */
/* ----------------------------------------------------------------------- */
namespace uima {
inline
UnicodeStringRef::UnicodeStringRef( void ) :
iv_pUChars(NULL),
iv_uiLength(0) {}
inline UnicodeStringRef::UnicodeStringRef(
const icu::UnicodeString & crUniString
) :
iv_pUChars(crUniString.getBuffer()),
iv_uiLength(crUniString.length()) {}
inline
UnicodeStringRef::UnicodeStringRef(
UChar const * cpacString
) :
iv_pUChars(cpacString),
iv_uiLength(cpacString==NULL ? 0 : u_strlen(cpacString)) {
assert( (EXISTS(iv_pUChars) )
|| ((iv_pUChars == NULL ) && (iv_uiLength == 0)) );
}
inline
UnicodeStringRef::UnicodeStringRef(
UChar const * cpacString,
int32_t uiLength
) :
iv_pUChars(cpacString),
iv_uiLength(uiLength) {
assert( (EXISTS(iv_pUChars) )
|| ((iv_pUChars == NULL ) && (iv_uiLength == 0)) );
}
inline
UnicodeStringRef::UnicodeStringRef(
UChar const * paucStringBegin,
UChar const * paucStringEnd
) :
iv_pUChars(paucStringBegin),
iv_uiLength(paucStringEnd - paucStringBegin) {
assert(EXISTS(paucStringBegin));
assert(EXISTS(paucStringEnd));
assert(paucStringEnd >= paucStringBegin);
assert( (EXISTS(iv_pUChars) )
|| ((iv_pUChars == NULL) && (iv_uiLength == 0)) );
}
inline int32_t
UnicodeStringRef::length( void ) const {
return iv_uiLength;
}
inline int32_t
UnicodeStringRef::getSizeInBytes( void ) const {
return (iv_uiLength * sizeof(UChar));
}
inline UChar
UnicodeStringRef::operator[]( int32_t uiIndex ) const {
assert(uiIndex < iv_uiLength);
assert(EXISTS(iv_pUChars));
return iv_pUChars[uiIndex]; //lint !e613: Possible use of null pointer 'UnicodeStringRef<wchar_t>::iv_pUChars' in left argument to operator '['
}
inline int
UnicodeStringRef::operator==( const UnicodeStringRef & crclRHS ) const {
if (iv_uiLength != crclRHS.iv_uiLength) {
return false;
}
return u_strncmp(iv_pUChars, crclRHS.iv_pUChars, iv_uiLength) == 0;
}
inline int
UnicodeStringRef::operator!=( const UnicodeStringRef & crclRHS ) const {
return !((*this)==crclRHS);
}
inline UnicodeStringRef &
UnicodeStringRef::operator=( UnicodeStringRef const & crclRHS ) {
iv_pUChars = crclRHS.iv_pUChars;
iv_uiLength = crclRHS.iv_uiLength;
return (*this);
}
//========================================
// Read-only alias methods
//========================================
inline void
UnicodeStringRef::pinIndices(int32_t& start,
int32_t& length) const {
// pin indices
if (start > iv_uiLength) {
start = iv_uiLength;
}
if (length > (iv_uiLength - start)) {
length = (iv_uiLength - start);
}
}
inline bool
UnicodeStringRef::operator> (const UnicodeStringRef& text) const {
return doCompare(0, iv_uiLength, text, 0, text.iv_uiLength) == 1;
}
inline bool
UnicodeStringRef::operator< (const UnicodeStringRef& text) const {
return doCompare(0, iv_uiLength, text, 0, text.iv_uiLength) == -1;
}
inline bool
UnicodeStringRef::operator>= (const UnicodeStringRef& text) const {
return doCompare(0, iv_uiLength, text, 0, text.iv_uiLength) != -1;
}
inline bool
UnicodeStringRef::operator<= (const UnicodeStringRef& text) const {
return doCompare(0, iv_uiLength, text, 0, text.iv_uiLength) != 1;
}
inline int8_t
UnicodeStringRef::compare(const UnicodeStringRef& text) const {
return doCompare(0, iv_uiLength, text, 0, text.iv_uiLength);
}
inline int8_t
UnicodeStringRef::compare(int32_t start,
int32_t length,
const UnicodeStringRef& srcText) const {
return doCompare(start, length, srcText, 0, srcText.iv_uiLength);
}
inline int8_t
UnicodeStringRef::compare(const UChar *srcChars,
int32_t srcLength) const {
return doCompare(0, iv_uiLength, srcChars, 0, srcLength);
}
inline int8_t
UnicodeStringRef::compare(icu::UnicodeString const &src ) const {
return doCompare(0, iv_uiLength, src.getBuffer(), 0, src.length());
}
inline int8_t
UnicodeStringRef::compare(int32_t start,
int32_t length,
const UChar *srcChars) const {
return doCompare(start, length, srcChars, 0, length);
}
inline int8_t
UnicodeStringRef::compare(int32_t start,
int32_t length,
const UChar *srcChars,
int32_t srcStart,
int32_t srcLength) const {
return doCompare(start, length, srcChars, srcStart, srcLength);
}
inline int8_t
UnicodeStringRef::compare(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const {
return doCompare(start, length, srcText, srcStart, srcLength);
}
inline int8_t
UnicodeStringRef::compareBetween(int32_t start,
int32_t limit,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLimit) const {
return doCompare(start, limit - start,
srcText, srcStart, srcLimit - srcStart);
}
inline int8_t
UnicodeStringRef::doCompare(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const {
const UChar *srcChars = srcText.getBuffer();
return doCompare(start, length, srcChars, srcStart, srcLength);
}
inline int8_t
UnicodeStringRef::compareCodePointOrder(const UnicodeStringRef& text) const {
return doCompareCodePointOrder(0, iv_uiLength, text, 0, text.iv_uiLength);
}
inline int8_t
UnicodeStringRef::compareCodePointOrder(int32_t start,
int32_t length,
const UnicodeStringRef& srcText) const {
return doCompareCodePointOrder(start, length, srcText, 0, srcText.iv_uiLength);
}
inline int8_t
UnicodeStringRef::compareCodePointOrder(const UChar *srcChars,
int32_t srcLength) const {
return doCompareCodePointOrder(0, iv_uiLength, srcChars, 0, srcLength);
}
inline int8_t
UnicodeStringRef::compareCodePointOrder(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const {
return doCompareCodePointOrder(start, length, srcText, srcStart, srcLength);
}
inline int8_t
UnicodeStringRef::compareCodePointOrder(int32_t start,
int32_t length,
const UChar *srcChars) const {
return doCompareCodePointOrder(start, length, srcChars, 0, length);
}
inline int8_t
UnicodeStringRef::compareCodePointOrder(int32_t start,
int32_t length,
const UChar *srcChars,
int32_t srcStart,
int32_t srcLength) const {
return doCompareCodePointOrder(start, length, srcChars, srcStart, srcLength);
}
inline int8_t
UnicodeStringRef::compareCodePointOrderBetween(int32_t start,
int32_t limit,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLimit) const {
return doCompareCodePointOrder(start, limit - start,
srcText, srcStart, srcLimit - srcStart);
}
inline int8_t
UnicodeStringRef::doCompareCodePointOrder(int32_t start,
int32_t length,
const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const {
const UChar *srcChars = srcText.getBuffer();
return doCompareCodePointOrder(start, length, srcChars, srcStart, srcLength);
}
inline int8_t
UnicodeStringRef::caseCompare(const UnicodeStringRef &text, uint32_t options) const {
return doCaseCompare(0, iv_uiLength, text, 0, text.iv_uiLength, options);
}
inline int8_t
UnicodeStringRef::caseCompare(int32_t start,
int32_t length,
const UnicodeStringRef &srcText,
uint32_t options) const {
return doCaseCompare(start, length, srcText, 0, srcText.iv_uiLength, options);
}
inline int8_t
UnicodeStringRef::caseCompare(const UChar *srcChars,
int32_t srcLength,
uint32_t options) const {
return doCaseCompare(0, iv_uiLength, srcChars, 0, srcLength, options);
}
inline int8_t
UnicodeStringRef::caseCompare(int32_t start,
int32_t length,
const UnicodeStringRef &srcText,
int32_t srcStart,
int32_t srcLength,
uint32_t options) const {
return doCaseCompare(start, length, srcText, srcStart, srcLength, options);
}
inline int8_t
UnicodeStringRef::caseCompare(int32_t start,
int32_t length,
const UChar *srcChars,
uint32_t options) const {
return doCaseCompare(start, length, srcChars, 0, length, options);
}
inline int8_t
UnicodeStringRef::caseCompare(int32_t start,
int32_t length,
const UChar *srcChars,
int32_t srcStart,
int32_t srcLength,
uint32_t options) const {
return doCaseCompare(start, length, srcChars, srcStart, srcLength, options);
}
inline int8_t
UnicodeStringRef::caseCompareBetween(int32_t start,
int32_t limit,
const UnicodeStringRef &srcText,
int32_t srcStart,
int32_t srcLimit,
uint32_t options) const {
return doCaseCompare(start, limit - start, srcText, srcStart, srcLimit - srcStart, options);
}
inline int8_t
UnicodeStringRef::doCaseCompare(int32_t start,
int32_t length,
const UnicodeStringRef &srcText,
int32_t srcStart,
int32_t srcLength,
uint32_t options) const {
const UChar *srcChars = srcText.getBuffer();
return doCaseCompare(start, length, srcChars, srcStart, srcLength, options);
}
inline int32_t
UnicodeStringRef::indexOf(const UnicodeStringRef& text) const {
return indexOf(text, 0, text.iv_uiLength, 0, iv_uiLength);
}
inline int32_t
UnicodeStringRef::indexOf(const UnicodeStringRef& text,
int32_t start) const {
return indexOf(text, 0, text.iv_uiLength, start, iv_uiLength - start);
}
inline int32_t
UnicodeStringRef::indexOf(const UnicodeStringRef& text,
int32_t start,
int32_t length) const {
return indexOf(text, 0, text.iv_uiLength, start, length);
}
inline int32_t
UnicodeStringRef::indexOf(const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength,
int32_t start,
int32_t length) const {
return indexOf(srcText.getBuffer(), srcStart, srcLength, start, length);
}
inline int32_t
UnicodeStringRef::indexOf(const UChar *srcChars,
int32_t srcLength,
int32_t start) const {
return indexOf(srcChars, 0, srcLength, start, iv_uiLength - start);
}
inline int32_t
UnicodeStringRef::indexOf(const UChar *srcChars,
int32_t srcLength,
int32_t start,
int32_t length) const {
return indexOf(srcChars, 0, srcLength, start, length);
}
inline int32_t
UnicodeStringRef::indexOf(UChar c) const {
return doIndexOf(c, 0, iv_uiLength);
}
inline int32_t
UnicodeStringRef::indexOf(UChar32 c) const {
if (!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doIndexOf((UChar)c, 0, iv_uiLength);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t length = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, length, c);
return indexOf(buffer, length, 0);
}
}
inline int32_t
UnicodeStringRef::indexOf(UChar c,
int32_t start) const {
return doIndexOf(c, start, iv_uiLength - start);
}
inline int32_t
UnicodeStringRef::indexOf(UChar32 c,
int32_t start) const {
if (!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doIndexOf((UChar)c, start, iv_uiLength - start);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t length = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, length, c);
return indexOf(buffer, length, start);
}
}
inline int32_t
UnicodeStringRef::indexOf(UChar c,
int32_t start,
int32_t length) const {
return doIndexOf(c, start, length);
}
inline int32_t
UnicodeStringRef::indexOf(UChar32 c,
int32_t start,
int32_t length) const {
if (!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doIndexOf((UChar)c, start, length);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t cLength = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, cLength, c);
return indexOf(buffer, cLength, start, length);
}
}
inline int32_t
UnicodeStringRef::lastIndexOf(const UnicodeStringRef& text) const {
return lastIndexOf(text, 0, text.iv_uiLength, 0, iv_uiLength);
}
inline int32_t
UnicodeStringRef::lastIndexOf(const UnicodeStringRef& text,
int32_t start) const {
return lastIndexOf(text, 0, text.iv_uiLength, start, iv_uiLength - start);
}
inline int32_t
UnicodeStringRef::lastIndexOf(const UnicodeStringRef& text,
int32_t start,
int32_t length) const {
return lastIndexOf(text, 0, text.iv_uiLength, start, length);
}
inline int32_t
UnicodeStringRef::lastIndexOf(const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength,
int32_t start,
int32_t length) const {
return lastIndexOf(srcText.getBuffer(), srcStart, srcLength, start, length);
}
inline int32_t
UnicodeStringRef::lastIndexOf(const UChar *srcChars,
int32_t srcLength,
int32_t start) const {
return lastIndexOf(srcChars, 0, srcLength, start, iv_uiLength - start);
}
inline int32_t
UnicodeStringRef::lastIndexOf(const UChar *srcChars,
int32_t srcLength,
int32_t start,
int32_t length) const {
return lastIndexOf(srcChars, 0, srcLength, start, length);
}
inline int32_t
UnicodeStringRef::lastIndexOf(UChar c) const {
return doLastIndexOf(c, 0, iv_uiLength);
}
inline int32_t
UnicodeStringRef::lastIndexOf(UChar32 c) const {
if (!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doLastIndexOf((UChar)c, 0, iv_uiLength);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t count = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
return lastIndexOf(buffer, count, 0);
}
}
inline int32_t
UnicodeStringRef::lastIndexOf(UChar c,
int32_t start) const {
return doLastIndexOf(c, start, iv_uiLength - start);
}
inline int32_t
UnicodeStringRef::lastIndexOf(UChar32 c,
int32_t start) const {
if (!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doLastIndexOf((UChar)c, start, iv_uiLength - start);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t count = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
return lastIndexOf(buffer, count, start);
}
}
inline int32_t
UnicodeStringRef::lastIndexOf(UChar c,
int32_t start,
int32_t length) const {
return doLastIndexOf(c, start, length);
}
inline int32_t
UnicodeStringRef::lastIndexOf(UChar32 c,
int32_t start,
int32_t length) const {
if (!UTF_NEED_MULTIPLE_UCHAR(c)) {
return doLastIndexOf((UChar)c, start, length);
} else {
UChar buffer[UTF_MAX_CHAR_LENGTH];
int32_t count = 0;
UTF_APPEND_CHAR_UNSAFE(buffer, count, c);
return lastIndexOf(buffer, count, start, length);
}
}
inline bool
UnicodeStringRef::startsWith(const UnicodeStringRef& text) const {
return compare(0, text.iv_uiLength, text, 0, text.iv_uiLength) == 0;
}
inline bool
UnicodeStringRef::startsWith(const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const {
return doCompare(0, srcLength, srcText, srcStart, srcLength) == 0;
}
inline bool
UnicodeStringRef::startsWith(const UChar *srcChars,
int32_t srcLength) const {
return doCompare(0, srcLength, srcChars, 0, srcLength) == 0;
}
inline bool
UnicodeStringRef::startsWith(const UChar *srcChars,
int32_t srcStart,
int32_t srcLength) const {
return doCompare(0, srcLength, srcChars, srcStart, srcLength) == 0;
}
inline bool
UnicodeStringRef::endsWith(const UnicodeStringRef& text) const {
return doCompare(iv_uiLength - text.iv_uiLength, text.iv_uiLength,
text, 0, text.iv_uiLength) == 0;
}
inline bool
UnicodeStringRef::endsWith(const UnicodeStringRef& srcText,
int32_t srcStart,
int32_t srcLength) const {
return doCompare(iv_uiLength - srcLength, srcLength,
srcText, srcStart, srcLength) == 0;
}
inline bool
UnicodeStringRef::endsWith(const UChar *srcChars,
int32_t srcLength) const {
return doCompare(iv_uiLength - srcLength, srcLength,
srcChars, 0, srcLength) == 0;
}
inline bool
UnicodeStringRef::endsWith(const UChar *srcChars,
int32_t srcStart,
int32_t srcLength) const {
return doCompare(iv_uiLength - srcLength, srcLength,
srcChars, srcStart, srcLength) == 0;
}
// ============================
// extract implementations (some in .cpp)
// ============================
inline void
UnicodeStringRef::extract(int32_t start,
int32_t length,
UChar *dst,
int32_t dstStart) const {
pinIndices(start, length);
memcpy(dst+dstStart, getBuffer()+start, length*sizeof(UChar));
}
inline void
UnicodeStringRef::extract(int32_t start,
int32_t length,
icu::UnicodeString& target) const {
target.replace(0, target.length(), getBuffer(), start, length);
}
// Replaces all of target by substring of src
// Could use setTo(getBuffer()+start,length) but that is implemented as a replace
inline void
UnicodeStringRef::extractBetween(int32_t start,
int32_t limit,
UChar *dst,
int32_t dstStart) const {
extract(start, limit - start, dst, dstStart);
}
inline void
UnicodeStringRef::extractBetween(int32_t start,
int32_t limit,
icu::UnicodeString& dst) const {
extract(start, limit - start, dst);
}
inline int32_t
UnicodeStringRef::extract(int32_t start,
int32_t length,
char *target,
const char *codepage) const {
// User-beware ... assumes target buffer is large enough
// Capacity assumed to be either large, or 0 if no buffer provided (pre-flighting)
return extract(start, length, target, target!=0 ? 0xffffffff : 0, codepage);
}
inline int32_t
UnicodeStringRef::extract(std::string & target,
const char *codepage) const {
return extract(0, iv_uiLength, target, codepage);
}
inline std::string
UnicodeStringRef::asUTF8(void) const {
std::string target;
extractUTF8(target);
return target;
}
inline UChar
UnicodeStringRef::charAt(int32_t offset) const {
assert(EXISTS(iv_pUChars));
if ((uint32_t)offset < (uint32_t)iv_uiLength) {
return iv_pUChars[offset];
} else {
return kInvalidUChar;
}
}
inline UChar32
UnicodeStringRef::char32At(int32_t offset) const {
if ((uint32_t)offset < (uint32_t)iv_uiLength) {
UChar32 c;
UTF_GET_CHAR(iv_pUChars, 0, offset, iv_uiLength, c);
return c;
} else {
return kInvalidUChar;
}
}
inline int32_t
UnicodeStringRef::getChar32Start(int32_t offset) const {
if ((uint32_t)offset < (uint32_t)iv_uiLength) {
UTF_SET_CHAR_START(iv_pUChars, 0, offset);
return offset;
} else {
return 0;
}
}
inline int32_t
UnicodeStringRef::getChar32Limit(int32_t offset) const {
if ((uint32_t)offset < (uint32_t)iv_uiLength) {
UTF_SET_CHAR_LIMIT(iv_pUChars, 0, offset, iv_uiLength);
return offset;
} else {
return iv_uiLength;
}
}
inline bool
UnicodeStringRef::isEmpty() const {
return iv_uiLength == 0;
}
inline UChar const *
UnicodeStringRef::getBuffer() const {
return iv_pUChars;
}
inline int8_t
UnicodeStringRef::doCaseCompare(int32_t start,
int32_t length,
const UChar *srcChars,
int32_t srcStart,
int32_t srcLength,
uint32_t options) const {
icu::UnicodeString s(iv_pUChars+start, (int32_t)length);
return s.caseCompare(srcChars + srcStart, (int32_t)srcLength, options);
}
inline UnicodeStringRef& UnicodeStringRef::setTo(const UnicodeStringRef& srcText) {
iv_pUChars = srcText.iv_pUChars;
iv_uiLength = srcText.iv_uiLength;
return (*this);
}
inline UnicodeStringRef& UnicodeStringRef::setTo(const icu::UnicodeString& srcText) {
iv_pUChars = srcText.getBuffer();
iv_uiLength = srcText.length();
return (*this);
}
inline UnicodeStringRef& UnicodeStringRef::setTo(const UChar *srcChars, int32_t srcLength) {
iv_pUChars = srcChars;
iv_uiLength = srcLength;
return (*this);
}
UIMA_LINK_IMPORTSPEC std::ostream &
operator << (
std::ostream & rclOStream,
const UnicodeStringRef & crclLString
);
/* ----------------------------------------------------------------------- */
/** @name vector to/from delimited string conversion routines */
/* ----------------------------------------------------------------------- */
/*@{*/
/**
Removes whitespace from both ends of a string.
Template function using <TT>isspace_templ()</TT>.
*/
inline UnicodeStringRef
strtrim(
const UnicodeStringRef & s
) {
if (s.length() == 0) {
return s;
}
UChar const * beg = s.getBuffer();
UChar const * end = s.getBuffer()+s.length()-1;
while (end >= beg && u_isspace(*end) ) {
--end;
}
while (beg < end && u_isspace(*beg) ) {
++beg;
}
return UnicodeStringRef(beg, end-beg+1);
}
/**
Splits a delimited string into pieces and stores the results in a vector
of strings. Delimiters are passed as a zero terminated string.
@param rveclstrOutput (Output) The vector where the results are stored
@param pcInput The delimited string to split.
@param uiInputLength The number of chars in pcInput
@param cpszDelimiters The delimiters. UChar* are interpreted as a set of delimiters.
@param bTrimString Flag: If true, all pieces will be trimmed before storing in <TT>storeVar</TT>
@param bInsertEmptyStrings Flag: If false, pieces that have length 0 will not be stored in <TT>storeVar</TT>
@return The number of strings added to <TT>rvecstrOutput</TT>
*/
UIMA_LINK_IMPORTSPEC int32_t
delimitedUnicodeStringRef2Vector(
std::vector< uima::UnicodeStringRef > & rveclstrOutput,
const UChar * pcInput,
int32_t uiInputLength,
const UChar * cpszDelimiters,
bool bTrimString,
bool bInsertEmptyStrings
);
inline int32_t
delimitedUnicodeStringRef2Vector(
std::vector< UnicodeStringRef > & veclstrOutput,
const UChar * pcInput,
const UChar * cpszDelimiters,
bool bTrimString,
bool bInsertEmptyStrings
) {
return delimitedUnicodeStringRef2Vector(veclstrOutput, pcInput, u_strlen(pcInput), cpszDelimiters, bTrimString, bInsertEmptyStrings);
}
//@}
} // namespace uima
#endif /* UIMA_UNICODESTRINGREF_HPP */
/* <EOF> */