blob: 6ca59b2ca1843ba9321d6282765d089cb4a056a0 [file] [log] [blame]
/*
* regc_locale.c --
*
* This file contains locale-specific regexp routines.
* This file is #included by regcomp.c.
*
* Copyright (c) 1998 by Scriptics Corporation.
*
* This software is copyrighted by the Regents of the University of
* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
* Corporation and other parties. The following terms apply to all files
* associated with the software unless explicitly disclaimed in
* individual files.
*
* The authors hereby grant permission to use, copy, modify, distribute,
* and license this software and its documentation for any purpose, provided
* that existing copyright notices are retained in all copies and that this
* notice is included verbatim in any distributions. No written agreement,
* license, or royalty fee is required for any of the authorized uses.
* Modifications to this software may be copyrighted by their authors
* and need not follow the licensing terms described here, provided that
* the new terms are clearly indicated on the first page of each file where
* they apply.
*
* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
* MODIFICATIONS.
*
* GOVERNMENT USE: If you are acquiring this software on behalf of the
* U.S. government, the Government shall have only "Restricted Rights"
* in the software and related documentation as defined in the Federal
* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
* are acquiring the software on behalf of the Department of Defense, the
* software shall be classified as "Commercial Computer Software" and the
* Government shall have only "Restricted Rights" as defined in Clause
* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
* authors grant the U.S. Government and others acting in its behalf
* permission to use and distribute the software in accordance with the
* terms specified in this license.
*
* $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.9 2008/02/14 17:33:37 tgl Exp $
*/
/* ASCII character-name table */
static const struct cname
{
const char *name;
const char code;
} cnames[] =
{
{
"NUL", '\0'
},
{
"SOH", '\001'
},
{
"STX", '\002'
},
{
"ETX", '\003'
},
{
"EOT", '\004'
},
{
"ENQ", '\005'
},
{
"ACK", '\006'
},
{
"BEL", '\007'
},
{
"alert", '\007'
},
{
"BS", '\010'
},
{
"backspace", '\b'
},
{
"HT", '\011'
},
{
"tab", '\t'
},
{
"LF", '\012'
},
{
"newline", '\n'
},
{
"VT", '\013'
},
{
"vertical-tab", '\v'
},
{
"FF", '\014'
},
{
"form-feed", '\f'
},
{
"CR", '\015'
},
{
"carriage-return", '\r'
},
{
"SO", '\016'
},
{
"SI", '\017'
},
{
"DLE", '\020'
},
{
"DC1", '\021'
},
{
"DC2", '\022'
},
{
"DC3", '\023'
},
{
"DC4", '\024'
},
{
"NAK", '\025'
},
{
"SYN", '\026'
},
{
"ETB", '\027'
},
{
"CAN", '\030'
},
{
"EM", '\031'
},
{
"SUB", '\032'
},
{
"ESC", '\033'
},
{
"IS4", '\034'
},
{
"FS", '\034'
},
{
"IS3", '\035'
},
{
"GS", '\035'
},
{
"IS2", '\036'
},
{
"RS", '\036'
},
{
"IS1", '\037'
},
{
"US", '\037'
},
{
"space", ' '
},
{
"exclamation-mark", '!'
},
{
"quotation-mark", '"'
},
{
"number-sign", '#'
},
{
"dollar-sign", '$'
},
{
"percent-sign", '%'
},
{
"ampersand", '&'
},
{
"apostrophe", '\''
},
{
"left-parenthesis", '('
},
{
"right-parenthesis", ')'
},
{
"asterisk", '*'
},
{
"plus-sign", '+'
},
{
"comma", ','
},
{
"hyphen", '-'
},
{
"hyphen-minus", '-'
},
{
"period", '.'
},
{
"full-stop", '.'
},
{
"slash", '/'
},
{
"solidus", '/'
},
{
"zero", '0'
},
{
"one", '1'
},
{
"two", '2'
},
{
"three", '3'
},
{
"four", '4'
},
{
"five", '5'
},
{
"six", '6'
},
{
"seven", '7'
},
{
"eight", '8'
},
{
"nine", '9'
},
{
"colon", ':'
},
{
"semicolon", ';'
},
{
"less-than-sign", '<'
},
{
"equals-sign", '='
},
{
"greater-than-sign", '>'
},
{
"question-mark", '?'
},
{
"commercial-at", '@'
},
{
"left-square-bracket", '['
},
{
"backslash", '\\'
},
{
"reverse-solidus", '\\'
},
{
"right-square-bracket", ']'
},
{
"circumflex", '^'
},
{
"circumflex-accent", '^'
},
{
"underscore", '_'
},
{
"low-line", '_'
},
{
"grave-accent", '`'
},
{
"left-brace", '{'
},
{
"left-curly-bracket", '{'
},
{
"vertical-line", '|'
},
{
"right-brace", '}'
},
{
"right-curly-bracket", '}'
},
{
"tilde", '~'
},
{
"DEL", '\177'
},
{
NULL, 0
}
};
/*
* some ctype functions with non-ascii-char guard
*/
static int
pg_wc_isdigit(pg_wchar c)
{
return (c <= UCHAR_MAX && isdigit((unsigned char) c));
}
static int
pg_wc_isalpha(pg_wchar c)
{
return (c <= UCHAR_MAX && isalpha((unsigned char) c));
}
static int
pg_wc_isalnum(pg_wchar c)
{
return (c <= UCHAR_MAX && isalnum((unsigned char) c));
}
static int
pg_wc_isupper(pg_wchar c)
{
return (c <= UCHAR_MAX && isupper((unsigned char) c));
}
static int
pg_wc_islower(pg_wchar c)
{
return (c <= UCHAR_MAX && islower((unsigned char) c));
}
static int
pg_wc_isgraph(pg_wchar c)
{
return (c <= UCHAR_MAX && isgraph((unsigned char) c));
}
static int
pg_wc_isprint(pg_wchar c)
{
return (c <= UCHAR_MAX && isprint((unsigned char) c));
}
static int
pg_wc_ispunct(pg_wchar c)
{
return (c <= UCHAR_MAX && ispunct((unsigned char) c));
}
static int
pg_wc_isspace(pg_wchar c)
{
return (c <= UCHAR_MAX && isspace((unsigned char) c));
}
static pg_wchar
pg_wc_toupper(pg_wchar c)
{
if (c <= UCHAR_MAX)
return toupper((unsigned char) c);
return c;
}
static pg_wchar
pg_wc_tolower(pg_wchar c)
{
if (c <= UCHAR_MAX)
return tolower((unsigned char) c);
return c;
}
/*
* element - map collating-element name to celt
*/
static celt
element(struct vars * v, /* context */
const chr *startp, /* points to start of name */
const chr *endp) /* points just past end of name */
{
const struct cname *cn;
size_t len;
/* generic: one-chr names stand for themselves */
assert(startp < endp);
len = endp - startp;
if (len == 1)
return *startp;
NOTE(REG_ULOCALE);
/* search table */
for (cn = cnames; cn->name != NULL; cn++)
{
if (strlen(cn->name) == len &&
pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
{
break; /* NOTE BREAK OUT */
}
}
if (cn->name != NULL)
return CHR(cn->code);
/* couldn't find it */
ERR(REG_ECOLLATE);
return 0;
}
/*
* range - supply cvec for a range, including legality check
*/
static struct cvec *
range(struct vars * v, /* context */
celt a, /* range start */
celt b, /* range end, might equal a */
int cases) /* case-independent? */
{
int nchrs;
struct cvec *cv;
celt c,
cc;
if (a != b && !before(a, b))
{
ERR(REG_ERANGE);
return NULL;
}
if (!cases)
{ /* easy version */
cv = getcvec(v, 0, 1);
NOERRN();
addrange(cv, a, b);
return cv;
}
/*
* When case-independent, it's hard to decide when cvec ranges are usable,
* so for now at least, we won't try. We use a range for the originally
* specified chrs and then add on any case-equivalents that are outside
* that range as individual chrs.
*
* To ensure sane behavior if someone specifies a very large range, limit
* the allocation size to 100000 chrs (arbitrary) and check for overrun
* inside the loop below.
*/
nchrs = b - a + 1;
if (nchrs <= 0 || nchrs > 100000)
nchrs = 100000;
cv = getcvec(v, nchrs, 1);
NOERRN();
addrange(cv, a, b);
for (c = a; c <= b; c++)
{
cc = pg_wc_tolower((chr) c);
if (cc !=c && (before(cc, a) || before(b, cc)))
{
if (cv->nchrs >= cv->chrspace)
{
ERR(REG_ETOOBIG);
return NULL;
}
addchr(cv, cc);
}
cc = pg_wc_toupper((chr) c);
if (cc != c && (before(cc, a) || before(b, cc)))
{
if (cv->nchrs >= cv->chrspace)
{
ERR(REG_ETOOBIG);
return NULL;
}
addchr(cv, cc);
}
}
return cv;
}
/*
* before - is celt x before celt y, for purposes of range legality?
*/
static int /* predicate */
before(celt x, celt y)
{
if (x < y)
return 1;
return 0;
}
/*
* eclass - supply cvec for an equivalence class
* Must include case counterparts on request.
*/
static struct cvec *
eclass(struct vars * v, /* context */
celt c, /* Collating element representing the
* equivalence class. */
int cases) /* all cases? */
{
struct cvec *cv;
/* crude fake equivalence class for testing */
if ((v->cflags & REG_FAKE) && c == 'x')
{
cv = getcvec(v, 4, 0);
addchr(cv, (chr) 'x');
addchr(cv, (chr) 'y');
if (cases)
{
addchr(cv, (chr) 'X');
addchr(cv, (chr) 'Y');
}
return cv;
}
/* otherwise, none */
if (cases)
return allcases(v, c);
cv = getcvec(v, 1, 0);
assert(cv != NULL);
addchr(cv, (chr) c);
return cv;
}
/*
* cclass - supply cvec for a character class
*
* Must include case counterparts on request.
*/
static struct cvec *
cclass(struct vars * v, /* context */
const chr *startp, /* where the name starts */
const chr *endp, /* just past the end of the name */
int cases) /* case-independent? */
{
size_t len;
struct cvec *cv = NULL;
const char **namePtr;
int i,
index;
/*
* The following arrays define the valid character class names.
*/
static const char *classNames[] = {
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
"lower", "print", "punct", "space", "upper", "xdigit", NULL
};
enum classes
{
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
};
/*
* Map the name to the corresponding enumerated value.
*/
len = endp - startp;
index = -1;
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
{
if (strlen(*namePtr) == len &&
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
{
index = i;
break;
}
}
if (index == -1)
{
ERR(REG_ECTYPE);
return NULL;
}
/*
* Remap lower and upper to alpha if the match is case insensitive.
*/
if (cases &&
((enum classes) index == CC_LOWER ||
(enum classes) index == CC_UPPER))
index = (int) CC_ALPHA;
/*
* Now compute the character class contents.
*
* For the moment, assume that only char codes < 256 can be in these
* classes.
*/
switch ((enum classes) index)
{
case CC_PRINT:
cv = getcvec(v, UCHAR_MAX, 0);
if (cv)
{
for (i = 0; i <= UCHAR_MAX; i++)
{
if (pg_wc_isprint((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_ALNUM:
cv = getcvec(v, UCHAR_MAX, 0);
if (cv)
{
for (i = 0; i <= UCHAR_MAX; i++)
{
if (pg_wc_isalnum((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_ALPHA:
cv = getcvec(v, UCHAR_MAX, 0);
if (cv)
{
for (i = 0; i <= UCHAR_MAX; i++)
{
if (pg_wc_isalpha((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_ASCII:
cv = getcvec(v, 0, 1);
if (cv)
addrange(cv, 0, 0x7f);
break;
case CC_BLANK:
cv = getcvec(v, 2, 0);
addchr(cv, '\t');
addchr(cv, ' ');
break;
case CC_CNTRL:
cv = getcvec(v, 0, 2);
addrange(cv, 0x0, 0x1f);
addrange(cv, 0x7f, 0x9f);
break;
case CC_DIGIT:
cv = getcvec(v, 0, 1);
if (cv)
addrange(cv, (chr) '0', (chr) '9');
break;
case CC_PUNCT:
cv = getcvec(v, UCHAR_MAX, 0);
if (cv)
{
for (i = 0; i <= UCHAR_MAX; i++)
{
if (pg_wc_ispunct((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_XDIGIT:
cv = getcvec(v, 0, 3);
if (cv)
{
addrange(cv, '0', '9');
addrange(cv, 'a', 'f');
addrange(cv, 'A', 'F');
}
break;
case CC_SPACE:
cv = getcvec(v, UCHAR_MAX, 0);
if (cv)
{
for (i = 0; i <= UCHAR_MAX; i++)
{
if (pg_wc_isspace((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_LOWER:
cv = getcvec(v, UCHAR_MAX, 0);
if (cv)
{
for (i = 0; i <= UCHAR_MAX; i++)
{
if (pg_wc_islower((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_UPPER:
cv = getcvec(v, UCHAR_MAX, 0);
if (cv)
{
for (i = 0; i <= UCHAR_MAX; i++)
{
if (pg_wc_isupper((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_GRAPH:
cv = getcvec(v, UCHAR_MAX, 0);
if (cv)
{
for (i = 0; i <= UCHAR_MAX; i++)
{
if (pg_wc_isgraph((chr) i))
addchr(cv, (chr) i);
}
}
break;
}
if (cv == NULL)
ERR(REG_ESPACE);
return cv;
}
/*
* allcases - supply cvec for all case counterparts of a chr (including itself)
*
* This is a shortcut, preferably an efficient one, for simple characters;
* messy cases are done via range().
*/
static struct cvec *
allcases(struct vars * v, /* context */
chr pc) /* character to get case equivs of */
{
struct cvec *cv;
chr c = (chr) pc;
chr lc,
uc;
lc = pg_wc_tolower((chr) c);
uc = pg_wc_toupper((chr) c);
cv = getcvec(v, 2, 0);
addchr(cv, lc);
if (lc != uc)
addchr(cv, uc);
return cv;
}
/*
* cmp - chr-substring compare
*
* Backrefs need this. It should preferably be efficient.
* Note that it does not need to report anything except equal/unequal.
* Note also that the length is exact, and the comparison should not
* stop at embedded NULs!
*/
static int /* 0 for equal, nonzero for unequal */
cmp(const chr *x, const chr *y, /* strings to compare */
size_t len) /* exact length of comparison */
{
return memcmp(VS(x), VS(y), len * sizeof(chr));
}
/*
* casecmp - case-independent chr-substring compare
*
* REG_ICASE backrefs need this. It should preferably be efficient.
* Note that it does not need to report anything except equal/unequal.
* Note also that the length is exact, and the comparison should not
* stop at embedded NULs!
*/
static int /* 0 for equal, nonzero for unequal */
casecmp(const chr *x, const chr *y, /* strings to compare */
size_t len) /* exact length of comparison */
{
for (; len > 0; len--, x++, y++)
{
if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
return 1;
}
return 0;
}