| /*------------------------------------------------------------------------- |
| * oracle_compat.c |
| * Oracle compatible functions. |
| * |
| * Copyright (c) 1996-2008, PostgreSQL Global Development Group |
| * |
| * Author: Edmund Mergl <E.Mergl@bawue.de> |
| * Multibyte enhancement: Tatsuo Ishii <ishii@postgresql.org> |
| * |
| * |
| * IDENTIFICATION |
| * $PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.67.2.1 2007/02/08 20:33:54 momjian Exp $ |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <ctype.h> |
| #include <limits.h> |
| /* |
| * towlower() and friends should be in <wctype.h>, but some pre-C99 systems |
| * declare them in <wchar.h>. |
| */ |
| #ifdef HAVE_WCHAR_H |
| #include <wchar.h> |
| #endif |
| #ifdef HAVE_WCTYPE_H |
| #include <wctype.h> |
| #endif |
| |
| #include "utils/builtins.h" |
| #include "utils/pg_locale.h" |
| #include "mb/pg_wchar.h" |
| |
| |
| /* |
| * If the system provides the needed functions for wide-character manipulation |
| * (which are all standardized by C99), then we implement upper/lower/initcap |
| * using wide-character functions. Otherwise we use the traditional <ctype.h> |
| * functions, which of course will not work as desired in multibyte character |
| * sets. Note that in either case we are effectively assuming that the |
| * database character encoding matches the encoding implied by LC_CTYPE. |
| * |
| * We assume if we have these two functions, we have their friends too, and |
| * can use the wide-character method. |
| */ |
| #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER) |
| #define USE_WIDE_UPPER_LOWER |
| char *wstring_lower (char *str); |
| char *wstring_upper(char *str); |
| #endif |
| |
| static text *dotrim(const char *string, int stringlen, |
| const char *set, int setlen, |
| bool doltrim, bool dortrim); |
| |
| |
| #ifdef USE_WIDE_UPPER_LOWER |
| |
| /* |
| * Convert a TEXT value into a palloc'd wchar string. |
| */ |
| static wchar_t * |
| texttowcs(const text *txt) |
| { |
| int nbytes = VARSIZE(txt) - VARHDRSZ; |
| char *workstr; |
| wchar_t *result; |
| size_t ncodes; |
| |
| /* Overflow paranoia */ |
| if (nbytes < 0 || |
| nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1) |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| |
| /* Need a null-terminated version of the input */ |
| workstr = (char *) palloc(nbytes + 1); |
| memcpy(workstr, VARDATA(txt), nbytes); |
| workstr[nbytes] = '\0'; |
| |
| /* Output workspace cannot have more codes than input bytes */ |
| result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); |
| |
| /* Do the conversion */ |
| ncodes = mbstowcs(result, workstr, nbytes + 1); |
| |
| if (ncodes == (size_t) -1) |
| { |
| /* |
| * Invalid multibyte character encountered. We try to give a useful |
| * error message by letting pg_verifymbstr check the string. But it's |
| * possible that the string is OK to us, and not OK to mbstowcs --- |
| * this suggests that the LC_CTYPE locale is different from the |
| * database encoding. Give a generic error message if verifymbstr |
| * can't find anything wrong. |
| */ |
| pg_verifymbstr(workstr, nbytes, false); |
| ereport(ERROR, |
| (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
| errmsg("invalid multibyte character for locale"), |
| errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); |
| } |
| |
| Assert(ncodes <= (size_t) nbytes); |
| |
| return result; |
| } |
| |
| |
| /* |
| * Convert a wchar string into a palloc'd TEXT value. The wchar string |
| * must be zero-terminated, but we also require the caller to pass the string |
| * length, since it will know it anyway in current uses. |
| */ |
| static text * |
| wcstotext(const wchar_t *str, int ncodes) |
| { |
| text *result; |
| size_t nbytes; |
| |
| /* Overflow paranoia */ |
| if (ncodes < 0 || |
| ncodes > (int) ((INT_MAX - VARHDRSZ) / MB_CUR_MAX) - 1) |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| |
| /* Make workspace certainly large enough for result */ |
| result = (text *) palloc((ncodes + 1) * MB_CUR_MAX + VARHDRSZ); |
| |
| /* Do the conversion */ |
| nbytes = wcstombs((char *) VARDATA(result), str, |
| (ncodes + 1) * MB_CUR_MAX); |
| |
| if (nbytes == (size_t) -1) |
| { |
| /* Invalid multibyte character encountered ... shouldn't happen */ |
| ereport(ERROR, |
| (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
| errmsg("invalid multibyte character for locale"))); |
| } |
| |
| Assert(nbytes <= (size_t) (ncodes * MB_CUR_MAX)); |
| |
| SET_VARSIZE(result, nbytes + VARHDRSZ); |
| |
| return result; |
| } |
| #endif /* USE_WIDE_UPPER_LOWER */ |
| |
| |
| /* |
| * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding. |
| * To make use of the upper/lower functionality, we need to map UTF8 to |
| * UTF16, which for some reason mbstowcs and wcstombs won't do for us. |
| * This conversion layer takes care of it. |
| */ |
| |
| #ifdef WIN32 |
| |
| /* texttowcs for the case of UTF8 to UTF16 */ |
| static wchar_t * |
| win32_utf8_texttowcs(const text *txt) |
| { |
| int nbytes = VARSIZE(txt) - VARHDRSZ; |
| wchar_t *result; |
| int r; |
| |
| /* Overflow paranoia */ |
| if (nbytes < 0 || |
| nbytes > (int) (INT_MAX / sizeof(wchar_t)) - 1) |
| ereport(ERROR, |
| (errcode(ERRCODE_OUT_OF_MEMORY), |
| errmsg("out of memory"))); |
| |
| /* Output workspace cannot have more codes than input bytes */ |
| result = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); |
| |
| /* stupid Microsloth API does not work for zero-length input */ |
| if (nbytes == 0) |
| r = 0; |
| else |
| { |
| /* Do the conversion */ |
| r = MultiByteToWideChar(CP_UTF8, 0, VARDATA(txt), nbytes, |
| result, nbytes); |
| |
| if (!r) /* assume it's NO_UNICODE_TRANSLATION */ |
| { |
| /* see notes above about error reporting */ |
| pg_verifymbstr(VARDATA(txt), nbytes, false); |
| ereport(ERROR, |
| (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
| errmsg("invalid multibyte character for locale"), |
| errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); |
| } |
| } |
| |
| Assert(r <= nbytes); |
| result[r] = 0; |
| |
| return result; |
| } |
| |
| /* wcstotext for the case of UTF16 to UTF8 */ |
| static text * |
| win32_utf8_wcstotext(const wchar_t *str) |
| { |
| text *result; |
| int nbytes; |
| int r; |
| |
| nbytes = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL); |
| if (nbytes == 0) /* shouldn't happen */ |
| ereport(ERROR, |
| (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
| errmsg("UTF-16 to UTF-8 translation failed: %lu", |
| GetLastError()))); |
| |
| result = palloc(nbytes + VARHDRSZ); |
| |
| r = WideCharToMultiByte(CP_UTF8, 0, str, -1, VARDATA(result), nbytes, |
| NULL, NULL); |
| if (r == 0) /* shouldn't happen */ |
| ereport(ERROR, |
| (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), |
| errmsg("UTF-16 to UTF-8 translation failed: %lu", |
| GetLastError()))); |
| |
| SET_VARSIZE(result, nbytes + VARHDRSZ - 1); /* -1 to ignore null */ |
| |
| return result; |
| } |
| |
| /* interface layer to check which encoding is in use */ |
| |
| static wchar_t * |
| win32_texttowcs(const text *txt) |
| { |
| if (GetDatabaseEncoding() == PG_UTF8) |
| return win32_utf8_texttowcs(txt); |
| else |
| return texttowcs(txt); |
| } |
| |
| static text * |
| win32_wcstotext(const wchar_t *str, int ncodes) |
| { |
| if (GetDatabaseEncoding() == PG_UTF8) |
| return win32_utf8_wcstotext(str); |
| else |
| return wcstotext(str, ncodes); |
| } |
| |
| /* use macros to cause routines below to call interface layer */ |
| |
| #define texttowcs win32_texttowcs |
| #define wcstotext win32_wcstotext |
| #endif /* WIN32 */ |
| |
| #ifdef USE_WIDE_UPPER_LOWER |
| /* |
| * string_upper and string_lower are used for correct multibyte upper/lower |
| * transformations localized strings. Returns pointers to transformated |
| * string. |
| */ |
| char * |
| wstring_upper(char *str) |
| { |
| wchar_t *workspace; |
| text *in_text; |
| text *out_text; |
| char *result; |
| int nbytes = strlen(str); |
| int i; |
| |
| in_text = palloc(nbytes + VARHDRSZ); |
| memcpy(VARDATA(in_text), str, nbytes); |
| SET_VARSIZE(in_text, nbytes + VARHDRSZ); |
| |
| workspace = texttowcs(in_text); |
| |
| for (i = 0; workspace[i] != 0; i++) |
| workspace[i] = towupper(workspace[i]); |
| |
| out_text = wcstotext(workspace, i); |
| |
| nbytes = VARSIZE(out_text) - VARHDRSZ; |
| result = palloc(nbytes + 1); |
| memcpy(result, VARDATA(out_text), nbytes); |
| |
| result[nbytes] = '\0'; |
| |
| pfree(workspace); |
| pfree(in_text); |
| pfree(out_text); |
| |
| return result; |
| } |
| |
| char * |
| wstring_lower(char *str) |
| { |
| wchar_t *workspace; |
| text *in_text; |
| text *out_text; |
| char *result; |
| int nbytes = strlen(str); |
| int i; |
| |
| in_text = palloc(nbytes + VARHDRSZ); |
| memcpy(VARDATA(in_text), str, nbytes); |
| SET_VARSIZE(in_text, nbytes + VARHDRSZ); |
| |
| workspace = texttowcs(in_text); |
| |
| for (i = 0; workspace[i] != 0; i++) |
| workspace[i] = towlower(workspace[i]); |
| |
| out_text = wcstotext(workspace, i); |
| |
| nbytes = VARSIZE(out_text) - VARHDRSZ; |
| result = palloc(nbytes + 1); |
| memcpy(result, VARDATA(out_text), nbytes); |
| |
| result[nbytes] = '\0'; |
| |
| pfree(workspace); |
| pfree(in_text); |
| pfree(out_text); |
| |
| return result; |
| } |
| #endif /* USE_WIDE_UPPER_LOWER */ |
| |
| /******************************************************************** |
| * |
| * lower |
| * |
| * Syntax: |
| * |
| * text lower(text string) |
| * |
| * Purpose: |
| * |
| * Returns string, with all letters forced to lowercase. |
| * |
| ********************************************************************/ |
| |
| Datum |
| lower(PG_FUNCTION_ARGS) |
| { |
| #ifdef USE_WIDE_UPPER_LOWER |
| |
| /* |
| * Use wide char code only when max encoding length > 1 and ctype != C. |
| * Some operating systems fail with multi-byte encodings and a C locale. |
| * Also, for a C locale there is no need to process as multibyte. |
| */ |
| if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *result; |
| wchar_t *workspace; |
| int i; |
| |
| workspace = texttowcs(string); |
| |
| for (i = 0; workspace[i] != 0; i++) |
| workspace[i] = towlower(workspace[i]); |
| |
| result = wcstotext(workspace, i); |
| |
| pfree(workspace); |
| |
| PG_RETURN_TEXT_P(result); |
| } |
| else |
| #endif /* USE_WIDE_UPPER_LOWER */ |
| { |
| text *string = PG_GETARG_TEXT_P_COPY(0); |
| char *ptr; |
| int m; |
| |
| /* |
| * Since we copied the string, we can scribble directly on the value |
| */ |
| ptr = VARDATA(string); |
| m = VARSIZE(string) - VARHDRSZ; |
| |
| while (m-- > 0) |
| { |
| *ptr = tolower((unsigned char) *ptr); |
| ptr++; |
| } |
| |
| PG_RETURN_TEXT_P(string); |
| } |
| } |
| |
| |
| /******************************************************************** |
| * |
| * upper |
| * |
| * Syntax: |
| * |
| * text upper(text string) |
| * |
| * Purpose: |
| * |
| * Returns string, with all letters forced to uppercase. |
| * |
| ********************************************************************/ |
| |
| Datum |
| upper(PG_FUNCTION_ARGS) |
| { |
| #ifdef USE_WIDE_UPPER_LOWER |
| |
| /* |
| * Use wide char code only when max encoding length > 1 and ctype != C. |
| * Some operating systems fail with multi-byte encodings and a C locale. |
| * Also, for a C locale there is no need to process as multibyte. |
| */ |
| if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *result; |
| wchar_t *workspace; |
| int i; |
| |
| workspace = texttowcs(string); |
| |
| for (i = 0; workspace[i] != 0; i++) |
| workspace[i] = towupper(workspace[i]); |
| |
| result = wcstotext(workspace, i); |
| |
| pfree(workspace); |
| |
| PG_RETURN_TEXT_P(result); |
| } |
| else |
| #endif /* USE_WIDE_UPPER_LOWER */ |
| { |
| text *string = PG_GETARG_TEXT_P_COPY(0); |
| char *ptr; |
| int m; |
| |
| /* |
| * Since we copied the string, we can scribble directly on the value |
| */ |
| ptr = VARDATA(string); |
| m = VARSIZE(string) - VARHDRSZ; |
| |
| while (m-- > 0) |
| { |
| *ptr = toupper((unsigned char) *ptr); |
| ptr++; |
| } |
| |
| PG_RETURN_TEXT_P(string); |
| } |
| } |
| |
| |
| /******************************************************************** |
| * |
| * initcap |
| * |
| * Syntax: |
| * |
| * text initcap(text string) |
| * |
| * Purpose: |
| * |
| * Returns string, with first letter of each word in uppercase, all |
| * other letters in lowercase. A word is defined as a sequence of |
| * alphanumeric characters, delimited by non-alphanumeric |
| * characters. |
| * |
| ********************************************************************/ |
| |
| Datum |
| initcap(PG_FUNCTION_ARGS) |
| { |
| #ifdef USE_WIDE_UPPER_LOWER |
| |
| /* |
| * Use wide char code only when max encoding length > 1 and ctype != C. |
| * Some operating systems fail with multi-byte encodings and a C locale. |
| * Also, for a C locale there is no need to process as multibyte. |
| */ |
| if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *result; |
| wchar_t *workspace; |
| int wasalnum = 0; |
| int i; |
| |
| workspace = texttowcs(string); |
| |
| for (i = 0; workspace[i] != 0; i++) |
| { |
| if (wasalnum) |
| workspace[i] = towlower(workspace[i]); |
| else |
| workspace[i] = towupper(workspace[i]); |
| wasalnum = iswalnum(workspace[i]); |
| } |
| |
| result = wcstotext(workspace, i); |
| |
| pfree(workspace); |
| |
| PG_RETURN_TEXT_P(result); |
| } |
| else |
| #endif /* USE_WIDE_UPPER_LOWER */ |
| { |
| text *string = PG_GETARG_TEXT_P_COPY(0); |
| int wasalnum = 0; |
| char *ptr; |
| int m; |
| |
| /* |
| * Since we copied the string, we can scribble directly on the value |
| */ |
| ptr = VARDATA(string); |
| m = VARSIZE(string) - VARHDRSZ; |
| |
| while (m-- > 0) |
| { |
| if (wasalnum) |
| *ptr = tolower((unsigned char) *ptr); |
| else |
| *ptr = toupper((unsigned char) *ptr); |
| wasalnum = isalnum((unsigned char) *ptr); |
| ptr++; |
| } |
| |
| PG_RETURN_TEXT_P(string); |
| } |
| } |
| |
| |
| /******************************************************************** |
| * |
| * lpad |
| * |
| * Syntax: |
| * |
| * text lpad(text string1, int4 len, text string2) |
| * |
| * Purpose: |
| * |
| * Returns string1, left-padded to length len with the sequence of |
| * characters in string2. If len is less than the length of string1, |
| * instead truncate (on the right) to len. |
| * |
| ********************************************************************/ |
| |
| Datum |
| lpad(PG_FUNCTION_ARGS) |
| { |
| text *string1 = PG_GETARG_TEXT_P(0); |
| int32 len = PG_GETARG_INT32(1); |
| text *string2 = PG_GETARG_TEXT_P(2); |
| text *ret; |
| char *ptr1, |
| *ptr2, |
| *ptr2end, |
| *ptr_ret; |
| int m, |
| s1len, |
| s2len; |
| |
| int bytelen; |
| |
| /* Negative len is silently taken as zero */ |
| if (len < 0) |
| len = 0; |
| |
| s1len = VARSIZE(string1) - VARHDRSZ; |
| if (s1len < 0) |
| s1len = 0; /* shouldn't happen */ |
| |
| s2len = VARSIZE(string2) - VARHDRSZ; |
| if (s2len < 0) |
| s2len = 0; /* shouldn't happen */ |
| |
| s1len = pg_mbstrlen_with_len(VARDATA(string1), s1len); |
| |
| if (s1len > len) |
| s1len = len; /* truncate string1 to len chars */ |
| |
| if (s2len <= 0) |
| len = s1len; /* nothing to pad with, so don't pad */ |
| |
| bytelen = pg_database_encoding_max_length() * len; |
| |
| /* check for integer overflow */ |
| if (len != 0 && bytelen / pg_database_encoding_max_length() != len) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("requested length too large"))); |
| |
| ret = (text *) palloc(VARHDRSZ + bytelen); |
| |
| m = len - s1len; |
| |
| ptr2 = VARDATA(string2); |
| ptr2end = ptr2 + s2len; |
| ptr_ret = VARDATA(ret); |
| |
| while (m--) |
| { |
| int mlen = pg_mblen(ptr2); |
| |
| memcpy(ptr_ret, ptr2, mlen); |
| ptr_ret += mlen; |
| ptr2 += mlen; |
| if (ptr2 == ptr2end) /* wrap around at end of s2 */ |
| ptr2 = VARDATA(string2); |
| } |
| |
| ptr1 = VARDATA(string1); |
| |
| while (s1len--) |
| { |
| int mlen = pg_mblen(ptr1); |
| |
| memcpy(ptr_ret, ptr1, mlen); |
| ptr_ret += mlen; |
| ptr1 += mlen; |
| } |
| |
| SET_VARSIZE(ret, ptr_ret - (char *) ret); |
| |
| PG_RETURN_TEXT_P(ret); |
| } |
| |
| |
| /******************************************************************** |
| * |
| * rpad |
| * |
| * Syntax: |
| * |
| * text rpad(text string1, int4 len, text string2) |
| * |
| * Purpose: |
| * |
| * Returns string1, right-padded to length len with the sequence of |
| * characters in string2. If len is less than the length of string1, |
| * instead truncate (on the right) to len. |
| * |
| ********************************************************************/ |
| |
| Datum |
| rpad(PG_FUNCTION_ARGS) |
| { |
| text *string1 = PG_GETARG_TEXT_P(0); |
| int32 len = PG_GETARG_INT32(1); |
| text *string2 = PG_GETARG_TEXT_P(2); |
| text *ret; |
| char *ptr1, |
| *ptr2, |
| *ptr2end, |
| *ptr_ret; |
| int m, |
| s1len, |
| s2len; |
| |
| int bytelen; |
| |
| /* Negative len is silently taken as zero */ |
| if (len < 0) |
| len = 0; |
| |
| s1len = VARSIZE(string1) - VARHDRSZ; |
| if (s1len < 0) |
| s1len = 0; /* shouldn't happen */ |
| |
| s2len = VARSIZE(string2) - VARHDRSZ; |
| if (s2len < 0) |
| s2len = 0; /* shouldn't happen */ |
| |
| s1len = pg_mbstrlen_with_len(VARDATA(string1), s1len); |
| |
| if (s1len > len) |
| s1len = len; /* truncate string1 to len chars */ |
| |
| if (s2len <= 0) |
| len = s1len; /* nothing to pad with, so don't pad */ |
| |
| bytelen = pg_database_encoding_max_length() * len; |
| |
| /* Check for integer overflow */ |
| if (len != 0 && bytelen / pg_database_encoding_max_length() != len) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("requested length too large"))); |
| |
| ret = (text *) palloc(VARHDRSZ + bytelen); |
| m = len - s1len; |
| |
| ptr1 = VARDATA(string1); |
| ptr_ret = VARDATA(ret); |
| |
| while (s1len--) |
| { |
| int mlen = pg_mblen(ptr1); |
| |
| memcpy(ptr_ret, ptr1, mlen); |
| ptr_ret += mlen; |
| ptr1 += mlen; |
| } |
| |
| ptr2 = VARDATA(string2); |
| ptr2end = ptr2 + s2len; |
| |
| while (m--) |
| { |
| int mlen = pg_mblen(ptr2); |
| |
| memcpy(ptr_ret, ptr2, mlen); |
| ptr_ret += mlen; |
| ptr2 += mlen; |
| if (ptr2 == ptr2end) /* wrap around at end of s2 */ |
| ptr2 = VARDATA(string2); |
| } |
| |
| SET_VARSIZE(ret, ptr_ret - (char *) ret); |
| |
| PG_RETURN_TEXT_P(ret); |
| } |
| |
| |
| /******************************************************************** |
| * |
| * btrim |
| * |
| * Syntax: |
| * |
| * text btrim(text string, text set) |
| * |
| * Purpose: |
| * |
| * Returns string with characters removed from the front and back |
| * up to the first character not in set. |
| * |
| ********************************************************************/ |
| |
| Datum |
| btrim(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *set = PG_GETARG_TEXT_P(1); |
| text *ret; |
| |
| ret = dotrim(VARDATA(string), VARSIZE(string) - VARHDRSZ, |
| VARDATA(set), VARSIZE(set) - VARHDRSZ, |
| true, true); |
| |
| PG_RETURN_TEXT_P(ret); |
| } |
| |
| /******************************************************************** |
| * |
| * btrim1 --- btrim with set fixed as ' ' |
| * |
| ********************************************************************/ |
| |
| Datum |
| btrim1(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *ret; |
| |
| ret = dotrim(VARDATA(string), VARSIZE(string) - VARHDRSZ, |
| " ", 1, |
| true, true); |
| |
| PG_RETURN_TEXT_P(ret); |
| } |
| |
| /* |
| * Common implementation for btrim, ltrim, rtrim |
| */ |
| static text * |
| dotrim(const char *string, int stringlen, |
| const char *set, int setlen, |
| bool doltrim, bool dortrim) |
| { |
| text *result; |
| int i; |
| |
| /* Nothing to do if either string or set is empty */ |
| if (stringlen > 0 && setlen > 0) |
| { |
| if (pg_database_encoding_max_length() > 1) |
| { |
| /* |
| * In the multibyte-encoding case, build arrays of pointers to |
| * character starts, so that we can avoid inefficient checks in |
| * the inner loops. |
| */ |
| const char **stringchars; |
| const char **setchars; |
| int *stringmblen; |
| int *setmblen; |
| int stringnchars; |
| int setnchars; |
| int resultndx; |
| int resultnchars; |
| const char *p; |
| int len; |
| int mblen; |
| const char *str_pos; |
| int str_len; |
| |
| stringchars = (const char **) palloc(stringlen * sizeof(char *)); |
| stringmblen = (int *) palloc(stringlen * sizeof(int)); |
| stringnchars = 0; |
| p = string; |
| len = stringlen; |
| while (len > 0) |
| { |
| stringchars[stringnchars] = p; |
| stringmblen[stringnchars] = mblen = pg_mblen(p); |
| stringnchars++; |
| p += mblen; |
| len -= mblen; |
| } |
| |
| setchars = (const char **) palloc(setlen * sizeof(char *)); |
| setmblen = (int *) palloc(setlen * sizeof(int)); |
| setnchars = 0; |
| p = set; |
| len = setlen; |
| while (len > 0) |
| { |
| setchars[setnchars] = p; |
| setmblen[setnchars] = mblen = pg_mblen(p); |
| setnchars++; |
| p += mblen; |
| len -= mblen; |
| } |
| |
| resultndx = 0; /* index in stringchars[] */ |
| resultnchars = stringnchars; |
| |
| if (doltrim) |
| { |
| while (resultnchars > 0) |
| { |
| str_pos = stringchars[resultndx]; |
| str_len = stringmblen[resultndx]; |
| for (i = 0; i < setnchars; i++) |
| { |
| if (str_len == setmblen[i] && |
| memcmp(str_pos, setchars[i], str_len) == 0) |
| break; |
| } |
| if (i >= setnchars) |
| break; /* no match here */ |
| string += str_len; |
| stringlen -= str_len; |
| resultndx++; |
| resultnchars--; |
| } |
| } |
| |
| if (dortrim) |
| { |
| while (resultnchars > 0) |
| { |
| str_pos = stringchars[resultndx + resultnchars - 1]; |
| str_len = stringmblen[resultndx + resultnchars - 1]; |
| for (i = 0; i < setnchars; i++) |
| { |
| if (str_len == setmblen[i] && |
| memcmp(str_pos, setchars[i], str_len) == 0) |
| break; |
| } |
| if (i >= setnchars) |
| break; /* no match here */ |
| stringlen -= str_len; |
| resultnchars--; |
| } |
| } |
| |
| pfree((void *)stringchars); |
| pfree(stringmblen); |
| pfree((void *)setchars); |
| pfree(setmblen); |
| } |
| else |
| { |
| /* |
| * In the single-byte-encoding case, we don't need such overhead. |
| */ |
| if (doltrim) |
| { |
| while (stringlen > 0) |
| { |
| char str_ch = *string; |
| |
| for (i = 0; i < setlen; i++) |
| { |
| if (str_ch == set[i]) |
| break; |
| } |
| if (i >= setlen) |
| break; /* no match here */ |
| string++; |
| stringlen--; |
| } |
| } |
| |
| if (dortrim) |
| { |
| while (stringlen > 0) |
| { |
| char str_ch = string[stringlen - 1]; |
| |
| for (i = 0; i < setlen; i++) |
| { |
| if (str_ch == set[i]) |
| break; |
| } |
| if (i >= setlen) |
| break; /* no match here */ |
| stringlen--; |
| } |
| } |
| } |
| } |
| |
| /* Return selected portion of string */ |
| result = (text *) palloc(VARHDRSZ + stringlen); |
| SET_VARSIZE(result, VARHDRSZ + stringlen); |
| memcpy(VARDATA(result), string, stringlen); |
| |
| return result; |
| } |
| |
| /******************************************************************** |
| * |
| * byteatrim |
| * |
| * Syntax: |
| * |
| * bytea byteatrim(byta string, bytea set) |
| * |
| * Purpose: |
| * |
| * Returns string with characters removed from the front and back |
| * up to the first character not in set. |
| * |
| * Cloned from btrim and modified as required. |
| ********************************************************************/ |
| |
| Datum |
| byteatrim(PG_FUNCTION_ARGS) |
| { |
| bytea *string = PG_GETARG_BYTEA_P(0); |
| bytea *set = PG_GETARG_BYTEA_P(1); |
| bytea *ret; |
| char *ptr, |
| *end, |
| *ptr2, |
| *end2; |
| int m; |
| |
| if ((m = VARSIZE(string) - VARHDRSZ) <= 0 || |
| (VARSIZE(set) - VARHDRSZ) <= 0) |
| PG_RETURN_BYTEA_P(string); |
| |
| ptr = VARDATA(string); |
| end = VARDATA(string) + VARSIZE(string) - VARHDRSZ - 1; |
| end2 = VARDATA(set) + VARSIZE(set) - VARHDRSZ - 1; |
| |
| while (m > 0) |
| { |
| ptr2 = VARDATA(set); |
| while (ptr2 <= end2) |
| { |
| if (*ptr == *ptr2) |
| break; |
| ++ptr2; |
| } |
| if (ptr2 > end2) |
| break; |
| ptr++; |
| m--; |
| } |
| |
| while (m > 0) |
| { |
| ptr2 = VARDATA(set); |
| while (ptr2 <= end2) |
| { |
| if (*end == *ptr2) |
| break; |
| ++ptr2; |
| } |
| if (ptr2 > end2) |
| break; |
| end--; |
| m--; |
| } |
| |
| ret = (bytea *) palloc(VARHDRSZ + m); |
| SET_VARSIZE(ret, VARHDRSZ + m); |
| memcpy(VARDATA(ret), ptr, m); |
| |
| PG_RETURN_BYTEA_P(ret); |
| } |
| |
| /******************************************************************** |
| * |
| * ltrim |
| * |
| * Syntax: |
| * |
| * text ltrim(text string, text set) |
| * |
| * Purpose: |
| * |
| * Returns string with initial characters removed up to the first |
| * character not in set. |
| * |
| ********************************************************************/ |
| |
| Datum |
| ltrim(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *set = PG_GETARG_TEXT_P(1); |
| text *ret; |
| |
| ret = dotrim(VARDATA(string), VARSIZE(string) - VARHDRSZ, |
| VARDATA(set), VARSIZE(set) - VARHDRSZ, |
| true, false); |
| |
| PG_RETURN_TEXT_P(ret); |
| } |
| |
| /******************************************************************** |
| * |
| * ltrim1 --- ltrim with set fixed as ' ' |
| * |
| ********************************************************************/ |
| |
| Datum |
| ltrim1(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *ret; |
| |
| ret = dotrim(VARDATA(string), VARSIZE(string) - VARHDRSZ, |
| " ", 1, |
| true, false); |
| |
| PG_RETURN_TEXT_P(ret); |
| } |
| |
| /******************************************************************** |
| * |
| * rtrim |
| * |
| * Syntax: |
| * |
| * text rtrim(text string, text set) |
| * |
| * Purpose: |
| * |
| * Returns string with final characters removed after the last |
| * character not in set. |
| * |
| ********************************************************************/ |
| |
| Datum |
| rtrim(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *set = PG_GETARG_TEXT_P(1); |
| text *ret; |
| |
| ret = dotrim(VARDATA(string), VARSIZE(string) - VARHDRSZ, |
| VARDATA(set), VARSIZE(set) - VARHDRSZ, |
| false, true); |
| |
| PG_RETURN_TEXT_P(ret); |
| } |
| |
| /******************************************************************** |
| * |
| * rtrim1 --- rtrim with set fixed as ' ' |
| * |
| ********************************************************************/ |
| |
| Datum |
| rtrim1(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *ret; |
| |
| ret = dotrim(VARDATA(string), VARSIZE(string) - VARHDRSZ, |
| " ", 1, |
| false, true); |
| |
| PG_RETURN_TEXT_P(ret); |
| } |
| |
| |
| /******************************************************************** |
| * |
| * translate |
| * |
| * Syntax: |
| * |
| * text translate(text string, text from, text to) |
| * |
| * Purpose: |
| * |
| * Returns string after replacing all occurrences of characters in from |
| * with the corresponding character in to. If from is longer than to, |
| * occurrences of the extra characters in from are deleted. |
| * Improved by Edwin Ramirez <ramirez@doc.mssm.edu>. |
| * |
| ********************************************************************/ |
| |
| Datum |
| translate(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| text *from = PG_GETARG_TEXT_P(1); |
| text *to = PG_GETARG_TEXT_P(2); |
| text *result; |
| char *from_ptr, |
| *to_ptr; |
| char *source, |
| *target; |
| int m, |
| fromlen, |
| tolen, |
| retlen, |
| i; |
| int worst_len; |
| int len; |
| int source_len; |
| int from_index; |
| |
| m = VARSIZE(string) - VARHDRSZ; |
| if (m <= 0) |
| PG_RETURN_TEXT_P(string); |
| source = VARDATA(string); |
| |
| fromlen = VARSIZE(from) - VARHDRSZ; |
| from_ptr = VARDATA(from); |
| tolen = VARSIZE(to) - VARHDRSZ; |
| to_ptr = VARDATA(to); |
| |
| /* |
| * The worst-case expansion is to substitute a max-length character for |
| * a single-byte character at each position of the string. |
| */ |
| worst_len = pg_database_encoding_max_length() * m; |
| |
| /* check for integer overflow */ |
| if (worst_len / pg_database_encoding_max_length() != m) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("requested length too large"))); |
| |
| result = (text *) palloc(worst_len + VARHDRSZ); |
| target = VARDATA(result); |
| retlen = 0; |
| |
| while (m > 0) |
| { |
| source_len = pg_mblen(source); |
| from_index = 0; |
| |
| for (i = 0; i < fromlen; i += len) |
| { |
| len = pg_mblen(&from_ptr[i]); |
| if (len == source_len && |
| memcmp(source, &from_ptr[i], len) == 0) |
| break; |
| |
| from_index++; |
| } |
| if (i < fromlen) |
| { |
| /* substitute */ |
| char *p = to_ptr; |
| |
| for (i = 0; i < from_index; i++) |
| { |
| p += pg_mblen(p); |
| if (p >= (to_ptr + tolen)) |
| break; |
| } |
| if (p < (to_ptr + tolen)) |
| { |
| len = pg_mblen(p); |
| memcpy(target, p, len); |
| target += len; |
| retlen += len; |
| } |
| |
| } |
| else |
| { |
| /* no match, so copy */ |
| memcpy(target, source, source_len); |
| target += source_len; |
| retlen += source_len; |
| } |
| |
| source += source_len; |
| m -= source_len; |
| } |
| |
| SET_VARSIZE(result, retlen + VARHDRSZ); |
| |
| /* |
| * The function result is probably much bigger than needed, if we're |
| * using a multibyte encoding, but it's not worth reallocating it; |
| * the result probably won't live long anyway. |
| */ |
| |
| PG_RETURN_TEXT_P(result); |
| } |
| |
| /******************************************************************** |
| * |
| * ascii |
| * |
| * Syntax: |
| * |
| * int ascii(text string) |
| * |
| * Purpose: |
| * |
| * Returns the decimal representation of the first character from |
| * string. |
| * If the string is empty we return 0. |
| * If the database encoding is UTF8, we return the Unicode codepoint. |
| * If the database encoding is any other multi-byte encoding, we |
| * return the value of the first byte if it is an ASCII character |
| * (range 1 .. 127), or raise an error. |
| * For all other encodings we return the value of the first byte, |
| * (range 1..255). |
| * |
| ********************************************************************/ |
| |
| Datum |
| ascii(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_PP(0); |
| int encoding = GetDatabaseEncoding(); |
| unsigned char *data; |
| |
| if (VARSIZE_ANY_EXHDR(string) <= 0) |
| PG_RETURN_INT32(0); |
| |
| data = (unsigned char *) VARDATA_ANY(string); |
| |
| if (encoding == PG_UTF8 && *data > 127) |
| { |
| /* return the code point for Unicode */ |
| |
| int result = 0, |
| tbytes = 0, |
| i; |
| |
| if (*data >= 0xF0) |
| { |
| result = *data & 0x07; |
| tbytes = 3; |
| } |
| else if (*data >= 0xE0) |
| { |
| result = *data & 0x0F; |
| tbytes = 2; |
| } |
| else |
| { |
| Assert(*data > 0xC0); |
| result = *data & 0x1f; |
| tbytes = 1; |
| } |
| |
| Assert(tbytes > 0); |
| |
| for (i = 1; i <= tbytes; i++) |
| { |
| Assert((data[i] & 0xC0) == 0x80); |
| result = (result << 6) + (data[i] & 0x3f); |
| } |
| |
| PG_RETURN_INT32(result); |
| } |
| else |
| { |
| if (pg_encoding_max_length(encoding) > 1 && *data > 127) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("requested character too large"))); |
| |
| |
| PG_RETURN_INT32((int32) *data); |
| } |
| } |
| |
| /******************************************************************** |
| * |
| * chr |
| * |
| * Syntax: |
| * |
| * text chr(int val) |
| * |
| * Purpose: |
| * |
| * Returns the character having the binary equivalent to val. |
| * |
| * For UTF8 we treat the argumwent as a Unicode code point. |
| * For other multi-byte encodings we raise an error for arguments |
| * outside the strict ASCII range (1..127). |
| * |
| * It's important that we don't ever return a value that is not valid |
| * in the database encoding, so that this doesn't become a way for |
| * invalid data to enter the database. |
| * |
| ********************************************************************/ |
| |
| Datum |
| chr(PG_FUNCTION_ARGS) |
| { |
| uint32 cvalue = PG_GETARG_UINT32(0); |
| text *result; |
| int encoding = GetDatabaseEncoding(); |
| |
| if (encoding == PG_UTF8 && cvalue > 127) |
| { |
| /* for Unicode we treat the argument as a code point */ |
| int bytes; |
| char *wch; |
| |
| /* We only allow valid Unicode code points */ |
| if (cvalue > 0x001fffff) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("requested character too large for encoding: %d", |
| cvalue))); |
| |
| if (cvalue > 0xffff) |
| bytes = 4; |
| else if (cvalue > 0x07ff) |
| bytes = 3; |
| else |
| bytes = 2; |
| |
| result = (text *) palloc(VARHDRSZ + bytes); |
| SET_VARSIZE(result, VARHDRSZ + bytes); |
| wch = VARDATA(result); |
| |
| if (bytes == 2) |
| { |
| wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F); |
| wch[1] = 0x80 | (cvalue & 0x3F);; |
| } |
| else if (bytes == 3) |
| { |
| wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F); |
| wch[1] = 0x80 | ((cvalue >> 6) & 0x3F); |
| wch[2] = 0x80 | (cvalue & 0x3F); |
| } |
| else |
| { |
| wch[0] = 0xF0 | ((cvalue >> 18) & 0x07); |
| wch[1] = 0x80 | ((cvalue >> 12) & 0x3F); |
| wch[2] = 0x80 | ((cvalue >> 6) & 0x3F); |
| wch[3] = 0x80 | (cvalue & 0x3F); |
| } |
| |
| } |
| |
| else |
| { |
| bool is_mb; |
| |
| /* |
| * Error out on arguments that make no sense or that we can't validly |
| * represent in the encoding. |
| */ |
| |
| if (cvalue == 0) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("null character not permitted"))); |
| |
| is_mb = pg_encoding_max_length(encoding) > 1; |
| |
| if ((is_mb && (cvalue > 127)) || (!is_mb && (cvalue > 255))) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("requested character too large for encoding: %d", |
| cvalue))); |
| |
| |
| result = (text *) palloc(VARHDRSZ + 1); |
| SET_VARSIZE(result, VARHDRSZ + 1); |
| *VARDATA(result) = (char) cvalue; |
| } |
| |
| PG_RETURN_TEXT_P(result); |
| } |
| |
| /******************************************************************** |
| * |
| * repeat |
| * |
| * Syntax: |
| * |
| * text repeat(text string, int val) |
| * |
| * Purpose: |
| * |
| * Repeat string by val. |
| * |
| ********************************************************************/ |
| |
| Datum |
| repeat(PG_FUNCTION_ARGS) |
| { |
| text *string = PG_GETARG_TEXT_P(0); |
| int32 count = PG_GETARG_INT32(1); |
| text *result; |
| int slen, |
| tlen; |
| int i; |
| char *cp; |
| |
| if (count < 0) |
| count = 0; |
| |
| slen = (VARSIZE(string) - VARHDRSZ); |
| tlen = (VARHDRSZ + (count * slen)); |
| |
| /* Check for integer overflow */ |
| if (slen != 0 && count != 0) |
| { |
| int check = count * slen; |
| int check2 = check + VARHDRSZ; |
| |
| if ((check / slen) != count || check2 <= check) |
| ereport(ERROR, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("requested length too large"))); |
| } |
| |
| result = (text *) palloc(tlen); |
| |
| SET_VARSIZE(result, tlen); |
| cp = VARDATA(result); |
| for (i = 0; i < count; i++) |
| { |
| memcpy(cp, VARDATA(string), slen); |
| cp += slen; |
| } |
| |
| PG_RETURN_TEXT_P(result); |
| } |