blob: 5a8fad737b516fe4ea7c91fae530f627bc11c454 [file] [log] [blame]
/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
/* -*-C++-*-
**************************************************************************
*
* File: NAString.cpp
* Description: Utility string functions (basic/exportable).
* (See NAString2.cpp for more funx.)
* Created: 06/07/94
* Language: C++
*
*
**************************************************************************
*/
#include <ctype.h>
#include "BaseTypes.h"
// #include "ComAnsiNamePart.h"
#include "ComASSERT.h"
#include "ComMPLoc.h"
#include "ComOperators.h"
#include "ComSmallDefs.h"
#include "str.h"
#include "ComRtUtils.h"
#include "sqlcli.h"
#include "charinfo.h"
#include "csconvert.h"
#include "nawstring.h"
#define SQLPARSERGLOBALSCMN__INITIALIZE
#define SQLPARSERGLOBALS_FLAGS
#include "SqlParserGlobals.h"
// Include key word header for IsSqlReservedWord().
#include "ComResWords.h"
// The timing loop in ToAnsiIdentifier below was run
// with this flag (using hashtable to check for reserved keywords)
// and without it (using strstr on one long char array to check).
// The winner is WITHOUT this flag; the strstr approach has marginally
// better average performance than the hashtable, plus it takes only
// half the time on a keyword hit, plus there is no initialization cost.
// Plus it has a smaller data size and smaller code size (smaller executable
// object/less disk space) and less algorithmic complexity.
//
// quantify'ing arkcmp's compilation of tpc-c queries shows the strstr-based
// IsSqlReservedWord taking 4.8% of total arkcmp elapsed time. A binary
// search implementation of IsSqlReservedWord shrinks this down to 0.01%
// of total arkcmp elapsed time.
#include "NAString.h"
#include "ComDistribution.h"
// Space, dquote, percent, etc, as found in Ansi 5.1,
// plus Tdm-extension of backslash.
//
// The character NON_SQL_TEXT_CHAR ('@', from our .h file) must *not*
// appear in this array: this internal special char is used to guarantee
// a unique parseable name in internally-generated text
// (unique in that it cannot conflict with any externally legal identifier).
//
static const char specialSQL_TEXT[] = " \"%&'()*+,-./:;<=>?[]_|\\";
#include "ReservedInternalNames.cpp"
// -----------------------------------------------------------------------
// The NAString_isoMappingCS memory cache for use by routines
// ToInternalIdentifier() and ToAnsiIdentifier[2|3]() in modules
// w:/common/NAString[2].cpp. These routines currently cannot
// access SqlParser_ISO_MAPPING directly due to the complex
// build hierarchy.
// -----------------------------------------------------------------------
static THREAD_P SQLCHARSET_CODE NAString_isoMappingCS = SQLCHARSETCODE_UNKNOWN;
// -----------------------------------------------------------------------
Lng32 NAString_getIsoMapCS()
{
if (NAString_isoMappingCS != SQLCHARSETCODE_UNKNOWN)
return (Lng32)NAString_isoMappingCS;
NAString_isoMappingCS = (SQLCHARSET_CODE)ComRtGetIsoMappingEnum();
return (Lng32)NAString_isoMappingCS;
}
// -----------------------------------------------------------------------
void NAString_setIsoMapCS(Lng32 isoMappingCS)
{
// ComASSERT(isoMappingCS == (Lng32)SQLCHARSETCODE_ISO88591 ||
// isoMappingCS == (Lng32)SQLCHARSETCODE_SJIS ||
// isoMappingCS == (Lng32)SQLCHARSETCODE_UTF8);
NAString_isoMappingCS = (SQLCHARSET_CODE)isoMappingCS;
}
// -----------------------------------------------------------------------
NABoolean isUpperIsoMapCS(unsigned char c)
{
{
return isUpper8859_1((NAWchar)c);
}
return FALSE; // dead code
}
// -----------------------------------------------------------------------
NABoolean isAlphaIsoMapCS(unsigned char c)
{
{
return isAlpha8859_1((NAWchar)c);
}
return FALSE; // dead code
}
// -----------------------------------------------------------------------
NABoolean isAlNumIsoMapCS(unsigned char c)
{
{
return isAlNum8859_1((NAWchar)c);
}
return FALSE; // dead code
}
// -----------------------------------------------------------------------
void NAStringUpshiftIsoMapCS(NAString &ns)
{
{
ns.toUpper8859_1();
}
}
// -----------------------------------------------------------------------
// convertNAString()
// Note that this allocates memory.
// -----------------------------------------------------------------------
char *convertNAString(const NAString& ns, CollHeap *heap, NABoolean wideNull)
{
size_t len = ns.length();
size_t nullSpaceLen = 0;
char* buf;
if (wideNull == TRUE)
nullSpaceLen = sizeof(NAWchar);
else
nullSpaceLen = 1;
if (heap)
buf = new (heap) char[len + nullSpaceLen];
else {
buf = new char[len + nullSpaceLen];
#ifndef NDEBUG
cerr << "Possible memory leak: convertNAString called with NULL heap\n";
#endif
}
str_cpy_all(buf, ns.data(), len);
if (wideNull == TRUE)
((NAWchar *)buf)[len / sizeof(NAWchar)] = L'\0';
else
buf[len] = '\0';
return buf;
}
// -----------------------------------------------------------------------
// Returns TRUE if the string consists entirely of whitespace
// (at least one space or tab, and nothing else),
// FALSE if string is empty (null) or contains a non-white character.
// -----------------------------------
NABoolean IsNAStringSpace(const NAString& ns)
{
if (ns.isNull())
return FALSE;
return IsNAStringSpaceOrEmpty(ns);
}
// -----------------------------------
// Returns TRUE if the string consists entirely of whitespace
// (zero or more spaces or tabs, and nothing else), including none (empty str).
// -----------------------------------------------------------------------
NABoolean IsNAStringSpaceOrEmpty(const NAString& ns)
{
StringPos len = ns.length();
for (StringPos i = 0; i < len; i++)
if (!isSpace8859_1((unsigned char)ns[i]))
return FALSE;
return TRUE;
}
// -----------------------------------------------------------------------
// Returns TRUE if the string contains only 7-bit ASCII characters or
// if the string is empty.
// -----------------------------------------------------------------------
NABoolean NAStringHasOnly7BitAsciiChars(const NAString& ns)
{
StringPos len = ns.length();
for (StringPos i = 0; i < len; i++)
if ( ((unsigned char)ns[i]) > 127 )
return FALSE;
return TRUE;
}
// -----------------------------------------------------------------------
// Returns TRUE if the string contains only 7-bit ASCII characters
// between '0' and '9' OR if the string is empty.
// -----------------------------------------------------------------------
NABoolean NAStringHasOnlyDecimalDigitAsciiChars(const NAString& ns)
{
StringPos len = ns.length();
for (StringPos i = 0; i < len; i++)
if ( ((unsigned char)ns[i]) < '0' OR
((unsigned char)ns[i]) > '9' )
return FALSE;
return TRUE;
}
// -----------------------------------------------------------------------
// upshift a string (no funny locale stuff, just do it)
// -----------------------------------------------------------------------
void NAStringUpshiftASCII(NAString& ns)
{
ns.toUpper();
}
// -----------------------------------------------------------------------
// decode a number from a prefix of an NAString
// -----------------------------------------------------------------------
Lng32 NAStringToLong(const NAString &ns)
{
Lng32 result;
sscanf(ns.data(),"%d",&result);
return result;
}
double NAStringToReal(const NAString &ns)
{
float result;
sscanf(ns.data(),"%g",&result);
return result;
}
NAString LongToNAString(Lng32 l)
{
char resultstr[100];
sprintf(resultstr,"%d",l);
return NAString(resultstr);
}
NAString UnsignedToNAString(UInt32 u)
{
char resultstr[100];
sprintf(resultstr,"%u",u);
return NAString(resultstr);
}
NAString Int64ToNAString(Int64 l)
{
char resultstr[100];
convertInt64ToAscii(l, resultstr);
return NAString(resultstr);
}
NAString RealToNAString(double d)
{
char resultstr[200];
sprintf(resultstr,"%G",d);
return NAString(resultstr);
}
NAString &replaceAll(NAString &source, const NAString &searchFor,
const NAString &replaceWith)
{
size_t indexOfReplace = NA_NPOS;
indexOfReplace = source.index(searchFor);
if (indexOfReplace != NA_NPOS)
{
// Replace all occurences of searchFor with replaceWith. When no
// more occurences are found or end of string is reached, index()
// will return NA_NPOS.
while (indexOfReplace != NA_NPOS)
{
source.replace(indexOfReplace, searchFor.length(),
replaceWith);
// Find index of next occurence to replace.
indexOfReplace =
source.index(searchFor, indexOfReplace + replaceWith.length());
}
}
return source;
}
// ---------------------------------------------------------------------
// Hash function for NAString types in NAKeyLookup
// ---------------------------------------------------------------------
ULng32 hashKey(const NAString& str)
{
return str.hash();
}
// ---------------------------------------------------------------------
// Look up names that start with a '$' or an '=' sign
//
// Right now, DEFINEs are simulated by environment variables
// (that might be useful for an OSS process on NSK as well)
// ---------------------------------------------------------------------
NAString LookupDefineName(const NAString &ns, NABoolean iterate)
{
const Int32 itermax = 100; // detect self-referencing env vars
Int32 iterlimit = iterate ? itermax : 1;
Int32 iterations = 0;
NAString delimIdent;
const char *defineName = NULL;
const char *mappedName = ns.data();
// If the name is like $"abc", convert it to $abc and then do the lookup.
if ((mappedName[0] == '$' OR mappedName[0] == '=') AND
mappedName[1] == '"')
{
delimIdent = &mappedName[1];
if (!ToInternalIdentifier(delimIdent, FALSE))
{
delimIdent.prepend(mappedName[0]);
mappedName = delimIdent.data();
}
}
while (mappedName AND
(mappedName[0] == '$' OR mappedName[0] == '=') AND
iterations++ < iterlimit)
{
defineName = mappedName;
mappedName = getenv(&mappedName[1]);
}
// could raise an exception if iterations >= itermax
if (mappedName)
return NAString(mappedName);
else
// couldn't map name, return unresolved name
return NAString(defineName);
}
// ---------------------------------------------------------------------
// Convert a NAString member of a QualifiedName, CorrName, or ColRefName
// from the canonical internal format required by Binder
// into the external delimited-identifier ANSI format.
// That is,
// A2C returns as A2C
// a2c "a2c"
// 12C "12C"
// A+C "A+C"
// A"C" "A""C"
//
// The required internal format is achieved by Parser (et alia) having
// previously called the companion function below, ToInternalIdentifier.
// ---------------------------------------------------------------------
// look up sqlText in the ReservedWords table; return TRUE iff id is
// an ANSI, PotentialANSI, or Tandem reserved word.
NABoolean IsSqlReservedWord(const char *sqlText)
{
return ComResWords::isSqlReservedWord(sqlText,0);
}
NABoolean IsCIdentifier(const char *id)
{
// trim whitespace first, if necessary
// Note that we allow identifiers starting with an underscore
for (size_t i=0; id[i] != 0; i++)
{
char c = id[i];
if (!(c >= 'A' && c <= 'Z' ||
c >= 'a' && c <= 'z' ||
c == '_' ||
c >= '0' && c <= '9' && i > 0))
return FALSE;
}
return TRUE;
}
NABoolean /*NAString::*/setMPLoc()
{
if (!SqlParser_Initialized() )
return TRUE;
else
return FALSE;
}
NAString ToAnsiIdentifier(const NAString &ns, NABoolean assertShort)
{
size_t nsLen = ns.length();
// Zero-length INTERNAL identifiers are fabricated by Parser and Binder;
// they're okay (it's only zero-length EXTERNAL ones that're illegal).
if (nsLen == 0)
return NAString();
// Assert various checks were previously done when converting the original
// external identifier (in some previous call to ToInternalIdentifier).
const Int32 SMAX=2048;
NAWString internalFormatNameInUCS2;
ComAnsiNameToUCS2 ( ns // in - const ComString & internalFormatName
, internalFormatNameInUCS2 // out - NAWString &
);
if ((Int32) internalFormatNameInUCS2.length() >
(Int32) (assertShort ? ComMAX_1_PART_INTERNAL_UCS2_NAME_LEN_IN_NAWCHARS : SMAX))
{
ComASSERT(0);
return NAString();
}
char buf[SMAX];
size_t len;
ToAnsiIdentifier3(ns.data(), ns.length(), buf, SMAX, &len, NAString_getIsoMapCS());
if (len == 0)
return NAString();
else
{
const NAString &nas = NAString(buf, len);
return nas;
}
}
// ---------------------------------------------------------------------
// Helper function for ToInternalIdentifier:
// put the integer count of characters scanned into the first character
// of the return string. (We know the count will fit into a char, and we
// know the string does have a first char (is nonempty), so this is safe.)
// SqlParser uses this info for pretty syntax error messaging.
// ---------------------------------------------------------------------
static Lng32 illegalCharInIdentifier(NAString &ansiIdent,
size_t i, size_t countOfRemoved)
{
ansiIdent[(size_t)0] = i + countOfRemoved;
return -3127;
}
// ---------------------------------------------------------------------
// The inverse of ToAnsiIdentifier -- but note that this function is the one
// called first, and does some essential checking that the above relies on.
//
// The purpose of this function is to convert NAStrings containing
// Ansi-format regular or delimited identifiers to our internal format
// required by Binder (RETDesc) lookups, which is the same format as
// in the catalog metadata tables.
//
// Leading blanks are removed, and then,
// if the string begins with a double quote, then this function does:
// - there are supposed to be double quotes surrounding the string
// and they are removed
// - any embedded double quotes (i.e., two consecutive dquotes)
// are turned into just one dquote
// - silently change tabs to spaces, just as a courtesy
// (officially by ANSI, tabs are illegal even in delimited identifiers
// because they are not a character in the SQL_TEXT default character set
// specification)
// - allow all characters to pass in delimited identifiers
// except the @ prefix as it being used internally by the compiler
// to generate unique table names for internal use. Please look at the
// contents of file w:/sqlshare/ReservedInternalNames.cpp for other
// prefix strings with embedded @ (e.g., OLD@) that are reserved for
// internal use. @ is allowed in delimited identifiers otherwise.
// - We now accept ^ in delimited identifiers if it is not a prefix.
// We used to disallow ^ unless acceptCircumflex is true.
//
// If the string does not begin with a double quote, then
// - remove trailing blanks (spaces AND tabs)
// - verify there are no illegal characters for a REGULAR identifier
// - uppercase the contents unless flagged not to
// - ensure that no regular identifier matches an Ansi reserved word
//
// Return value is a SqlCode value (of a message with no parameters) if error,
// and zero (0) if no error.
// It is caller's job to insert any error condition into a ComDiags.
//
// Efficiency: this function saves on space at the cost of some time.
// The calls to RWCString.remove() probably take linear time as a function
// of string length on each call. A faster version of this function
// would establish a transformed string in a separate buffer and then
// copy it back into the original.
//
// $$$ Kludge NLS (National Language Support) 7-APR-2007 $$$ We used
// to not accept the 7-bit ASCII characters @, /, ^, and \, in ANSI SQL
// delimited identifiers specified by customers, but we do allow
// many other 8-bit byte values between the two double quotes;
// our Japanese customers take advantage of this lack of restriction
// and put their Japanese multibyte characters in delimited identifiers.
// The MXCMP program treats the indentifier as if it contains a string
// of ISO 8859-1 characters.
// Note that currently, the target columns in the metadata
// tables containing internally-formated identifier has the
// CHAR(128) CHARACTER SET ISO88591 data type.
// This kludge workaround works for most case, but there are about
// 5-10% of the Japanese Shift-JIS characters rejected by MXCMP because
// the lower byte of their two-byte multibyte characters contains binary
// value equivalent to the representation of 7-bit ASCII characters @, ^,
// or \.
//
// These restrictions have been lifted. \, @, and ^ now can appear
// within delimited identifiers. $, @, and ^ reserved for internal
// use when they are a prefix. A few other prefixes with @ embedded
// are reserved for internal use also.
//
// We now allow the forward slash ( i.e., / ) character to appear
// within a delimited identifier. Character / is guaranteed to be a
// standalone one-byte character in Shift-JIS and EUC-JP and any
// other character sets that is the supersets of the 7-bit ASCII
// character set (e.g., UTF-8).
// ---------------------------------------------------------------------
Lng32 ToInternalIdentifier( NAString &ansiIdent
, Int32 upCase
, NABoolean acceptCircumflex // VO: Fix genesis solution 10-040204-2957
, UInt16 pv_flags // call-by-value parameter (pv_)flags
)
{
size_t i; // unsigned: beware "i--" when (i==0)!
// Remove leading blanks (spaces AND tabs).
// SqlLexer/Parser do not pass in leading blanks, but we cannot trust
// the SchemaDB caller, nor the Catman-constructed-on-the-fly names
// of ComAnsiNamePart.
//
// Lines [RW] fix a RogueWave memory leak in RWCString::operator[].
//
const char *sptr = ansiIdent.data(); // [RW] added this line
size_t len = ansiIdent.length();
for (i = 0; i < len; i++)
if (isSpace8859_1((unsigned char)*sptr)) // [RW]
sptr++;
else
break;
if (i) {
ansiIdent.remove(0,i);
len = ansiIdent.length();
}
if (len == 0)
return -3004; // An ident must contain at least one character
size_t countOfRemoved = i;
i = 0;
// Handle double quotes or backquotes as delimited identifiers.
// Backquotes are used for hive objects.
// An error will be returned later if they are used for traf objects.
NABoolean isDquote = (ansiIdent[i] == '"');
if ((ansiIdent[i] != '"') &&
(ansiIdent[i] != '`')) { // REGULAR identifier
// ANSI 5.2 SR 13 + 14 and 8.2 SR 3a say that trailing spaces are
// insignificant in equality-testing of identifiers, so remove them
// (and tabs as well, as a courtesy).
//
// This loop transforms 'ABC ' into 'ABC' (NOT the same as delimited loop!)
// We also know it won't be empty, thanks to the check above.
//
for (i = len; i > 0; ) {
i--;
if (!isSpace8859_1((unsigned char)ansiIdent[i]))
break;
}
if (++i < len) {
ansiIdent.remove(i);
len = ansiIdent.length();
}
// ComASSERT(ComGetNameInterfaceCharSet() == SQLCHARSETCODE_UTF8);
NABoolean has7BitAsciiCharsOnly = NAStringHasOnly7BitAsciiChars(ansiIdent);
NABoolean isLatin1 = FALSE;
const Int32 SMAX = 2048;
char latin1Buf[SMAX+1];
char *pFirstUntranslatedChar = NULL;
if (NOT IsNAStringSpaceOrEmpty(ansiIdent) AND NOT has7BitAsciiCharsOnly)
{
// Check to see if ansiIdent contains English and Western European characters only (including
// those in the upper half of the ISO88591 character set). Note that ansiIdent contains
// UTF-8 encoding values.
UInt32 outLen = 0;
UInt32 translatedCharCount = 0;
Int32 retCode = UTF8ToLocale ( cnv_version1
, (const char*) ansiIdent.data()
, (const Int32) ansiIdent.length()
, (const char*) latin1Buf
, (const Int32) SMAX+1
, (cnv_charset) cnv_ISO88591
, (char* &) pFirstUntranslatedChar
, (UInt32 *) &outLen // unsigned int *output_data_len_p
, (const Int32) TRUE // const int addNullAtEnd_flag
, (const Int32) FALSE // const int allow_invalids
, (UInt32 *) &translatedCharCount // unsigned int * translated_char_cnt_p
, (const char*) NULL // const char *substitution_char_p
);
if (retCode == 0) // success - i.e., ansiIdent contains characters that can be support by ISO 8859-1
{
isLatin1 = TRUE;
len = translatedCharCount;
}
}
if (NOT has7BitAsciiCharsOnly AND NOT isLatin1)
{
// Regular identifiers can contains characters supported by ISO 8859-1 standard only.
size_t pos = 0;
if (pFirstUntranslatedChar != NULL AND (pFirstUntranslatedChar - ansiIdent.data()) > 0)
pos = pFirstUntranslatedChar - ansiIdent.data();
return illegalCharInIdentifier(ansiIdent, pos, countOfRemoved);
}
// First character of a regular identifier must be alpha
// (or '\' or '$' if NSK name).
// (User-input names under ANSI or SHORTANSI do not allow NSK format.)
//
i = 0;
unsigned char c = (unsigned char)ansiIdent[i];
if (isLatin1)
c = (unsigned char)latin1Buf[i];
if (isAlphaIsoMapCS(c) ||
(c == '$' && ((pv_flags & NASTRING_REGULAR_IDENT_WITH_DOLLAR_PREFIX) != 0))) {
// Subsequent characters must be alphanumeric or the underscore.
//
while (++i < len) {
if (isLatin1)
c = (unsigned char)latin1Buf[i];
else
c = (unsigned char)ansiIdent[i];
if (NOT isAlNumIsoMapCS(c) && c != '_') {
return illegalCharInIdentifier(ansiIdent, i, countOfRemoved);
}
}
if (upCase) {
if (isLatin1)
{
NAString ns = latin1Buf;
ns.toUpper8859_1();
memcpy(latin1Buf, ns.data(), ns.length()+1);
}
else
NAStringUpshiftIsoMapCS(ansiIdent);
}
// Reserved words cannot be regular identifiers
//
if (IsSqlReservedWord(ansiIdent))
{
return -3128;
}
} else if ((c == '\\') ||
(c == '$' && ((pv_flags & NASTRING_ALLOW_NSK_GUARDIAN_NAME_FORMAT) != 0)
&& ((pv_flags & NASTRING_DELIM_IDENT_WITH_DOLLAR_PREFIX) == 0))) {
// ComASSERT(NAStringHasOnly7BitAsciiChars(ansiIdent) AND NOT isLatin1);
// For now, allow Guardian style names in ANSI mode as well. This was the
// old behavior since this method was not being called for Guardian names.
// The MX Reference manual is also a bit ambiguous in this regard.
// Need to get a resolution on this issue soon. RMW 11-20-2000.
//
// } else if ((c == '\\' || c == '$') &&
// (!SqlParser_Initialized() || SqlParser_NAMETYPE == DF_NSK)) {
//
// User can enter \ or $ at beginning of ident *only* if NAMETYPE NSK;
// if SHORTANSI, their input is Ansi only (no \ or $),
// and it is only the *output*, after SHORTANSI name resolution,
// that may contain \ and $.
//
//## We should really call the left-to-right ComMPLoc ctor here,
//## allowing only a valid MP format
//## (one of ComMPLoc::SYS, VOL, or FILE).
//
// Allow the patterns:
// \[a-zA-Z][a-zA-Z0-9]*.$[a-zA-Z][a-zA-Z0-9]*
// $[a-zA-Z][a-zA-Z0-9]*
//
i++;
if (c == '\\') {
// Must start with an ascii char.
//
if((i >= len) || (NOT isAlphaIsoMapCS((unsigned char)ansiIdent[i++]))) {
return illegalCharInIdentifier(ansiIdent, i - 1, countOfRemoved);
}
while ((i < len) && isAlNumIsoMapCS((unsigned char)ansiIdent[i])) {
i++;
}
// Expecting a ".$"
//
if((i >= len) || ((unsigned char)ansiIdent[i++] != '.')) {
return illegalCharInIdentifier(ansiIdent, i - 1, countOfRemoved);
}
if((i >= len) || ((unsigned char)ansiIdent[i++] != '$')) {
return illegalCharInIdentifier(ansiIdent, i - 1, countOfRemoved);
}
}
// After the '$'
//
while (i < len) {
if (NOT isAlNumIsoMapCS((unsigned char)ansiIdent[i++])) {
return illegalCharInIdentifier(ansiIdent, i - 1, countOfRemoved);
}
}
// Note that for NSK style names, it is not necessary to call
// IsSqlReservedWord() since there are no reserved identifiers
// starting with \ or $.
//
if (upCase) {
NAStringUpshiftIsoMapCS(ansiIdent);
}
} else {
// Invalid first character.
//
return illegalCharInIdentifier(ansiIdent, i, countOfRemoved);
}
if (isLatin1)
{
char utf8Buf[SMAX+1];
char * p1stUnstranslatedChar = NULL;
UInt32 utf8StrLenInBytes = 0;
UInt32 charCount = 0;
Int32 returnCode = LocaleToUTF8(cnv_version1
, (const char*) latin1Buf
, (const Int32) len
, (const char*) utf8Buf
, (const Int32) SMAX+1
, cnv_ISO88591
, p1stUnstranslatedChar // char * & first_untranslated_char
, &utf8StrLenInBytes // unsigned int * output_data_len_p
, (const Int32)TRUE // const int addNullAtEnd_flag
, &charCount // unsigned int * translated_char_cnt_p
);
// Note that utf8StrLenInBytes includes the NULL terminator added at the end
// (addNullAtEnd_flag was set to TRUE in the above call).
// ComASSERT(returnCode == 0);
ansiIdent = utf8Buf;
}
} // end REGULAR identifier
else {
UInt32 state = (isDquote ? 1 : 3);
ansiIdent.remove(0,1); // remove initial dquote
countOfRemoved++;
const char *sptr = ansiIdent.data();
len = ansiIdent.length();
if (len <= 1) // A delimited ident must contain at least one character
return -3004; // plus an ending double-quote.
i = 0;
unsigned char c = (unsigned char)ansiIdent[i];
// Kludge NLS Notes: When \ character is the first character in a
// Japanese multibyte identifier, it is really
// the standalone one-byte character and is not
// the lower byte of a multibyte character. In
// Shift-JIS character set, the \ backslash is
// actually displayed as and representing the
// Japanese Yen money unit symbol.
//
// The $ character is guaranteed to be the
// one-byte standalone character in Shift-JIS
// and all other character sets that are
// supersets of the 7-bit ASCII character set.
// "\SYS.$VOL" -- special handling because '\' and '$' are not Ansi-special
if ((c == '\\') ||
(c == '$' && ((pv_flags & NASTRING_ALLOW_NSK_GUARDIAN_NAME_FORMAT) != 0)
&& ((pv_flags & NASTRING_DELIM_IDENT_WITH_DOLLAR_PREFIX) == 0))) {
// For now, allow Guardian stlye names in ANSI mode as well. This was the
// old behavior since this method was not being called for Guardian names.
// The MX Reference manual is also a bit ambiguous in this regard.
// Need to get a resolution on this issue soon. RMW 11-20-2000.
//
// if ((c == '\\' || c == '$') &&
// (!SqlParser_Initialized() || SqlParser_NAMETYPE == DF_NSK)) {
//
// User can enter \ or $ at beginning of ident *only* if NAMETYPE NSK;
// if SHORTANSI, their input is Ansi only (no \ or $),
// and it is only the *output*, after SHORTANSI name resolution,
// that may contain \ and $.
//
//## We should really call the left-to-right ComMPLoc ctor here,
//## allowing only a valid MP format
//## (one of ComMPLoc::SYS, VOL, or FILE).
//
// Allow the patterns:
// \[A-Z][A-Z0-9]*.$[A-Z][A-Z0-9]*
// $[A-Z][A-Z0-9]*
//
i++;
if (c == '\\') {
// Must start with an ascii char.
//
if((i >= len) || (NOT isUpperIsoMapCS((unsigned char)ansiIdent[i++]))) {
return illegalCharInIdentifier(ansiIdent, i - 1, countOfRemoved);
}
while ((i < len) &&
(isUpperIsoMapCS((unsigned char)ansiIdent[i]) ||
isDigit8859_1((unsigned char)ansiIdent[i]))) {
i++;
}
// Expecting a ".$"
//
if((i >= len) || ((unsigned char)ansiIdent[i++] != '.')) {
return illegalCharInIdentifier(ansiIdent, i - 1, countOfRemoved);
}
if((i >= len) || ((unsigned char)ansiIdent[i++] != '$')) {
return illegalCharInIdentifier(ansiIdent, i - 1, countOfRemoved);
}
}
// After the '$'
//
while ((i < len) &&
(isUpperIsoMapCS((unsigned char)ansiIdent[i]) ||
isDigit8859_1((unsigned char)ansiIdent[i]))) {
i++;
}
// Expecting a '"' character in the last position.
//
if (((unsigned char)ansiIdent[i] != '"') || (i != len - 1)) {
return illegalCharInIdentifier(ansiIdent, i, countOfRemoved);
}
ansiIdent.remove(i, 1);
countOfRemoved++;
} else
{
//## [RW memleak -- should replace ansiIdent[i] with sptr references,
//## refreshing sptr after every remove() or other length modification..]
while (i < ansiIdent.length()) {
unsigned char c = (unsigned char)ansiIdent[i];
if (i == 0) { // first character
if ( ( c == '^' AND NOT acceptCircumflex )
// ---- Allow $ to appear in delimited identifers to support routine action names.
// OR ( c == '$' AND ((pv_flags & NASTRING_DELIM_IDENT_WITH_DOLLAR_PREFIX) == 0) )
)
return illegalCharInIdentifier(ansiIdent, i, countOfRemoved);
if ( NOT Get_SqlParser_Flags(ALLOW_FUNNY_IDENTIFIER) AND
isDelimitedIdentifierReservedForInternalUse(ansiIdent.data(),
ansiIdent.length()) ) {
for ( ; i <= ansiIdent.length(); i++ ) { // look for the first '@'
if ( ansiIdent[i] == NON_SQL_TEXT_CHAR )
break;
} // for
return illegalCharInIdentifier(ansiIdent, i, countOfRemoved);
} // if funny identifier not allowed and specified name is funny
} // if is first character in name
// Notes: The following logic will not mess up our
// Japanese customer's ANSI SQL names. All 32
// control characters, i.e., single-byte values
// ranges from 0x00 through 0x1F inclusively, are
// standalone characters in Shift-JIS character set
// and any character sets that are supersets of the
// 7-bit ASCII character set. The Tab character is
// a control character. Just keep the current
// behavior unless our Japanese customers complain
// about this "tab to space" conversion.
//
if (isSpace8859_1(c) && c != ' ') {
ansiIdent[i] = ' '; // tab becomes space
c = ' '; // tab is now space
}
if (NOT isAlNumIsoMapCS(c)) {
//
// Notes: '/' is guaranteed to always be a
// single-byte standalone character
// in any multibyte character sets
// so it is okay to disallow it; our
// Japanese customer will not complain.
//
// JC: Fix genesis solution 10-040304-3817
// Don't allow '/' in a delimited identifier
//
// ### SAP POC ### 11/21/2008 ### BEGIN
// We now accept the forward slash in delimited names as
// required by SAP POC. Comment out the following 3 lines of code.
//
// ### if (c == '/') {
// ### return illegalCharInIdentifier(ansiIdent, i, countOfRemoved);
// ### }
//
// ### SAP POC ### 11/21/2008 ### END
//
// Notes: The restriction of @ and \ in
// delimited names has been loosen.
//
} // if (NOT isAlNumIsoMapCS(c))
switch (state) {
case 1:
if (c == '"') {
ansiIdent.remove(i,1);
countOfRemoved++;
state = 2;
} else
i++;
break;
case 2:
if (c == '"')
state = 1;
else if (c != ' ') // tab became space
return illegalCharInIdentifier(ansiIdent, i, countOfRemoved);
i++;
break;
case 3:
if (c == '`') {
ansiIdent.remove(i,1);
countOfRemoved++;
state = 4;
} else
i++;
break;
case 4:
if (c == '`')
state = 3;
else if (c != ' ') // tab became space
return illegalCharInIdentifier(ansiIdent, i, countOfRemoved);
i++;
break;
default:
ComASSERT(FALSE);
} // switch
} // while
if ((isDquote && (state != 2)) ||
(NOT isDquote && (state != 4)))
return illegalCharInIdentifier(ansiIdent, i, countOfRemoved);
// ANSI 5.2 SR 13 + 14 and 8.2 SR 3a say that trailing spaces
// are insignificant in equality-testing of identifiers, so
// remove them. NB: length() and resize(i) have i one greater
// than operator[i] positions.
//
// This loop transforms '" ABC " ' into ' ABC'.
// We must check that '" " ' is rejected as an empty string. //"
//
NABoolean empty = TRUE;
for (i = ansiIdent.length(); i > 0; ) {
--i;
if (ansiIdent[i] != ' ') { // tab became space
empty = FALSE;
break;
}
}
if (empty)
return -3004; // A delimited ident must contain at least one character
ansiIdent.resize(++i);
}
} // end DELIMITED identifier
if (!Get_SqlParser_Flags(ALLOW_FUNNY_IDENTIFIER))
{
if (ansiIdent.length() > ComMAX_1_PART_INTERNAL_UTF8_NAME_LEN_IN_BYTES)
return -3118; // Identifier too long.
// allocate plenty of room to avoid buffer overrun
NAWchar internalNameInUCS2[ComMAX_1_PART_INTERNAL_UTF8_NAME_LEN_IN_BYTES + 1 + 16];
internalNameInUCS2[0] = NAWCHR('\0');
Int32 iErrorCode =
ComAnsiNameToUCS2 ( (const char *) ansiIdent.data() // in - const char *
, (NAWchar *) internalNameInUCS2 // out - NAWchar * outBuf
, (Int32) (ComMAX_1_PART_INTERNAL_UTF8_NAME_LEN_IN_BYTES + 1 + 8) // in - outBufSizeInNAWchars
, FALSE // do not fill the remainder of the output buffer with spaces
);
if (iErrorCode != 0 || NAWstrlen(internalNameInUCS2) == 0)
return -13001; // An internal error occurred. The SQL statement could not be translated.
if (NAWstrlen(internalNameInUCS2) > ComMAX_1_PART_INTERNAL_UCS2_NAME_LEN_IN_NAWCHARS)
return -3118; // Identifier too long.
}
return 0; // no error
} // ToInternalIdentifier
// ---------------------------------------------------------------------
// Converted the external-format (quoted) string literal used by the
// user to to the internal-format string used by the user. This routine
// assumes that the syntax of the input external-format string literal
// is already valid so it does not perform any checking
// ---------------------------------------------------------------------
#if 0 /* Needed for possible future enhancement -- see caller in CatRoutinePassThroughParamList.cpp */
void ToInternalString(NAString &internalStr, const NAString &quotedStr)
{
const char *extStr = quotedStr.data();
ComASSERT(strlen(extStr) >= 2 AND
extStr[0] EQU '\'' AND
extStr[strlen(extStr) - 1] EQU '\'');
internalStr = "";
if (strlen(extStr) EQU 2) return;
for (StringPos i = 1, j = 0; i < strlen(extStr) - 1; i++, j++)
{
internalStr[j] = extStr[i];
if (internalStr[j] EQU '\'')
{
i++;
ComASSERT(extStr[i] EQU '\'');
}
}
}
#endif
// ---------------------------------------------------------------------
// Converted the internal-format string literal used by the parser to
// the external-format (quoted) string used by the user. This routine
// assumes that the syntax of the input internal-format string is
// already valid so it does not perform any checking. The default behavior
// is to turn each single-quote (') into a double single-quote ('') and enclose
// the entire string in single quotes ('....'). Pass in FALSE as the third
// parameter to duplicate existing single quotes without enclosing the entire
// string in single quotes.
// ---------------------------------------------------------------------
void ToQuotedString( NAString &quotedStr
, const NAString &internalStr
, NABoolean encloseInQuotes )
{
if (encloseInQuotes) quotedStr = '\'';
for (StringPos i = 0; i < internalStr.length(); i++)
{
quotedStr += internalStr[i];
if (internalStr[i] EQU '\'') quotedStr += '\'';
}
if (encloseInQuotes) quotedStr += '\'';
}
// ---------------------------------------------------------------------
// bsearchStrcmp() is used by bsearch() within tokIsFuncOrParenKeyword()
// to compare two strings.
static Int32 bsearchStrcmp(const void *s1, const void *s2)
{
return (strcmp((char*)s1, *((char**)s2)));
}
// Used by PrettifySqlText() -- depends on its having upcased unquoted tokens.
static NABoolean tokIsFuncOrParenKeyword(const NAString &sqlText,
size_t pos, size_t prevpos)
{
NAString tok(" "); // space in front
tok += &sqlText.data()[pos];
ComASSERT(tok[tok.length()-1] == ' '); // and space after
if (tok == " AND " || tok == " OR ")
return FALSE;
// Derived table correlation names are not keywords, but we want to treat
// them like a paren-keyword (no space between word and lparen):
// SELECT COUNT(*) FROM (SELECT ...) corr(colRename,...)
// SELECT COUNT(*) FROM (SELECT ...) AS corr(colRename,...)
// But we want a space in this context:
// CREATE VIEW vw(col) AS (SELECT ...);
if (pos >= 2 && sqlText[pos - 1] == ' ')
{
if (sqlText[pos - 2] == ')' && tok != " FROM " && tok != " AS ")
return TRUE;
if (pos >= 5)
if (sqlText[pos - 5] == ')' &&
sqlText[pos - 4] == ' ' &&
sqlText[pos - 3] == 'A' &&
sqlText[pos - 2] == 'S')
return TRUE;
}
// Ansi reserved-word function names, tandem-extensions, and other keywords.
// These must be in alphabetical order. Order is checked for DEBUG builds.
// There must also be a trailing space for each keyword.
// Stored procedure names (e.g. EXPLAIN) are deliberately not in this list
// nor are such tokens as CHECK, PRIMARY KEY, REFERENCES, VALUES, ...
// Some expression names (e.g. CASE, COALESCE, NULLIF) are.
//
static const char *keywords[] =
{
"ABS ", // Tandem-extension
"ACOS ", // Tandem-extension
"ASC ", // Collation name
"ASCII ", // Tandem-extension
"ASIN ", // Tandem-extension
"ATAN ", // Tandem-extension
"ATAN2 ", // Tandem-extension
"AVG ", // ANSI
"BIT ", // Datatype with scales/precisions/length
"BIT_LENGTH ", // ANSI
"CASE ", // ANSI
"CAST ", // ANSI
"CEILING ", // Tandem-extension
"CHAR ", // Datatype with scales/precisions/length
"CHARACTER ", // Datatype with scales/precisions/length
"CHARACTER_LENGTH ", // ANSI
"CHAR_LENGTH ", // ANSI
"COALESCE ", // ANSI
"CODE_VALUE ", // Tandem-extension
"CONCAT ", // Tandem-extension
"CONVERT ", // ANSI
"CONVERTFROMHEX ", // Tandem-extension
"CONVERTTIMESTAMP ", // Tandem-extension
"CONVERTTOHEX ", // Tandem-extension
"COS ", // Tandem-extension
"COSH ", // Tandem-extension
"COUNT ", // ANSI
"CRC32 ", // Trafodion extension
"CURDATE ", // Tandem-extension
"CURRENT ", // ANSI
"CURRENT_DATE ", // ANSI
"CURRENT_TIME ", // ANSI
"CURRENT_TIMESTAMP ", // ANSI
"CURRENT_USER ", // ANSI
"CURTIME ", // Tandem-extension
"DATEFORMAT ", // Tandem-extension
"DAY ", // Datatype with scales/precisions/length
"DAYNAME ", // Tandem-extension
"DAYOFMONTH ", // Tandem-extension
"DAYOFWEEK ", // Tandem-extension
"DAYOFYEAR ", // Tandem-extension
"DEC ", // Datatype with scales/precisions/length
"DECIMAL ", // Datatype with scales/precisions/length
"DEGREES ", // Tandem-extension
"DESC ", // Collation name
"ENCODE_KEY ", // Tandem-extension
"EXP ", // Tandem-extension
"EXTEND ", // ANSI
"EXTERNAL ", // Collation name
"EXTRACT ", // ANSI
"FIRSTDAYOFYEAR ", // Tandem-extension
"FLOAT ", // Datatype with scales/precisions/length
"FLOOR ", // Tandem-extension
"GROUP_CONCAT", // MySQL-extension
"HASHPARTFUNC ", // Tandem-extension
"HOUR ", // Datatype with scales/precisions/length
"JSON_OBJECT_FIELD_TEXT" //json_object_field_text
"JULIANTIMESTAMP ", // Tandem-extension
"LCASE ", // Tandem-extension
"LOCATE ", // Tandem-extension
"LOG ", // Tandem-extension
"LOG10 ", // Tandem-extension
"LOWER ", // ANSI
"LPAD ", // Tandem-extension
"LTRIM ", // Tandem-extension
"MAX ", // ANSI
"MD5 ", // Trafodion extension
"MIN ", // ANSI
"MINUTE ", // Datatype with scales/precisions/length
"MOD ", // Tandem-extension
"MONTH ", // Datatype with scales/precisions/length
"MONTHNAME ", // Tandem-extension
"NCHAR ", // Datatype with scales/precisions/length
"NOW ", // Tandem-extension
"NULLIF ", // ANSI
"NUMERIC ", // Datatype with scales/precisions/length
"OCTET_LENGTH ", // ANSI
"OS_USERID ", // Tandem-extension
"PI ", // Tandem-extension
"PIC 9 ", // Cobol datatype directly supported by SQLMX DDL
"PICTURE 9 ", // Cobol datatype directly supported by SQLMX DDL
"POSITION ", // ANSI
"POWER ", // Tandem-extension
"QUARTER ", // Tandem-extension
"RADIANS ", // Tandem-extension
"RAND ", // Tandem-extension
"REPEAT ", // Tandem-extension
"ROUND ", // Tandem-extension
"ROUNDROBINPARTFUNC ", // Tandem-extension
"RPAD ", // Tandem-extension
"RTRIM ", // Tandem-extension
"SECOND ", // Datatype with scales/precisions/length
"SESSION_USER ", // ANSI
"SHA ", // Trafodion extension
"SHA1 ", // Trafodion extension
"SHA2 ", // Trafodion extension
"SIGN ", // Tandem-extension
"SIN ", // Tandem-extension
"SINH ", // Tandem-extension
"SQRT ", // Tandem-extension
"STDDEV ", // Tandem-extension
"SUBSTRING ", // ANSI
"SUM ", // ANSI
"SYS_GUID ", // Oracle-extension
"TAN ", // Tandem-extension
"TANH ", // Tandem-extension
"TIME ", // Datatype with scales/precisions/length
"TIMESTAMP ", // Datatype with scales/precisions/length
"TRANSLATE ", // ANSI
"TRIM ", // ANSI
"TRUNCATE ", // Tandem-extension
"UCASE ", // Tandem-extension
"UPPER ", // ANSI
"UPSHIFT ", // ANSI
"USER ", // ANSI
"VARCHAR ", // Datatype with scales/precisions/length
"VARIANCE ", // Tandem-extension
"VARNCHAR ", // Datatype with scales/precisions/length
"VARYING ", // Datatype with scales/precisions/length
"WEEK ", // Tandem-extension
"YEAR ", // Datatype with scales/precisions/length
};
#ifdef _DEBUG
// Only check the order of the above keywords in debug mode.
static NABoolean checked_order = FALSE;
if (!checked_order)
{
for (Int32 i = 1; i < (sizeof(keywords) / sizeof(keywords[0])); i++)
{
if (::strcmp(keywords[i], keywords[i - 1]) <= 0)
{
char err_buf[128];
sprintf(err_buf, "keywords %s and %s are out of order",
keywords[i], keywords[i-1]);
ABORT(err_buf);
}
}
checked_order = TRUE;
}
#endif
// Return true if this is a keyword
if (bsearch(tok.data() + 1, keywords, (sizeof(keywords) / sizeof(keywords[0])),
sizeof(char*), bsearchStrcmp))
return TRUE;
// PICTURE or PIC (Cobol datatype directly supported by SQLMX DDL).
if (prevpos) prevpos--;
NAString prevtok(&sqlText.data()[prevpos]);
NABoolean pic = FALSE;
if (prevtok.length() > 9)
{
prevtok.remove(9);
pic = prevtok == " PICTURE ";
}
if (!pic && prevtok.length() > 5)
{
prevtok.remove(5);
pic = prevtok == " PIC ";
}
if (pic)
{
if (tok == " B " || tok == " X " ||
tok == " S9 " || tok == " S 9 " || tok == " SV9 " || tok == " V9 ")
return TRUE;
}
if (tok == " V9 ") // PICTURE S V9(nnn)
{
if (prevtok.length() > 3)
{
prevtok.remove(3);
if (prevtok == " S ")
{
if (prevpos >= 8)
{
prevtok = &sqlText.data()[prevpos-8];
prevtok.remove(9);
if (prevtok == " PICTURE ") return TRUE;
}
if (prevpos >= 4)
{
prevtok = &sqlText.data()[prevpos-4];
prevtok.remove(5);
if (prevtok == " PIC ") return TRUE;
}
}
}
}
return FALSE;
}
// This is cloned from sqlcomp/parser.C's stringScanWillTerminateInParser,
// though it serves a different purpose.
// It compresses multiple blanks into a single space, and optionally uppercases
// (it does not, of course, do these things within quoted text, for either
// ' or " quoting). //"
//
Lng32 PrettifySqlText(NAString &sqlText, const char *nationalCharSetName)
{
#define prevResultChar (result.length() ? result[result.length() - 1] : '\0')
#define prevResultCharIs(c) (prevResultChar == c)
// Either this is NOT passed in (null pointer), OR it has the Ansi character
// for charset introducer, the underscore.
ComASSERT(!nationalCharSetName || *nationalCharSetName == '_');
NAString result;
enum TokType { SPACE, ALPHA, DELIM, DIGIT, LPAREN, PUNC, UNARYOP }
toktype = SPACE, prevtoktype = SPACE;
char prev = ' '; // will remove leading blanks
char quote_seen = '\0';
size_t alphapos = 0, prevalphapos = 0;
for (const char *s = sqlText.data(); *s; s++)
{
char curr = *s;
if (quote_seen)
if (*s == quote_seen)
quote_seen = '\0';
else
{ /*consume quoted character*/ }
else if (*s == '\'' || *s == '"')
{
quote_seen = *s;
if (toktype != DELIM) // initial, not embedded, quote
{
// Put a space in front of initial quote,
// unless it is a national, bit, or hex string literal (Ansi 5.3)
if (prev != ' ')
if (*s == '"')
result.append(" ");
else if (prev != '_' && strchr(specialSQL_TEXT, prev))
result.append(" ");
else if (prev == 'N' && nationalCharSetName)
if (result.length() >= 2)
if (result[result.length()-2] == ' ' ||
result[result.length()-2] == '(') {
// Here we have ' N' or '(N' preceding an initial squote.
// Replace the N with the actual cs name
// (which the caller must provide with the correct
// underscore introducer, e.g. "_KANJI").
result.remove(result.length() - 1);
result.append(nationalCharSetName);
}
toktype = DELIM; // delim-ident or string literal...
}
}
else if (isSpace8859_1((unsigned char)*s))
curr = ' '; // convert unquoted tab/newline to space
if (quote_seen || *s == '\'' || *s == '"') // in quotes or on ending quote
{
result.append(s, 1);
}
else if (curr == ' ')
{
if (prev == ' ')
{ /*throw away the subsequent spaces; remain in whatever toktype*/ }
else
{
result.append(" "); // append ourself, a space
prevtoktype = toktype;
toktype = SPACE;
}
}
else // unquoted and not a space
{
TokType efftoktype = toktype != SPACE ? toktype : prevtoktype;
// Put a space before the first letter of an identifier/hostvar/param,
// before the first digit of a number (but not a digit in an ident),
// before the first lparen of a series of lparens.
//
NABoolean isInLatin1ExtendedHalf = FALSE;
NAWchar tmpBuf[10];
if ((UInt32)curr >= 0x80) // is not a 7-bit ASCII character
{
char * p1stUnstranslatedChar = NULL;
UInt32 iOutLenInBytesIncludingNull = 0;
UInt32 iTranslatedCharCount = 0;
Int32 cnvErrStatus = LocaleToUTF16
( cnv_version1 // in - const enum cnv_version version
, s // in - const char *in_bufr
, strlen(s) // in - const int in_len
, (const char *) tmpBuf // out - const char *out_bufr
, 10*BYTES_PER_NAWCHAR // in - const int out_bufr_size_in_bytes
, cnv_UTF8 // in - enum cnv_charset charset of source
, p1stUnstranslatedChar // out - char * & first_untranslated_char
, &iOutLenInBytesIncludingNull // out - unsigned int *output_data_len_p
, 0 // in - const int cnv_flags
, (Int32) TRUE // in - const int addNullAtEnd_flag
, &iTranslatedCharCount // out - unsigned int * translated_char_cnt_p
, 1 // in - unsigned int max_chars_to_convert
);
// NOTE: No errors should be possible -- string has been converted before.
// ComASSERT(cnvErrStatus == 0 && iTranslatedCharCount == 1);
if (cnvErrStatus EQU 0 AND (UInt32)tmpBuf[0] >= 0x80 AND (UInt32)tmpBuf[0] <= 0xFF)
{
isInLatin1ExtendedHalf = TRUE;
curr = (unsigned char)tmpBuf[0];
s++; // The character stored in curr requires two bytes in UTF-8 encoding value
}
}
if (isAlphaIsoMapCS((unsigned char)curr) || curr == '_' || curr == ':' ||
curr == '?' || curr == '$' || curr == '\\') // non-Ansi '\nsk.$vol' extension
{
if (toktype != ALPHA)
{
toktype = ALPHA;
// 123.E+10 and 123. E+10 are numeric.
// ABC.E+10 is ABC.E + 10 (efftoktype is alpha, thus E is).
// Note: need to test both prevResultChar and efftoktype here
// since rparen sets what can become our efftoktype to DIGIT;
// probably unnecessary since
// A)E+10 is not legal (derived-table-rename plus number?).
if (curr == 'E' || curr == 'e')
if (s[1] == '+' || s[1] == '-' || isDigit8859_1((unsigned char)s[1]))
if (isDigit8859_1((unsigned char)prevResultChar) || prevResultChar == '.')
if (efftoktype == DIGIT)
{
// Avoid getting into toktype PUNC next loop iter:
if (s[1] == '+' || s[1] == '-')
{
result.append("E");
curr = *++s;
}
toktype = DIGIT;
}
if (toktype == ALPHA)
{
if (prev != ' ') result.append(" ");
prevalphapos = alphapos;
alphapos = result.length();
}
}
}
else if (isDigit8859_1((unsigned char)curr))
{
if (toktype != DIGIT && toktype != ALPHA)
{
if (prev != ' ') result.append(" ");
toktype = DIGIT;
}
}
else if (curr == '(')
{
if (toktype != LPAREN)
{
if (prev != ' ') result.append(" ");
toktype = LPAREN;
if (prevResultCharIs(' ')) // note: wrong to test (prev==' ')
if (tokIsFuncOrParenKeyword(result, alphapos, prevalphapos))
result.remove(result.length() - 1);
}
}
else if (curr == '.')
{
if (efftoktype == ALPHA || efftoktype == DELIM)
{
if (prevResultCharIs(' ')) // note: wrong to test (prev==' ')
result.remove(result.length() - 1);
}
else if (toktype != DIGIT) // not efftoktype!
{
if (prev != ' ') result.append(" ");
toktype = DIGIT;
}
}
else if (curr == ')' || curr == ',' || curr == ';')
{
// Remove any preceding space for this kind of punctuation.
if (prevResultCharIs(' ')) // note: wrong to test (prev==' ')
result.remove(result.length() - 1);
// Set prevtoktype for UNARYOP determination.
// Note that input of (alpha)-1,(digit)+-1,isp()- -1
// will thus display as (alpha) - 1, (digit) + -1, isp () - -1
prevtoktype = curr == ')' ? DIGIT : SPACE;
toktype = SPACE;
}
else
{
// Currently not dealing with Sql characters '&', '[', ']'
// because we don't expect them to appear in the text this
// procedure ever encounters.
if (toktype != PUNC)
{
// Put a space before first of a new series of punc
// and before subsequent unary minuses in a series of uminus
// (i.e. "x + - -y", not "x + --y", as "--" is Sql comment).
// Note: wrong to test (prev=='-'); see curr/prev set below.
if (prev != ' ' ||
(curr == '-' && prevResultCharIs('-')))
result.append(" ");
// '+ or -' is unary if it follows a reserved word, a comma,
// an LPAREN (but not an rparen!), or any PUNC or UNARYOP.
// E.g., select +1 + -2 from t where -3 < -col - (-5 + 6) - 7;
if (curr == '+' || curr == '-')
{
if (efftoktype == ALPHA)
{
NAString tok;
tok += &result.data()[alphapos];
tok[tok.length()-1] = 0;
toktype = IsSqlReservedWord(tok.data()) ? UNARYOP : PUNC;
}
else
toktype = (efftoktype != DELIM && efftoktype != DIGIT) ?
UNARYOP : PUNC;
}
else
{
toktype = PUNC;
// If "CREATE ... LOCATION /G/directory ...",
// need to treat from the slash thru the next space
// as if they were quoted -- don't insert spaces
// and don't uppercase.
if (curr == '/' && efftoktype == ALPHA)
{
NAString tok(&result.data()[alphapos]);
if (tok == "LOCATION ")
{
tok = result;
tok.remove(7);
if (tok == "CREATE ")
quote_seen = ' '; // will consume till ' '
}
}
}
}
else if (curr == '+' || curr == '-')
{
// Put a space before a unary operator.
if (!prevResultCharIs(' ')) result.append(" ");
toktype = UNARYOP;
}
}
// Preceding spaces have been dealt with; now add the current char.
if (isInLatin1ExtendedHalf)
{
NAString ns(curr);
ns.toUpper8859_1();
curr = (char)ns.data()[0];
char utf8Buf[20];
char * p1stUnstranslatedChar = NULL;
UInt32 utf8OutLenInBytesIncludingNull = 0;
UInt32 charCount = 0;
Int32 returnCode = LocaleToUTF8
( cnv_version1
, (const char*) ns.data() // source
, (const Int32) ns.length() // source len in bytes - should be 1
, (const char*) utf8Buf // output buffer for target
, (const Int32) 20 // output buffer size in bytes
, cnv_ISO88591 // source char set
, p1stUnstranslatedChar // char * & first_untranslated_char
, &utf8OutLenInBytesIncludingNull // unsigned int * output_data_len_p in bytes including '\0' terminator
, (const Int32)TRUE // const int addNullAtEnd_flag
, &charCount // unsigned int * translated_char_cnt_p - should be 1
);
ComASSERT(returnCode == 0 && charCount == 1 && utf8OutLenInBytesIncludingNull == 3);
// Exclude the NULL terminator added at the end (addNullAtEnd_flag was set to TRUE in the above call)
// from the count.
UInt32 utf8StrLenInBytes = 0;
if ((Int32)utf8OutLenInBytesIncludingNull >= CharInfo::minBytesPerChar(CharInfo::UTF8))
utf8StrLenInBytes = utf8OutLenInBytesIncludingNull - CharInfo::minBytesPerChar(CharInfo::UTF8);
ComASSERT(utf8StrLenInBytes > 0);
result.append(utf8Buf, utf8StrLenInBytes);
}
else if ((UInt32)curr < 0x80) // is a 7-bit ASCII character
{
curr = (char)toupper(curr);
result.append(&curr, 1);
}
// This kind of punctuation does not want spaces following it --
// except, do not collapse "* *" (as in "CONTROL TABLE * * RESET;")
// into "**" exponentiation operator.
// (Note that '?' does not appear here: Tdm named parameters
// must have the name immediately following the '?', to distinguish
// from Ansi unnamed params. Also, Tdm '\nsk.$vol' punc is never
// allowed with trailing spaces on input, so no need here to
// scan ahead and remove it.)
// Remain in same toktype (space/lparen/alpha/etc).
if (curr == '.' || curr == '(' || curr == ':' ||
toktype == PUNC || toktype == UNARYOP)
{
if (curr != '*')
while (isSpace8859_1((unsigned char)s[1])) s++; // throw away following spaces
// Set curr to space, which next sets prev to space, which
// in the next loop iter will prevent a space from being appended.
// Thus, CAT.SCH.TBL.FLTCOL > 1.5 + -7 will display correctly.
if (toktype != PUNC) curr = ' ';
}
} // unquoted and not a space
prev = curr;
} // loop over sqlText
sqlText = result;
if (quote_seen) {
return -15005; // Unmatched quote
}
size_t len = sqlText.length();
if (len) {
len--;
if (sqlText[len] == ' ') sqlText.remove(len--); // trim final space
// These lines commented out because space already removed before ';' above...
// if (sqlText[len] == ';' && len--)
// if (sqlText[len] == ' ') sqlText.remove(len,1); // trim space before ';'
}
return 0; // no error
} // PrettifySqlText
// SQL/MX Regression Test Support
// Calculate an increased max output line length to accommodate schema names
// longer than 'SCH', that are used when regression test suites are executed
// concurrently. The number of characters that schemaName is longer than
// 'SCH' is referred to as the "excess character count" in the following
// description. The increase beyond maxLineLen is calculated as the excess
// character count times the number of times schemaName occurs in the search
// substring of sqlText starting at pos. The search substring length is
// increased by the excess character count each time schemaName is found in
// the search substring.
//
size_t adjustedMaxLen(const NAString &sqlText, size_t pos, size_t maxLineLen,
const char *schemaName)
{
const size_t SCH_LEN = 3; // Number of chars in 'SCH'
size_t schemaNameLen = schemaName ? strlen(schemaName) : 0;
if (schemaNameLen > SCH_LEN)
{
size_t excessCharCount = schemaNameLen - SCH_LEN;
size_t sqlTextLen = sqlText.length();
size_t maxSearchPos = pos + maxLineLen - 1;
size_t occurs = 0;
while (TRUE)
{
if (maxSearchPos >= sqlTextLen)
maxSearchPos = sqlTextLen - 1;
pos = sqlText.index(schemaName, pos, NAString::ignoreCase);
if ((pos == NA_NPOS) || (pos > maxSearchPos))
break;
occurs++;
pos += schemaNameLen;
maxSearchPos += excessCharCount;
}
return maxLineLen + (occurs * excessCharCount);
}
else
return maxLineLen;
}
// Called by SHOWDDL command (CmpDescribe.C).
// Inserts linebreaks at word boundaries in order to keep tokens whole --
// to make it easier for a user to cut SHOWDDL output text and paste it
// into SQLCI as a new command.
//
// The optional schemaName argument provides SQL/MX regression test
// support. When a schemaName is provided, the maximum output line
// length is adjusted so that lines are broken in the same logical place
// they would be if the deafult schema was 'SCH'.
//
size_t LineBreakSqlText(NAString &sqlText,
NABoolean showddlView,
size_t maxlen,
size_t pfxlen,
size_t pfxinitlen,
char pfxchar,
const char * schemaName,
NABoolean commentOut)
{
if (commentOut && pfxchar != '-')
{
// Make sure that the prefixes have enough room
// for the leading "--" comment prefix.
if (pfxlen < 2)
pfxlen += (2 - pfxlen);
if (pfxinitlen < 2)
pfxinitlen += (2 - pfxinitlen);
}
// The initial line can be indented differently from subsequent lines.
if (maxlen == 0 || maxlen <= pfxlen) maxlen = pfxlen + 1;
size_t maxinitlen = (maxlen <= pfxinitlen) ? pfxinitlen + 1 : maxlen;
maxlen -= pfxlen;
maxinitlen -= pfxinitlen;
size_t maxcurrlen = adjustedMaxLen(sqlText, 0, maxinitlen, schemaName);
NAString result(pfxchar, pfxinitlen);
NAString pfx(pfxchar, pfxlen);
if (commentOut && pfxchar != '-')
{
result[(size_t)0] = '-';
result[(size_t)1] = '-';
pfx[(size_t)0] = '-';
pfx[(size_t)1] = '-';
}
pfx.prepend("\n");
NABoolean showddlViewAS = FALSE;
char quote_seen = '\0';
size_t cnt = 0, space = 0, dot[3]; // C.S.T.COL ref has max 3 dots
size_t sqlNextPos = 0;
dot[0] = 0;
dot[1] = 0;
sqlText += "\n"; // sentinel (newline, for "--")
for (const char *s = sqlText.data(); *s; s++)
{
if (quote_seen)
if (*s == quote_seen)
quote_seen = '\0';
else
{ /*consume quoted character*/ }
else if (*s == '\'' || *s == '"')
quote_seen = *s;
else if (*s == '-' && s[1] == '-') // SQL comment: "--" to eol
quote_seen = '\n';
result.append(s, 1);
cnt++;
sqlNextPos++;
if (!quote_seen)
if (isSpace8859_1((unsigned char)*s)) // sentinel ensures we get here
{
if (showddlView) // look for keyword "AS"
if ((space && result.length() - space == 3) ||
(!space && cnt == 3))
if (result[result.length()-3] == 'A' &&
result[result.length()-2] == 'S')
showddlViewAS = TRUE;
if (cnt < maxcurrlen)
{
space = result.length(); // thus space > pfxlen
}
else
{
if (cnt > maxcurrlen + 1 && space > 0) // linebreak on space
{
result.replace(space - 1, 1, pfx);
cnt = result.length() - space - pfxlen;
space = result.length();
Int32 i=0;
for (; i<3; i++)
if (!dot[i]) break;
else dot[i] += pfxlen; // replaced 1 with p+1
maxcurrlen = adjustedMaxLen(sqlText, sqlNextPos - cnt,
maxlen, schemaName);
// fall through to while loop
}
while (cnt > maxcurrlen + 1)
{
NABoolean dotfound = FALSE;
size_t dotdiff;
Int32 i = 0;
for (i=0; i<3; i++)
if (!dot[i]) break;
else
{
dotdiff = dot[i] - (result.length() - cnt);
if (dotdiff <= maxcurrlen) { dotfound = TRUE; break; }
}
if (dotfound) // linebreak after dot
{ // and don't forget to dot your i's!
result.insert(dot[i], pfx);
cnt -= dotdiff;
dot[i] = 0;
for ( ; i--; ) dot[i] += pfxlen + 1; // inserted p+1
}
else // linebreak after too-long unspaced token
{
result.remove(result.length() - 1);
result += pfx;
cnt = space = 0;
}
maxcurrlen = adjustedMaxLen(sqlText, sqlNextPos - cnt, maxlen,
schemaName);
} // while still too long
if (cnt >= maxcurrlen)
{
result.remove(result.length() - 1);
result += pfx;
cnt = space = 0;
maxcurrlen = adjustedMaxLen(sqlText, sqlNextPos, maxlen,
schemaName);
}
else if (cnt && result[result.length() - 1] == ' ')
space = result.length();
} // need to linebreak
dot[0] = 0;
if (showddlViewAS) // it was the keyword "AS"
{
size_t i;
for (i = result.length() - 1;
result[i] == ' ' || result[i] == pfxchar; i--)
result.remove(i);
if (result[i] != '\n') result += "\n"; // newline, no pfx chars
if (result.length() <= maxinitlen + 1)
pfxlen = pfxinitlen; // still the first line
NAString queryText(++s); // get past the current space
cnt = LineBreakSqlText(queryText, FALSE,
maxlen+pfxlen, pfxlen+4, pfxlen+2,
pfxchar, schemaName);
result += queryText;
s = &sqlText.data()[sqlText.length() - 1]; // will exit loop
}
} // unquoted space
else if ((*s == '.') &&
(!isDigit8859_1((unsigned char)s[1]))) // ignore dots in numeric constants
{
dot[2] = dot[1];
dot[1] = dot[0];
dot[0] = result.length();
} // unquoted dot
} // loop over sqlText
sqlText = result;
TrimNAStringSpace(sqlText, FALSE, TRUE); // remove trailing sentinel char
return cnt;
} // LineBreakSqlText
void GetSimplePosixFilename(NAString &filename, NABoolean doLower)
{
// Remove any preceding directory path
const char *fslash = strrchr(filename, '/'),
*bslash = strrchr(filename, '\\'),
*dirpathpunc;
if (fslash && bslash)
dirpathpunc = fslash > bslash ? fslash : bslash;
else
dirpathpunc = fslash ? fslash : bslash;
if (dirpathpunc) filename = ++dirpathpunc;
#ifdef NA_CASE_INSENSITIVE_FILENAMES
if (doLower) filename.toLower();
#endif
} // GetSimplePosixFilename
void FUNNY_ANSI_IDENT_REMOVE_PREFIX(NAString &str, const char *pfx)
{
// Say str is "PACKED__@T" and pfx is PACKED__@
str.remove(1, strlen(pfx)); // str is now "T" (dquotes kept)
ToInternalIdentifier(str); // str is now T
str = ToAnsiIdentifier(str); // str remains T
assert(!str.isNull());
}
NAString Latin1StrToUTF8(const NAString & latin1Str, NAMemory * heap)
{
if (latin1Str.isNull())
return NAString();
char buffer[3008]; // allocate a few extra bytes to make me feel better
char * target = &buffer[0];
bool isBufferAllocatedFromProcessHeap(FALSE);
Lng32 targetBufferLen = (Lng32)(latin1Str.length() * 4 /* SQL_UTF8_CHAR_MAXSIZE */ + 2);
if ( targetBufferLen > 3000 )
{
isBufferAllocatedFromProcessHeap = TRUE;
target = new (heap) char[targetBufferLen + 2]; // allocate a couple extra bytes ...
}
char * p1stUnstranslatedChar = NULL;
UInt32 utf8StrLenInBytes = 0;
UInt32 charCount = 0; // number of characters translated/converted
Int32 errorCode = LocaleToUTF8 ( cnv_version1
, latin1Str.data() // in - const char * srcStr
, (Int32)latin1Str.length() // in - const int srcStrLen
, (const char*)target // out - const char * bufferForTargetStr
, (Int32)targetBufferLen // in - const in targetBufferSizeInBytes
, cnv_ISO88591 // in - cnv_charset srcCharset
, p1stUnstranslatedChar // out - char* & first_untranslated_char
, &utf8StrLenInBytes // out - unsigned int * output_data_len_p
, (const Int32)TRUE // in - const int addNullAtEnd_flag
, &charCount // out - unsigned int * translated_char_cnt_p
);
// Exclude the NULL terminator added to the end (addNullAtEnd_flag was set to TRUE in the above call)
// from the count.
if ((Int32)utf8StrLenInBytes >= CharInfo::minBytesPerChar(CharInfo::UTF8))
utf8StrLenInBytes -= (UInt32)CharInfo::minBytesPerChar(CharInfo::UTF8);
else
utf8StrLenInBytes = 0;
NAString result;
if (utf8StrLenInBytes > 0)
result.append(target, (size_t)utf8StrLenInBytes);
if (isBufferAllocatedFromProcessHeap)
NADELETEBASIC(target, heap);
return result;
}
// -----------------------------------------------------------------------
// StatementHeap-related stuff
// -----------------------------------------------------------------------
static NAMemory *TheStatementHeap = NASTRING_UNINIT_HEAP_PTR;