blob: 0864a4e373c19293431e533120ef4d283c51e83a [file] [log] [blame]
/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
#ifndef CHARINFO_H
#define CHARINFO_H
/* -*-C++-*-
*****************************************************************************
*
* File: CharInfo.h
* Description: The header file for the CharInfo class. This class defines
* and provides features/information about character sets,
* collations and coercibility that are supported by SQL/MX.
*
*
* Created: 7/8/98
* Modified: $Date: 2007/10/09 19:38:37 $ (GMT)
* Language: C++
* Status: $State: Exp $
*
*
*
*****************************************************************************
*/
#include "Platform.h"
#include "Collections.h" // for LIST in CollationDB
#include "NABoolean.h"
#include "Platform.h"
#include "NAWinNT.h"
#include "ComCharSetDefs.h"
#include "sql_charset_strings.h"
// Forward references
class ComMPLoc;
class QualifiedName;
class SchemaName;
// Contents of this file
class CharInfo;
class CollationInfo;
class CollationDB;
// Notice how we currently equate SQL_TEXT with ISO88591.
// This is used by StaticCompiler.cpp, in the MODULE statement.
//
// ***HOWEVER:*** ##
// Ansi 4.2 specifies that SQL_TEXT has to be the union of all char-sets
// implemented by an implementation. In our case, it should be UNICODE.
// So I am not sure if we should disallow 'MODULE...NAMES ARE SQL_TEXT'
// and instead promote '...NAMES ARE ISO88591' instead.
//
// Taking a UNICODE module can be done, except that we can not
// effectively spit out the error msgs as we do not know the locale
// into which error msg is converted
// (NT does not have a font that would cover all UNICODE characters!).
//
// Disallowing 'MODULE...NAMES ARE SQL_TEXT' anytime soon would cause
// disruption to our Beta sites.
// I propose that, whenever we do fully support a UNICODE module file,
// then we switch the CharInfo enum SQL_TEXT to equal UNICODE instead
// of ISO88591 (and deal with any disruption then...).
#undef UNICODE
// conflict with definition in file sqlmxevents/zmxc
#undef SQL_TEXT
#undef MAX_CHAR_SET_STRING_LENGTH
#define MAX_CHAR_SET_STRING_LENGTH 128 // keep in sync with w:/cli/sqlcli.h
// and with w:/common/ComSizeDefs.h
// ComMAX_ANSI_IDENTIFIER_INTERNAL_LEN_IN_NAWCHARS
#define MAXNPASSES 4 // Maximum number of Passes for any collation
static const short collationNPasses[] = {2,2};
static const unsigned char collationMaxChar[] = {0x81,0x81};
class CharInfo
{
public:
enum CharSet // keep in sync with w:/cli/sqlcli.h enum SQLCHARSET_CODE !
{ // and with the charset map table in CharInfo.cpp !
CHARSET_MIN = -2,
KSC5601_MP = -2, // an MX Unicode encoding could be named KSX5601.
KANJI_MP = -1, // logically equiv to SJIS, physically diff encoding.
// the defines below are defined in common/ComCharSetDefs.h, so
// that they can be used by other components (ODBC, utilities?)
// without sourcing in this file
UnknownCharSet = SQLCHARSETCODE_UNKNOWN, // 0
ISO88591 = SQLCHARSETCODE_ISO88591, // 1
ISO88592 = 2,
ISO88593 = 3,
ISO88594 = 4,
ISO88595 = 5,
ISO88596 = 6,
ISO88597 = 7,
ISO88598 = 8,
ISO88599 = 9,
SJIS = SQLCHARSETCODE_SJIS, // 10
UNICODE = SQLCHARSETCODE_UCS2, // 11
EUCJP = SQLCHARSETCODE_EUCJP, // 12
BIG5 = SQLCHARSETCODE_BIG5, // 13
GB18030 = SQLCHARSETCODE_GB18030, // 14
UTF8 = SQLCHARSETCODE_UTF8, // 15
KSC5601 = SQLCHARSETCODE_MB_KSC5601, // 16
GB2312 = SQLCHARSETCODE_GB2312, // 17
GBK = SQLCHARSETCODE_GBK, // 18
//
// synonyms
SQL_TEXT = UNICODE,
UCS2 = UNICODE,
DefaultCharSet = ISO88591,
CHARSET_MAX = 18,
// for internal use only
ISO_MAPPING_CODE = SQLCHARSETCODE_ISO_MAPPING // 9999
};
enum Collation { UNKNOWN_COLLATION = 0,
DefaultCollation = 1,
SJIS_COLLATION = SJIS, // to prevent coding bugs!// SJIIS= 10
FIRST_SYS_COLLATION =100, //system collations start at 101 to Last_sys_collation
CZECH_COLLATION =101,
CZECH_COLLATION_CI =102,
LAST_SYS_COLLATION ,
FIRST_USER_DEFINED_COLLATION = 1000
};
enum Coercibility { NO_COLLATING_SEQUENCE = 0, COERCIBLE, IMPLICIT, EXPLICIT
};
static CharSet getCharSetEnum(const char* name);
static const char* getCharSetName(CharSet cs,
NABoolean retUnkAsBlank = FALSE);
static NABoolean isCharSetSupported(CharSet cs);
static NABoolean isCharSetSupported(const char* name)
{ return isCharSetSupported(getCharSetEnum(name)); }
static NABoolean isCharSetFullySupported(CharSet cs);
static NABoolean isCharSetFullySupported(const char* name)
{ return isCharSetFullySupported(getCharSetEnum(name)); }
static NABoolean isOnlySingleByteCharacters(CharSet cs);
static NABoolean isOnlySingleByteCharacters(const char* name)
{ return isOnlySingleByteCharacters(getCharSetEnum(name)); }
static NABoolean isModuleCharSetSupported(CharSet cs)
{ return cs == CharInfo::ISO88591; };
static NABoolean isHexFormatSupported(CharSet cs);
static NABoolean isHexFormatSupported(const char* name)
{ return isHexFormatSupported(getCharSetEnum(name)); }
static NABoolean isTerminalCharSetSupported(CharSet cs);
static NABoolean isTerminalCharSetSupported(const char* name)
{ return isTerminalCharSetSupported(getCharSetEnum(name)); }
static NABoolean isMsgCharSetSupported(CharSet cs);
static NABoolean isMsgCharSetSupported(const char* name)
{ return isMsgCharSetSupported(getCharSetEnum(name)); }
static void toggleCharSetSupport(CharSet cs); // debugging only
static Int32 minBytesPerChar(CharSet cs);
static Int32 maxBytesPerChar(CharSet cs);
static Int32 bytesPerChar(CharSet cs)
{ return maxBytesPerChar(cs); }
static NABoolean isSingleByteCharSet(CharSet cs)
{ return maxBytesPerChar(cs) == 1
|| cs == CharInfo::UTF8 // is variable-length/width multi-byte char-set but treat it as a C/C++ string
; }
static NABoolean isVariableWidthMultiByteCharSet(CharSet cs);
static NABoolean is_NCHAR_MP(CharSet cs)
{ return cs == KANJI_MP || cs == KSC5601_MP; }
static Int32 getFSTypeFixedChar(CharSet cs);
static Int32 getFSTypeVarChar(CharSet cs);
static Int32 getFSTypeANSIChar(CharSet cs);
static const char* getReplacementCharacter(CharSet cs); // replacement for untranslatable chars
// get the encoding charset for a (logical/SQL) charset
static CharInfo::CharSet getEncoding(const CharInfo::CharSet);
static Collation getCollationEnum(const char *name,
NABoolean formatNSK = FALSE,
size_t maxlen = 0);
static const char* getCollationName(Collation co,
NABoolean retUnkAsBlank = FALSE);
static Int32 getCollationFlags(Collation co);
static NABoolean isCollationUserDefined(Collation co)
{ return co >= FIRST_USER_DEFINED_COLLATION; } // watch out for UNKNOWN_COLLATION!
static const char* getCoercibilityText(Coercibility ce);
static NABoolean compareCoercibility(Coercibility ce1, Coercibility ce2);
// check if the code point value for each character in the input string is
// a valid UCS2 character
static NABoolean checkCodePoint(const NAWchar *inputStr, Int32 inputLen, CharInfo::CharSet cs);
// Convert the int value returned by MBCS_DEFAULTCHARSET_()
// to MX enum value. See Guardian Procedure Calls Ref Manual.
static CharSet getCharSetEnumFromNSK_MBCS(Int32 n)
{
switch (n) {
case 1: return KANJI_MP;
case 12: return KSC5601_MP;
default: return UnknownCharSet; // we don't support other magic numbers
} // like Hangul, Big5, Chinese PC
}
// Obtain the character set locale information about locale machine.
// The result is one of the SQLCHARSETCODE_xxx values defined in SQLCLI.h,
// or "UNKNOWN" for a unknown locale.
// Used by the sql_id and Formatter classes.
static Lng32 findLocaleCharSet();
static const char* getLocaleCharSetAsString();
static Int32 getTargetCharTypeFromLocale();
// check whether the client character set (e.g., the cs of a hostvar) is
// assignment compatible with the MX one (e.g., the cs of a column).
static NABoolean isAssignmentCompatible(CharSet clientCS, CharSet mxCS)
{
return clientCS == mxCS ||
(clientCS == CharInfo::UNICODE && // relaxation
mxCS == CharInfo::ISO88591);
};
// for an arbitrary string encoded in "sourceCS", with length
// "sourceLenInBytes", what is the max. length in bytes of this
// string after converting it to "targetCS"?
static Int32 getMaxConvertedLenInBytes(CharSet sourceCS,
Int32 sourceLenInBytes,
CharSet targetCS);
static const CollationDB *builtinCollationDB();
private:
friend class CollationDB; // needs to access builtinCDB_
static const char* const localeCharSet_;
static const CollationDB *builtinCollationDB_;
}; // CharInfo
// For the convenience of SqlParser.y, and ItemExpr::bindNode()
struct CollationAndCoercibility {
CharInfo::Collation collation_;
CharInfo::Coercibility coercibility_;
};
class CollationInfo : public NABasicObject
{
friend class CollationDB; // for robust security/validity: see private methods!
public:
enum CollationFlags {
// external (governing SQL syntax and what MX supports) flags
NO_FLAGS = 0,
NO_PAD = 0x1, // Ansi NO PAD (vs. PAD SPACE)
ORDERED_CMP_ILLEGAL = 0x10, // MP collations in MX-NSK-R1
EQ_NE_CMP_ILLEGAL = 0x20, // MP collations in MX-NSK-R1
ALL_CMP_ILLEGAL = EQ_NE_CMP_ILLEGAL | ORDERED_CMP_ILLEGAL,
ALL_NEGATIVE_SYNTAX_FLAGS = 0x0FFFFFFF,
// internal (class-implementation) flags
NO_ALLOC_AND_COPY_IN_CTOR = 0x10000000,
ALL_NEGATIVE_PLUS_STATIC = ALL_NEGATIVE_SYNTAX_FLAGS | NO_ALLOC_AND_COPY_IN_CTOR
};
enum CollationMisc { MAX_NAME_PARTS = 4,
SIZEARRAY_SIZE = MAX_NAME_PARTS + 1,
OFFSETARRAY_SIZE = MAX_NAME_PARTS - 1
};
enum Pass
{
FirstPass =0,
SecondPass =1,
ThirdPass =2,
FourthPass =3
};
enum CollationType
{
Sort =0, // Used for sort. when the collation type is Sort, the encoded
// value of a nullable column is prepended with 2 bytes
// indicating whether the value is null or not null
Compare =1, // Used to compare two character strings. The main diffrence
// with the sort type is that the encoded value of a null value
// is actually a null value
Search =2 // used to do string search
};
enum SortDirection
{
DefaultDir =0,
Ascending =1,
Descending =2
};
CollationInfo(CollHeap *h, CharInfo::Collation co, const char *name,
CollationFlags flags = NO_FLAGS,
size_t *sizArray = NULL); /* array[SIZEARRAY_SIZE] */
~CollationInfo()
{
if (!(flags_ | NO_ALLOC_AND_COPY_IN_CTOR)) delete (char*)name_;
}
CharInfo::Collation getCollationEnum() const { return co_; }
const char * getCollationName() const { return name_; }
Int32 getCollationFlags() const { return flags_; }
// for debugging
void display() const;
static NABoolean isSystemCollation(const CharInfo::Collation collation)
{
return (collation > CharInfo::FIRST_SYS_COLLATION && collation < CharInfo::LAST_SYS_COLLATION);
}
static short getCollationParamsIndex(const CharInfo::Collation collation)
{
return (collation - CharInfo::FIRST_SYS_COLLATION -1);
}
inline static short getCollationNPasses(const CharInfo::Collation collation)
{
return collationNPasses[getCollationParamsIndex( collation) ];
}
inline static unsigned char getCollationMaxChar(const CharInfo::Collation collation)
{
return collationMaxChar[getCollationParamsIndex( collation) ];
}
private:
void setFlags(CollationFlags f) { flags_ |= f; }
void clrFlags(CollationFlags f) { flags_ &= ~f; }
// See ComMPLoc::getMPName(size_t *)
// and QualifiedName::getQualifiedNameAsAnsiString(size_t *)
// -- an MP name can have up to 4 parts (MAX_NAME_PARTS),
// and an Ansi name up to 3.
// Hence, here we have:
// - one fixed string for the whole name,
// - the length of the whole,
// and, where the caller (CollationDB::insert methods)
// has determined that valid synonyms exist based on current defaults,
// - offsets into the string pointing to those synonyms
// (an offset value of zero means no synonym!).
//
// E.g., for name "\X.$Y.Z.W", if default MPLoc was "\X.$Y.DIFFSV",
// 012345678
// here the namelen would of course be 9,
// and the three offsets would be 3, 6, and 0.
// Put another way, the string starting at name_[3] is "$Y.Z.W",
// the string at name_[6] is "Z.W",
// both of which are equivalent to the full name, GIVEN THE DEFAULTS.
// The string at name_[8] is "W", which by the given defaults
// is equivalent to the different full name "\X.$Y.DIFFSV.W";
// hence the third offset is 0 in this example.
//
CharInfo::Collation co_;
Int32 flags_;
const char* name_;
size_t namelen_;
size_t synonymOffset_[OFFSETARRAY_SIZE];
}; // CollationInfo
typedef LIST(const CollationInfo *) CollationDBSupertype;
typedef LIST(CollationDB *) CollationDBList;
class CollationDB : private CollationDBSupertype
{
public:
CollationDB(CollHeap *h);
CollationDB(CollHeap *h, const CollationInfo *co, size_t count);
~CollationDB();
void clearAndReset()
{
for (CollIndex i = entries(); i--; ) {
delete at(i);
at(i) = NULL; // (just in case)
}
clear();
}
// for debugging only
void display() const;
static void Display();
NABoolean &refreshNeeded() { return refreshNeeded_; }
// The insert methods assume you've already done an unsuccessful lookup.
//
// They return a newly generated CharInfo::Collation value for the
// user-defined collation just inserted.
CharInfo::Collation insert(ComMPLoc &loc,
const ComMPLoc *defaultMPLoc,
CollationInfo::CollationFlags
flags=CollationInfo::NO_FLAGS);
CharInfo::Collation insert(QualifiedName &qn,
const SchemaName *defaultSchema,
CollationInfo::CollationFlags
flags=CollationInfo::NO_FLAGS);
private:
friend class CharInfo; // its static funx should be the only callers of these:
CharInfo::Collation insert(const char *nam,
size_t *sizArray, /* array[SIZEARRAY_SIZE] */
CollationInfo::CollationFlags flags,
Int32 defaultMatchCount);
inline
CollationDB * nextCDB() const;
const CollationInfo * getCollationInfo(CharInfo::Collation co) const;
CharInfo::Collation getCollationEnum(const char *name,
NABoolean formatNSK,
size_t namlen) const;
const char * getCollationName(CharInfo::Collation co,
NABoolean retUnkAsBlank) const;
Int32 getCollationFlags(CharInfo::Collation co) const;
// data members
CollHeap * heap_;
NABoolean refreshNeeded_;
static Lng32 nextUserCo_;
}; // CollationDB
#endif //CHARINFO_H