blob: 3b013cfa7d36b4e0f1e17c8d629b59dfaec238b9 [file] [log] [blame]
#include <iostream>
#include "stream.h"
#ifndef YAML_PREFETCH_SIZE
#define YAML_PREFETCH_SIZE 2048
#endif
#define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A)))
#define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A))
#define CP_REPLACEMENT_CHARACTER (0xFFFD)
namespace YAML {
enum UtfIntroState {
uis_start,
uis_utfbe_b1,
uis_utf32be_b2,
uis_utf32be_bom3,
uis_utf32be,
uis_utf16be,
uis_utf16be_bom1,
uis_utfle_bom1,
uis_utf16le_bom2,
uis_utf32le_bom3,
uis_utf16le,
uis_utf32le,
uis_utf8_imp,
uis_utf16le_imp,
uis_utf32le_imp3,
uis_utf8_bom1,
uis_utf8_bom2,
uis_utf8,
uis_error
};
enum UtfIntroCharType {
uict00,
uictBB,
uictBF,
uictEF,
uictFE,
uictFF,
uictAscii,
uictOther,
uictMax
};
static bool s_introFinalState[] = {
false, // uis_start
false, // uis_utfbe_b1
false, // uis_utf32be_b2
false, // uis_utf32be_bom3
true, // uis_utf32be
true, // uis_utf16be
false, // uis_utf16be_bom1
false, // uis_utfle_bom1
false, // uis_utf16le_bom2
false, // uis_utf32le_bom3
true, // uis_utf16le
true, // uis_utf32le
false, // uis_utf8_imp
false, // uis_utf16le_imp
false, // uis_utf32le_imp3
false, // uis_utf8_bom1
false, // uis_utf8_bom2
true, // uis_utf8
true, // uis_error
};
static UtfIntroState s_introTransitions[][uictMax] = {
// uict00, uictBB, uictBF, uictEF,
// uictFE, uictFF, uictAscii, uictOther
{uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
uis_utfle_bom1, uis_utf8_imp, uis_utf8},
{uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf16be, uis_utf8},
{uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
uis_utf8, uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
uis_utf8},
{uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
uis_utf32be, uis_utf32be, uis_utf32be},
{uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
uis_utf16be, uis_utf16be, uis_utf16be},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
uis_utf8, uis_utf8},
{uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
uis_utf32le, uis_utf32le, uis_utf32le},
{uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf8, uis_utf8},
{uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
uis_utf16le, uis_utf16le, uis_utf16le},
{uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf8},
{uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
uis_utf8},
};
static char s_introUngetCount[][uictMax] = {
// uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
{0, 1, 1, 0, 0, 0, 0, 1},
{0, 2, 2, 2, 2, 2, 2, 2},
{3, 3, 3, 3, 0, 3, 3, 3},
{4, 4, 4, 4, 4, 0, 4, 4},
{1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1},
{2, 2, 2, 2, 2, 0, 2, 2},
{2, 2, 2, 2, 0, 2, 2, 2},
{0, 1, 1, 1, 1, 1, 1, 1},
{0, 2, 2, 2, 2, 2, 2, 2},
{1, 1, 1, 1, 1, 1, 1, 1},
{1, 1, 1, 1, 1, 1, 1, 1},
{0, 2, 2, 2, 2, 2, 2, 2},
{0, 3, 3, 3, 3, 3, 3, 3},
{4, 4, 4, 4, 4, 4, 4, 4},
{2, 0, 2, 2, 2, 2, 2, 2},
{3, 3, 0, 3, 3, 3, 3, 3},
{1, 1, 1, 1, 1, 1, 1, 1},
};
inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) {
if (std::istream::traits_type::eof() == ch) {
return uictOther;
}
switch (ch) {
case 0:
return uict00;
case 0xBB:
return uictBB;
case 0xBF:
return uictBF;
case 0xEF:
return uictEF;
case 0xFE:
return uictFE;
case 0xFF:
return uictFF;
}
if ((ch > 0) && (ch < 0xFF)) {
return uictAscii;
}
return uictOther;
}
inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits,
unsigned char rshift) {
const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
const unsigned char mask = (0xFF >> (lead_bits + 1));
return static_cast<char>(
static_cast<unsigned char>(header | ((ch >> rshift) & mask)));
}
inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
// We are not allowed to queue the Stream::eof() codepoint, so
// replace it with CP_REPLACEMENT_CHARACTER
if (static_cast<unsigned long>(Stream::eof()) == ch) {
ch = CP_REPLACEMENT_CHARACTER;
}
if (ch < 0x80) {
q.push_back(Utf8Adjust(ch, 0, 0));
} else if (ch < 0x800) {
q.push_back(Utf8Adjust(ch, 2, 6));
q.push_back(Utf8Adjust(ch, 1, 0));
} else if (ch < 0x10000) {
q.push_back(Utf8Adjust(ch, 3, 12));
q.push_back(Utf8Adjust(ch, 1, 6));
q.push_back(Utf8Adjust(ch, 1, 0));
} else {
q.push_back(Utf8Adjust(ch, 4, 18));
q.push_back(Utf8Adjust(ch, 1, 12));
q.push_back(Utf8Adjust(ch, 1, 6));
q.push_back(Utf8Adjust(ch, 1, 0));
}
}
Stream::Stream(std::istream& input)
: m_input(input),
m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
m_nPrefetchedAvailable(0),
m_nPrefetchedUsed(0) {
typedef std::istream::traits_type char_traits;
if (!input)
return;
// Determine (or guess) the character-set by reading the BOM, if any. See
// the YAML specification for the determination algorithm.
char_traits::int_type intro[4];
int nIntroUsed = 0;
UtfIntroState state = uis_start;
for (; !s_introFinalState[state];) {
std::istream::int_type ch = input.get();
intro[nIntroUsed++] = ch;
UtfIntroCharType charType = IntroCharTypeOf(ch);
UtfIntroState newState = s_introTransitions[state][charType];
int nUngets = s_introUngetCount[state][charType];
if (nUngets > 0) {
input.clear();
for (; nUngets > 0; --nUngets) {
if (char_traits::eof() != intro[--nIntroUsed])
input.putback(char_traits::to_char_type(intro[nIntroUsed]));
}
}
state = newState;
}
switch (state) {
case uis_utf8:
m_charSet = utf8;
break;
case uis_utf16le:
m_charSet = utf16le;
break;
case uis_utf16be:
m_charSet = utf16be;
break;
case uis_utf32le:
m_charSet = utf32le;
break;
case uis_utf32be:
m_charSet = utf32be;
break;
default:
m_charSet = utf8;
break;
}
ReadAheadTo(0);
}
Stream::~Stream() { delete[] m_pPrefetched; }
char Stream::peek() const {
if (m_readahead.empty()) {
return Stream::eof();
}
return m_readahead[0];
}
Stream::operator bool() const {
return m_input.good() ||
(!m_readahead.empty() && m_readahead[0] != Stream::eof());
}
// get
// . Extracts a character from the stream and updates our position
char Stream::get() {
char ch = peek();
AdvanceCurrent();
m_mark.column++;
if (ch == '\n') {
m_mark.column = 0;
m_mark.line++;
}
return ch;
}
// get
// . Extracts 'n' characters from the stream and updates our position
std::string Stream::get(int n) {
std::string ret;
ret.reserve(n);
for (int i = 0; i < n; i++)
ret += get();
return ret;
}
// eat
// . Eats 'n' characters and updates our position.
void Stream::eat(int n) {
for (int i = 0; i < n; i++)
get();
}
void Stream::AdvanceCurrent() {
if (!m_readahead.empty()) {
m_readahead.pop_front();
m_mark.pos++;
}
ReadAheadTo(0);
}
bool Stream::_ReadAheadTo(size_t i) const {
while (m_input.good() && (m_readahead.size() <= i)) {
switch (m_charSet) {
case utf8:
StreamInUtf8();
break;
case utf16le:
StreamInUtf16();
break;
case utf16be:
StreamInUtf16();
break;
case utf32le:
StreamInUtf32();
break;
case utf32be:
StreamInUtf32();
break;
}
}
// signal end of stream
if (!m_input.good())
m_readahead.push_back(Stream::eof());
return m_readahead.size() > i;
}
void Stream::StreamInUtf8() const {
unsigned char b = GetNextByte();
if (m_input.good()) {
m_readahead.push_back(b);
}
}
void Stream::StreamInUtf16() const {
unsigned long ch = 0;
unsigned char bytes[2];
int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
bytes[0] = GetNextByte();
bytes[1] = GetNextByte();
if (!m_input.good()) {
return;
}
ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
if (ch >= 0xDC00 && ch < 0xE000) {
// Trailing (low) surrogate...ugh, wrong order
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
return;
} else if (ch >= 0xD800 && ch < 0xDC00) {
// ch is a leading (high) surrogate
// Four byte UTF-8 code point
// Read the trailing (low) surrogate
for (;;) {
bytes[0] = GetNextByte();
bytes[1] = GetNextByte();
if (!m_input.good()) {
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
return;
}
unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
if (chLow < 0xDC00 || chLow >= 0xE000) {
// Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the
// stream.
QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
// Deal with the next UTF-16 unit
if (chLow < 0xD800 || chLow >= 0xE000) {
// Easiest case: queue the codepoint and return
QueueUnicodeCodepoint(m_readahead, ch);
return;
} else {
// Start the loop over with the new high surrogate
ch = chLow;
continue;
}
}
// Select the payload bits from the high surrogate
ch &= 0x3FF;
ch <<= 10;
// Include bits from low surrogate
ch |= (chLow & 0x3FF);
// Add the surrogacy offset
ch += 0x10000;
break;
}
}
QueueUnicodeCodepoint(m_readahead, ch);
}
inline char* ReadBuffer(unsigned char* pBuffer) {
return reinterpret_cast<char*>(pBuffer);
}
unsigned char Stream::GetNextByte() const {
if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) {
std::streambuf* pBuf = m_input.rdbuf();
m_nPrefetchedAvailable = static_cast<std::size_t>(
pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
m_nPrefetchedUsed = 0;
if (!m_nPrefetchedAvailable) {
m_input.setstate(std::ios_base::eofbit);
}
if (0 == m_nPrefetchedAvailable) {
return 0;
}
}
return m_pPrefetched[m_nPrefetchedUsed++];
}
void Stream::StreamInUtf32() const {
static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}};
unsigned long ch = 0;
unsigned char bytes[4];
int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
bytes[0] = GetNextByte();
bytes[1] = GetNextByte();
bytes[2] = GetNextByte();
bytes[3] = GetNextByte();
if (!m_input.good()) {
return;
}
for (int i = 0; i < 4; ++i) {
ch <<= 8;
ch |= bytes[pIndexes[i]];
}
QueueUnicodeCodepoint(m_readahead, ch);
}
}