blob: 447c0b0df32407ee2be76c232d9a40f036ba2d26 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef avro_json_JsonIO_hh__
#define avro_json_JsonIO_hh__
#include <boost/lexical_cast.hpp>
#include <boost/math/special_functions/fpclassify.hpp>
#include <boost/utility.hpp>
#include <locale>
#include <sstream>
#include <stack>
#include <string>
#include "Config.hh"
#include "Stream.hh"
namespace avro {
namespace json {
inline char toHex(unsigned int n) {
return (n < 10) ? (n + '0') : (n + 'a' - 10);
}
class AVRO_DECL JsonParser : boost::noncopyable {
public:
enum class Token {
Null,
Bool,
Long,
Double,
String,
ArrayStart,
ArrayEnd,
ObjectStart,
ObjectEnd
};
size_t line() const { return line_; }
private:
enum State {
stValue, // Expect a data type
stArray0, // Expect a data type or ']'
stArrayN, // Expect a ',' or ']'
stObject0, // Expect a string or a '}'
stObjectN, // Expect a ',' or '}'
stKey // Expect a ':'
};
std::stack<State> stateStack;
State curState;
bool hasNext;
char nextChar;
bool peeked;
StreamReader in_;
Token curToken;
bool bv;
int64_t lv;
double dv;
std::string sv;
size_t line_;
Token doAdvance();
Token tryLiteral(const char exp[], size_t n, Token tk);
Token tryNumber(char ch);
Token tryString();
static Exception unexpected(unsigned char ch);
char next();
static std::string decodeString(const std::string &s, bool binary);
public:
JsonParser() : curState(stValue), hasNext(false), nextChar(0), peeked(false),
curToken(Token::Null), bv(false), lv(0), dv(0), line_(1) {}
void init(InputStream &is) {
// Clear by swapping with an empty stack
std::stack<State>().swap(stateStack);
curState = stValue;
hasNext = false;
peeked = false;
line_ = 1;
in_.reset(is);
}
Token advance() {
if (!peeked) {
curToken = doAdvance();
} else {
peeked = false;
}
return curToken;
}
Token peek() {
if (!peeked) {
curToken = doAdvance();
peeked = true;
}
return curToken;
}
void expectToken(Token tk);
bool boolValue() const {
return bv;
}
Token cur() const {
return curToken;
}
double doubleValue() const {
return dv;
}
int64_t longValue() const {
return lv;
}
const std::string &rawString() const {
return sv;
}
std::string stringValue() const {
return decodeString(sv, false);
}
std::string bytesValue() const {
return decodeString(sv, true);
}
void drain() {
if (!stateStack.empty() || peeked) {
throw Exception("Invalid state for draining");
}
in_.drain(hasNext);
hasNext = false;
}
/**
* Return UTF-8 encoded string value.
*/
static std::string toStringValue(const std::string &sv) {
return decodeString(sv, false);
}
/**
* Return byte-encoded string value. It is an error if the input
* JSON string contained unicode characters more than "\u00ff'.
*/
static std::string toBytesValue(const std::string &sv) {
return decodeString(sv, true);
}
static const char *const tokenNames[];
static const char *toString(Token tk) {
return tokenNames[static_cast<size_t>(tk)];
}
};
class AVRO_DECL JsonNullFormatter {
public:
explicit JsonNullFormatter(StreamWriter &) {}
void handleObjectStart() {}
void handleObjectEnd() {}
void handleValueEnd() {}
void handleColon() {}
};
class AVRO_DECL JsonPrettyFormatter {
StreamWriter &out_;
size_t level_;
std::vector<uint8_t> indent_;
static const int CHARS_PER_LEVEL = 2;
void printIndent() {
size_t charsToIndent = level_ * CHARS_PER_LEVEL;
if (indent_.size() < charsToIndent) {
indent_.resize(charsToIndent * 2, ' ');
}
out_.writeBytes(indent_.data(), charsToIndent);
}
public:
explicit JsonPrettyFormatter(StreamWriter &out) : out_(out), level_(0), indent_(10, ' ') {}
void handleObjectStart() {
out_.write('\n');
++level_;
printIndent();
}
void handleObjectEnd() {
out_.write('\n');
--level_;
printIndent();
}
void handleValueEnd() {
out_.write('\n');
printIndent();
}
void handleColon() {
out_.write(' ');
}
};
template<class F>
class AVRO_DECL JsonGenerator {
StreamWriter out_;
F formatter_;
enum State {
stStart,
stArray0,
stArrayN,
stMap0,
stMapN,
stKey,
};
std::stack<State> stateStack;
State top;
void write(const char *b, const char *p) {
if (b != p) {
out_.writeBytes(reinterpret_cast<const uint8_t *>(b), p - b);
}
}
void escape(char c, const char *b, const char *p) {
write(b, p);
out_.write('\\');
out_.write(c);
}
void escapeCtl(char c) {
escapeUnicode(static_cast<uint8_t>(c));
}
void writeHex(char c) {
out_.write(toHex((static_cast<unsigned char>(c)) / 16));
out_.write(toHex((static_cast<unsigned char>(c)) % 16));
}
void escapeUnicode16(uint32_t c) {
out_.write('\\');
out_.write('u');
writeHex((c >> 8) & 0xff);
writeHex(c & 0xff);
}
void escapeUnicode(uint32_t c) {
if (c < 0x10000) {
escapeUnicode16(c);
} else if (c < 0x110000) {
c -= 0x10000;
escapeUnicode16(((c >> 10) & 0x3ff) | 0xd800);
escapeUnicode16((c & 0x3ff) | 0xdc00);
} else {
throw Exception(boost::format("Invalid code-point: %1%") % c);
}
}
void doEncodeString(const char *b, size_t len, bool binary) {
const char *e = b + len;
out_.write('"');
for (const char *p = b; p != e; p++) {
if ((*p & 0x80) != 0) {
write(b, p);
if (binary) {
escapeCtl(*p);
} else if ((*p & 0x40) == 0) {
throw Exception("Invalid UTF-8 sequence");
} else {
int more = 1;
uint32_t value;
if ((*p & 0x20) != 0) {
more++;
if ((*p & 0x10) != 0) {
more++;
if ((*p & 0x08) != 0) {
throw Exception("Invalid UTF-8 sequence");
} else {
value = *p & 0x07;
}
} else {
value = *p & 0x0f;
}
} else {
value = *p & 0x1f;
}
for (int i = 0; i < more; ++i) {
if (++p == e || (*p & 0xc0) != 0x80) {
throw Exception("Invalid UTF-8 sequence");
}
value <<= 6;
value |= *p & 0x3f;
}
escapeUnicode(value);
}
} else {
switch (*p) {
case '\\':
case '"':
case '/':
escape(*p, b, p);
break;
case '\b':
escape('b', b, p);
break;
case '\f':
escape('f', b, p);
break;
case '\n':
escape('n', b, p);
break;
case '\r':
escape('r', b, p);
break;
case '\t':
escape('t', b, p);
break;
default:
if (std::iscntrl(*p, std::locale::classic())) {
write(b, p);
escapeCtl(*p);
break;
} else {
continue;
}
}
}
b = p + 1;
}
write(b, e);
out_.write('"');
}
void sep() {
if (top == stArrayN) {
out_.write(',');
formatter_.handleValueEnd();
} else if (top == stArray0) {
top = stArrayN;
}
}
void sep2() {
if (top == stKey) {
top = stMapN;
}
}
public:
JsonGenerator() : formatter_(out_), top(stStart) {}
void init(OutputStream &os) {
out_.reset(os);
}
void flush() {
out_.flush();
}
int64_t byteCount() const {
return out_.byteCount();
}
void encodeNull() {
sep();
out_.writeBytes(reinterpret_cast<const uint8_t *>("null"), 4);
sep2();
}
void encodeBool(bool b) {
sep();
if (b) {
out_.writeBytes(reinterpret_cast<const uint8_t *>("true"), 4);
} else {
out_.writeBytes(reinterpret_cast<const uint8_t *>("false"), 5);
}
sep2();
}
template<typename T>
void encodeNumber(T t) {
sep();
std::ostringstream oss;
oss << boost::lexical_cast<std::string>(t);
const std::string s = oss.str();
out_.writeBytes(reinterpret_cast<const uint8_t *>(s.data()), s.size());
sep2();
}
void encodeNumber(double t) {
sep();
std::ostringstream oss;
if (boost::math::isfinite(t)) {
oss << boost::lexical_cast<std::string>(t);
} else if (boost::math::isnan(t)) {
oss << "NaN";
} else if (t == std::numeric_limits<double>::infinity()) {
oss << "Infinity";
} else {
oss << "-Infinity";
}
const std::string s = oss.str();
out_.writeBytes(reinterpret_cast<const uint8_t *>(s.data()), s.size());
sep2();
}
void encodeString(const std::string &s) {
if (top == stMap0) {
top = stKey;
} else if (top == stMapN) {
out_.write(',');
formatter_.handleValueEnd();
top = stKey;
} else if (top == stKey) {
top = stMapN;
} else {
sep();
}
doEncodeString(s.c_str(), s.size(), false);
if (top == stKey) {
out_.write(':');
formatter_.handleColon();
}
}
void encodeBinary(const uint8_t *bytes, size_t len) {
sep();
doEncodeString(reinterpret_cast<const char *>(bytes), len, true);
sep2();
}
void arrayStart() {
sep();
stateStack.push(top);
top = stArray0;
out_.write('[');
formatter_.handleObjectStart();
}
void arrayEnd() {
top = stateStack.top();
stateStack.pop();
formatter_.handleObjectEnd();
out_.write(']');
sep2();
}
void objectStart() {
sep();
stateStack.push(top);
top = stMap0;
out_.write('{');
formatter_.handleObjectStart();
}
void objectEnd() {
top = stateStack.top();
stateStack.pop();
formatter_.handleObjectEnd();
out_.write('}');
sep2();
}
};
} // namespace json
} // namespace avro
#endif