blob: 9c9880b2ae313d5b61a8c26a5326fc0b311ddd00 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifndef GEODE_UTIL_JAVAMODIFIEDUTF8_H_
#define GEODE_UTIL_JAVAMODIFIEDUTF8_H_
#include <codecvt>
#include <locale>
#include <string>
#include "string.hpp"
namespace apache {
namespace geode {
namespace client {
namespace internal {
struct JavaModifiedUtf8 {
/**
* Calculate the length of the given UTF-8 string when encoded in Java
* Modified UTF-8.
*/
inline static size_t encodedLength(const std::string& utf8) {
if (utf8.empty()) {
return 0;
}
// TODO string optimize for direct calculation
return encodedLength(to_utf16(utf8));
}
/**
* Calculate the length of the given UTF-16 string when encoded in Java
* Modified UTF-8.
*/
inline static size_t encodedLength(const std::u16string& utf16) {
return encodedLength(utf16.data(), utf16.length());
}
inline static size_t encodedLength(const char16_t* data, size_t length) {
size_t encodedLen = 0;
while (length-- > 0) {
const char16_t c = *(data++);
if (c == 0) {
// NUL
encodedLen += 2;
} else if (c < 0x80) {
// ASCII
encodedLen++;
} else if (c < 0x800) {
encodedLen += 2;
} else {
encodedLen += 3;
}
}
return encodedLen;
}
/**
* Converts given UTF-8 string to Java Modified UTF-8 string.
*/
inline static std::string fromString(const std::string& utf8) {
return fromString(to_utf16(utf8));
}
/**
* Converts given UTF-16 string to Java Modified UTF-8 string.
*/
inline static std::string fromString(const std::u16string& utf16) {
std::string jmutf8;
jmutf8.reserve(utf16.length());
for (auto&& c : utf16) {
encode(c, jmutf8);
}
return jmutf8;
}
/**
* Converts a single UTF-16 code unit into Java Modified UTF-8 code units.
*/
inline static void encode(const char16_t c, std::string& jmutf8) {
if (c == 0) {
// NUL
jmutf8 += static_cast<uint8_t>(0xc0);
jmutf8 += static_cast<uint8_t>(0x80);
} else if (c < 0x80) {
// ASCII character
jmutf8 += static_cast<uint8_t>(c);
} else if (c < 0x800) {
jmutf8 += static_cast<uint8_t>(0xC0 | c >> 6);
jmutf8 += static_cast<uint8_t>(0x80 | (c & 0x3F));
} else {
jmutf8 += static_cast<uint8_t>(0xE0 | c >> 12);
jmutf8 += static_cast<uint8_t>(0x80 | ((c >> 6) & 0x3F));
jmutf8 += static_cast<uint8_t>(0x80 | (c & 0x3F));
}
}
inline static std::u16string decode(const char* buf, uint16_t len) {
std::u16string value;
const auto end = buf + len;
while (buf < end) {
value += decodeJavaModifiedUtf8Char(&buf);
}
return value;
}
inline static char16_t decodeJavaModifiedUtf8Char(const char** pbuf) {
char16_t c;
// get next byte unsigned
int32_t b = **pbuf & 0xff;
(*pbuf)++;
int32_t k = b >> 5;
// classify based on the high order 3 bits
switch (k) {
case 6: {
// two byte encoding
// 110yyyyy 10xxxxxx
// use low order 6 bits
int32_t y = b & 0x1f;
// use low order 6 bits of the next byte
// It should have high order bits 10, which we don't check.
int32_t x = **pbuf & 0x3f;
(*pbuf)++;
// 00000yyy yyxxxxxx
c = (y << 6 | x);
break;
}
case 7: {
// three byte encoding
// 1110zzzz 10yyyyyy 10xxxxxx
// use low order 4 bits
int32_t z = b & 0x0f;
// use low order 6 bits of the next byte
// It should have high order bits 10, which we don't check.
int32_t y = **pbuf & 0x3f;
(*pbuf)++;
// use low order 6 bits of the next byte
// It should have high order bits 10, which we don't check.
int32_t x = **pbuf & 0x3f;
(*pbuf)++;
// zzzzyyyy yyxxxxxx
c = static_cast<char16_t>(z << 12 | y << 6 | x);
break;
}
default:
// one byte encoding
// 0xxxxxxx
// use just low order 7 bits
// 00000000 0xxxxxxx
c = static_cast<char16_t>(b & 0x7f);
break;
}
return c;
}
};
} // namespace internal
} // namespace client
} // namespace geode
} // namespace apache
#endif // GEODE_UTIL_JAVAMODIFIEDUTF8_H_