blob: ac17badf851bdce8b095735195b63883e079233f [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_CFISH_STRING
#define C_CFISH_STRINGITERATOR
#define CFISH_USE_SHORT_NAMES
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include "Clownfish/Class.h"
#include "Clownfish/String.h"
#include "Clownfish/ByteBuf.h"
#include "Clownfish/CharBuf.h"
#include "Clownfish/Err.h"
#include "Clownfish/Util/Memory.h"
#include "Clownfish/Util/StringHelper.h"
#define STACK_ITER(string, byte_offset) \
S_new_stack_iter(alloca(sizeof(StringIterator)), string, byte_offset)
// Helper function for throwing invalid UTF-8 error. Since THROW uses
// a String internally, calling THROW with invalid UTF-8 would create an
// infinite loop -- so we fwrite some of the bogus text to stderr and
// invoke THROW with a generic message.
#define DIE_INVALID_UTF8(text, size) \
S_die_invalid_utf8(text, size, __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO)
static void
S_die_invalid_utf8(const char *text, size_t size, const char *file, int line,
const char *func);
static const char*
S_memmem(String *self, const char *substring, size_t size);
static StringIterator*
S_new_stack_iter(void *allocation, String *string, size_t byte_offset);
String*
Str_new_from_utf8(const char *utf8, size_t size) {
if (!StrHelp_utf8_valid(utf8, size)) {
DIE_INVALID_UTF8(utf8, size);
}
String *self = (String*)Class_Make_Obj(STRING);
return Str_init_from_trusted_utf8(self, utf8, size);
}
String*
Str_new_from_trusted_utf8(const char *utf8, size_t size) {
String *self = (String*)Class_Make_Obj(STRING);
return Str_init_from_trusted_utf8(self, utf8, size);
}
String*
Str_init_from_trusted_utf8(String *self, const char *utf8, size_t size) {
// Allocate.
char *ptr = (char*)MALLOCATE(size + 1);
// Copy.
memcpy(ptr, utf8, size);
ptr[size] = '\0'; // Null terminate.
// Assign.
self->ptr = ptr;
self->size = size;
self->origin = self;
return self;
}
String*
Str_new_steal_utf8(char *utf8, size_t size) {
if (!StrHelp_utf8_valid(utf8, size)) {
DIE_INVALID_UTF8(utf8, size);
}
String *self = (String*)Class_Make_Obj(STRING);
return Str_init_steal_trusted_utf8(self, utf8, size);
}
String*
Str_new_steal_trusted_utf8(char *utf8, size_t size) {
String *self = (String*)Class_Make_Obj(STRING);
return Str_init_steal_trusted_utf8(self, utf8, size);
}
String*
Str_init_steal_trusted_utf8(String *self, char *utf8, size_t size) {
self->ptr = utf8;
self->size = size;
self->origin = self;
return self;
}
String*
Str_new_wrap_utf8(const char *utf8, size_t size) {
if (!StrHelp_utf8_valid(utf8, size)) {
DIE_INVALID_UTF8(utf8, size);
}
String *self = (String*)Class_Make_Obj(STRING);
return Str_init_wrap_trusted_utf8(self, utf8, size);
}
String*
Str_new_wrap_trusted_utf8(const char *utf8, size_t size) {
String *self = (String*)Class_Make_Obj(STRING);
return Str_init_wrap_trusted_utf8(self, utf8, size);
}
String*
Str_init_stack_string(void *allocation, const char *utf8, size_t size) {
String *self = (String*)Class_Init_Obj(STRING, allocation);
return Str_init_wrap_trusted_utf8(self, utf8, size);
}
String*
Str_init_wrap_trusted_utf8(String *self, const char *ptr, size_t size) {
self->ptr = ptr;
self->size = size;
self->origin = NULL;
return self;
}
String*
Str_new_from_char(int32_t code_point) {
const size_t MAX_UTF8_BYTES = 4;
char *ptr = (char*)MALLOCATE(MAX_UTF8_BYTES + 1);
size_t size = StrHelp_encode_utf8_char(code_point, (uint8_t*)ptr);
ptr[size] = '\0';
String *self = (String*)Class_Make_Obj(STRING);
self->ptr = ptr;
self->size = size;
self->origin = self;
return self;
}
String*
Str_newf(const char *pattern, ...) {
CharBuf *buf = CB_new(strlen(pattern));
va_list args;
va_start(args, pattern);
CB_VCatF(buf, pattern, args);
va_end(args);
String *self = CB_Yield_String(buf);
DECREF(buf);
return self;
}
static String*
S_new_substring(String *string, size_t byte_offset, size_t size) {
String *self = (String*)Class_Make_Obj(STRING);
if (string->origin == NULL) {
// Copy substring of wrapped strings.
Str_init_from_trusted_utf8(self, string->ptr + byte_offset, size);
}
else {
self->ptr = string->ptr + byte_offset;
self->size = size;
self->origin = (String*)INCREF(string->origin);
}
return self;
}
bool
Str_Is_Copy_On_IncRef_IMP(String *self) {
return self->origin == NULL;
}
void
Str_Destroy_IMP(String *self) {
if (self->origin == self) {
FREEMEM((char*)self->ptr);
}
else {
DECREF(self->origin);
}
SUPER_DESTROY(self, STRING);
}
size_t
Str_Hash_Sum_IMP(String *self) {
size_t hashvalue = 5381;
StringIterator *iter = STACK_ITER(self, 0);
const StrIter_Next_t next = METHOD_PTR(STRINGITERATOR, CFISH_StrIter_Next);
int32_t code_point;
while (STR_OOB != (code_point = next(iter))) {
hashvalue = ((hashvalue << 5) + hashvalue) ^ code_point;
}
return hashvalue;
}
static void
S_die_invalid_utf8(const char *text, size_t size, const char *file, int line,
const char *func) {
fprintf(stderr, "Invalid UTF-8, aborting: '");
fwrite(text, sizeof(char), size < 200 ? size : 200, stderr);
if (size > 200) { fwrite("[...]", sizeof(char), 5, stderr); }
fprintf(stderr, "' (length %lu)\n", (unsigned long)size);
Err_throw_at(ERR, file, line, func, "Invalid UTF-8");
}
String*
Str_To_String_IMP(String *self) {
return (String*)INCREF(self);
}
int64_t
Str_To_I64_IMP(String *self) {
return Str_BaseX_To_I64(self, 10);
}
int64_t
Str_BaseX_To_I64_IMP(String *self, uint32_t base) {
StringIterator *iter = STACK_ITER(self, 0);
int64_t retval = 0;
bool is_negative = false;
int32_t code_point = StrIter_Next(iter);
// Advance past minus sign.
if (code_point == '-') {
code_point = StrIter_Next(iter);
is_negative = true;
}
// Accumulate.
while (code_point != STR_OOB) {
if (isalnum(code_point)) {
int32_t addend = isdigit(code_point)
? code_point - '0'
: tolower(code_point) - 'a' + 10;
if (addend >= (int32_t)base) { break; }
retval *= base;
retval += addend;
}
else {
break;
}
code_point = StrIter_Next(iter);
}
// Apply minus sign.
if (is_negative) { retval = 0 - retval; }
return retval;
}
double
Str_To_F64_IMP(String *self) {
size_t amount = self->size < 511 ? self->size : 511;
char buf[512];
memcpy(buf, self->ptr, amount);
buf[amount] = 0; // NULL-terminate.
return strtod(buf, NULL);
}
char*
Str_To_Utf8_IMP(String *self) {
char *buf = (char*)malloc(self->size + 1);
memcpy(buf, self->ptr, self->size);
buf[self->size] = '\0'; // NULL-terminate.
return buf;
}
ByteBuf*
Str_To_ByteBuf_IMP(String *self) {
return BB_new_bytes(self->ptr, self->size);
}
String*
Str_Clone_IMP(String *self) {
return (String*)INCREF(self);
}
String*
Str_Cat_IMP(String *self, String *other) {
return Str_Cat_Trusted_Utf8(self, other->ptr, other->size);
}
String*
Str_Cat_Utf8_IMP(String *self, const char* ptr, size_t size) {
if (!StrHelp_utf8_valid(ptr, size)) {
DIE_INVALID_UTF8(ptr, size);
}
return Str_Cat_Trusted_Utf8(self, ptr, size);
}
String*
Str_Cat_Trusted_Utf8_IMP(String *self, const char* ptr, size_t size) {
size_t result_size = self->size + size;
char *result_ptr = (char*)MALLOCATE(result_size + 1);
memcpy(result_ptr, self->ptr, self->size);
memcpy(result_ptr + self->size, ptr, size);
result_ptr[result_size] = '\0';
String *result = (String*)Class_Make_Obj(STRING);
return Str_init_steal_trusted_utf8(result, result_ptr, result_size);
}
bool
Str_Starts_With_IMP(String *self, String *prefix) {
return Str_Starts_With_Utf8_IMP(self, prefix->ptr, prefix->size);
}
bool
Str_Starts_With_Utf8_IMP(String *self, const char *prefix, size_t size) {
if (size <= self->size
&& (memcmp(self->ptr, prefix, size) == 0)
) {
return true;
}
else {
return false;
}
}
bool
Str_Equals_IMP(String *self, Obj *other) {
String *const twin = (String*)other;
if (twin == self) { return true; }
if (!Obj_is_a(other, STRING)) { return false; }
return Str_Equals_Utf8_IMP(self, twin->ptr, twin->size);
}
int32_t
Str_Compare_To_IMP(String *self, Obj *other) {
String *twin = (String*)CERTIFY(other, STRING);
size_t min_size;
int32_t tie;
if (self->size <= twin->size) {
min_size = self->size;
tie = self->size < twin->size ? -1 : 0;
}
else {
min_size = twin->size;
tie = 1;
}
int comparison = memcmp(self->ptr, twin->ptr, min_size);
if (comparison < 0) { return -1; }
if (comparison > 0) { return 1; }
return tie;
}
bool
Str_Equals_Utf8_IMP(String *self, const char *ptr, size_t size) {
if (self->size != size) {
return false;
}
return (memcmp(self->ptr, ptr, self->size) == 0);
}
bool
Str_Ends_With_IMP(String *self, String *suffix) {
return Str_Ends_With_Utf8_IMP(self, suffix->ptr, suffix->size);
}
bool
Str_Ends_With_Utf8_IMP(String *self, const char *suffix, size_t suffix_len) {
if (suffix_len <= self->size) {
const char *start = self->ptr + self->size - suffix_len;
if (memcmp(start, suffix, suffix_len) == 0) {
return true;
}
}
return false;
}
bool
Str_Contains_IMP(String *self, String *substring) {
return !!S_memmem(self, substring->ptr, substring->size);
}
bool
Str_Contains_Utf8_IMP(String *self, const char *substring, size_t size) {
return !!S_memmem(self, substring, size);
}
StringIterator*
Str_Find_IMP(String *self, String *substring) {
return Str_Find_Utf8(self, substring->ptr, substring->size);
}
StringIterator*
Str_Find_Utf8_IMP(String *self, const char *substring, size_t size) {
const char *ptr = S_memmem(self, substring, size);
return ptr ? StrIter_new(self, ptr - self->ptr) : NULL;
}
static const char*
S_memmem(String *self, const char *substring, size_t size) {
if (size == 0) { return self->ptr; }
if (size > self->size) { return NULL; }
const char *ptr = self->ptr;
const char *end = ptr + self->size - size + 1;
char first_char = substring[0];
// Naive string search.
while (NULL != (ptr = (const char*)memchr(ptr, first_char, end - ptr))) {
if (memcmp(ptr, substring, size) == 0) { break; }
ptr++;
}
return ptr;
}
String*
Str_Trim_IMP(String *self) {
StringIterator *top = STACK_ITER(self, 0);
StrIter_Skip_Whitespace(top);
StringIterator *tail = NULL;
if (top->byte_offset < self->size) {
tail = STACK_ITER(self, self->size);
StrIter_Skip_Whitespace_Back(tail);
}
return StrIter_crop((StringIterator*)top, (StringIterator*)tail);
}
String*
Str_Trim_Top_IMP(String *self) {
StringIterator *top = STACK_ITER(self, 0);
StrIter_Skip_Whitespace(top);
return StrIter_crop((StringIterator*)top, NULL);
}
String*
Str_Trim_Tail_IMP(String *self) {
StringIterator *tail = STACK_ITER(self, self->size);
StrIter_Skip_Whitespace_Back(tail);
return StrIter_crop(NULL, (StringIterator*)tail);
}
size_t
Str_Length_IMP(String *self) {
StringIterator *iter = STACK_ITER(self, 0);
return StrIter_Advance(iter, SIZE_MAX);
}
int32_t
Str_Code_Point_At_IMP(String *self, size_t tick) {
StringIterator *iter = STACK_ITER(self, 0);
StrIter_Advance(iter, tick);
return StrIter_Next(iter);
}
int32_t
Str_Code_Point_From_IMP(String *self, size_t tick) {
if (tick == 0) { return STR_OOB; }
StringIterator *iter = STACK_ITER(self, self->size);
StrIter_Recede(iter, tick - 1);
return StrIter_Prev(iter);
}
String*
Str_SubString_IMP(String *self, size_t offset, size_t len) {
StringIterator *iter = STACK_ITER(self, 0);
StrIter_Advance(iter, offset);
size_t start_offset = iter->byte_offset;
StrIter_Advance(iter, len);
size_t size = iter->byte_offset - start_offset;
return S_new_substring(self, start_offset, size);
}
size_t
Str_Get_Size_IMP(String *self) {
return self->size;
}
const char*
Str_Get_Ptr8_IMP(String *self) {
return self->ptr;
}
StringIterator*
Str_Top_IMP(String *self) {
return StrIter_new(self, 0);
}
StringIterator*
Str_Tail_IMP(String *self) {
return StrIter_new(self, self->size);
}
/*****************************************************************/
StringIterator*
StrIter_new(String *string, size_t byte_offset) {
StringIterator *self = (StringIterator*)Class_Make_Obj(STRINGITERATOR);
self->string = (String*)INCREF(string);
self->byte_offset = byte_offset;
return self;
}
static StringIterator*
S_new_stack_iter(void *allocation, String *string, size_t byte_offset) {
StringIterator *self
= (StringIterator*)Class_Init_Obj(STRINGITERATOR, allocation);
// Assume that the string will be available for the lifetime of the
// iterator and don't increase its refcount.
self->string = string;
self->byte_offset = byte_offset;
return self;
}
String*
StrIter_crop(StringIterator *top, StringIterator *tail) {
String *string;
size_t top_offset;
size_t tail_offset;
if (tail == NULL) {
if (top == NULL) {
THROW(ERR, "StrIter_crop: Both top and tail are NULL");
UNREACHABLE_RETURN(String*);
}
string = top->string;
tail_offset = string->size;
}
else {
string = tail->string;
if (top != NULL && string != top->string) {
THROW(ERR, "StrIter_crop: strings don't match");
UNREACHABLE_RETURN(String*);
}
tail_offset = tail->byte_offset;
}
if (top == NULL) {
top_offset = 0;
}
else {
top_offset = top->byte_offset;
if (top_offset > tail_offset) {
THROW(ERR, "StrIter_crop: top is behind tail");
UNREACHABLE_RETURN(String*);
}
}
return S_new_substring(string, top_offset, tail_offset - top_offset);
}
StringIterator*
StrIter_Clone_IMP(StringIterator *self) {
return StrIter_new(self->string, self->byte_offset);
}
void
StrIter_Assign_IMP(StringIterator *self, StringIterator *other) {
if (self->string != other->string) {
DECREF(self->string);
self->string = (String*)INCREF(other->string);
}
self->byte_offset = other->byte_offset;
}
bool
StrIter_Equals_IMP(StringIterator *self, Obj *other) {
StringIterator *const twin = (StringIterator*)other;
if (twin == self) { return true; }
if (!Obj_is_a(other, STRINGITERATOR)) { return false; }
return self->string == twin->string
&& self->byte_offset == twin->byte_offset;
}
int32_t
StrIter_Compare_To_IMP(StringIterator *self, Obj *other) {
StringIterator *twin = (StringIterator*)CERTIFY(other, STRINGITERATOR);
if (self->string != twin->string) {
THROW(ERR, "Can't compare iterators of different strings");
UNREACHABLE_RETURN(int32_t);
}
if (self->byte_offset < twin->byte_offset) { return -1; }
if (self->byte_offset > twin->byte_offset) { return 1; }
return 0;
}
bool
StrIter_Has_Next_IMP(StringIterator *self) {
return self->byte_offset < self->string->size;
}
bool
StrIter_Has_Prev_IMP(StringIterator *self) {
return self->byte_offset != 0;
}
int32_t
StrIter_Next_IMP(StringIterator *self) {
String *string = self->string;
size_t byte_offset = self->byte_offset;
size_t size = string->size;
if (byte_offset >= size) { return STR_OOB; }
const uint8_t *const ptr = (const uint8_t*)string->ptr;
int32_t retval = ptr[byte_offset++];
if (retval >= 0x80) {
/*
* The 'mask' bit is tricky. In each iteration, 'retval' is
* left-shifted by 6 and 'mask' by 5 bits. So relative to the first
* byte of the sequence, 'mask' moves one bit to the right.
*
* The possible outcomes after the loop are:
*
* Two byte sequence
* retval: 110aaaaa bbbbbb
* mask: 00100000 000000
*
* Three byte sequence
* retval: 1110aaaa bbbbbb cccccc
* mask: 00010000 000000 000000
*
* Four byte sequence
* retval: 11110aaa bbbbbb cccccc dddddd
* mask: 00001000 000000 000000 000000
*
* This also illustrates why the exit condition (retval & mask)
* works. After the first iteration, the third most significant bit
* is tested. After the second iteration, the fourth, and so on.
*/
int32_t mask = 1 << 6;
do {
if (byte_offset >= size) {
THROW(ERR, "StrIter_Next: Invalid UTF-8");
UNREACHABLE_RETURN(int32_t);
}
retval = (retval << 6) | (ptr[byte_offset++] & 0x3F);
mask <<= 5;
} while (retval & mask);
retval &= mask - 1;
}
self->byte_offset = byte_offset;
return retval;
}
int32_t
StrIter_Prev_IMP(StringIterator *self) {
size_t byte_offset = self->byte_offset;
if (byte_offset == 0) { return STR_OOB; }
const uint8_t *const ptr = (const uint8_t*)self->string->ptr;
int32_t retval = ptr[--byte_offset];
if (retval >= 0x80) {
// Construct the result from right to left.
if (byte_offset == 0) {
THROW(ERR, "StrIter_Prev: Invalid UTF-8");
UNREACHABLE_RETURN(int32_t);
}
retval &= 0x3F;
int shift = 6;
int32_t first_byte_mask = 0x1F;
int32_t byte = ptr[--byte_offset];
while ((byte & 0xC0) == 0x80) {
if (byte_offset == 0) {
THROW(ERR, "StrIter_Prev: Invalid UTF-8");
UNREACHABLE_RETURN(int32_t);
}
retval |= (byte & 0x3F) << shift;
shift += 6;
first_byte_mask >>= 1;
byte = ptr[--byte_offset];
}
retval |= (byte & first_byte_mask) << shift;
}
self->byte_offset = byte_offset;
return retval;
}
size_t
StrIter_Advance_IMP(StringIterator *self, size_t num) {
size_t num_skipped = 0;
size_t byte_offset = self->byte_offset;
size_t size = self->string->size;
const uint8_t *const ptr = (const uint8_t*)self->string->ptr;
while (num_skipped < num) {
if (byte_offset >= size) {
break;
}
uint8_t first_byte = ptr[byte_offset];
byte_offset += StrHelp_UTF8_COUNT[first_byte];
++num_skipped;
}
if (byte_offset > size) {
THROW(ERR, "StrIter_Advance: Invalid UTF-8");
UNREACHABLE_RETURN(size_t);
}
self->byte_offset = byte_offset;
return num_skipped;
}
size_t
StrIter_Recede_IMP(StringIterator *self, size_t num) {
size_t num_skipped = 0;
size_t byte_offset = self->byte_offset;
const uint8_t *const ptr = (const uint8_t*)self->string->ptr;
while (num_skipped < num) {
if (byte_offset == 0) {
break;
}
uint8_t byte;
do {
if (byte_offset == 0) {
THROW(ERR, "StrIter_Recede: Invalid UTF-8");
UNREACHABLE_RETURN(size_t);
}
byte = ptr[--byte_offset];
} while ((byte & 0xC0) == 0x80);
++num_skipped;
}
self->byte_offset = byte_offset;
return num_skipped;
}
size_t
StrIter_Skip_Whitespace_IMP(StringIterator *self) {
size_t num_skipped = 0;
size_t byte_offset = self->byte_offset;
int32_t code_point;
while (STR_OOB != (code_point = StrIter_Next(self))) {
if (!StrHelp_is_whitespace(code_point)) { break; }
byte_offset = self->byte_offset;
++num_skipped;
}
self->byte_offset = byte_offset;
return num_skipped;
}
size_t
StrIter_Skip_Whitespace_Back_IMP(StringIterator *self) {
size_t num_skipped = 0;
size_t byte_offset = self->byte_offset;
int32_t code_point;
while (STR_OOB != (code_point = StrIter_Prev(self))) {
if (!StrHelp_is_whitespace(code_point)) { break; }
byte_offset = self->byte_offset;
++num_skipped;
}
self->byte_offset = byte_offset;
return num_skipped;
}
bool
StrIter_Starts_With_IMP(StringIterator *self, String *prefix) {
return StrIter_Starts_With_Utf8_IMP(self, prefix->ptr, prefix->size);
}
bool
StrIter_Starts_With_Utf8_IMP(StringIterator *self, const char *prefix,
size_t size) {
String *string = self->string;
size_t byte_offset = self->byte_offset;
if (string->size - byte_offset < size) { return false; }
return memcmp(string->ptr + byte_offset, prefix, size) == 0;
}
bool
StrIter_Ends_With_IMP(StringIterator *self, String *suffix) {
return StrIter_Ends_With_Utf8_IMP(self, suffix->ptr, suffix->size);
}
bool
StrIter_Ends_With_Utf8_IMP(StringIterator *self, const char *suffix,
size_t size) {
String *string = self->string;
size_t byte_offset = self->byte_offset;
if (byte_offset < size) { return false; }
return memcmp(string->ptr + byte_offset - size, suffix, size) == 0;
}
void
StrIter_Destroy_IMP(StringIterator *self) {
DECREF(self->string);
SUPER_DESTROY(self, STRINGITERATOR);
}