| /* Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| parcel Clownfish; |
| |
| __C__ |
| |
| // For strlen |
| #include <string.h> |
| |
| // For CFISH_ALLOCA_OBJ. |
| #include "Clownfish/Class.h" |
| |
| // For CFISH_ERR_FUNC_MACRO. |
| #include "Clownfish/Err.h" |
| |
| __END_C__ |
| |
| /** |
| * Immutable string holding Unicode characters. |
| */ |
| |
| public final class Clownfish::String nickname Str |
| inherits Clownfish::Obj { |
| |
| const char *ptr; |
| size_t size; |
| String *origin; |
| |
| /** Return true if the string is valid UTF-8, false otherwise. |
| */ |
| public inert bool |
| utf8_valid(const char *ptr, size_t len); |
| |
| /** Throws an error if the string isn't valid UTF-8. |
| */ |
| public inert void |
| validate_utf8(const char *text, size_t size, const char *file, int line, |
| const char *func); |
| |
| /** Returns true if the code point qualifies as Unicode whitespace. |
| */ |
| public inert bool |
| is_whitespace(int32_t code_point); |
| |
| /** Encode a Unicode code point to a UTF-8 sequence. |
| * |
| * @param code_point A legal unicode code point. |
| * @param buffer Write buffer which must hold at least 4 bytes (the |
| * maximum legal length for a UTF-8 char). |
| */ |
| inert uint32_t |
| encode_utf8_char(int32_t code_point, void *buffer); |
| |
| /** Return a String which holds a copy of the supplied UTF-8 character |
| * data after checking for validity. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public inert incremented String* |
| new_from_utf8(const char *utf8, size_t size); |
| |
| /** Return a String which holds a copy of the supplied UTF-8 character |
| * data, skipping validity checks. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public inert incremented String* |
| new_from_trusted_utf8(const char *utf8, size_t size); |
| |
| /** Initialize a String which holds a copy of the supplied UTF-8 character |
| * data, skipping validity checks. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public inert String* |
| init_from_trusted_utf8(String *self, const char *utf8, size_t size); |
| |
| /** Return a String which assumes ownership of the supplied buffer |
| * containing UTF-8 character data after checking for validity. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public inert incremented String* |
| new_steal_utf8(char *utf8, size_t size); |
| |
| /** Return a String which assumes ownership of the supplied buffer |
| * containing UTF-8 character data, skipping validity checks. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public inert incremented String* |
| new_steal_trusted_utf8(char *utf8, size_t size); |
| |
| /** Initialize a String which assumes ownership of the supplied buffer |
| * containing UTF-8 character data, skipping validity checks. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public inert String* |
| init_steal_trusted_utf8(String *self, char *utf8, size_t size); |
| |
| /** Return a String which wraps an external buffer containing UTF-8 |
| * character data after checking for validity. The buffer must stay |
| * unchanged for the lifetime of the String. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public inert incremented String* |
| new_wrap_utf8(const char *utf8, size_t size); |
| |
| /** Return a String which wraps an external buffer containing UTF-8 |
| * character data, skipping validity checks. The buffer must stay |
| * unchanged for the lifetime of the String. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public inert incremented String* |
| new_wrap_trusted_utf8(const char *utf8, size_t size); |
| |
| inert incremented String* |
| init_stack_string(void *allocation, const char *utf8, size_t size); |
| |
| /** Initialize a String which wraps an external buffer containing UTF-8 |
| * character data after checking for validity. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public inert String* |
| init_wrap_trusted_utf8(String *self, const char *utf8, size_t size); |
| |
| /** Return a String which holds a single character. |
| * |
| * @param code_point Unicode code point of the character. |
| */ |
| public inert incremented String* |
| new_from_char(int32_t code_point); |
| |
| /** Return a String with content expanded from a pattern and arguments |
| * conforming to the spec defined by [](CharBuf.VCatF). |
| * |
| * Note: a user-supplied `pattern` string is a security hole |
| * and must not be allowed. |
| * |
| * @param pattern A format string. |
| */ |
| public inert incremented String* |
| newf(const char *pattern, ...); |
| |
| void* |
| To_Host(String *self, void *vcache); |
| |
| /** Return the concatenation of the String and `other`. |
| */ |
| public incremented String* |
| Cat(String *self, String *other); |
| |
| /** Return the concatenation of the String and the supplied UTF-8 |
| * character data after checking for validity. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public incremented String* |
| Cat_Utf8(String *self, const char *utf8, size_t size); |
| |
| /** Return the concatenation of the String and the supplied UTF-8 |
| * character data, skipping validity checks. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public incremented String* |
| Cat_Trusted_Utf8(String *self, const char *utf8, size_t size); |
| |
| /** Extract a 64-bit integer from a decimal string. See [](.BaseX_To_I64) |
| * for details. |
| */ |
| public int64_t |
| To_I64(String *self); |
| |
| /** Extract a 64-bit integer from a variable-base stringified version. |
| * Expects an optional minus sign followed by base-x digits, stopping at |
| * any non-digit character. Returns zero if no digits are found. If the |
| * value exceeds the range of an `int64_t`, the result is undefined. |
| * |
| * @param base A base between 2 and 36. |
| */ |
| public int64_t |
| BaseX_To_I64(String *self, uint32_t base); |
| |
| /** Convert a string to a floating-point number using the C library |
| * function `strtod`. |
| */ |
| public double |
| To_F64(String *self); |
| |
| /** Test whether the String starts with `prefix`. |
| */ |
| public bool |
| Starts_With(String *self, String *prefix); |
| |
| /** Test whether the String starts with a prefix supplied as raw UTF-8. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public bool |
| Starts_With_Utf8(String *self, const char *utf8, size_t size); |
| |
| /** Test whether the String ends with `suffix`. |
| */ |
| public bool |
| Ends_With(String *self, String *suffix); |
| |
| /** Test whether the String ends with a suffix supplied as raw UTF-8. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public bool |
| Ends_With_Utf8(String *self, const char *utf8, size_t size); |
| |
| /** Test whether the String contains `substring`. |
| */ |
| public bool |
| Contains(String *self, String *substring); |
| |
| /** Test whether the String contains a substring supplied as raw UTF-8. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public bool |
| Contains_Utf8(String *self, const char *utf8, size_t size); |
| |
| /** Return a [](StringIterator) pointing to the first occurrence of |
| * `substring` within the String, or [](@null) if the substring does not |
| * match. |
| */ |
| public incremented nullable StringIterator* |
| Find(String *self, String *substring); |
| |
| /** Return a [](StringIterator) pointing to the first occurrence of the |
| * substring within the String, or [](@null) if the substring does not |
| * match. The substring is supplied as raw UTF-8. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public incremented nullable StringIterator* |
| Find_Utf8(String *self, const char *utf8, size_t size); |
| |
| /** Equality test. |
| * |
| * @return true if `other` is a String with the same character data as |
| * `self`. |
| */ |
| public bool |
| Equals(String *self, Obj *other); |
| |
| /** Test whether the String matches the supplied UTF-8 character data. |
| */ |
| public bool |
| Equals_Utf8(String *self, const char *utf8, size_t size); |
| |
| /** Return the number of Unicode code points the String contains. |
| */ |
| public size_t |
| Length(String *self); |
| |
| /** Return the number of bytes occupied by the String's internal content. |
| */ |
| public size_t |
| Get_Size(String *self); |
| |
| /** Return the internal backing array for the String if its internal |
| * encoding is UTF-8. If it is not encoded as UTF-8 throw an exception. |
| * The character data is not null-terminated. |
| */ |
| public const char* |
| Get_Ptr8(String *self); |
| |
| /** Return a NULL-terminated copy of the string data in UTF-8 encoding. |
| * The buffer must be freed by the caller. |
| */ |
| public char* |
| To_Utf8(String *self); |
| |
| /** Return a ByteBuf which holds a copy of the String. |
| */ |
| public incremented ByteBuf* |
| To_ByteBuf(String *self); |
| |
| public incremented String* |
| Clone(String *self); |
| |
| bool |
| Is_Copy_On_IncRef(String *self); |
| |
| /** Indicate whether one String is less than, equal to, or greater than |
| * another. The Unicode code points of the Strings are compared |
| * lexicographically. Throws an exception if `other` is not a String. |
| * |
| * @return 0 if the Strings are equal, a negative number if `self` is less |
| * than `other`, and a positive number if `self` is greater than `other`. |
| */ |
| public int32_t |
| Compare_To(String *self, Obj *other); |
| |
| /** Return a hash code for the string. |
| */ |
| size_t |
| Hash_Sum(String *self); |
| |
| /** Return a copy of the String. |
| */ |
| public incremented String* |
| To_String(String *self); |
| |
| /** Return a copy of the String with Unicode whitespace characters |
| * removed from both top and tail. Whitespace is any character that has |
| * the Unicode property `White_Space`. |
| */ |
| public incremented String* |
| Trim(String *self); |
| |
| /** Return a copy of the String with leading Unicode whitespace |
| * removed. Whitespace is any character that has the Unicode property |
| * `White_Space`. |
| */ |
| public incremented String* |
| Trim_Top(String *self); |
| |
| /** Return a copy of the String with trailing Unicode whitespace |
| * removed. Whitespace is any character that has the Unicode property |
| * `White_Space`. |
| */ |
| public incremented String* |
| Trim_Tail(String *self); |
| |
| /** Return the Unicode code point located `tick` code points in from the |
| * top. Return `CFISH_STR_OOB` if out of bounds. |
| */ |
| public int32_t |
| Code_Point_At(String *self, size_t tick); |
| |
| /** Return the Unicode code point located `tick` code points counting |
| * backwards from the end. Return `CFISH_STR_OOB` if out of bounds. |
| */ |
| public int32_t |
| Code_Point_From(String *self, size_t tick); |
| |
| /** Return a new substring containing a copy of the specified range. |
| * |
| * @param offset Offset from the top, in code points. |
| * @param length The desired length of the substring, in code points. |
| */ |
| public incremented String* |
| SubString(String *self, size_t offset, size_t length); |
| |
| /** Return an iterator initialized to the start of the string. |
| */ |
| public incremented StringIterator* |
| Top(String *self); |
| |
| /** Return an iterator initialized to the end of the string. |
| */ |
| public incremented StringIterator* |
| Tail(String *self); |
| |
| public void |
| Destroy(String *self); |
| } |
| |
| /** |
| * Iterate Unicode code points in a String. |
| */ |
| |
| public final class Clownfish::StringIterator nickname StrIter |
| inherits Clownfish::Obj { |
| |
| String *string; |
| size_t byte_offset; |
| |
| inert incremented StringIterator* |
| new(String *string, size_t byte_offset); |
| |
| /** Return the substring between the top and tail iterators. |
| * |
| * @param top Top iterator. Use start of string if [](@null). |
| * @param tail Tail iterator. Use end of string if [](@null). |
| */ |
| public inert incremented String* |
| crop(StringIterator *top, StringIterator *tail); |
| |
| public incremented StringIterator* |
| Clone(StringIterator *self); |
| |
| /** Assign the source string and current position of `other` to `self`. |
| */ |
| public void |
| Assign(StringIterator *self, StringIterator *other); |
| |
| /** Equality test. |
| * |
| * @return true if `other` is a StringIterator with the same source |
| * string and character position as `self`. |
| */ |
| public bool |
| Equals(StringIterator *self, Obj *other); |
| |
| /** Indicate whether one StringIterator is less than, equal to, or |
| * greater than another by comparing their character positions. Throws an |
| * exception if `other` is not a StringIterator pointing to the same |
| * source string as `self`. |
| * |
| * @return 0 if the StringIterators are equal, a negative number if `self` |
| * is less than `other`, and a positive number if `self` is greater than |
| * `other`. |
| */ |
| public int32_t |
| Compare_To(StringIterator *self, Obj *other); |
| |
| /** Return true if the iterator is not at the end of the string. |
| */ |
| public bool |
| Has_Next(StringIterator *self); |
| |
| /** Return true if the iterator is not at the start of the string. |
| */ |
| public bool |
| Has_Prev(StringIterator *self); |
| |
| /** Return the code point after the current position and advance the |
| * iterator. Return `CFISH_STR_OOB` at the end of the string. |
| */ |
| public int32_t |
| Next(StringIterator *self); |
| |
| /** Return the code point before the current position and go one step back. |
| * Return `CFISH_STR_OOB` at the start of the string. |
| */ |
| public int32_t |
| Prev(StringIterator *self); |
| |
| /** Skip code points. |
| * |
| * @param num The number of code points to skip. |
| * @return the number of code points actually skipped. This can be less |
| * than the requested number if the end of the string is reached. |
| */ |
| public size_t |
| Advance(StringIterator *self, size_t num); |
| |
| /** Skip code points backward. |
| * |
| * @param num The number of code points to skip. |
| * @return the number of code points actually skipped. This can be less |
| * than the requested number if the start of the string is reached. |
| */ |
| public size_t |
| Recede(StringIterator *self, size_t num); |
| |
| /** Skip whitespace. Whitespace is any character that has the Unicode |
| * property `White_Space`. |
| * |
| * @return the number of code points skipped. |
| */ |
| public size_t |
| Skip_Whitespace(StringIterator *self); |
| |
| /** Skip whitespace backward. Whitespace is any character that has the |
| * Unicode property `White_Space`. |
| * |
| * @return the number of code points skipped. |
| */ |
| public size_t |
| Skip_Whitespace_Back(StringIterator *self); |
| |
| /** Test whether the content after the iterator starts with `prefix`. |
| */ |
| public bool |
| Starts_With(StringIterator *self, String *prefix); |
| |
| /** Test whether the content after the iterator starts with a prefix |
| * supplied as raw UTF-8. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public bool |
| Starts_With_Utf8(StringIterator *self, const char *utf8, size_t size); |
| |
| /** Test whether the content before the iterator ends with `suffix`. |
| */ |
| public bool |
| Ends_With(StringIterator *self, String *suffix); |
| |
| /** Test whether the content before the iterator ends with a suffix |
| * supplied as raw UTF-8. |
| * |
| * @param utf8 Pointer to UTF-8 character data. |
| * @param size Size of UTF-8 character data in bytes. |
| */ |
| public bool |
| Ends_With_Utf8(StringIterator *self, const char *utf8, size_t size); |
| |
| public void |
| Destroy(StringIterator *self); |
| } |
| |
| __C__ |
| |
| #define CFISH_VALIDATE_UTF8(text, size) \ |
| cfish_Str_validate_utf8(text, size, \ |
| __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO) |
| |
| #define CFISH_SSTR_BLANK() \ |
| cfish_Str_init_stack_string(CFISH_ALLOCA_OBJ(CFISH_STRING), "", 0) |
| |
| #define CFISH_SSTR_WRAP_C(ptr) \ |
| cfish_Str_init_stack_string(CFISH_ALLOCA_OBJ(CFISH_STRING), ptr, \ |
| strlen(ptr)) |
| |
| #define CFISH_SSTR_WRAP_UTF8(ptr, size) \ |
| cfish_Str_init_stack_string(CFISH_ALLOCA_OBJ(CFISH_STRING), ptr, size) |
| |
| #define CFISH_STR_OOB -1 |
| |
| #ifdef CFISH_USE_SHORT_NAMES |
| #define VALIDATE_UTF8 CFISH_VALIDATE_UTF8 |
| #define SSTR_BLANK CFISH_SSTR_BLANK |
| #define SSTR_WRAP_C CFISH_SSTR_WRAP_C |
| #define SSTR_WRAP_UTF8 CFISH_SSTR_WRAP_UTF8 |
| #define STR_OOB CFISH_STR_OOB |
| #endif |
| __END_C__ |
| |
| |