blob: 46b3929b565a7ddbc86c8f1e48d9e6aea3b1ca45 [file] [log] [blame]
/* prefix_string.c --- implement strings based on a prefix tree
*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
* ====================================================================
*/
#include <assert.h>
#include "private/svn_string_private.h"
/* A node in the tree represents a common prefix. The root node is the
* empty prefix. Nodes may have up to 256 sub-nodes, each starting with
* a different character (possibly '\0').
*
* The nodes in the tree store only up to 8 chars of the respective common
* prefix, i.e. longer common prefixes must be drawn out over multiple
* hierarchy levels. This is a space <-> efficiency trade-off.
*
* Strings are the leaf nodes in the tree and use a specialized, smaller
* data structure. They may add 0 to 7 extra chars to the prefix. Both
* data types can be discerned by the last char in the data buffer. This
* must be 0 for strings (leaves) and non-0 otherwise. Please note that
* ordinary nodes have a length information so that no terminating 0 is
* required for them.
*/
/* forward declaration */
typedef struct node_t node_t;
/* String type and tree leaf.
*/
struct svn_prefix_string__t
{
/* mandatory prefix */
node_t *prefix;
/* 0 ..7 chars to add the the prefix.
*
* NUL-terminated, if this is indeed a tree leaf. We use the same struct
* within node_t for inner tree nodes, too. There, DATA[7] is not NUL,
* meaning DATA may or may not be NUL terminated. The actual length is
* provided by the node_t.length field (minus parent node length). */
char data[8];
};
/* A node inside the tree, i.e. not a string and not a leaf (unless this is
* the root node).
*
* Note: keep the ordering to minimize size / alignment overhead on 64 bit
* machines.
*/
struct node_t
{
/* pointer to the parent prefix plus the 1 .. 8 extra chars.
* Only the root will provide 0 extra chars. */
svn_prefix_string__t key;
/* Length of the prefix from the root down to and including this one.
* 0 for the root node. Only then will key.prefix be NULL. */
apr_uint32_t length;
/* Number of entries used in SUB_NODES. */
apr_uint32_t sub_node_count;
/* The sub-nodes, ordered by first char. node_t and svn_prefix_string__t
* may be mixed here. May be NULL.
* The number of allocated entries is always a power-of-two and only
* given implicitly by SUB_NODE_COUNT. */
struct node_t **sub_nodes;
};
/* The actual tree structure. */
struct svn_prefix_tree__t
{
/* the common tree root (represents the empty prefix). */
node_t *root;
/* all sub-nodes & strings will be allocated from this pool */
apr_pool_t *pool;
};
/* Return TRUE, iff NODE is a leaf node.
*/
static svn_boolean_t
is_leaf(node_t *node)
{
/* If this NOT a leaf node and this node has ...
* ... 8 chars, data[7] will not be NUL because we only support
* NUL-*terminated* strings.
* ... less than 8 chars, this will be set to 0xff
* (any other non-NUL would do as well but this is not valid UTF8
* making it easy to recognize during debugging etc.) */
return node->key.data[7] == 0;
}
/* Ensure that the sub-nodes array of NODE within TREE has at least one
* unused entry. Re-allocate as necessary.
*/
static void
auto_realloc_sub_nodes(svn_prefix_tree__t *tree,
node_t *node)
{
if (node->sub_node_count & (node->sub_node_count - 1))
return;
if (node->sub_node_count == 0)
{
node->sub_nodes = apr_pcalloc(tree->pool, sizeof(*node->sub_nodes));
}
else
{
node_t **sub_nodes
= apr_pcalloc(tree->pool,
2 * node->sub_node_count * sizeof(*sub_nodes));
memcpy(sub_nodes, node->sub_nodes,
node->sub_node_count * sizeof(*sub_nodes));
node->sub_nodes = sub_nodes;
}
}
/* Given the COUNT pointers in the SUB_NODES array, return the location at
* which KEY is either located or would be inserted.
*/
static int
search_lower_bound(node_t **sub_nodes,
unsigned char key,
int count)
{
int lower = 0;
int upper = count - 1;
/* Binary search for the lowest position at which to insert KEY. */
while (lower <= upper)
{
int current = lower + (upper - lower) / 2;
if ((unsigned char)sub_nodes[current]->key.data[0] < key)
lower = current + 1;
else
upper = current - 1;
}
return lower;
}
svn_prefix_tree__t *
svn_prefix_tree__create(apr_pool_t *pool)
{
svn_prefix_tree__t *tree = apr_pcalloc(pool, sizeof(*tree));
tree->pool = pool;
tree->root = apr_pcalloc(pool, sizeof(*tree->root));
tree->root->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */
return tree;
}
svn_prefix_string__t *
svn_prefix_string__create(svn_prefix_tree__t *tree,
const char *s)
{
svn_prefix_string__t *new_string;
apr_size_t len = strlen(s);
node_t *node = tree->root;
node_t *new_node;
int idx;
/* walk the existing tree until we either find S or the node at which S
* has to be inserted */
while (TRUE)
{
node_t *sub_node;
int match = 1;
/* index of the matching sub-node */
idx = node->sub_node_count
? search_lower_bound(node->sub_nodes,
(unsigned char)s[node->length],
node->sub_node_count)
: 0;
/* any (partially) matching sub-nodes? */
if (idx == (int)node->sub_node_count
|| node->sub_nodes[idx]->key.data[0] != s[node->length])
break;
/* Yes, it matches - at least the first character does. */
sub_node = node->sub_nodes[idx];
/* fully matching sub-node? */
if (is_leaf(sub_node))
{
if (strcmp(sub_node->key.data, s + node->length) == 0)
return &sub_node->key;
}
else
{
/* The string formed by the path from the root down to
* SUB_NODE differs from S.
*
* Is it a prefix? In that case, the chars added by SUB_NODE
* must fully match the respective chars in S. */
apr_size_t sub_node_len = sub_node->length - node->length;
if (strncmp(sub_node->key.data, s + node->length,
sub_node_len) == 0)
{
node = sub_node;
continue;
}
}
/* partial match -> split
*
* At this point, S may either be a prefix to the string represented
* by SUB_NODE, or they may diverge at some offset with
* SUB_NODE->KEY.DATA .
*
* MATCH starts with 1 here b/c we already know that at least one
* char matches. Also, the loop will terminate because the strings
* differ before SUB_NODE->KEY.DATA - either at the NUL terminator
* of S or some char before that.
*/
while (sub_node->key.data[match] == s[node->length + match])
++match;
new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
new_node->key = sub_node->key;
new_node->length = node->length + match;
new_node->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */
new_node->sub_node_count = 1;
new_node->sub_nodes = apr_palloc(tree->pool, sizeof(node_t *));
new_node->sub_nodes[0] = sub_node;
memmove(sub_node->key.data, sub_node->key.data + match, 8 - match);
/* replace old sub-node with new one and continue lookup */
sub_node->key.prefix = new_node;
node->sub_nodes[idx] = new_node;
node = new_node;
}
/* add sub-node(s) and final string */
while (len - node->length > 7)
{
new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
new_node->key.prefix = node;
new_node->length = node->length + 8;
memcpy(new_node->key.data, s + node->length, 8);
auto_realloc_sub_nodes(tree, node);
memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
(node->sub_node_count - idx) * sizeof(node_t *));
/* replace old sub-node with new one and continue lookup */
node->sub_nodes[idx] = new_node;
node->sub_node_count++;
node = new_node;
idx = 0;
}
new_string = apr_pcalloc(tree->pool, sizeof(*new_string));
new_string->prefix = node;
memcpy(new_string->data, s + node->length, len - node->length);
auto_realloc_sub_nodes(tree, node);
memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
(node->sub_node_count - idx) * sizeof(node_t *));
node->sub_nodes[idx] = (node_t *)new_string;
node->sub_node_count++;
return new_string;
}
svn_string_t *
svn_prefix_string__expand(const svn_prefix_string__t *s,
apr_pool_t *pool)
{
apr_size_t s_len = strlen(s->data);
apr_size_t len = s->prefix->length + s_len;
char *buffer = apr_palloc(pool, len + 1);
svn_string_t *result = apr_pcalloc(pool, sizeof(*result));
result->data = buffer;
result->len = len;
buffer[len] = '\0';
while (s->prefix)
{
memcpy(buffer + s->prefix->length, s->data, len - s->prefix->length);
len = s->prefix->length;
s = &s->prefix->key;
}
return result;
}
int
svn_prefix_string__compare(const svn_prefix_string__t *lhs,
const svn_prefix_string__t *rhs)
{
const node_t *lhs_parent = lhs->prefix;
const node_t *rhs_parent = rhs->prefix;
if (lhs == rhs)
return 0;
/* find the common root */
while (lhs_parent != rhs_parent)
{
if (lhs_parent->length <= rhs_parent->length)
{
rhs = &rhs_parent->key;
rhs_parent = rhs_parent->key.prefix;
}
else if (rhs_parent->length <= lhs_parent->length)
{
lhs = &lhs_parent->key;
lhs_parent = lhs_parent->key.prefix;
}
/* same tree? */
assert(lhs_parent && rhs_parent);
}
/* at the common root, strings will differ in the first follow-up char */
return (int)(unsigned char)lhs->data[0] - (int)(unsigned char)rhs->data[0];
}