blob: aed9700499bbfb3cb05fcd2049cf769da8b0f831 [file] [log] [blame]
#!/usr/bin/python2.4
#
# Copyright 2011 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Builds the string table. See class comment for more details."""
__author__ = 'bmcquade@google.com (Bryan McQuade)'
class StringTableBuilder(object):
"""Builds a string table of hostname parts from the given suffix trie.
StringTableBuilder constructs the string table and provides an API
to find the index of a hostname-part within the string table.
The string table is an array of null-separated unique string
suffixes, combined with a map that allows for efficient lookup of
the offset of a string suffix within the array.
"""
def __init__(self):
# The generated string table containing one character per element,
# e.g. for "com\0edu\0au\0 the table would contain: ['c', 'o',
# 'm', '\0', 'e', 'd', 'u', '\0', 'a', 'u', '\0'].
self._string_table = []
# Map from a hostname-part (e.g. 'com' or 'edu') to its offset in
# the string table.
self._hostname_part_map = {}
def BuildStringTable(self, hostname_part_node, root_suffix_node):
"""Constructs the string table using the specified tries.
Args:
hostname_part_node: TrieNode for the hostname part being added to
the string table.
root_suffix_node: Root TrieNode containing all unique hostname part
string suffixes.
"""
children = hostname_part_node.GetChildren()
# Iterate over all of the children in the hostname part node, and
# add each to the string table.
for child in children:
node = root_suffix_node
# NOTE: iterating characters in reverse order assumes that there
# are no multibyte characters in the stream.
for char in reversed(child.GetName()):
if ord(char) > 127:
raise ValueError("Encountered unexpected multibyte character.")
# Get the node associated with the character. Since
# root_suffix_node was already populated with all of the
# hostname-parts, this should always succeed. If there is no
# child node for the given character, it's an error, and
# GetChild will raise a ValueError.
node = node.GetChild(char)
self._EmitHostnamePart(node)
# Now recursively add all of the children of the
# hostname_part_node to the string table.
for child in children:
self.BuildStringTable(child, root_suffix_node)
def GetStringTable(self):
"""Return the generated string table."""
return self._string_table
def GetHostnamePartOffset(self, name):
"""Get the offset of the given hostname-part in the string table."""
return self._hostname_part_map[name]
def _EmitHostnamePart(self, node):
"""Emit the hostname-part for the node to the string table.
Args:
node: The string suffix node to emit. Each node in the chain
contains a single character to be emitted.
"""
# Find the most shallow leaf under this node and emit it
# instead. By emitting only leaves, we compress hostname-parts
# that are themselves suffixes of other hostname-parts. For
# instance, we do not emit 'missoula' because it is a suffix of
# another hostname-part 'fortmissoula'. We can instead emit
# 'fortmissoula' and refer to 'missoula' as the offset of
# 'fortmissoula' plus 4 characters. This reduces the size of the
# string table by ~1kB.
node = StringTableBuilder._FindMostShallowLeafNode(node)
hostname_part = node.GetIdentifier()
if hostname_part in self._hostname_part_map:
# This hostname part has already been emitted during a previous
# invocation, so there is nothing more to do.
return
# Store the offset of the hostname part we're about to emit.
self._hostname_part_map[hostname_part] = len(self._string_table)
# Append the characters for this hostname_part to the string table
# as well as a trailing null byte.
self._string_table.extend(hostname_part)
self._string_table.append('\0')
# Next, walk up the parent chain, looking for additional terminal
# nodes. For each terminal node, create an entry in the
# hostname_parts map. Since each parent that is a terminal node is
# a suffix of the hostname_part we just added (e.g. if we just
# added 'hello' then our parent is 'ello', the next parent is
# 'llo', etc), the index in the hostname_part map should point
# into the string we just added to the string table. For instance
# if we just added 'hello' at index 500 and 'llo' is a terminal
# node, we should add an entry for 'llo' at index 502.
# Remember the index of the hostname_part just added, as well as
# its length, so we can add parent terminal nodes relative to it.
leaf_hostname_part_index = self._hostname_part_map[hostname_part]
leaf_hostname_part_len = len(hostname_part)
parent = node.GetParent()
while True:
if parent.IsTerminalNode():
hostname_part = parent.GetIdentifier()
if hostname_part not in self._hostname_part_map:
offset = leaf_hostname_part_len - len(hostname_part)
self._hostname_part_map[hostname_part] = (
leaf_hostname_part_index + offset)
if parent.IsRoot():
break
parent = parent.GetParent()
@staticmethod
def _FindMostShallowLeafNode(node):
"""Return the most shallow leaf TrieNode under the the given node.
Returns the given node if that node is a leaf node.
Args:
node: The TrieNode to begin searching from.
"""
# Suppress warning "Object (best) has no attribute (GetParentChain)"
__pychecker__ = 'no-objattrs'
if not node.HasChildren():
return node
candidates = (StringTableBuilder._FindMostShallowLeafNode(child)
for child in node.GetChildren())
# Return the candidate node with the smallest chain length.
# oschaaf(XXX):
# We sort on x.GetIdentifier() too to avoid inconsistent results between
# 32 bits and 64 bits systems running this script in case multiple
# candidates have the same (minimum) chain length.
# Though not sorting on x.GetIdentifier() doesn't seem to affect validity
# the consistency is helpful in that it will avoid future debugging
# sessions.
s = sorted(candidates,
key = lambda x: (len(x.GetParentChain()), x.GetIdentifier()))
return s[0]