subversion/libsvn_delta/vdelta.c - subversion - Git at Google

 /*
  * vdelta.c:  vdelta generator.
  *
  * ====================================================================
  * Copyright (c) 2000-2002 CollabNet.  All rights reserved.
  *
  * This software is licensed as described in the file COPYING, which
  * you should have received as part of this distribution.  The terms
  * are also available at http://subversion.tigris.org/license-1.html.
  * If newer versions of this license are posted there, you may use a
  * newer version instead, at your option.
  *
  * This software consists of voluntary contributions made by many
  * individuals.  For exact contribution history, see the revision
  * history and logs, available at http://subversion.tigris.org/.
  * ====================================================================
  */


 #include <assert.h>

 #include <apr_general.h>        /* for APR_INLINE */

 #include "svn_delta.h"
 #include "delta.h"


 /* ==================================================================== */
 /* Hash table for vdelta hashing.

    Each hash bucket is a chain of slots. The index of the slot in
    the slots array is also the index of the key string in the
    current window's data stream. The hash table implements a multimap
    (i.e., hash and key collisions are allowed).

    To store a key->index mapping, just add slot[index] to the slot
    chain tn key's bucket (see store_mapping).

    For a given key, you can traverse the list of match candidates (some
    of which may be hash collisions) like this:

    for (slot = buckets[get_bucket(key)]; slot != NULL; slot = slot->next)
      {
        index = slot - slots;
        ...
      }

   This hash table implementation os based on the description in
   http://subversion.tigris.org/subversion-dev/current/msg00152.html. */


 /* Size of a vdelta hash key. */
 #define VD_KEY_SIZE 4


 /* Hash slot. */
 typedef struct hash_slot_t {
   struct hash_slot_t *next;
 } hash_slot_t;

 /* Hash table. */
 typedef struct hash_table_t {
   int num_buckets;              /* Number of buckets in the table. */
   hash_slot_t **buckets;        /* Bucket array. */
   hash_slot_t *slots;           /* Slots array. */
 } hash_table_t;


 /* Create a hash table with NUM_SLOTS slots. NUM_SLOTS should be the sum
    of the size of the source and target parts of the delta window.
    Allocate from POOL. */
 static hash_table_t *
 create_hash_table (apr_size_t num_slots, apr_pool_t *pool)
 {
   int i;
   apr_size_t j;
   hash_table_t* table = apr_palloc (pool, sizeof (*table));

   /* This should be a reasonable number of buckets ... */
   table->num_buckets = (num_slots / 3) | 1;
   table->buckets = apr_palloc (pool, (table->num_buckets
                                       * sizeof (*table->buckets)));
   for (i = 0; i < table->num_buckets; ++i)
     table->buckets[i] = NULL;

   table->slots = apr_palloc (pool, num_slots * sizeof (*table->slots));
   for (j = 0; j < num_slots; ++j)
     table->slots[j].next = NULL;

   return table;
 }


 /* Convert a key to a pointer to the key's hash bucket.
    We use a 2-universal multiplicative hash function. If you're
    windering about the selected multiplier, take a look at the
    comments in apr/tables/apr_hash.c:find_entry for a discussion
    on fast string hashes; it's very illuminating.

    [ We use 127 instead of 33 here because I happen to like
      interesting prime numbers, so there. --xbc ] */
 static APR_INLINE apr_uint32_t
 get_bucket (const hash_table_t *table, const char *key)
 {
   int i;
   apr_uint32_t hash = 0;
   for (i = 0; i < VD_KEY_SIZE; ++i)
     hash = hash * 127 + *key++;
   return hash % table->num_buckets;
 }


 /* Store a key->index mapping into the hash table. */
 static APR_INLINE void
 store_mapping (hash_table_t *table, const char* key, apr_off_t idx)
 {
   apr_uint32_t bucket = get_bucket (table, key);;
   assert (table->slots[idx].next == NULL);
   table->slots[idx].next = table->buckets[bucket];
   table->buckets[bucket] = &table->slots[idx];
 }


 /* ==================================================================== */
 /* Vdelta generator.

    The article "Delta Algorithms: An Empirical Analysis" by Hunt,
    Vo and Tichy contains a description of the vdelta algorithm,
    but it's incomplete. Here's a detailed description (see also
    http://subversion.tigris.org/subversion-dev/current/msg00158.html
    in the mailing list archives):

      1. Look up the four bytes starting at the current position
         pointer.  If there are no matches for those four bytes,
         output an insert, move the position pointer forward by one,
         and go back to step 1.

      2. Determine which of the candidates yields the longest
         extension.  This will be called the "current match".

      3. Look up the last three bytes of the current match plus one
         unmatched byte.  If there is no match for those four bytes,
         the current match is the best match; go to step 6.

      4. For each candidate, check backwards to see if it matches
         the entire match so far.  If no candidates satisfy that
         constraint, the current match is the best match; go to step 6.

      5. Among the candidates which do satisfy the constraint,
         determine which one yields the longest extension.  This
         will be the new "current match."  Go back to step 3.

      6. Output a block copy instruction, add indexes for the last
         three positions of the matched data, advance the position
         pointer by the length of the match, and go back to step 1.

    Inserts and copies are generated only when the current position
    is within the target data.

    Note that the vdelta algorithm allows copies that cross the
    source/target data boundary. Because our internal delta
    representation has different opcodes for source and terget copies,
    we split them in two. This means that the opcode stream in the
    delta window can contain copies shorter than VD_KEY_SIZE. These
    could be represented by insert ops instead, but we'll leave them
    in, so that we can merge them again when we convert the delta
    window to an external format like vcdiff that supports cross
    -boundary copies. */


 /* Find the length of a match within the data window.
    Note that (match < from && from <= end) must always be true here. */

 static APR_INLINE int
 find_match_len (const char *match, const char *from, const char *end)
 {
   const char *here = from;
   while (here < end && *match == *here)
     {
       ++match;
       ++here;
     }
   return here - from;
 }


 /* This is the main vdelta generator. */

 static void
 vdelta (struct build_ops_baton_t *build_baton,
         const char *data,
         const char *start,
         const char *end,
         svn_boolean_t outputflag,
         hash_table_t *table,
         apr_pool_t *pool)
 {
   const char *here = start;     /* Current position in the buffer. */
   const char *insert_from = NULL; /* Start of byte range for insertion. */

   for (;;)
     {
       const char *current_match, *key;
       apr_size_t current_match_len = 0;
       hash_slot_t *slot;
       svn_boolean_t progress;

       /* If we're near the end, just insert the last few bytes. */
       if (end - here < VD_KEY_SIZE)
         {
           const char *from = ((insert_from != NULL) ? insert_from : here);

           if (outputflag && from < end)
             svn_txdelta__insert_op (build_baton, svn_txdelta_new, 0,
                                     end - from, from, pool);
           return;
         }

       /* Search for the longest match.  */
       current_match = NULL;
       current_match_len = 0;
       key = here;
       do
         {
           /* Try to extend the current match.  Our key is the last
              three matched bytes plus one unmatched byte if we already
              have a current match, or just the four bytes where we are
              if we don't have a current match yet.  See which mapping
              yields the longest extension.  */
           progress = FALSE;
           for (slot = table->buckets[get_bucket (table, key)];
                slot != NULL;
                slot = slot->next)
             {
               const char *match;
               apr_size_t match_len;

               if (slot - table->slots < key - here) /* Too close to start */
                 continue;
               match = data + (slot - table->slots) - (key - here);
               match_len = find_match_len (match, here, end);

               /* We can only copy from the source or from the target, so
                  don't let the match cross START.  */
               if (match < start && match + match_len > start)
                 match_len = start - match;

               if (match_len >= VD_KEY_SIZE && match_len > current_match_len)
                 {
                   /* We have a longer match; record it.  */
                   current_match = match;
                   current_match_len = match_len;
                   progress = TRUE;
                 }
             }
           if (progress)
             key = here + current_match_len - (VD_KEY_SIZE - 1);
         }
       while (progress && end - key >= VD_KEY_SIZE);

       if (current_match_len < VD_KEY_SIZE)
         {
           /* There is no match here; store a mapping and insert this byte. */
           store_mapping (table, here, here - data);
           if (insert_from == NULL)
             insert_from = here;
           here++;
           continue;
         }
       else if (outputflag)
         {
           if (insert_from != NULL)
             {
               /* Commit the pending insert. */
               svn_txdelta__insert_op (build_baton, svn_txdelta_new,
                                       0, here - insert_from,
                                       insert_from, pool);
               insert_from = NULL;
             }
           if (current_match < start) /* Copy from source. */
             svn_txdelta__insert_op (build_baton, svn_txdelta_source,
                                     current_match - data,
                                     current_match_len,
                                     NULL, pool);
           else                       /* Copy from target */
             svn_txdelta__insert_op (build_baton, svn_txdelta_target,
                                     current_match - start,
                                     current_match_len,
                                     NULL, pool);
         }

       /* Adjust the current position and insert mappings for the
          last three bytes of the match. */
       here += current_match_len;
       if (end - here >= VD_KEY_SIZE)
         {
           char const *last = here - (VD_KEY_SIZE - 1);
           for (; last < here; ++last)
             store_mapping (table, last, last - data);
         }
     }
 }


 void
 svn_txdelta__vdelta (struct build_ops_baton_t *build_baton,
                      const char *data,
                      apr_size_t source_len,
                      apr_size_t target_len,
                      apr_pool_t *pool)
 {
   hash_table_t *table = create_hash_table (source_len + target_len, pool);

   vdelta (build_baton, data, data, data + source_len, FALSE, table, pool);
   vdelta (build_baton, data, data + source_len, data + source_len + target_len,
           TRUE, table, pool);

 #if 0
   /* This bit of code calculates the hash load and the
      number of collisions. Please note that a the number
      of collisions per bucket is one less than the length
      of the chain. :-)  --xbc */
   {
     int i;
     int empty = 0;
     int collisions = 0;
     for (i = 0; i < table->num_buckets; ++i)
     {
       hash_slot_t *slot = table->buckets[i];
       if (!slot)
         ++empty;
       else
       {
         slot = slot->next;
         while (slot != NULL)
         {
           ++collisions;
           slot = slot->next;
         }
       }
     }
     fprintf (stderr, "Hash stats: load %d, collisions %d\n",
              100 - 100 * empty / table->num_buckets, collisions);
   }
 #endif
 }


 /*
  * local variables:
  * eval: (load-file "../../tools/dev/svn-dev.el")
  * end:
  */
	/*
	* vdelta.c: vdelta generator.
	*
	* ====================================================================
	* Copyright (c) 2000-2002 CollabNet. All rights reserved.
	*
	* This software is licensed as described in the file COPYING, which
	* you should have received as part of this distribution. The terms
	* are also available at http://subversion.tigris.org/license-1.html.
	* If newer versions of this license are posted there, you may use a
	* newer version instead, at your option.
	*
	* This software consists of voluntary contributions made by many
	* individuals. For exact contribution history, see the revision
	* history and logs, available at http://subversion.tigris.org/.
	* ====================================================================
	*/


	#include <assert.h>

	#include <apr_general.h> /* for APR_INLINE */

	#include "svn_delta.h"
	#include "delta.h"


	/* ==================================================================== */
	/* Hash table for vdelta hashing.

	Each hash bucket is a chain of slots. The index of the slot in
	the slots array is also the index of the key string in the
	current window's data stream. The hash table implements a multimap
	(i.e., hash and key collisions are allowed).

	To store a key->index mapping, just add slot[index] to the slot
	chain tn key's bucket (see store_mapping).

	For a given key, you can traverse the list of match candidates (some
	of which may be hash collisions) like this:

	for (slot = buckets[get_bucket(key)]; slot != NULL; slot = slot->next)
	{
	index = slot - slots;
	...
	}

	This hash table implementation os based on the description in
	http://subversion.tigris.org/subversion-dev/current/msg00152.html. */


	/* Size of a vdelta hash key. */
	#define VD_KEY_SIZE 4


	/* Hash slot. */
	typedef struct hash_slot_t {
	struct hash_slot_t *next;
	} hash_slot_t;

	/* Hash table. */
	typedef struct hash_table_t {
	int num_buckets; /* Number of buckets in the table. */
	hash_slot_t *buckets; / Bucket array. */
	hash_slot_t slots; / Slots array. */
	} hash_table_t;


	/* Create a hash table with NUM_SLOTS slots. NUM_SLOTS should be the sum
	of the size of the source and target parts of the delta window.
	Allocate from POOL. */
	static hash_table_t *
	create_hash_table (apr_size_t num_slots, apr_pool_t *pool)
	{
	int i;
	apr_size_t j;
	hash_table_t* table = apr_palloc (pool, sizeof (*table));

	/* This should be a reasonable number of buckets ... */
	table->num_buckets = (num_slots / 3) \| 1;
	table->buckets = apr_palloc (pool, (table->num_buckets
	* sizeof (*table->buckets)));
	for (i = 0; i < table->num_buckets; ++i)
	table->buckets[i] = NULL;

	table->slots = apr_palloc (pool, num_slots * sizeof (*table->slots));
	for (j = 0; j < num_slots; ++j)
	table->slots[j].next = NULL;

	return table;
	}


	/* Convert a key to a pointer to the key's hash bucket.
	We use a 2-universal multiplicative hash function. If you're
	windering about the selected multiplier, take a look at the
	comments in apr/tables/apr_hash.c:find_entry for a discussion
	on fast string hashes; it's very illuminating.

	[ We use 127 instead of 33 here because I happen to like
	interesting prime numbers, so there. --xbc ] */
	static APR_INLINE apr_uint32_t
	get_bucket (const hash_table_t table, const char key)
	{
	int i;
	apr_uint32_t hash = 0;
	for (i = 0; i < VD_KEY_SIZE; ++i)
	hash = hash * 127 + *key++;
	return hash % table->num_buckets;
	}


	/* Store a key->index mapping into the hash table. */
	static APR_INLINE void
	store_mapping (hash_table_t table, const char key, apr_off_t idx)
	{
	apr_uint32_t bucket = get_bucket (table, key);;
	assert (table->slots[idx].next == NULL);
	table->slots[idx].next = table->buckets[bucket];
	table->buckets[bucket] = &table->slots[idx];
	}



	/* ==================================================================== */
	/* Vdelta generator.

	The article "Delta Algorithms: An Empirical Analysis" by Hunt,
	Vo and Tichy contains a description of the vdelta algorithm,
	but it's incomplete. Here's a detailed description (see also
	http://subversion.tigris.org/subversion-dev/current/msg00158.html
	in the mailing list archives):

	1. Look up the four bytes starting at the current position
	pointer. If there are no matches for those four bytes,
	output an insert, move the position pointer forward by one,
	and go back to step 1.

	2. Determine which of the candidates yields the longest
	extension. This will be called the "current match".

	3. Look up the last three bytes of the current match plus one
	unmatched byte. If there is no match for those four bytes,
	the current match is the best match; go to step 6.

	4. For each candidate, check backwards to see if it matches
	the entire match so far. If no candidates satisfy that
	constraint, the current match is the best match; go to step 6.

	5. Among the candidates which do satisfy the constraint,
	determine which one yields the longest extension. This
	will be the new "current match." Go back to step 3.

	6. Output a block copy instruction, add indexes for the last
	three positions of the matched data, advance the position
	pointer by the length of the match, and go back to step 1.

	Inserts and copies are generated only when the current position
	is within the target data.

	Note that the vdelta algorithm allows copies that cross the
	source/target data boundary. Because our internal delta
	representation has different opcodes for source and terget copies,
	we split them in two. This means that the opcode stream in the
	delta window can contain copies shorter than VD_KEY_SIZE. These
	could be represented by insert ops instead, but we'll leave them
	in, so that we can merge them again when we convert the delta
	window to an external format like vcdiff that supports cross
	-boundary copies. */


	/* Find the length of a match within the data window.
	Note that (match < from && from <= end) must always be true here. */

	static APR_INLINE int
	find_match_len (const char match, const char from, const char *end)
	{
	const char *here = from;
	while (here < end && match == here)
	{
	++match;
	++here;
	}
	return here - from;
	}


	/* This is the main vdelta generator. */

	static void
	vdelta (struct build_ops_baton_t *build_baton,
	const char *data,
	const char *start,
	const char *end,
	svn_boolean_t outputflag,
	hash_table_t *table,
	apr_pool_t *pool)
	{
	const char here = start; / Current position in the buffer. */
	const char insert_from = NULL; / Start of byte range for insertion. */

	for (;;)
	{
	const char current_match, key;
	apr_size_t current_match_len = 0;
	hash_slot_t *slot;
	svn_boolean_t progress;

	/* If we're near the end, just insert the last few bytes. */
	if (end - here < VD_KEY_SIZE)
	{
	const char *from = ((insert_from != NULL) ? insert_from : here);

	if (outputflag && from < end)
	svn_txdelta__insert_op (build_baton, svn_txdelta_new, 0,
	end - from, from, pool);
	return;
	}

	/* Search for the longest match. */
	current_match = NULL;
	current_match_len = 0;
	key = here;
	do
	{
	/* Try to extend the current match. Our key is the last
	three matched bytes plus one unmatched byte if we already
	have a current match, or just the four bytes where we are
	if we don't have a current match yet. See which mapping
	yields the longest extension. */
	progress = FALSE;
	for (slot = table->buckets[get_bucket (table, key)];
	slot != NULL;
	slot = slot->next)
	{
	const char *match;
	apr_size_t match_len;

	if (slot - table->slots < key - here) /* Too close to start */
	continue;
	match = data + (slot - table->slots) - (key - here);
	match_len = find_match_len (match, here, end);

	/* We can only copy from the source or from the target, so
	don't let the match cross START. */
	if (match < start && match + match_len > start)
	match_len = start - match;

	if (match_len >= VD_KEY_SIZE && match_len > current_match_len)
	{
	/* We have a longer match; record it. */
	current_match = match;
	current_match_len = match_len;
	progress = TRUE;
	}
	}
	if (progress)
	key = here + current_match_len - (VD_KEY_SIZE - 1);
	}
	while (progress && end - key >= VD_KEY_SIZE);

	if (current_match_len < VD_KEY_SIZE)
	{
	/* There is no match here; store a mapping and insert this byte. */
	store_mapping (table, here, here - data);
	if (insert_from == NULL)
	insert_from = here;
	here++;
	continue;
	}
	else if (outputflag)
	{
	if (insert_from != NULL)
	{
	/* Commit the pending insert. */
	svn_txdelta__insert_op (build_baton, svn_txdelta_new,
	0, here - insert_from,
	insert_from, pool);
	insert_from = NULL;
	}
	if (current_match < start) /* Copy from source. */
	svn_txdelta__insert_op (build_baton, svn_txdelta_source,
	current_match - data,
	current_match_len,
	NULL, pool);
	else /* Copy from target */
	svn_txdelta__insert_op (build_baton, svn_txdelta_target,
	current_match - start,
	current_match_len,
	NULL, pool);
	}

	/* Adjust the current position and insert mappings for the
	last three bytes of the match. */
	here += current_match_len;
	if (end - here >= VD_KEY_SIZE)
	{
	char const *last = here - (VD_KEY_SIZE - 1);
	for (; last < here; ++last)
	store_mapping (table, last, last - data);
	}
	}
	}


	void
	svn_txdelta__vdelta (struct build_ops_baton_t *build_baton,
	const char *data,
	apr_size_t source_len,
	apr_size_t target_len,
	apr_pool_t *pool)
	{
	hash_table_t *table = create_hash_table (source_len + target_len, pool);

	vdelta (build_baton, data, data, data + source_len, FALSE, table, pool);
	vdelta (build_baton, data, data + source_len, data + source_len + target_len,
	TRUE, table, pool);

	#if 0
	/* This bit of code calculates the hash load and the
	number of collisions. Please note that a the number
	of collisions per bucket is one less than the length
	of the chain. :-) --xbc */
	{
	int i;
	int empty = 0;
	int collisions = 0;
	for (i = 0; i < table->num_buckets; ++i)
	{
	hash_slot_t *slot = table->buckets[i];
	if (!slot)
	++empty;
	else
	{
	slot = slot->next;
	while (slot != NULL)
	{
	++collisions;
	slot = slot->next;
	}
	}
	}
	fprintf (stderr, "Hash stats: load %d, collisions %d\n",
	100 - 100 * empty / table->num_buckets, collisions);
	}
	#endif
	}



	/*
	* local variables:
	* eval: (load-file "../../tools/dev/svn-dev.el")
	* end:
	*/