blob: a26a79dabb2901f29e7453485f807ae15984b0dc [file] [log] [blame]
/* stats.c -- implements the svn_fs_fs__get_stats private API.
*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
* ====================================================================
*/
#include "svn_dirent_uri.h"
#include "svn_fs.h"
#include "svn_pools.h"
#include "svn_sorts.h"
#include "private/svn_cache.h"
#include "private/svn_sorts_private.h"
#include "private/svn_string_private.h"
#include "index.h"
#include "pack.h"
#include "rev_file.h"
#include "util.h"
#include "fs_fs.h"
#include "cached_data.h"
#include "low_level.h"
#include "revprops.h"
#include "../libsvn_fs/fs-loader.h"
#include "svn_private_config.h"
/* We group representations into 2x2 different kinds plus one default:
* [dir / file] x [text / prop]. The assignment is done by the first node
* that references the respective representation.
*/
typedef enum rep_kind_t
{
/* The representation is not used _directly_, i.e. not referenced by any
* noderev. However, some other representation may use it as delta base.
* Null value. Should not occur in real-word repositories. */
unused_rep,
/* a properties on directory rep */
dir_property_rep,
/* a properties on file rep */
file_property_rep,
/* a directory rep */
dir_rep,
/* a file rep */
file_rep
} rep_kind_t;
/* A representation fragment.
*/
typedef struct rep_stats_t
{
/* offset in the revision file (phys. addressing) /
* item index within REVISION (log. addressing) */
apr_uint64_t item_index;
/* item length in bytes */
apr_uint64_t size;
/* item length after de-deltification */
apr_uint64_t expanded_size;
/* revision that contains this representation
* (may be referenced by other revisions, though) */
svn_revnum_t revision;
/* number of nodes that reference this representation */
apr_uint32_t ref_count;
/* length of the PLAIN / DELTA line in the source file in bytes */
apr_uint16_t header_size;
/* classification of the representation. values of rep_kind_t */
char kind;
/* length of the delta chain, including this representation,
* saturated to 255 - if need be */
apr_byte_t chain_length;
} rep_stats_t;
/* Represents a link in the rep delta chain. REVISION + ITEM_INDEX points
* to BASE_REVISION + BASE_ITEM_INDEX. We collect this info while scanning
* a f7 repo in a single pass and resolve it afterwards. */
typedef struct rep_ref_t
{
/* Revision that contains this representation. */
svn_revnum_t revision;
/* Item index of this rep within REVISION. */
apr_uint64_t item_index;
/* Revision of the representation we deltified against.
* -1 if this representation is either PLAIN or a self-delta. */
svn_revnum_t base_revision;
/* Item index of that rep within BASE_REVISION. */
apr_uint64_t base_item_index;
/* Length of the PLAIN / DELTA line in the source file in bytes.
* We use this to update the info in the rep stats after scanning the
* whole file. */
apr_uint16_t header_size;
} rep_ref_t;
/* Represents a single revision.
* There will be only one instance per revision. */
typedef struct revision_info_t
{
/* number of this revision */
svn_revnum_t revision;
/* pack file offset (manifest value), 0 for non-packed files */
apr_off_t offset;
/* length of the changes list on bytes */
apr_uint64_t changes_len;
/* offset of the changes list relative to OFFSET */
apr_uint64_t change_count;
/* first offset behind the revision data in the pack file (file length
* for non-packed revs) */
apr_off_t end;
/* number of directory noderevs in this revision */
apr_uint64_t dir_noderev_count;
/* number of file noderevs in this revision */
apr_uint64_t file_noderev_count;
/* total size of directory noderevs (i.e. the structs - not the rep) */
apr_uint64_t dir_noderev_size;
/* total size of file noderevs (i.e. the structs - not the rep) */
apr_uint64_t file_noderev_size;
/* all rep_stats_t of this revision (in no particular order),
* i.e. those that point back to this struct */
apr_array_header_t *representations;
/* Temporary rev / pack file access object, used in phys. addressing
* mode only. NULL when done reading this revision. */
svn_fs_fs__revision_file_t *rev_file;
} revision_info_t;
/* Root data structure containing all information about a given repository.
* We use it as a wrapper around svn_fs_t and pass it around where we would
* otherwise just use a svn_fs_t.
*/
typedef struct query_t
{
/* FS API object*/
svn_fs_t *fs;
/* The HEAD revision. */
svn_revnum_t head;
/* Number of revs per shard; 0 for non-sharded repos. */
int shard_size;
/* First non-packed revision. */
svn_revnum_t min_unpacked_rev;
/* all revisions */
apr_array_header_t *revisions;
/* empty representation.
* Used as a dummy base for DELTA reps without base. */
rep_stats_t *null_base;
/* collected statistics */
svn_fs_fs__stats_t *stats;
/* Progress notification callback to call after each shard. May be NULL. */
svn_fs_progress_notify_func_t progress_func;
/* Baton for PROGRESS_FUNC. */
void *progress_baton;
/* Cancellation support callback to call once in a while. May be NULL. */
svn_cancel_func_t cancel_func;
/* Baton for CANCEL_FUNC. */
void *cancel_baton;
} query_t;
/* Initialize the LARGEST_CHANGES member in STATS with a capacity of COUNT
* entries. Allocate the result in RESULT_POOL.
*/
static void
initialize_largest_changes(svn_fs_fs__stats_t *stats,
apr_size_t count,
apr_pool_t *result_pool)
{
apr_size_t i;
stats->largest_changes = apr_pcalloc(result_pool,
sizeof(*stats->largest_changes));
stats->largest_changes->count = count;
stats->largest_changes->min_size = 1;
stats->largest_changes->changes
= apr_palloc(result_pool, count * sizeof(*stats->largest_changes->changes));
/* allocate *all* entries before the path stringbufs. This increases
* cache locality and enhances performance significantly. */
for (i = 0; i < count; ++i)
stats->largest_changes->changes[i]
= apr_palloc(result_pool, sizeof(**stats->largest_changes->changes));
/* now initialize them and allocate the stringbufs */
for (i = 0; i < count; ++i)
{
stats->largest_changes->changes[i]->size = 0;
stats->largest_changes->changes[i]->revision = SVN_INVALID_REVNUM;
stats->largest_changes->changes[i]->path
= svn_stringbuf_create_ensure(1024, result_pool);
}
}
/* Add entry for SIZE to HISTOGRAM.
*/
static void
add_to_histogram(svn_fs_fs__histogram_t *histogram,
apr_int64_t size)
{
apr_int64_t shift = 0;
while (((apr_int64_t)(1) << shift) <= size)
shift++;
histogram->total.count++;
histogram->total.sum += size;
histogram->lines[(apr_size_t)shift].count++;
histogram->lines[(apr_size_t)shift].sum += size;
}
/* Update data aggregators in STATS with this representation of type KIND,
* on-disk REP_SIZE and expanded node size EXPANDED_SIZE for PATH in REVSION.
* PLAIN_ADDED indicates whether the node has a deltification predecessor.
*/
static void
add_change(svn_fs_fs__stats_t *stats,
apr_uint64_t rep_size,
apr_uint64_t expanded_size,
svn_revnum_t revision,
const char *path,
rep_kind_t kind,
svn_boolean_t plain_added)
{
/* identify largest reps */
if (rep_size >= stats->largest_changes->min_size)
{
apr_size_t i;
svn_fs_fs__largest_changes_t *largest_changes = stats->largest_changes;
svn_fs_fs__large_change_info_t *info
= largest_changes->changes[largest_changes->count - 1];
info->size = rep_size;
info->revision = revision;
svn_stringbuf_set(info->path, path);
/* linear insertion but not too bad since count is low and insertions
* near the end are more likely than close to front */
for (i = largest_changes->count - 1; i > 0; --i)
if (largest_changes->changes[i-1]->size >= rep_size)
break;
else
largest_changes->changes[i] = largest_changes->changes[i-1];
largest_changes->changes[i] = info;
largest_changes->min_size
= largest_changes->changes[largest_changes->count-1]->size;
}
/* global histograms */
add_to_histogram(&stats->rep_size_histogram, rep_size);
add_to_histogram(&stats->node_size_histogram, expanded_size);
if (plain_added)
{
add_to_histogram(&stats->added_rep_size_histogram, rep_size);
add_to_histogram(&stats->added_node_size_histogram, expanded_size);
}
/* specific histograms by type */
switch (kind)
{
case unused_rep:
add_to_histogram(&stats->unused_rep_histogram, rep_size);
break;
case dir_property_rep:
add_to_histogram(&stats->dir_prop_rep_histogram, rep_size);
add_to_histogram(&stats->dir_prop_histogram, expanded_size);
break;
case file_property_rep:
add_to_histogram(&stats->file_prop_rep_histogram, rep_size);
add_to_histogram(&stats->file_prop_histogram, expanded_size);
break;
case dir_rep:
add_to_histogram(&stats->dir_rep_histogram, rep_size);
add_to_histogram(&stats->dir_histogram, expanded_size);
break;
case file_rep:
add_to_histogram(&stats->file_rep_histogram, rep_size);
add_to_histogram(&stats->file_histogram, expanded_size);
break;
}
/* by extension */
if (kind == file_rep)
{
/* determine extension */
svn_fs_fs__extension_info_t *info;
const char * file_name = strrchr(path, '/');
const char * extension = file_name ? strrchr(file_name, '.') : NULL;
if (extension == NULL || extension == file_name + 1)
extension = "(none)";
/* get / auto-insert entry for this extension */
info = apr_hash_get(stats->by_extension, extension, APR_HASH_KEY_STRING);
if (info == NULL)
{
apr_pool_t *pool = apr_hash_pool_get(stats->by_extension);
info = apr_pcalloc(pool, sizeof(*info));
info->extension = apr_pstrdup(pool, extension);
apr_hash_set(stats->by_extension, info->extension,
APR_HASH_KEY_STRING, info);
}
/* update per-extension histogram */
add_to_histogram(&info->node_histogram, expanded_size);
add_to_histogram(&info->rep_histogram, rep_size);
}
}
/* Comparator used for binary search comparing the absolute file offset
* of a representation to some other offset. DATA is a *rep_stats_t,
* KEY is a pointer to an apr_uint64_t.
*/
static int
compare_representation_item_index(const void *data, const void *key)
{
apr_uint64_t lhs = (*(const rep_stats_t *const *)data)->item_index;
apr_uint64_t rhs = *(const apr_uint64_t *)key;
if (lhs < rhs)
return -1;
return (lhs > rhs ? 1 : 0);
}
/* Find the revision_info_t object to the given REVISION in QUERY and
* return it in *REVISION_INFO. For performance reasons, we skip the
* lookup if the info is already provided.
*
* In that revision, look for the rep_stats_t object for item ITEM_INDEX.
* If it already exists, set *IDX to its index in *REVISION_INFO's
* representations list and return the representation object. Otherwise,
* set the index to where it must be inserted and return NULL.
*/
static rep_stats_t *
find_representation(int *idx,
query_t *query,
revision_info_t **revision_info,
svn_revnum_t revision,
apr_uint64_t item_index)
{
revision_info_t *info;
*idx = -1;
/* first let's find the revision */
info = revision_info ? *revision_info : NULL;
if (info == NULL || info->revision != revision)
{
info = APR_ARRAY_IDX(query->revisions, revision, revision_info_t*);
if (revision_info)
*revision_info = info;
}
/* not found -> no result */
if (info == NULL)
return NULL;
/* look for the representation */
*idx = svn_sort__bsearch_lower_bound(info->representations,
&item_index,
compare_representation_item_index);
if (*idx < info->representations->nelts)
{
/* return the representation, if this is the one we were looking for */
rep_stats_t *result
= APR_ARRAY_IDX(info->representations, *idx, rep_stats_t *);
if (result->item_index == item_index)
return result;
}
/* not parsed, yet */
return NULL;
}
/* Find / auto-construct the representation stats for REP in QUERY and
* return it in *REPRESENTATION.
*
* If necessary, allocate the result in RESULT_POOL; use SCRATCH_POOL for
* temporary allocations.
*/
static svn_error_t *
parse_representation(rep_stats_t **representation,
query_t *query,
representation_t *rep,
revision_info_t *revision_info,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
rep_stats_t *result;
int idx;
/* read location (revision, offset) and size */
/* look it up */
result = find_representation(&idx, query, &revision_info, rep->revision,
rep->item_index);
if (!result)
{
/* not parsed, yet (probably a rep in the same revision).
* Create a new rep object and determine its base rep as well.
*/
result = apr_pcalloc(result_pool, sizeof(*result));
result->revision = rep->revision;
result->expanded_size = rep->expanded_size;
result->item_index = rep->item_index;
result->size = rep->size;
/* In phys. addressing mode, follow link to the actual representation.
* In log. addressing mode, we will find it already as part of our
* linear walk through the whole file. */
if (!svn_fs_fs__use_log_addressing(query->fs))
{
svn_fs_fs__rep_header_t *header;
apr_off_t offset = revision_info->offset
+ (apr_off_t)rep->item_index;
SVN_ERR_ASSERT(revision_info->rev_file);
SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET,
&offset, scratch_pool));
SVN_ERR(svn_fs_fs__read_rep_header(&header,
revision_info->rev_file->stream,
scratch_pool, scratch_pool));
result->header_size = header->header_size;
/* Determine length of the delta chain. */
if (header->type == svn_fs_fs__rep_delta)
{
int base_idx;
rep_stats_t *base_rep
= find_representation(&base_idx, query, NULL,
header->base_revision,
header->base_item_index);
result->chain_length = 1 + MIN(base_rep->chain_length,
(apr_byte_t)0xfe);
}
else
{
result->chain_length = 1;
}
}
svn_sort__array_insert(revision_info->representations, &result, idx);
}
*representation = result;
return SVN_NO_ERROR;
}
/* forward declaration */
static svn_error_t *
read_noderev(query_t *query,
svn_stringbuf_t *noderev_str,
revision_info_t *revision_info,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool);
/* Read the noderev item at OFFSET in REVISION_INFO from the filesystem
* provided by QUERY. Return it in *NODEREV, allocated in RESULT_POOL.
* Use SCRATCH_POOL for temporary allocations.
*
* The textual representation of the noderev will be used to determine
* the on-disk size of the noderev. Only called in phys. addressing mode.
*/
static svn_error_t *
read_phsy_noderev(svn_stringbuf_t **noderev,
query_t *query,
apr_off_t offset,
revision_info_t *revision_info,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
svn_stringbuf_t *noderev_str = svn_stringbuf_create_empty(result_pool);
svn_stringbuf_t *line;
svn_boolean_t eof;
apr_pool_t *iterpool = svn_pool_create(scratch_pool);
/* Navigate the file stream to the start of noderev. */
SVN_ERR_ASSERT(revision_info->rev_file);
offset += revision_info->offset;
SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET,
&offset, scratch_pool));
/* Read it (terminated by an empty line) */
do
{
svn_pool_clear(iterpool);
SVN_ERR(svn_stream_readline(revision_info->rev_file->stream, &line,
"\n", &eof, iterpool));
svn_stringbuf_appendstr(noderev_str, line);
svn_stringbuf_appendbyte(noderev_str, '\n');
}
while (line->len > 0 && !eof);
/* Return the result. */
*noderev = noderev_str;
svn_pool_destroy(iterpool);
return SVN_NO_ERROR;
}
/* Starting at the directory in NODEREV's text, read all DAG nodes,
* directories and representations linked in that tree structure.
* Store them in QUERY and REVISION_INFO. Also, read them only once.
*
* Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
* temporaries.
*/
static svn_error_t *
parse_dir(query_t *query,
node_revision_t *noderev,
revision_info_t *revision_info,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
apr_pool_t *iterpool = svn_pool_create(scratch_pool);
int i;
apr_array_header_t *entries;
SVN_ERR(svn_fs_fs__rep_contents_dir(&entries, query->fs, noderev,
scratch_pool, scratch_pool));
for (i = 0; i < entries->nelts; ++i)
{
svn_fs_dirent_t *dirent = APR_ARRAY_IDX(entries, i, svn_fs_dirent_t *);
if (svn_fs_fs__id_rev(dirent->id) == revision_info->revision)
{
svn_stringbuf_t *noderev_str;
svn_pool_clear(iterpool);
SVN_ERR(read_phsy_noderev(&noderev_str, query,
svn_fs_fs__id_item(dirent->id),
revision_info, iterpool, iterpool));
SVN_ERR(read_noderev(query, noderev_str, revision_info,
result_pool, iterpool));
}
}
svn_pool_destroy(iterpool);
return SVN_NO_ERROR;
}
/* Parse the noderev given as NODEREV_STR and store the info in QUERY and
* REVISION_INFO. In phys. addressing mode, continue reading all DAG nodes,
* directories and representations linked in that tree structure.
*
* Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
* temporaries.
*/
static svn_error_t *
read_noderev(query_t *query,
svn_stringbuf_t *noderev_str,
revision_info_t *revision_info,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
rep_stats_t *text = NULL;
rep_stats_t *props = NULL;
node_revision_t *noderev;
svn_stream_t *stream = svn_stream_from_stringbuf(noderev_str, scratch_pool);
SVN_ERR(svn_fs_fs__read_noderev(&noderev, stream, scratch_pool,
scratch_pool));
SVN_ERR(svn_fs_fs__fixup_expanded_size(query->fs, noderev->data_rep,
scratch_pool));
SVN_ERR(svn_fs_fs__fixup_expanded_size(query->fs, noderev->prop_rep,
scratch_pool));
if (noderev->data_rep)
{
SVN_ERR(parse_representation(&text, query,
noderev->data_rep, revision_info,
result_pool, scratch_pool));
/* if we are the first to use this rep, mark it as "text rep" */
if (++text->ref_count == 1)
text->kind = noderev->kind == svn_node_dir ? dir_rep : file_rep;
}
if (noderev->prop_rep)
{
SVN_ERR(parse_representation(&props, query,
noderev->prop_rep, revision_info,
result_pool, scratch_pool));
/* if we are the first to use this rep, mark it as "prop rep" */
if (++props->ref_count == 1)
props->kind = noderev->kind == svn_node_dir ? dir_property_rep
: file_property_rep;
}
/* record largest changes */
if (text && text->ref_count == 1)
add_change(query->stats, text->size, text->expanded_size, text->revision,
noderev->created_path, text->kind, !noderev->predecessor_id);
if (props && props->ref_count == 1)
add_change(query->stats, props->size, props->expanded_size,
props->revision, noderev->created_path, props->kind,
!noderev->predecessor_id);
/* if this is a directory and has not been processed, yet, read and
* process it recursively */
if ( noderev->kind == svn_node_dir && text && text->ref_count == 1
&& !svn_fs_fs__use_log_addressing(query->fs))
SVN_ERR(parse_dir(query, noderev, revision_info, result_pool,
scratch_pool));
/* update stats */
if (noderev->kind == svn_node_dir)
{
revision_info->dir_noderev_size += noderev_str->len;
revision_info->dir_noderev_count++;
}
else
{
revision_info->file_noderev_size += noderev_str->len;
revision_info->file_noderev_count++;
}
return SVN_NO_ERROR;
}
/* For the revision given as REVISION_INFO within QUERY, determine the number
* of entries in its changed paths list and store that info in REVISION_INFO.
* Use SCRATCH_POOL for temporary allocations.
*/
static svn_error_t *
get_phys_change_count(query_t *query,
revision_info_t *revision_info,
apr_pool_t *scratch_pool)
{
apr_pool_t *iterpool = svn_pool_create(scratch_pool);
svn_fs_fs__changes_context_t *context;
/* Fetch the first block of data. */
SVN_ERR(svn_fs_fs__create_changes_context(&context, query->fs,
revision_info->revision,
scratch_pool));
revision_info->change_count = 0;
while (!context->eol)
{
apr_array_header_t *changes;
svn_pool_clear(iterpool);
SVN_ERR(svn_fs_fs__get_changes(&changes, context, iterpool, iterpool));
revision_info->change_count = changes->nelts;
}
svn_pool_destroy(iterpool);
return SVN_NO_ERROR;
}
/* Read header information for the revision stored in FILE_CONTENT (one
* whole revision). Return the offsets within FILE_CONTENT for the
* *ROOT_NODEREV, the list of *CHANGES and its len in *CHANGES_LEN.
* Use POOL for temporary allocations. */
static svn_error_t *
read_phys_revision(query_t *query,
revision_info_t *info,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
char buf[64];
apr_off_t root_node_offset;
apr_off_t changes_offset;
svn_stringbuf_t *trailer;
svn_stringbuf_t *noderev_str;
/* Read the last 64 bytes of the revision (if long enough). */
apr_off_t start = MAX(info->offset, info->end - sizeof(buf));
apr_size_t len = (apr_size_t)(info->end - start);
SVN_ERR(svn_io_file_seek(info->rev_file->file, APR_SET, &start,
scratch_pool));
SVN_ERR(svn_io_file_read_full2(info->rev_file->file, buf, len, NULL, NULL,
scratch_pool));
trailer = svn_stringbuf_ncreate(buf, len, scratch_pool);
/* Parse that trailer. */
SVN_ERR(svn_fs_fs__parse_revision_trailer(&root_node_offset,
&changes_offset, trailer,
info->revision));
SVN_ERR(get_phys_change_count(query, info, scratch_pool));
/* Calculate the length of the changes list. */
trailer = svn_fs_fs__unparse_revision_trailer(root_node_offset,
changes_offset,
scratch_pool);
info->changes_len = info->end - info->offset - changes_offset
- trailer->len;
/* Recursively read nodes added in this rev. */
SVN_ERR(read_phsy_noderev(&noderev_str, query, root_node_offset, info,
scratch_pool, scratch_pool));
SVN_ERR(read_noderev(query, noderev_str, info, result_pool, scratch_pool));
return SVN_NO_ERROR;
}
/* Read the content of the pack file staring at revision BASE physical
* addressing mode and store it in QUERY.
*
* Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
* temporaries.
*/
static svn_error_t *
read_phys_pack_file(query_t *query,
svn_revnum_t base,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
apr_pool_t *iterpool = svn_pool_create(scratch_pool);
int i;
svn_filesize_t file_size = 0;
svn_fs_fs__revision_file_t *rev_file;
SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base,
scratch_pool, scratch_pool));
SVN_ERR(svn_io_file_size_get(&file_size, rev_file->file, scratch_pool));
/* process each revision in the pack file */
for (i = 0; i < query->shard_size; ++i)
{
revision_info_t *info;
/* cancellation support */
if (query->cancel_func)
SVN_ERR(query->cancel_func(query->cancel_baton));
/* create the revision info for the current rev */
info = apr_pcalloc(result_pool, sizeof(*info));
info->representations = apr_array_make(result_pool, 4,
sizeof(rep_stats_t*));
info->rev_file = rev_file;
info->revision = base + i;
SVN_ERR(svn_fs_fs__get_packed_offset(&info->offset, query->fs, base + i,
iterpool));
if (i + 1 == query->shard_size)
info->end = file_size;
else
SVN_ERR(svn_fs_fs__get_packed_offset(&info->end, query->fs,
base + i + 1, iterpool));
SVN_ERR(read_phys_revision(query, info, result_pool, iterpool));
info->representations = apr_array_copy(result_pool,
info->representations);
/* Done with this revision. */
info->rev_file = NULL;
/* put it into our container */
APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
/* destroy temps */
svn_pool_clear(iterpool);
}
/* Done with this pack file. */
SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
/* one more pack file processed */
if (query->progress_func)
query->progress_func(base, query->progress_baton, scratch_pool);
return SVN_NO_ERROR;
}
/* Read the content of the file for REVISION in physical addressing mode
* and store its contents in QUERY.
*
* Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
* temporaries.
*/
static svn_error_t *
read_phys_revision_file(query_t *query,
svn_revnum_t revision,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info));
svn_filesize_t file_size = 0;
svn_fs_fs__revision_file_t *rev_file;
/* cancellation support */
if (query->cancel_func)
SVN_ERR(query->cancel_func(query->cancel_baton));
/* read the whole pack file into memory */
SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, revision,
scratch_pool, scratch_pool));
SVN_ERR(svn_io_file_size_get(&file_size, rev_file->file, scratch_pool));
/* create the revision info for the current rev */
info->representations = apr_array_make(result_pool, 4, sizeof(rep_stats_t*));
info->rev_file = rev_file;
info->revision = revision;
info->offset = 0;
info->end = file_size;
SVN_ERR(read_phys_revision(query, info, result_pool, scratch_pool));
/* Done with this revision. */
SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
info->rev_file = NULL;
/* put it into our container */
APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
/* show progress every 1000 revs or so */
if (query->progress_func)
{
if (query->shard_size && (revision % query->shard_size == 0))
query->progress_func(revision, query->progress_baton, scratch_pool);
if (!query->shard_size && (revision % 1000 == 0))
query->progress_func(revision, query->progress_baton, scratch_pool);
}
return SVN_NO_ERROR;
}
/* Given the unparsed changes list in CHANGES with LEN chars, return the
* number of changed paths encoded in it. Only used in log. addressing
* mode.
*/
static apr_uint64_t
get_log_change_count(const char *changes,
apr_size_t len)
{
apr_size_t lines = 0;
const char *end = changes + len;
/* line count */
for (; changes < end; ++changes)
if (*changes == '\n')
++lines;
/* two lines per change */
return lines / 2;
}
/* Read the item described by ENTRY from the REV_FILE and return the
* respective byte sequence in *CONTENTS, allocated in RESULT_POOL.
* Use SCRATCH_POOL for temporary allocations
*/
static svn_error_t *
read_item(svn_stringbuf_t **contents,
svn_fs_fs__revision_file_t *rev_file,
svn_fs_fs__p2l_entry_t *entry,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
svn_stringbuf_t *item = svn_stringbuf_create_ensure(entry->size,
result_pool);
item->len = entry->size;
item->data[item->len] = 0;
SVN_ERR(svn_io_file_aligned_seek(rev_file->file, rev_file->block_size,
NULL, entry->offset, scratch_pool));
SVN_ERR(svn_io_file_read_full2(rev_file->file, item->data, item->len,
NULL, NULL, scratch_pool));
*contents = item;
return SVN_NO_ERROR;
}
/* Predicate comparing the two rep_ref_t** LHS and RHS by the respective
* representation's revision.
*/
static int
compare_representation_refs(const void *lhs, const void *rhs)
{
svn_revnum_t lhs_rev = (*(const rep_ref_t *const *)lhs)->revision;
svn_revnum_t rhs_rev = (*(const rep_ref_t *const *)rhs)->revision;
if (lhs_rev < rhs_rev)
return -1;
return (lhs_rev > rhs_rev ? 1 : 0);
}
/* Given all the presentations found in a single rev / pack file as
* rep_ref_t * in REP_REFS, update the delta chain lengths in QUERY.
* REP_REFS and its contents can then be discarded.
*/
static svn_error_t *
resolve_representation_refs(query_t *query,
apr_array_header_t *rep_refs)
{
int i;
/* Because delta chains can only point to previous revs, after sorting
* REP_REFS, all base refs have already been updated. */
svn_sort__array(rep_refs, compare_representation_refs);
/* Build up the CHAIN_LENGTH values. */
for (i = 0; i < rep_refs->nelts; ++i)
{
int idx;
rep_ref_t *ref = APR_ARRAY_IDX(rep_refs, i, rep_ref_t *);
rep_stats_t *rep = find_representation(&idx, query, NULL,
ref->revision, ref->item_index);
/* No dangling pointers and all base reps have been processed. */
SVN_ERR_ASSERT(rep);
SVN_ERR_ASSERT(!rep->chain_length);
/* Set the HEADER_SIZE as we found it during the scan. */
rep->header_size = ref->header_size;
/* The delta chain got 1 element longer. */
if (ref->base_revision == SVN_INVALID_REVNUM)
{
rep->chain_length = 1;
}
else
{
rep_stats_t *base;
base = find_representation(&idx, query, NULL, ref->base_revision,
ref->base_item_index);
SVN_ERR_ASSERT(base);
SVN_ERR_ASSERT(base->chain_length);
rep->chain_length = 1 + MIN(base->chain_length, (apr_byte_t)0xfe);
}
}
return SVN_NO_ERROR;
}
/* Process the logically addressed revision contents of revisions BASE to
* BASE + COUNT - 1 in QUERY.
*
* Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
* temporaries.
*/
static svn_error_t *
read_log_rev_or_packfile(query_t *query,
svn_revnum_t base,
int count,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
fs_fs_data_t *ffd = query->fs->fsap_data;
apr_pool_t *iterpool = svn_pool_create(scratch_pool);
apr_off_t max_offset;
apr_off_t offset = 0;
int i;
svn_fs_fs__revision_file_t *rev_file;
/* We collect the delta chain links as we scan the file. Afterwards,
* we determine the lengths of those delta chains and throw this
* temporary container away. */
apr_array_header_t *rep_refs = apr_array_make(scratch_pool, 64,
sizeof(rep_ref_t *));
/* we will process every revision in the rev / pack file */
for (i = 0; i < count; ++i)
{
/* create the revision info for the current rev */
revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info));
info->representations = apr_array_make(result_pool, 4,
sizeof(rep_stats_t*));
info->revision = base + i;
APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
}
/* open the pack / rev file that is covered by the p2l index */
SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base,
scratch_pool, iterpool));
SVN_ERR(svn_fs_fs__p2l_get_max_offset(&max_offset, query->fs, rev_file,
base, scratch_pool));
/* record the whole pack size in the first rev so the total sum will
still be correct */
APR_ARRAY_IDX(query->revisions, base, revision_info_t*)->end = max_offset;
/* for all offsets in the file, get the P2L index entries and process
the interesting items (change lists, noderevs) */
for (offset = 0; offset < max_offset; )
{
apr_array_header_t *entries;
svn_pool_clear(iterpool);
/* cancellation support */
if (query->cancel_func)
SVN_ERR(query->cancel_func(query->cancel_baton));
/* get all entries for the current block */
SVN_ERR(svn_fs_fs__p2l_index_lookup(&entries, query->fs, rev_file, base,
offset, ffd->p2l_page_size,
iterpool, iterpool));
/* process all entries (and later continue with the next block) */
for (i = 0; i < entries->nelts; ++i)
{
svn_stringbuf_t *item;
revision_info_t *info;
svn_fs_fs__p2l_entry_t *entry
= &APR_ARRAY_IDX(entries, i, svn_fs_fs__p2l_entry_t);
/* skip bits we previously processed */
if (i == 0 && entry->offset < offset)
continue;
/* skip zero-sized entries */
if (entry->size == 0)
continue;
/* read and process interesting items */
info = APR_ARRAY_IDX(query->revisions, entry->item.revision,
revision_info_t*);
if (entry->type == SVN_FS_FS__ITEM_TYPE_NODEREV)
{
SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool));
SVN_ERR(read_noderev(query, item, info, result_pool, iterpool));
}
else if (entry->type == SVN_FS_FS__ITEM_TYPE_CHANGES)
{
SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool));
info->change_count
= get_log_change_count(item->data + 0, item->len);
info->changes_len += entry->size;
}
else if ( (entry->type == SVN_FS_FS__ITEM_TYPE_FILE_REP)
|| (entry->type == SVN_FS_FS__ITEM_TYPE_DIR_REP)
|| (entry->type == SVN_FS_FS__ITEM_TYPE_FILE_PROPS)
|| (entry->type == SVN_FS_FS__ITEM_TYPE_DIR_PROPS))
{
/* Collect the delta chain link. */
svn_fs_fs__rep_header_t *header;
rep_ref_t *ref = apr_pcalloc(scratch_pool, sizeof(*ref));
SVN_ERR(svn_io_file_aligned_seek(rev_file->file,
rev_file->block_size,
NULL, entry->offset,
iterpool));
SVN_ERR(svn_fs_fs__read_rep_header(&header,
rev_file->stream,
iterpool, iterpool));
ref->header_size = header->header_size;
ref->revision = entry->item.revision;
ref->item_index = entry->item.number;
if (header->type == svn_fs_fs__rep_delta)
{
ref->base_item_index = header->base_item_index;
ref->base_revision = header->base_revision;
}
else
{
ref->base_item_index = SVN_FS_FS__ITEM_INDEX_UNUSED;
ref->base_revision = SVN_INVALID_REVNUM;
}
APR_ARRAY_PUSH(rep_refs, rep_ref_t *) = ref;
}
/* advance offset */
offset += entry->size;
}
}
/* Resolve the delta chain links. */
SVN_ERR(resolve_representation_refs(query, rep_refs));
/* clean up and close file handles */
svn_pool_destroy(iterpool);
return SVN_NO_ERROR;
}
/* Read the content of the pack file staring at revision BASE logical
* addressing mode and store it in QUERY.
*
* Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
* temporaries.
*/
static svn_error_t *
read_log_pack_file(query_t *query,
svn_revnum_t base,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
SVN_ERR(read_log_rev_or_packfile(query, base, query->shard_size,
result_pool, scratch_pool));
/* one more pack file processed */
if (query->progress_func)
query->progress_func(base, query->progress_baton, scratch_pool);
return SVN_NO_ERROR;
}
/* Read the content of the file for REVISION in logical addressing mode
* and store its contents in QUERY.
*
* Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
* temporaries.
*/
static svn_error_t *
read_log_revision_file(query_t *query,
svn_revnum_t revision,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
SVN_ERR(read_log_rev_or_packfile(query, revision, 1,
result_pool, scratch_pool));
/* show progress every 1000 revs or so */
if (query->progress_func)
{
if (query->shard_size && (revision % query->shard_size == 0))
query->progress_func(revision, query->progress_baton, scratch_pool);
if (!query->shard_size && (revision % 1000 == 0))
query->progress_func(revision, query->progress_baton, scratch_pool);
}
return SVN_NO_ERROR;
}
/* Read the repository and collect the stats info in QUERY.
*
* Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
* temporaries.
*/
static svn_error_t *
read_revisions(query_t *query,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
apr_pool_t *iterpool = svn_pool_create(scratch_pool);
svn_revnum_t revision;
/* read all packed revs */
for ( revision = 0
; revision < query->min_unpacked_rev
; revision += query->shard_size)
{
svn_pool_clear(iterpool);
if (svn_fs_fs__use_log_addressing(query->fs))
SVN_ERR(read_log_pack_file(query, revision, result_pool, iterpool));
else
SVN_ERR(read_phys_pack_file(query, revision, result_pool, iterpool));
}
/* read non-packed revs */
for ( ; revision <= query->head; ++revision)
{
svn_pool_clear(iterpool);
if (svn_fs_fs__use_log_addressing(query->fs))
SVN_ERR(read_log_revision_file(query, revision, result_pool,
iterpool));
else
SVN_ERR(read_phys_revision_file(query, revision, result_pool,
iterpool));
}
svn_pool_destroy(iterpool);
return SVN_NO_ERROR;
}
/* Accumulate stats of REP in STATS.
*/
static void
add_rep_pack_stats(svn_fs_fs__rep_pack_stats_t *stats,
rep_stats_t *rep)
{
stats->count++;
stats->packed_size += rep->size;
stats->expanded_size += rep->expanded_size;
stats->overhead_size += rep->header_size + 7 /* ENDREP\n */;
}
/* Accumulate stats of REP in STATS.
*/
static void
add_rep_stats(svn_fs_fs__representation_stats_t *stats,
rep_stats_t *rep)
{
add_rep_pack_stats(&stats->total, rep);
if (rep->ref_count == 1)
add_rep_pack_stats(&stats->uniques, rep);
else
add_rep_pack_stats(&stats->shared, rep);
stats->references += rep->ref_count;
stats->expanded_size += rep->ref_count * rep->expanded_size;
stats->chain_len += rep->chain_length;
}
/* Aggregate the info the in revision_info_t * array REVISIONS into the
* respectve fields of STATS.
*/
static void
aggregate_stats(const apr_array_header_t *revisions,
svn_fs_fs__stats_t *stats)
{
int i, k;
/* aggregate info from all revisions */
stats->revision_count = revisions->nelts;
for (i = 0; i < revisions->nelts; ++i)
{
revision_info_t *revision = APR_ARRAY_IDX(revisions, i,
revision_info_t *);
/* data gathered on a revision level */
stats->change_count += revision->change_count;
stats->change_len += revision->changes_len;
stats->total_size += revision->end - revision->offset;
stats->dir_node_stats.count += revision->dir_noderev_count;
stats->dir_node_stats.size += revision->dir_noderev_size;
stats->file_node_stats.count += revision->file_noderev_count;
stats->file_node_stats.size += revision->file_noderev_size;
stats->total_node_stats.count += revision->dir_noderev_count
+ revision->file_noderev_count;
stats->total_node_stats.size += revision->dir_noderev_size
+ revision->file_noderev_size;
/* process representations */
for (k = 0; k < revision->representations->nelts; ++k)
{
rep_stats_t *rep = APR_ARRAY_IDX(revision->representations, k,
rep_stats_t *);
/* accumulate in the right bucket */
switch(rep->kind)
{
case file_rep:
add_rep_stats(&stats->file_rep_stats, rep);
break;
case dir_rep:
add_rep_stats(&stats->dir_rep_stats, rep);
break;
case file_property_rep:
add_rep_stats(&stats->file_prop_rep_stats, rep);
break;
case dir_property_rep:
add_rep_stats(&stats->dir_prop_rep_stats, rep);
break;
default:
break;
}
add_rep_stats(&stats->total_rep_stats, rep);
}
}
}
/* Return a new svn_fs_fs__stats_t instance, allocated in RESULT_POOL.
*/
static svn_fs_fs__stats_t *
create_stats(apr_pool_t *result_pool)
{
svn_fs_fs__stats_t *stats = apr_pcalloc(result_pool, sizeof(*stats));
initialize_largest_changes(stats, 64, result_pool);
stats->by_extension = apr_hash_make(result_pool);
return stats;
}
/* Create a *QUERY, allocated in RESULT_POOL, reading filesystem FS and
* collecting results in STATS. Store the optional PROCESS_FUNC and
* PROGRESS_BATON as well as CANCEL_FUNC and CANCEL_BATON in *QUERY, too.
* Use SCRATCH_POOL for temporary allocations.
*/
static svn_error_t *
create_query(query_t **query,
svn_fs_t *fs,
svn_fs_fs__stats_t *stats,
svn_fs_progress_notify_func_t progress_func,
void *progress_baton,
svn_cancel_func_t cancel_func,
void *cancel_baton,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
*query = apr_pcalloc(result_pool, sizeof(**query));
/* Read repository dimensions. */
(*query)->shard_size = svn_fs_fs__shard_size(fs);
SVN_ERR(svn_fs_fs__youngest_rev(&(*query)->head, fs, scratch_pool));
SVN_ERR(svn_fs_fs__min_unpacked_rev(&(*query)->min_unpacked_rev, fs,
scratch_pool));
/* create data containers and caches
* Note: this assumes that int is at least 32-bits and that we only support
* 32-bit wide revision numbers (actually 31-bits due to the signedness
* of both the nelts field of the array and our revision numbers). This
* means this code will fail on platforms where int is less than 32-bits
* and the repository has more revisions than int can hold. */
(*query)->revisions = apr_array_make(result_pool, (int) (*query)->head + 1,
sizeof(revision_info_t *));
(*query)->null_base = apr_pcalloc(result_pool,
sizeof(*(*query)->null_base));
/* Store other parameters */
(*query)->fs = fs;
(*query)->stats = stats;
(*query)->progress_func = progress_func;
(*query)->progress_baton = progress_baton;
(*query)->cancel_func = cancel_func;
(*query)->cancel_baton = cancel_baton;
return SVN_NO_ERROR;
}
svn_error_t *
svn_fs_fs__get_stats(svn_fs_fs__stats_t **stats,
svn_fs_t *fs,
svn_fs_progress_notify_func_t progress_func,
void *progress_baton,
svn_cancel_func_t cancel_func,
void *cancel_baton,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
query_t *query;
*stats = create_stats(result_pool);
SVN_ERR(create_query(&query, fs, *stats, progress_func, progress_baton,
cancel_func, cancel_baton, scratch_pool,
scratch_pool));
SVN_ERR(read_revisions(query, scratch_pool, scratch_pool));
aggregate_stats(query->revisions, *stats);
return SVN_NO_ERROR;
}
/* Baton for rev_size_index_entry_cb. */
struct rev_size_baton_t {
svn_revnum_t revision;
apr_off_t rev_size;
};
/* Implements svn_fs_fs__dump_index_func_t, summing object sizes for
* revision BATON->revision into BATON->rev_size.
*/
static svn_error_t *
rev_size_index_entry_cb(const svn_fs_fs__p2l_entry_t *entry,
void *baton,
apr_pool_t *scratch_pool)
{
struct rev_size_baton_t *b = baton;
if (entry->item.revision == b->revision)
b->rev_size += entry->size;
return SVN_NO_ERROR;
}
svn_error_t *
svn_fs_fs__revision_size(apr_off_t *rev_size,
svn_fs_t *fs,
svn_revnum_t revision,
apr_pool_t *scratch_pool)
{
/* Get the size of the revision (excluding rev-props) */
if (svn_fs_fs__use_log_addressing(fs))
{
/* This works for a packed or a non-packed revision.
We could provide an optimized case for a non-packed revision
using svn_fs_fs__p2l_get_max_offset(). */
struct rev_size_baton_t b = { 0, 0 };
b.revision = revision;
SVN_ERR(svn_fs_fs__dump_index(fs, revision,
rev_size_index_entry_cb, &b,
NULL, NULL, scratch_pool));
*rev_size = b.rev_size;
}
else
{
svn_fs_fs__revision_file_t *rev_file;
svn_revnum_t min_unpacked_rev;
SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, fs, revision,
scratch_pool, scratch_pool));
SVN_ERR(svn_fs_fs__min_unpacked_rev(&min_unpacked_rev, fs,
scratch_pool));
if (revision < min_unpacked_rev)
{
int shard_size = svn_fs_fs__shard_size(fs);
apr_off_t start_offset, end_offset;
SVN_ERR(svn_fs_fs__get_packed_offset(&start_offset, fs, revision,
scratch_pool));
if (((revision + 1) % shard_size) == 0)
{
svn_filesize_t file_size;
SVN_ERR(svn_io_file_size_get(&file_size, rev_file->file, scratch_pool));
end_offset = (apr_off_t)file_size;
}
else
{
SVN_ERR(svn_fs_fs__get_packed_offset(&end_offset, fs,
revision + 1, scratch_pool));
}
*rev_size = (end_offset - start_offset);
}
else
{
svn_filesize_t file_size;
SVN_ERR(svn_io_file_size_get(&file_size, rev_file->file, scratch_pool));
*rev_size = (apr_off_t)file_size;
}
SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
}
/* Add the size of the rev-props */
{
apr_off_t size;
SVN_ERR(svn_fs_fs__get_revision_props_size(&size, fs, revision, scratch_pool));
*rev_size += size;
}
return SVN_NO_ERROR;
}