deltas: Compute rollsum targets

This commit is contained in:
Colin Walters 2015-01-29 17:39:34 -05:00
parent d749932f6b
commit 3df8be0d92
2 changed files with 220 additions and 6 deletions

View File

@ -49,6 +49,8 @@ libostree_1_la_SOURCES = \
src/libostree/ostree-lzma-compressor.h \
src/libostree/ostree-lzma-decompressor.c \
src/libostree/ostree-lzma-decompressor.h \
src/libostree/bupsplit.h \
src/libostree/bupsplit.c \
src/libostree/ostree-varint.h \
src/libostree/ostree-varint.c \
src/libostree/ostree-linuxfsutil.h \

View File

@ -21,6 +21,7 @@
#include "config.h"
#include <string.h>
#include <zlib.h>
#include "ostree-core-private.h"
#include "ostree-repo-private.h"
@ -379,6 +380,185 @@ process_one_object (OstreeRepo *repo,
return ret;
}
typedef struct {
GPtrArray *keys;
GHashTable *values;
} OrderedRollsums;
static void
ordered_rollsums_free (OrderedRollsums *ohash)
{
g_ptr_array_unref (ohash->keys);
g_hash_table_unref (ohash->values);
g_free (ohash);
}
static gboolean
rollsum_chunks_crc32 (GInputStream *istream,
OrderedRollsums **out_rollsums,
GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
gsize start = 0;
gboolean rollsum_end = FALSE;
OrderedRollsums *ret_rollsums = g_new0 (OrderedRollsums, 1);
gs_unref_object GBufferedInputStream *bufinput =
(GBufferedInputStream*) g_buffered_input_stream_new_sized (istream, ROLLSUM_BLOB_MAX);
ret_rollsums->keys = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
ret_rollsums->values = g_hash_table_new (NULL, NULL);
while (TRUE)
{
gssize bytes_read;
const guint8 *buf;
gsize bufsize;
int offset, bits;
bytes_read = g_buffered_input_stream_fill (bufinput, -1, cancellable, error);
if (bytes_read == -1)
goto out;
if (bytes_read == 0)
break;
buf = g_buffered_input_stream_peek_buffer (bufinput, &bufsize);
if (!rollsum_end)
{
offset = bupsplit_find_ofs (buf, MIN(G_MAXINT32, bufsize), &bits);
if (offset == 0)
{
rollsum_end = TRUE;
offset = MIN(ROLLSUM_BLOB_MAX, bufsize);
}
else if (offset > ROLLSUM_BLOB_MAX)
offset = ROLLSUM_BLOB_MAX;
}
else
offset = MIN(ROLLSUM_BLOB_MAX, bufsize);
if (!g_input_stream_skip ((GInputStream*)bufinput, bufsize, cancellable, error))
goto out;
/* Use zlib's crc32 */
{ guint32 crc = crc32 (0L, NULL, 0);
GVariant *val;
crc = crc32 (crc, buf, offset);
val = g_variant_ref_sink (g_variant_new ("(utt)", crc, (guint64) start, (guint64)offset));
g_ptr_array_add (ret_rollsums->keys, val);
g_hash_table_insert (ret_rollsums->values, GUINT_TO_POINTER (crc), val);
}
start += offset;
}
ret = TRUE;
gs_transfer_out_value (out_rollsums, &ret_rollsums);
out:
if (ret_rollsums)
ordered_rollsums_free (ret_rollsums);
return ret;
}
typedef struct {
char *from_checksum;
OrderedRollsums *from_rollsums;
OrderedRollsums *to_rollsums;
guint match_ratio;
} ContentRollsum;
static void
content_rollsums_free (ContentRollsum *rollsum)
{
g_free (rollsum->from_checksum);
ordered_rollsums_free (rollsum->from_rollsums);
ordered_rollsums_free (rollsum->to_rollsums);
g_free (rollsum);
}
static gboolean
try_content_rollsum (OstreeRepo *repo,
const char *from,
const char *to,
ContentRollsum **out_rollsum,
GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
OrderedRollsums *from_rollsum = NULL;
OrderedRollsums *to_rollsum = NULL;
gs_unref_object GInputStream *from_istream = NULL;
gs_unref_object GFileInfo *from_finfo = NULL;
gs_unref_object GInputStream *to_istream = NULL;
gs_unref_object GFileInfo *to_finfo = NULL;
ContentRollsum *ret_rollsum = NULL;
guint total = 0;
guint matches = 0;
guint match_ratio = 0;
gpointer hkey, hvalue;
GHashTableIter hiter;
*out_rollsum = NULL;
if (!ostree_repo_load_file (repo, from, &from_istream, &from_finfo, NULL,
cancellable, error))
goto out;
if (!ostree_repo_load_file (repo, to, &to_istream, &to_finfo, NULL,
cancellable, error))
goto out;
/* Only try to rollsum regular files obviously */
if (!(g_file_info_get_file_type (from_finfo) == G_FILE_TYPE_REGULAR
&& g_file_info_get_file_type (to_finfo) == G_FILE_TYPE_REGULAR))
{
ret = TRUE;
goto out;
}
g_assert (from_istream && to_istream);
if (!rollsum_chunks_crc32 (from_istream, &from_rollsum, cancellable, error))
goto out;
if (!rollsum_chunks_crc32 (to_istream, &to_rollsum, cancellable, error))
goto out;
g_clear_object (&from_istream);
g_clear_object (&to_istream);
g_hash_table_iter_init (&hiter, to_rollsum->values);
while (g_hash_table_iter_next (&hiter, &hkey, &hvalue))
{
if (g_hash_table_contains (from_rollsum->values, hkey))
matches++;
total++;
}
match_ratio = (matches*100)/total;
/* Only proceed if the file contains (arbitrary) more than 25% of
* the previous chunks.
*/
if (match_ratio < 25)
{
ret = TRUE;
goto out;
}
ret_rollsum = g_new0 (ContentRollsum, 1);
ret_rollsum->match_ratio = match_ratio;
ret_rollsum->from_checksum = g_strdup (from);
ret_rollsum->from_rollsums = from_rollsum; from_rollsum = NULL;
ret_rollsum->to_rollsums = to_rollsum; to_rollsum = NULL;
ret = TRUE;
gs_transfer_out_value (out_rollsum, &ret_rollsum);
out:
return ret;
}
static gboolean
generate_delta_lowlatency (OstreeRepo *repo,
const char *from,
@ -402,6 +582,7 @@ generate_delta_lowlatency (OstreeRepo *repo,
gs_unref_hashtable GHashTable *new_reachable_metadata = NULL;
gs_unref_hashtable GHashTable *new_reachable_content = NULL;
gs_unref_hashtable GHashTable *modified_content_objects = NULL;
gs_unref_hashtable GHashTable *rollsum_optimized_content_objects = NULL;
gs_unref_hashtable GHashTable *content_object_to_size = NULL;
if (from != NULL)
@ -424,15 +605,21 @@ generate_delta_lowlatency (OstreeRepo *repo,
cancellable, error))
goto out;
modified_content_objects = g_hash_table_new_full (ostree_hash_object_name, g_variant_equal,
NULL,
(GDestroyNotify) g_variant_unref);
modified_content_objects = g_hash_table_new_full (g_str_hash, g_str_equal,
g_free, g_free);
for (i = 0; i < modified->len; i++)
{
OstreeDiffItem *diffitem = modified->pdata[i];
GVariant *objname = ostree_object_name_serialize (diffitem->target_checksum,
OSTREE_OBJECT_TYPE_FILE);
g_hash_table_add (modified_content_objects, objname);
/* Theoretically, a target file could replace multiple source
* files. That could happen if say a project changed from having
* multiple binaries to one binary.
*
* In that case, we have last one wins behavior. For ELF rollsum
* tends to be useless unless there's a large static data blob.
*/
g_hash_table_replace (modified_content_objects,
g_strdup (diffitem->target_checksum),
g_strdup (diffitem->src_checksum));
}
if (from)
@ -478,6 +665,31 @@ generate_delta_lowlatency (OstreeRepo *repo,
g_hash_table_remove (new_reachable_metadata,
ostree_object_name_serialize (to, OSTREE_OBJECT_TYPE_COMMIT));
rollsum_optimized_content_objects = g_hash_table_new_full (g_str_hash, g_str_equal,
g_free,
(GDestroyNotify) content_rollsums_free);
g_hash_table_iter_init (&hashiter, modified_content_objects);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
const char *to_checksum = key;
const char *from_checksum = value;
ContentRollsum *rollsum;
if (!try_content_rollsum (repo, from_checksum, to_checksum,
&rollsum, cancellable, error))
goto out;
if (!rollsum)
continue;
g_hash_table_insert (rollsum_optimized_content_objects, g_strdup (to_checksum), rollsum);
}
g_printerr ("rollsum for %u/%u modified\n",
g_hash_table_size (rollsum_optimized_content_objects),
g_hash_table_size (modified_content_objects));
/* Scan for large objects, so we can fall back to plain HTTP-based
* fetch. In the future this should come after an rsync-style
* rolling delta check for modified files.