deltas: Implement rollsums

This does an rsync-style prepared delta basically.  On my test data,
it shaves ~6MB of uncompressed data.  Not a huge amount, but I expect
this to be more useful for things like binaries which embed data, etc.
This commit is contained in:
Colin Walters 2015-01-30 10:13:07 -05:00
parent 345754a564
commit 9aa7e30b38
8 changed files with 642 additions and 338 deletions

View File

@ -51,6 +51,8 @@ libostree_1_la_SOURCES = \
src/libostree/ostree-lzma-decompressor.h \ src/libostree/ostree-lzma-decompressor.h \
src/libostree/bupsplit.h \ src/libostree/bupsplit.h \
src/libostree/bupsplit.c \ src/libostree/bupsplit.c \
src/libostree/ostree-rollsum.h \
src/libostree/ostree-rollsum.c \
src/libostree/ostree-varint.h \ src/libostree/ostree-varint.h \
src/libostree/ostree-varint.c \ src/libostree/ostree-varint.c \
src/libostree/ostree-linuxfsutil.h \ src/libostree/ostree-linuxfsutil.h \

View File

@ -87,12 +87,6 @@ INSTALL_DATA_HOOKS += install-gpg-data-hook
echo 'Output=TAP' >> $@.tmp; \ echo 'Output=TAP' >> $@.tmp; \
mv $@.tmp $@) mv $@.tmp $@)
%.test: tests/%.js Makefile
$(AM_V_GEN) (echo '[Test]' > $@.tmp; \
echo 'Exec=env TESTDATADIR=$(pkglibexecdir)/installed-tests $(pkglibexecdir)/installed-tests/$(notdir $<)' >> $@.tmp; \
echo 'Type=session' >> $@.tmp; \
mv $@.tmp $@)
if BUILDOPT_GJS if BUILDOPT_GJS
insttest_SCRIPTS += tests/test-core.js \ insttest_SCRIPTS += tests/test-core.js \
tests/test-sizes.js \ tests/test-sizes.js \
@ -109,7 +103,7 @@ check_PROGRAMS = tests/test-rollsum tests/test-varint tests/test-ot-unix-utils
tests_test_ot_unix_utils_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS) tests_test_ot_unix_utils_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS)
tests_test_ot_unix_utils_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS) tests_test_ot_unix_utils_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS)
tests_test_rollsum_SOURCES = src/libostree/bupsplit.c tests/test-rollsum.c tests_test_rollsum_SOURCES = src/libostree/bupsplit.c src/libostree/ostree-rollsum.c tests/test-rollsum.c
tests_test_rollsum_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS) tests_test_rollsum_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS)
tests_test_rollsum_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS) tests_test_rollsum_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS)

View File

@ -21,18 +21,16 @@
#include "config.h" #include "config.h"
#include <string.h> #include <string.h>
#include <zlib.h> #include <gio/gunixoutputstream.h>
#include "ostree-core-private.h" #include "ostree-core-private.h"
#include "ostree-repo-private.h" #include "ostree-repo-private.h"
#include "ostree-lzma-compressor.h" #include "ostree-lzma-compressor.h"
#include "ostree-repo-static-delta-private.h" #include "ostree-repo-static-delta-private.h"
#include "ostree-diff.h" #include "ostree-diff.h"
#include "ostree-rollsum.h"
#include "otutil.h" #include "otutil.h"
#include "ostree-varint.h" #include "ostree-varint.h"
#include "bupsplit.h"
#define ROLLSUM_BLOB_MAX (8192*4)
typedef struct { typedef struct {
guint64 uncompressed_size; guint64 uncompressed_size;
@ -260,6 +258,36 @@ splice_stream_to_payload (OstreeStaticDeltaPartBuilder *current_part,
return ret; return ret;
} }
static void
write_content_mode_xattrs (OstreeRepo *repo,
OstreeStaticDeltaPartBuilder *current_part,
GFileInfo *content_finfo,
GVariant *content_xattrs,
gsize *out_mode_offset,
gsize *out_xattr_offset)
{
guint32 uid =
g_file_info_get_attribute_uint32 (content_finfo, "unix::uid");
guint32 gid =
g_file_info_get_attribute_uint32 (content_finfo, "unix::gid");
guint32 mode =
g_file_info_get_attribute_uint32 (content_finfo, "unix::mode");
gs_unref_variant GVariant *modev
= g_variant_ref_sink (g_variant_new ("(uuu)",
GUINT32_TO_BE (uid),
GUINT32_TO_BE (gid),
GUINT32_TO_BE (mode)));
*out_mode_offset = write_unique_variant_chunk (current_part,
current_part->mode_set,
current_part->modes,
modev);
*out_xattr_offset = write_unique_variant_chunk (current_part,
current_part->xattr_set,
current_part->xattrs,
content_xattrs);
}
static gboolean static gboolean
process_one_object (OstreeRepo *repo, process_one_object (OstreeRepo *repo,
OstreeStaticDeltaBuilder *builder, OstreeStaticDeltaBuilder *builder,
@ -327,26 +355,12 @@ process_one_object (OstreeRepo *repo,
else else
{ {
gsize mode_offset, xattr_offset, content_offset; gsize mode_offset, xattr_offset, content_offset;
guint32 uid = guint32 mode;
g_file_info_get_attribute_uint32 (content_finfo, "unix::uid");
guint32 gid =
g_file_info_get_attribute_uint32 (content_finfo, "unix::gid");
guint32 mode =
g_file_info_get_attribute_uint32 (content_finfo, "unix::mode");
gs_unref_variant GVariant *modev
= g_variant_ref_sink (g_variant_new ("(uuu)",
GUINT32_TO_BE (uid),
GUINT32_TO_BE (gid),
GUINT32_TO_BE (mode)));
mode_offset = write_unique_variant_chunk (current_part, mode = g_file_info_get_attribute_uint32 (content_finfo, "unix::mode");
current_part->mode_set,
current_part->modes, write_content_mode_xattrs (repo, current_part, content_finfo, content_xattrs,
modev); &mode_offset, &xattr_offset);
xattr_offset = write_unique_variant_chunk (current_part,
current_part->xattr_set,
current_part->xattrs,
content_xattrs);
if (S_ISLNK (mode)) if (S_ISLNK (mode))
{ {
@ -381,106 +395,75 @@ process_one_object (OstreeRepo *repo,
return ret; return ret;
} }
typedef struct {
GPtrArray *keys;
GHashTable *values;
} OrderedRollsums;
static void
ordered_rollsums_free (OrderedRollsums *ohash)
{
g_ptr_array_unref (ohash->keys);
g_hash_table_unref (ohash->values);
g_free (ohash);
}
static gboolean
rollsum_chunks_crc32 (GInputStream *istream,
OrderedRollsums **out_rollsums,
GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
gsize start = 0;
gboolean rollsum_end = FALSE;
OrderedRollsums *ret_rollsums = g_new0 (OrderedRollsums, 1);
gs_unref_object GBufferedInputStream *bufinput =
(GBufferedInputStream*) g_buffered_input_stream_new_sized (istream, ROLLSUM_BLOB_MAX);
ret_rollsums->keys = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
ret_rollsums->values = g_hash_table_new (NULL, NULL);
while (TRUE)
{
gssize bytes_read;
const guint8 *buf;
gsize bufsize;
int offset, bits;
bytes_read = g_buffered_input_stream_fill (bufinput, -1, cancellable, error);
if (bytes_read == -1)
goto out;
if (bytes_read == 0)
break;
buf = g_buffered_input_stream_peek_buffer (bufinput, &bufsize);
if (!rollsum_end)
{
offset = bupsplit_find_ofs (buf, MIN(G_MAXINT32, bufsize), &bits);
if (offset == 0)
{
rollsum_end = TRUE;
offset = MIN(ROLLSUM_BLOB_MAX, bufsize);
}
else if (offset > ROLLSUM_BLOB_MAX)
offset = ROLLSUM_BLOB_MAX;
}
else
offset = MIN(ROLLSUM_BLOB_MAX, bufsize);
if (!g_input_stream_skip ((GInputStream*)bufinput, bufsize, cancellable, error))
goto out;
/* Use zlib's crc32 */
{ guint32 crc = crc32 (0L, NULL, 0);
GVariant *val;
crc = crc32 (crc, buf, offset);
val = g_variant_ref_sink (g_variant_new ("(utt)", crc, (guint64) start, (guint64)offset));
g_ptr_array_add (ret_rollsums->keys, val);
g_hash_table_insert (ret_rollsums->values, GUINT_TO_POINTER (crc), val);
}
start += offset;
}
ret = TRUE;
gs_transfer_out_value (out_rollsums, &ret_rollsums);
out:
if (ret_rollsums)
ordered_rollsums_free (ret_rollsums);
return ret;
}
typedef struct { typedef struct {
char *from_checksum; char *from_checksum;
OrderedRollsums *from_rollsums; OstreeRollsumMatches *matches;
OrderedRollsums *to_rollsums; GBytes *tmp_to;
guint match_ratio;
guint64 match_size;
} ContentRollsum; } ContentRollsum;
static void static void
content_rollsums_free (ContentRollsum *rollsum) content_rollsums_free (ContentRollsum *rollsum)
{ {
g_free (rollsum->from_checksum); g_free (rollsum->from_checksum);
ordered_rollsums_free (rollsum->from_rollsums); _ostree_rollsum_matches_free (rollsum->matches);
ordered_rollsums_free (rollsum->to_rollsums); g_bytes_unref (rollsum->tmp_to);
g_free (rollsum); g_free (rollsum);
} }
/* Load a content object, uncompressing it to an unlinked tmpfile
that's mmap()'d and suitable for seeking.
*/
static gboolean
get_unpacked_unlinked_content (OstreeRepo *repo,
const char *checksum,
GBytes **out_content,
GFileInfo **out_finfo,
GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
gs_free char *tmpname = g_strdup ("tmpostree-deltaobj-XXXXXX");
gs_fd_close int fd = -1;
gs_unref_bytes GBytes *ret_content = NULL;
gs_unref_object GInputStream *istream = NULL;
gs_unref_object GFileInfo *ret_finfo = NULL;
gs_unref_object GOutputStream *out = NULL;
fd = g_mkstemp (tmpname);
if (fd == -1)
{
gs_set_error_from_errno (error, errno);
goto out;
}
/* Doesn't need a name */
(void) unlink (tmpname);
if (!ostree_repo_load_file (repo, checksum, &istream, &ret_finfo, NULL,
cancellable, error))
goto out;
if (g_file_info_get_file_type (ret_finfo) != G_FILE_TYPE_REGULAR)
{
ret = TRUE;
goto out;
}
out = g_unix_output_stream_new (fd, FALSE);
if (g_output_stream_splice (out, istream, G_OUTPUT_STREAM_SPLICE_CLOSE_TARGET,
cancellable, error) < 0)
goto out;
{ GMappedFile *mfile = g_mapped_file_new_from_fd (fd, FALSE, error);
ret_content = g_mapped_file_get_bytes (mfile);
g_mapped_file_unref (mfile);
}
ret = TRUE;
gs_transfer_out_value (out_content, &ret_content);
out:
return ret;
}
static gboolean static gboolean
try_content_rollsum (OstreeRepo *repo, try_content_rollsum (OstreeRepo *repo,
const char *from, const char *from,
@ -490,85 +473,203 @@ try_content_rollsum (OstreeRepo *repo,
GError **error) GError **error)
{ {
gboolean ret = FALSE; gboolean ret = FALSE;
OrderedRollsums *from_rollsum = NULL; gs_unref_hashtable GHashTable *from_rollsum = NULL;
OrderedRollsums *to_rollsum = NULL; gs_unref_hashtable GHashTable *to_rollsum = NULL;
gs_unref_object GInputStream *from_istream = NULL; gs_unref_bytes GBytes *tmp_from = NULL;
gs_unref_bytes GBytes *tmp_to = NULL;
gs_unref_object GFileInfo *from_finfo = NULL; gs_unref_object GFileInfo *from_finfo = NULL;
gs_unref_object GInputStream *to_istream = NULL;
gs_unref_object GFileInfo *to_finfo = NULL; gs_unref_object GFileInfo *to_finfo = NULL;
OstreeRollsumMatches *matches;
ContentRollsum *ret_rollsum = NULL; ContentRollsum *ret_rollsum = NULL;
guint total = 0;
guint matches = 0;
guint match_ratio = 0;
guint64 match_size = 0;
gpointer hkey, hvalue;
GHashTableIter hiter;
*out_rollsum = NULL; *out_rollsum = NULL;
if (!ostree_repo_load_file (repo, from, &from_istream, &from_finfo, NULL, /* Load the content objects, splice them to uncompressed temporary files that
cancellable, error)) * we can just mmap() and seek around in conveniently.
*/
if (!get_unpacked_unlinked_content (repo, from, &tmp_from, &from_finfo,
cancellable, error))
goto out; goto out;
if (!ostree_repo_load_file (repo, to, &to_istream, &to_finfo, NULL, if (!get_unpacked_unlinked_content (repo, to, &tmp_to, &to_finfo,
cancellable, error)) cancellable, error))
goto out; goto out;
/* Only try to rollsum regular files obviously */ /* Only try to rollsum regular files obviously */
if (!(g_file_info_get_file_type (from_finfo) == G_FILE_TYPE_REGULAR if (!(tmp_from && tmp_to))
&& g_file_info_get_file_type (to_finfo) == G_FILE_TYPE_REGULAR))
{ {
ret = TRUE; ret = TRUE;
goto out; goto out;
} }
g_assert (from_istream && to_istream); matches = _ostree_compute_rollsum_matches (tmp_from, tmp_to);
if (!rollsum_chunks_crc32 (from_istream, &from_rollsum, cancellable, error)) { guint match_ratio = (matches->bufmatches*100)/matches->total;
goto out;
if (!rollsum_chunks_crc32 (to_istream, &to_rollsum, cancellable, error))
goto out;
g_clear_object (&from_istream); /* Only proceed if the file contains (arbitrary) more than 25% of
g_clear_object (&to_istream); * the previous chunks.
*/
if (match_ratio < 25)
{
ret = TRUE;
goto out;
}
}
g_hash_table_iter_init (&hiter, to_rollsum->values); g_printerr ("rollsum for %s; crcs=%u bufs=%u total=%u matchsize=%llu\n",
while (g_hash_table_iter_next (&hiter, &hkey, &hvalue)) to, matches->crcmatches,
{ matches->bufmatches,
GVariant *chunk = hvalue; matches->total, (unsigned long long)matches->match_size);
if (g_hash_table_contains (from_rollsum->values, hkey))
{
guint64 offset;
g_variant_get (chunk, "(utt)", NULL, NULL, &offset);
matches++;
match_size += offset;
}
total++;
}
match_ratio = (matches*100)/total;
/* Only proceed if the file contains (arbitrary) more than 25% of
* the previous chunks.
*/
if (match_ratio < 25)
{
ret = TRUE;
goto out;
}
ret_rollsum = g_new0 (ContentRollsum, 1); ret_rollsum = g_new0 (ContentRollsum, 1);
ret_rollsum->match_ratio = match_ratio;
ret_rollsum->match_size = match_size;
ret_rollsum->from_checksum = g_strdup (from); ret_rollsum->from_checksum = g_strdup (from);
ret_rollsum->from_rollsums = from_rollsum; from_rollsum = NULL; ret_rollsum->matches = matches; matches = NULL;
ret_rollsum->to_rollsums = to_rollsum; to_rollsum = NULL; ret_rollsum->tmp_to = tmp_to; tmp_to = NULL;
ret = TRUE; ret = TRUE;
gs_transfer_out_value (out_rollsum, &ret_rollsum); gs_transfer_out_value (out_rollsum, &ret_rollsum);
out: out:
if (matches)
_ostree_rollsum_matches_free (matches);
return ret; return ret;
} }
static void
append_payload_chunk_and_write (OstreeStaticDeltaPartBuilder *current_part,
const guint8 *buf,
guint64 offset)
{
guint64 payload_start;
payload_start = current_part->payload->len;
g_string_append_len (current_part->payload, (char*)buf, offset);
g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_WRITE);
_ostree_write_varuint64 (current_part->operations, offset);
_ostree_write_varuint64 (current_part->operations, payload_start);
}
static gboolean
process_one_rollsum (OstreeRepo *repo,
OstreeStaticDeltaBuilder *builder,
OstreeStaticDeltaPartBuilder **current_part_val,
const char *to_checksum,
ContentRollsum *rollsum,
GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
guint64 content_size;
gs_unref_object GInputStream *content_stream = NULL;
gs_unref_object GFileInfo *content_finfo = NULL;
gs_unref_variant GVariant *content_xattrs = NULL;
OstreeStaticDeltaPartBuilder *current_part = *current_part_val;
const guint8 *tmp_to_buf;
gsize tmp_to_len;
/* Check to see if this delta has gone over maximum size */
if (current_part->objects->len > 0 &&
current_part->payload->len > builder->max_chunk_size_bytes)
{
*current_part_val = current_part = allocate_part (builder);
}
tmp_to_buf = g_bytes_get_data (rollsum->tmp_to, &tmp_to_len);
if (!ostree_repo_load_file (repo, to_checksum, &content_stream,
&content_finfo, &content_xattrs,
cancellable, error))
goto out;
content_size = g_file_info_get_size (content_finfo);
g_assert_cmpint (tmp_to_len, ==, content_size);
current_part->uncompressed_size += content_size;
g_ptr_array_add (current_part->objects, ostree_object_name_serialize (to_checksum, OSTREE_OBJECT_TYPE_FILE));
{ gsize mode_offset, xattr_offset, from_csum_offset;
gboolean reading_payload = TRUE;
guchar source_csum[32];
guint i;
write_content_mode_xattrs (repo, current_part, content_finfo, content_xattrs,
&mode_offset, &xattr_offset);
/* Write the origin checksum */
ostree_checksum_inplace_to_bytes (rollsum->from_checksum, source_csum);
from_csum_offset = current_part->payload->len;
g_string_append_len (current_part->payload, (char*)source_csum, sizeof (source_csum));
g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_OPEN);
_ostree_write_varuint64 (current_part->operations, mode_offset);
_ostree_write_varuint64 (current_part->operations, xattr_offset);
_ostree_write_varuint64 (current_part->operations, content_size);
{ guint64 writing_offset = 0;
guint64 offset = 0, to_start = 0, from_start = 0;
GPtrArray *matchlist = rollsum->matches->matches;
g_assert (matchlist->len > 0);
for (i = 0; i < matchlist->len; i++)
{
GVariant *match = matchlist->pdata[i];
guint32 crc;
guint64 prefix;
g_variant_get (match, "(uttt)", &crc, &offset, &to_start, &from_start);
prefix = to_start - writing_offset;
if (prefix > 0)
{
if (!reading_payload)
{
g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_UNSET_READ_SOURCE);
reading_payload = TRUE;
}
g_assert_cmpint (writing_offset + prefix, <=, tmp_to_len);
append_payload_chunk_and_write (current_part, tmp_to_buf + writing_offset, prefix);
writing_offset += prefix;
}
if (reading_payload)
{
g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_SET_READ_SOURCE);
_ostree_write_varuint64 (current_part->operations, from_csum_offset);
reading_payload = FALSE;
}
g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_WRITE);
_ostree_write_varuint64 (current_part->operations, offset);
_ostree_write_varuint64 (current_part->operations, from_start);
writing_offset += offset;
}
if (!reading_payload)
{
g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_UNSET_READ_SOURCE);
reading_payload = TRUE;
}
{ guint64 remainder = tmp_to_len - writing_offset;
if (remainder > 0)
append_payload_chunk_and_write (current_part, tmp_to_buf + writing_offset, remainder);
writing_offset += remainder;
g_assert_cmpint (writing_offset, ==, tmp_to_len);
}
g_assert_cmpint (writing_offset, ==, content_size);
}
g_string_append_c (current_part->operations, (gchar)OSTREE_STATIC_DELTA_OP_CLOSE);
}
ret = TRUE;
out:
return ret;
}
static gboolean static gboolean
generate_delta_lowlatency (OstreeRepo *repo, generate_delta_lowlatency (OstreeRepo *repo,
const char *from, const char *from,
@ -694,46 +795,13 @@ generate_delta_lowlatency (OstreeRepo *repo,
continue; continue;
g_hash_table_insert (rollsum_optimized_content_objects, g_strdup (to_checksum), rollsum); g_hash_table_insert (rollsum_optimized_content_objects, g_strdup (to_checksum), rollsum);
builder->rollsum_size += rollsum->match_size; builder->rollsum_size += rollsum->matches->match_size;
} }
g_printerr ("rollsum for %u/%u modified\n", g_printerr ("rollsum for %u/%u modified\n",
g_hash_table_size (rollsum_optimized_content_objects), g_hash_table_size (rollsum_optimized_content_objects),
g_hash_table_size (modified_content_objects)); g_hash_table_size (modified_content_objects));
/* Scan for large objects, so we can fall back to plain HTTP-based
* fetch. In the future this should come after an rsync-style
* rolling delta check for modified files.
*/
g_hash_table_iter_init (&hashiter, new_reachable_content);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
GVariant *serialized_key = key;
const char *checksum;
OstreeObjectType objtype;
guint64 uncompressed_size;
gboolean fallback = FALSE;
ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
if (!ostree_repo_load_object_stream (repo, objtype, checksum,
NULL, &uncompressed_size,
cancellable, error))
goto out;
if (uncompressed_size > builder->min_fallback_size_bytes)
fallback = TRUE;
if (fallback)
{
gs_free char *size = g_format_size (uncompressed_size);
g_printerr ("fallback for %s (%s)\n",
ostree_object_to_string (checksum, objtype), size);
g_ptr_array_add (builder->fallback_objects,
g_variant_ref (serialized_key));
g_hash_table_iter_remove (&hashiter);
}
}
current_part = allocate_part (builder); current_part = allocate_part (builder);
/* Pack the metadata first */ /* Pack the metadata first */
@ -752,7 +820,57 @@ generate_delta_lowlatency (OstreeRepo *repo,
goto out; goto out;
} }
/* Now content */ /* Now do rollsummed objects */
g_hash_table_iter_init (&hashiter, rollsum_optimized_content_objects);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
const char *checksum = key;
ContentRollsum *rollsum = value;
if (!process_one_rollsum (repo, builder, &current_part,
checksum, rollsum,
cancellable, error))
goto out;
}
/* Scan for large objects, so we can fall back to plain HTTP-based
* fetch.
*/
g_hash_table_iter_init (&hashiter, new_reachable_content);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
GVariant *serialized_key = key;
const char *checksum;
OstreeObjectType objtype;
guint64 uncompressed_size;
gboolean fallback = FALSE;
ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
/* Skip content objects we rollsum'd */
if (g_hash_table_contains (rollsum_optimized_content_objects, checksum))
continue;
if (!ostree_repo_load_object_stream (repo, objtype, checksum,
NULL, &uncompressed_size,
cancellable, error))
goto out;
if (uncompressed_size > builder->min_fallback_size_bytes)
fallback = TRUE;
if (fallback)
{
gs_free char *size = g_format_size (uncompressed_size);
g_printerr ("fallback for %s (%s)\n",
ostree_object_to_string (checksum, objtype), size);
g_ptr_array_add (builder->fallback_objects,
g_variant_ref (serialized_key));
g_hash_table_iter_remove (&hashiter);
}
}
/* Now non-rollsummed content */
g_hash_table_iter_init (&hashiter, new_reachable_content); g_hash_table_iter_init (&hashiter, new_reachable_content);
while (g_hash_table_iter_next (&hashiter, &key, &value)) while (g_hash_table_iter_next (&hashiter, &key, &value))
{ {
@ -762,6 +880,10 @@ generate_delta_lowlatency (OstreeRepo *repo,
ostree_object_name_deserialize (serialized_key, &checksum, &objtype); ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
/* Skip content objects we rollsum'd */
if (g_hash_table_contains (rollsum_optimized_content_objects, checksum))
continue;
if (!process_one_object (repo, builder, &current_part, if (!process_one_object (repo, builder, &current_part,
checksum, objtype, checksum, objtype,
cancellable, error)) cancellable, error))

View File

@ -135,7 +135,8 @@ typedef enum {
OSTREE_STATIC_DELTA_OP_OPEN_SPLICE_AND_CLOSE = 'S', OSTREE_STATIC_DELTA_OP_OPEN_SPLICE_AND_CLOSE = 'S',
OSTREE_STATIC_DELTA_OP_OPEN = 'o', OSTREE_STATIC_DELTA_OP_OPEN = 'o',
OSTREE_STATIC_DELTA_OP_WRITE = 'w', OSTREE_STATIC_DELTA_OP_WRITE = 'w',
OSTREE_STATIC_DELTA_OP_SET_READ_SOURCE = 'R', OSTREE_STATIC_DELTA_OP_SET_READ_SOURCE = 'r',
OSTREE_STATIC_DELTA_OP_UNSET_READ_SOURCE = 'R',
OSTREE_STATIC_DELTA_OP_CLOSE = 'c' OSTREE_STATIC_DELTA_OP_CLOSE = 'c'
} OstreeStaticDeltaOpCode; } OstreeStaticDeltaOpCode;

View File

@ -98,6 +98,7 @@ OPPROTO(open_splice_and_close)
OPPROTO(open) OPPROTO(open)
OPPROTO(write) OPPROTO(write)
OPPROTO(set_read_source) OPPROTO(set_read_source)
OPPROTO(unset_read_source)
OPPROTO(close) OPPROTO(close)
#undef OPPROTO #undef OPPROTO
@ -250,6 +251,10 @@ _ostree_static_delta_part_execute_raw (OstreeRepo *repo,
if (!dispatch_set_read_source (repo, state, cancellable, error)) if (!dispatch_set_read_source (repo, state, cancellable, error))
goto out; goto out;
break; break;
case OSTREE_STATIC_DELTA_OP_UNSET_READ_SOURCE:
if (!dispatch_unset_read_source (repo, state, cancellable, error))
goto out;
break;
case OSTREE_STATIC_DELTA_OP_CLOSE: case OSTREE_STATIC_DELTA_OP_CLOSE:
if (!dispatch_close (repo, state, cancellable, error)) if (!dispatch_close (repo, state, cancellable, error))
goto out; goto out;
@ -491,7 +496,7 @@ dispatch_open_splice_and_close (OstreeRepo *repo,
if (!open_output_target (state, cancellable, error)) if (!open_output_target (state, cancellable, error))
goto out; goto out;
if (OSTREE_OBJECT_TYPE_IS_META (state->output_objtype)) if (OSTREE_OBJECT_TYPE_IS_META (state->output_objtype))
{ {
gs_unref_variant GVariant *metadata = NULL; gs_unref_variant GVariant *metadata = NULL;
@ -519,7 +524,6 @@ dispatch_open_splice_and_close (OstreeRepo *repo,
{ {
guint64 content_offset; guint64 content_offset;
guint64 objlen; guint64 objlen;
guint64 content_size;
gsize bytes_written; gsize bytes_written;
gs_unref_object GInputStream *object_input = NULL; gs_unref_object GInputStream *object_input = NULL;
gs_unref_object GInputStream *memin = NULL; gs_unref_object GInputStream *memin = NULL;
@ -527,11 +531,11 @@ dispatch_open_splice_and_close (OstreeRepo *repo,
if (!do_content_open_generic (repo, state, cancellable, error)) if (!do_content_open_generic (repo, state, cancellable, error))
goto out; goto out;
if (!read_varuint64 (state, &content_size, error)) if (!read_varuint64 (state, &state->content_size, error))
goto out; goto out;
if (!read_varuint64 (state, &content_offset, error)) if (!read_varuint64 (state, &content_offset, error))
goto out; goto out;
if (!validate_ofs (state, content_offset, content_size, error)) if (!validate_ofs (state, content_offset, state->content_size, error))
goto out; goto out;
/* Fast path for regular files to bare repositories */ /* Fast path for regular files to bare repositories */
@ -551,7 +555,7 @@ dispatch_open_splice_and_close (OstreeRepo *repo,
{ {
if (!g_output_stream_write_all (state->content_out, if (!g_output_stream_write_all (state->content_out,
state->payload_data + content_offset, state->payload_data + content_offset,
content_size, state->content_size,
&bytes_written, &bytes_written,
cancellable, error)) cancellable, error))
goto out; goto out;
@ -567,14 +571,14 @@ dispatch_open_splice_and_close (OstreeRepo *repo,
if (S_ISLNK (state->mode)) if (S_ISLNK (state->mode))
{ {
gs_free char *nulterminated_target = gs_free char *nulterminated_target =
g_strndup ((char*)state->payload_data + content_offset, content_size); g_strndup ((char*)state->payload_data + content_offset, state->content_size);
g_file_info_set_symlink_target (finfo, nulterminated_target); g_file_info_set_symlink_target (finfo, nulterminated_target);
} }
else else
{ {
g_assert (S_ISREG (state->mode)); g_assert (S_ISREG (state->mode));
g_file_info_set_size (finfo, content_size); g_file_info_set_size (finfo, state->content_size);
memin = g_memory_input_stream_new_from_data (state->payload_data + content_offset, content_size, NULL); memin = g_memory_input_stream_new_from_data (state->payload_data + content_offset, state->content_size, NULL);
} }
if (!ostree_raw_file_to_content_stream (memin, finfo, state->xattrs, if (!ostree_raw_file_to_content_stream (memin, finfo, state->xattrs,
@ -621,6 +625,9 @@ dispatch_open (OstreeRepo *repo,
if (!do_content_open_generic (repo, state, cancellable, error)) if (!do_content_open_generic (repo, state, cancellable, error))
goto out; goto out;
if (!read_varuint64 (state, &state->content_size, error))
goto out;
if (!_ostree_repo_open_trusted_content_bare (repo, state->checksum, if (!_ostree_repo_open_trusted_content_bare (repo, state->checksum,
state->content_size, state->content_size,
&state->barecommitstate, &state->barecommitstate,
@ -651,8 +658,6 @@ dispatch_write (OstreeRepo *repo,
goto out; goto out;
if (!read_varuint64 (state, &content_offset, error)) if (!read_varuint64 (state, &content_offset, error))
goto out; goto out;
if (!validate_ofs (state, content_offset, content_size, error))
goto out;
if (!state->have_obj) if (!state->have_obj)
{ {
@ -695,6 +700,9 @@ dispatch_write (OstreeRepo *repo,
} }
else else
{ {
if (!validate_ofs (state, content_offset, content_size, error))
goto out;
if (!g_output_stream_write_all (state->content_out, if (!g_output_stream_write_all (state->content_out,
state->payload_data + content_offset, state->payload_data + content_offset,
content_size, content_size,
@ -745,6 +753,29 @@ dispatch_set_read_source (OstreeRepo *repo,
return ret; return ret;
} }
static gboolean
dispatch_unset_read_source (OstreeRepo *repo,
StaticDeltaExecutionState *state,
GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
if (state->read_source_fd)
{
(void) close (state->read_source_fd);
state->read_source_fd = -1;
}
g_clear_pointer (&state->read_source_object, g_free);
ret = TRUE;
/* out: */
if (!ret)
g_prefix_error (error, "opcode unset-read-source: ");
return ret;
}
static gboolean static gboolean
dispatch_close (OstreeRepo *repo, dispatch_close (OstreeRepo *repo,
StaticDeltaExecutionState *state, StaticDeltaExecutionState *state,
@ -765,14 +796,10 @@ dispatch_close (OstreeRepo *repo,
goto out; goto out;
} }
if (state->read_source_fd) if (!dispatch_unset_read_source (repo, state, cancellable, error))
{ goto out;
(void) close (state->read_source_fd);
state->read_source_fd = -1;
}
g_clear_pointer (&state->xattrs, g_variant_unref); g_clear_pointer (&state->xattrs, g_variant_unref);
g_clear_pointer (&state->read_source_object, g_free);
g_clear_object (&state->content_out); g_clear_object (&state->content_out);
state->checksum_index++; state->checksum_index++;

View File

@ -0,0 +1,201 @@
/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*-
*
* Copyright (C) 2015 Colin Walters <walters@verbum.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include "config.h"
#include <string.h>
#include <zlib.h>
#include "ostree-rollsum.h"
#include "libgsystem.h"
#include "bupsplit.h"
#define ROLLSUM_BLOB_MAX (8192*4)
static GHashTable *
rollsum_chunks_crc32 (GBytes *bytes)
{
gsize start = 0;
gboolean rollsum_end = FALSE;
GHashTable *ret_rollsums = NULL;
const guint8 *buf;
gsize buflen;
gsize remaining;
ret_rollsums = g_hash_table_new_full (NULL, NULL, NULL, (GDestroyNotify)g_ptr_array_unref);
buf = g_bytes_get_data (bytes, &buflen);
remaining = buflen;
while (remaining > 0)
{
int offset, bits;
if (!rollsum_end)
{
offset = bupsplit_find_ofs (buf + start, MIN(G_MAXINT32, remaining), &bits);
if (offset == 0)
{
rollsum_end = TRUE;
offset = MIN(ROLLSUM_BLOB_MAX, remaining);
}
else if (offset > ROLLSUM_BLOB_MAX)
offset = ROLLSUM_BLOB_MAX;
}
else
offset = MIN(ROLLSUM_BLOB_MAX, remaining);
/* Use zlib's crc32 */
{ guint32 crc = crc32 (0L, NULL, 0);
GVariant *val;
GPtrArray *matches;
crc = crc32 (crc, buf, offset);
val = g_variant_ref_sink (g_variant_new ("(utt)", crc, (guint64) start, (guint64)offset));
matches = g_hash_table_lookup (ret_rollsums, GUINT_TO_POINTER (crc));
if (!matches)
{
matches = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
g_hash_table_insert (ret_rollsums, GUINT_TO_POINTER (crc), matches);
}
g_ptr_array_add (matches, val);
}
start += offset;
remaining -= offset;
}
return ret_rollsums;
}
static gint
compare_matches (const void *app,
const void *bpp)
{
GVariant **avpp = (GVariant**)app;
GVariant *a = *avpp;
GVariant **bvpp = (GVariant**)bpp;
GVariant *b = *bvpp;
guint64 a_start, b_start;
g_variant_get_child (a, 2, "t", &a_start);
g_variant_get_child (b, 2, "t", &b_start);
g_assert_cmpint (a_start, !=, b_start);
if (a_start < b_start)
return -1;
return 1;
}
OstreeRollsumMatches *
_ostree_compute_rollsum_matches (GBytes *from,
GBytes *to)
{
OstreeRollsumMatches *ret_rollsum = NULL;
gs_unref_hashtable GHashTable *from_rollsum = NULL;
gs_unref_hashtable GHashTable *to_rollsum = NULL;
gs_unref_ptrarray GPtrArray *matches = NULL;
const guint8 *from_buf;
gsize from_len;
const guint8 *to_buf;
gsize to_len;
gpointer hkey, hvalue;
GHashTableIter hiter;
ret_rollsum = g_new0 (OstreeRollsumMatches, 1);
matches = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
from_buf = g_bytes_get_data (from, &from_len);
to_buf = g_bytes_get_data (to, &to_len);
from_rollsum = rollsum_chunks_crc32 (from);
to_rollsum = rollsum_chunks_crc32 (to);
g_hash_table_iter_init (&hiter, to_rollsum);
while (g_hash_table_iter_next (&hiter, &hkey, &hvalue))
{
GPtrArray *to_chunks = hvalue;
GPtrArray *from_chunks;
from_chunks = g_hash_table_lookup (from_rollsum, hkey);
if (from_chunks != NULL)
{
guint i;
ret_rollsum->crcmatches++;
for (i = 0; i < to_chunks->len; i++)
{
GVariant *to_chunk = to_chunks->pdata[i];
guint64 to_start, to_offset;
guint32 tocrc;
guint j;
g_variant_get (to_chunk, "(utt)", &tocrc, &to_start, &to_offset);
for (j = 0; j < from_chunks->len; j++)
{
GVariant *from_chunk = from_chunks->pdata[j];
guint32 fromcrc;
guint64 from_start, from_offset;
g_variant_get (from_chunk, "(utt)", &fromcrc, &from_start, &from_offset);
g_assert (fromcrc == tocrc);
g_assert (to_offset == from_offset);
/* Rsync uses a cryptographic checksum, but let's be
* very conservative here and just memcmp.
*/
if (memcmp (from_buf + from_start, to_buf + to_start, to_offset) == 0)
{
GVariant *match = g_variant_new ("(uttt)", fromcrc, to_offset, to_start, from_start);
ret_rollsum->bufmatches++;
ret_rollsum->match_size += to_offset;
g_ptr_array_add (matches, g_variant_ref_sink (match));
break; /* Don't need any more matches */
}
}
}
}
ret_rollsum->total += to_chunks->len;
}
g_ptr_array_sort (matches, compare_matches);
ret_rollsum->from_rollsums = from_rollsum; from_rollsum = NULL;
ret_rollsum->to_rollsums = to_rollsum; to_rollsum = NULL;
ret_rollsum->matches = matches; matches = NULL;
return ret_rollsum;
}
void
_ostree_rollsum_matches_free (OstreeRollsumMatches *rollsum)
{
g_hash_table_unref (rollsum->to_rollsums);
g_hash_table_unref (rollsum->from_rollsums);
g_ptr_array_unref (rollsum->matches);
g_free (rollsum);
}

View File

@ -0,0 +1,44 @@
/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*-
*
* Copyright (C) 2015 Colin Walters <walters@verbum.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#pragma once
#include <gio/gio.h>
G_BEGIN_DECLS
typedef struct {
GHashTable *from_rollsums;
GHashTable *to_rollsums;
guint crcmatches;
guint bufmatches;
guint total;
guint64 match_size;
GPtrArray *matches;
} OstreeRollsumMatches;
OstreeRollsumMatches *
_ostree_compute_rollsum_matches (GBytes *from,
GBytes *to);
void _ostree_rollsum_matches_free (OstreeRollsumMatches *rollsum);
G_END_DECLS

View File

@ -20,138 +20,51 @@
#include "config.h" #include "config.h"
#include "ostree-rollsum.h"
#include <unistd.h>
#include <stdlib.h>
#include "libgsystem.h" #include "libgsystem.h"
#include "bupsplit.h"
#define BLOB_MAX (8192*4)
static GPtrArray *
rollsum_checksums_for_data (GBytes *bytes)
{
const guint8 *start;
gsize len;
gboolean rollsum_end = FALSE;
GPtrArray *ret = g_ptr_array_new_with_free_func ((GDestroyNotify)g_variant_unref);
start = g_bytes_get_data (bytes, &len);
while (len > 0)
{
int offset, bits;
if (!rollsum_end)
{
offset = bupsplit_find_ofs (start, MIN(G_MAXINT32, len), &bits);
if (offset == 0)
{
rollsum_end = TRUE;
offset = MIN(BLOB_MAX, len);
}
else if (offset > BLOB_MAX)
offset = BLOB_MAX;
}
else
offset = MIN(BLOB_MAX, len);
{
gs_free char *blobcsum =
g_compute_checksum_for_data (G_CHECKSUM_SHA256,
start, offset);
g_ptr_array_add (ret, g_variant_ref_sink (g_variant_new ("(st)",
blobcsum, (guint64)offset)));
}
start += offset;
len -= offset;
}
return ret;
}
static void
print_rollsums (GPtrArray *rollsums)
{
guint i;
for (i = 0; i < rollsums->len; i++)
{
GVariant *sum = rollsums->pdata[i];
const char *csum;
guint64 val;
g_variant_get (sum, "(&st)", &csum, &val);
g_print ("chunk %s %" G_GUINT64_FORMAT "\n", csum, val);
}
}
int int
main (int argc, char **argv) main (int argc, char **argv)
{ {
GCancellable *cancellable = NULL;
GError *local_error = NULL; GError *local_error = NULL;
GError **error = &local_error; GError **error = &local_error;
gs_unref_object GFile *path = NULL; GBytes *from_bytes = NULL;
GBytes *bytes = NULL; GBytes *to_bytes = NULL;
const char *from_path;
const char *to_path;
OstreeRollsumMatches *matches;
GMappedFile *mfile;
g_setenv ("GIO_USE_VFS", "local", TRUE); g_setenv ("GIO_USE_VFS", "local", TRUE);
if (argc == 2) if (argc < 3)
{ exit (1);
gs_unref_ptrarray GPtrArray *rollsums = NULL;
path = g_file_new_for_path (argv[1]); from_path = argv[1];
bytes = gs_file_map_readonly (path, cancellable, error); to_path = argv[2];
if (!bytes)
goto out;
rollsums = rollsum_checksums_for_data (bytes); mfile = g_mapped_file_new (from_path, FALSE, error);
print_rollsums (rollsums); if (!mfile)
} goto out;
else if (argc > 2) from_bytes = g_mapped_file_get_bytes (mfile);
{ g_mapped_file_unref (mfile);
guint i; mfile = g_mapped_file_new (to_path, FALSE, error);
gs_unref_hashtable GHashTable *sums = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL); if (!mfile)
guint64 input_size = 0; goto out;
guint64 rollsum_size = 0; to_bytes = g_mapped_file_get_bytes (mfile);
g_mapped_file_unref (mfile);
for (i = 1; i < argc; i++) matches = _ostree_compute_rollsum_matches (from_bytes, to_bytes);
{
guint j;
gs_unref_ptrarray GPtrArray *rollsums = NULL;
guint64 this_rollsum_size = 0;
path = g_file_new_for_path (argv[i]); g_printerr ("rollsum crcs=%u bufs=%u total=%u matchsize=%llu\n",
bytes = gs_file_map_readonly (path, cancellable, error); matches->crcmatches,
if (!bytes) matches->bufmatches,
goto out; matches->total, (unsigned long long)matches->match_size);
input_size += g_bytes_get_size (bytes);
g_print ("input: %s size: %" G_GUINT64_FORMAT "\n", argv[i], g_bytes_get_size (bytes));
rollsums = rollsum_checksums_for_data (bytes);
print_rollsums (rollsums);
for (j = 0; j < rollsums->len; j++)
{
GVariant *sum = rollsums->pdata[j];
const char *csum;
guint64 ofs;
g_variant_get (sum, "(&st)", &csum, &ofs);
if (!g_hash_table_contains (sums, csum))
{
g_hash_table_add (sums, g_strdup (csum));
rollsum_size += ofs;
}
this_rollsum_size += ofs;
}
g_print ("input: rollsum size: %" G_GUINT64_FORMAT "\n", this_rollsum_size);
}
g_print ("rollsum total:%u input:%" G_GUINT64_FORMAT " output: %" G_GUINT64_FORMAT " speedup:%f\n",
g_hash_table_size (sums), input_size, rollsum_size,
(((double)(input_size+1)) / ((double) rollsum_size + 1)));
}
else
{
bupsplit_selftest ();
}
out: out:
g_clear_pointer (&bytes, g_bytes_unref);
if (local_error) if (local_error)
{ {
g_printerr ("%s\n", local_error->message); g_printerr ("%s\n", local_error->message);