diff --git a/Makefile-libostree.am b/Makefile-libostree.am index f77a36a7..a8ba8bc9 100644 --- a/Makefile-libostree.am +++ b/Makefile-libostree.am @@ -91,6 +91,7 @@ libostree_1_la_SOURCES = \ src/libostree/ostree-repo-static-delta-core.c \ src/libostree/ostree-repo-static-delta-processing.c \ src/libostree/ostree-repo-static-delta-compilation.c \ + src/libostree/ostree-repo-static-delta-compilation-analysis.c \ src/libostree/ostree-repo-static-delta-private.h \ $(NULL) if USE_LIBARCHIVE diff --git a/src/libostree/ostree-repo-static-delta-compilation-analysis.c b/src/libostree/ostree-repo-static-delta-compilation-analysis.c new file mode 100644 index 00000000..39c818fe --- /dev/null +++ b/src/libostree/ostree-repo-static-delta-compilation-analysis.c @@ -0,0 +1,305 @@ +/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- + * + * Copyright (C) 2015 Colin Walters + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#include "config.h" + +#include +#include + +#include "ostree-core-private.h" +#include "ostree-repo-private.h" +#include "ostree-lzma-compressor.h" +#include "ostree-repo-static-delta-private.h" +#include "ostree-diff.h" +#include "ostree-rollsum.h" +#include "otutil.h" +#include "ostree-varint.h" + +void +_ostree_delta_content_sizenames_free (gpointer v) +{ + OstreeDeltaContentSizeNames *ce = v; + g_free (ce->checksum); + g_ptr_array_unref (ce->basenames); + g_free (ce); +} + +static gboolean +build_content_sizenames_recurse (OstreeRepo *repo, + OstreeRepoCommitTraverseIter *iter, + GHashTable *sizenames_map, + GHashTable *include_only_objects, + GCancellable *cancellable, + GError **error) +{ + gboolean ret = FALSE; + + while (TRUE) + { + OstreeRepoCommitIterResult iterres = + ostree_repo_commit_traverse_iter_next (iter, cancellable, error); + + if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_ERROR) + goto out; + else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_END) + break; + else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_FILE) + { + char *name; + char *checksum; + OstreeDeltaContentSizeNames *csizenames; + + ostree_repo_commit_traverse_iter_get_file (iter, &name, &checksum); + + if (include_only_objects && !g_hash_table_contains (include_only_objects, checksum)) + continue; + + csizenames = g_hash_table_lookup (sizenames_map, checksum); + if (!csizenames) + { + gs_unref_object GFileInfo *finfo = NULL; + + csizenames = g_new0 (OstreeDeltaContentSizeNames, 1); + csizenames->checksum = g_strdup (checksum); + + /* Transfer ownership so things get cleaned up if we + * throw an exception below. + */ + g_hash_table_replace (sizenames_map, csizenames->checksum, csizenames); + + if (!ostree_repo_load_file (repo, checksum, + NULL, &finfo, NULL, + cancellable, error)) + goto out; + + csizenames->size = g_file_info_get_size (finfo); + } + + if (!csizenames->basenames) + csizenames->basenames = g_ptr_array_new_with_free_func (g_free); + g_ptr_array_add (csizenames->basenames, g_strdup (name)); + } + else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_DIR) + { + char *name; + char *content_checksum; + char *meta_checksum; + gs_unref_variant GVariant *dirtree = NULL; + ostree_cleanup_repo_commit_traverse_iter + OstreeRepoCommitTraverseIter subiter = { 0, }; + + ostree_repo_commit_traverse_iter_get_dir (iter, &name, &content_checksum, &meta_checksum); + + if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_DIR_TREE, + content_checksum, &dirtree, + error)) + goto out; + + if (!ostree_repo_commit_traverse_iter_init_dirtree (&subiter, repo, dirtree, + OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE, + error)) + goto out; + + if (!build_content_sizenames_recurse (repo, &subiter, + sizenames_map, include_only_objects, + cancellable, error)) + goto out; + } + else + g_assert_not_reached (); + } + ret = TRUE; + out: + return ret; +} + +static int +compare_sizenames (const void *a, + const void *b) +{ + OstreeDeltaContentSizeNames *sn_a = *(OstreeDeltaContentSizeNames**)(void*)a; + OstreeDeltaContentSizeNames *sn_b = *(OstreeDeltaContentSizeNames**)(void*)b; + + return sn_a->size - sn_b->size; +} + +/** + * Generate a sorted array of [(checksum: str, size: uint64, names: array[string]), ...] + * for regular file content. + */ +static gboolean +build_content_sizenames_filtered (OstreeRepo *repo, + GVariant *commit, + GHashTable *include_only_objects, + GPtrArray **out_sizenames, + GCancellable *cancellable, + GError **error) +{ + gboolean ret = FALSE; + gs_unref_ptrarray GPtrArray *ret_sizenames = + g_ptr_array_new_with_free_func (_ostree_delta_content_sizenames_free); + gs_unref_hashtable GHashTable *sizenames_map = + g_hash_table_new_full (g_str_hash, g_str_equal, NULL, _ostree_delta_content_sizenames_free); + ostree_cleanup_repo_commit_traverse_iter + OstreeRepoCommitTraverseIter iter = { 0, }; + + if (!ostree_repo_commit_traverse_iter_init_commit (&iter, repo, commit, + OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE, + error)) + goto out; + + if (!build_content_sizenames_recurse (repo, &iter, sizenames_map, include_only_objects, + cancellable, error)) + goto out; + + { GHashTableIter hashiter; + gpointer hkey, hvalue; + + g_hash_table_iter_init (&hashiter, sizenames_map); + while (g_hash_table_iter_next (&hashiter, &hkey, &hvalue)) + { + g_hash_table_iter_steal (&hashiter); + g_ptr_array_add (ret_sizenames, hvalue); + } + } + + g_ptr_array_sort (ret_sizenames, compare_sizenames); + + ret = TRUE; + gs_transfer_out_value (out_sizenames, &ret_sizenames); + out: + return ret; +} + +static gboolean +string_array_nonempty_intersection (GPtrArray *a, + GPtrArray *b) +{ + guint i; + for (i = 0; i < a->len; i++) + { + guint j; + const char *a_str = a->pdata[i]; + for (j = 0; j < b->len; j++) + { + const char *b_str = b->pdata[j]; + if (strcmp (a_str, b_str) == 0) + return TRUE; + } + } + return FALSE; +} + +/* + * Build up a map of files with matching basenames and similar size, + * and use it to find apparently similar objects. + * + * @new_reachable_regfile_content is a Set of new regular + * file objects. + * + * Currently, @out_modified_regfile_content will be a Map; + * however in the future it would be easy to have this function return + * multiple candidate matches. The hard part would be changing + * the delta compiler to iterate over all matches, determine + * a cost for each one, then pick the best. + */ +gboolean +_ostree_delta_compute_similar_objects (OstreeRepo *repo, + GVariant *from_commit, + GVariant *to_commit, + GHashTable *new_reachable_regfile_content, + guint similarity_percent_threshold, + GHashTable **out_modified_regfile_content, + GCancellable *cancellable, + GError **error) +{ + gboolean ret = FALSE; + gs_unref_hashtable GHashTable *ret_modified_regfile_content = + g_hash_table_new_full (g_str_hash, g_str_equal, g_free, (GDestroyNotify)g_ptr_array_unref); + gs_unref_ptrarray GPtrArray *from_sizes = NULL; + gs_unref_ptrarray GPtrArray *to_sizes = NULL; + guint i, j; + guint lower; + guint upper; + + if (!build_content_sizenames_filtered (repo, from_commit, NULL, + &from_sizes, + cancellable, error)) + goto out; + + if (!build_content_sizenames_filtered (repo, to_commit, new_reachable_regfile_content, + &to_sizes, + cancellable, error)) + goto out; + + /* Iterate over all newly added objects, find objects which have + * similar basename and sizes. + * + * Because the arrays are sorted by size, we can maintain a `lower` + * bound on the original (from) objects to start searching. + */ + lower = 0; + upper = from_sizes->len; + for (i = 0; i < to_sizes->len; i++) + { + OstreeDeltaContentSizeNames *to_sizenames = to_sizes->pdata[i]; + const guint64 min_threshold = to_sizenames->size * + (1.0-similarity_percent_threshold/100.0); + const guint64 max_threshold = to_sizenames->size * + (1.0+similarity_percent_threshold/100.0); + + /* Don't build candidates for the empty object */ + if (to_sizenames->size == 0) + continue; + + for (j = lower; j < upper; j++) + { + OstreeDeltaContentSizeNames *from_sizenames = from_sizes->pdata[j]; + + /* Don't build candidates for the empty object */ + if (from_sizenames->size == 0) + continue; + + if (from_sizenames->size < min_threshold) + { + lower++; + continue; + } + + if (from_sizenames->size > max_threshold) + break; + + if (!string_array_nonempty_intersection (from_sizenames->basenames, to_sizenames->basenames)) + continue; + + /* Only one candidate right now */ + g_hash_table_insert (ret_modified_regfile_content, + g_strdup (to_sizenames->checksum), + g_strdup (from_sizenames->checksum)); + break; + } + } + + ret = TRUE; + gs_transfer_out_value (out_modified_regfile_content, &ret_modified_regfile_content); + out: + return ret; +} + diff --git a/src/libostree/ostree-repo-static-delta-compilation.c b/src/libostree/ostree-repo-static-delta-compilation.c index 62d006ef..b3ce797c 100644 --- a/src/libostree/ostree-repo-static-delta-compilation.c +++ b/src/libostree/ostree-repo-static-delta-compilation.c @@ -32,6 +32,8 @@ #include "otutil.h" #include "ostree-varint.h" +#define CONTENT_SIZE_SIMILARITY_THRESHOLD_PERCENT (30) + typedef struct { guint64 uncompressed_size; GPtrArray *objects; @@ -479,7 +481,7 @@ try_content_rollsum (OstreeRepo *repo, gs_unref_bytes GBytes *tmp_to = NULL; gs_unref_object GFileInfo *from_finfo = NULL; gs_unref_object GFileInfo *to_finfo = NULL; - OstreeRollsumMatches *matches; + OstreeRollsumMatches *matches = NULL; ContentRollsum *ret_rollsum = NULL; *out_rollsum = NULL; @@ -669,7 +671,6 @@ process_one_rollsum (OstreeRepo *repo, return ret; } - static gboolean generate_delta_lowlatency (OstreeRepo *repo, const char *from, @@ -681,18 +682,18 @@ generate_delta_lowlatency (OstreeRepo *repo, gboolean ret = FALSE; GHashTableIter hashiter; gpointer key, value; - guint i; OstreeStaticDeltaPartBuilder *current_part = NULL; gs_unref_object GFile *root_from = NULL; + gs_unref_variant GVariant *from_commit = NULL; gs_unref_object GFile *root_to = NULL; - gs_unref_ptrarray GPtrArray *modified = NULL; - gs_unref_ptrarray GPtrArray *removed = NULL; - gs_unref_ptrarray GPtrArray *added = NULL; + gs_unref_variant GVariant *to_commit = NULL; gs_unref_hashtable GHashTable *to_reachable_objects = NULL; gs_unref_hashtable GHashTable *from_reachable_objects = NULL; + gs_unref_hashtable GHashTable *from_regfile_content = NULL; gs_unref_hashtable GHashTable *new_reachable_metadata = NULL; - gs_unref_hashtable GHashTable *new_reachable_content = NULL; - gs_unref_hashtable GHashTable *modified_content_objects = NULL; + gs_unref_hashtable GHashTable *new_reachable_regfile_content = NULL; + gs_unref_hashtable GHashTable *new_reachable_symlink_content = NULL; + gs_unref_hashtable GHashTable *modified_regfile_content = NULL; gs_unref_hashtable GHashTable *rollsum_optimized_content_objects = NULL; gs_unref_hashtable GHashTable *content_object_to_size = NULL; @@ -701,51 +702,30 @@ generate_delta_lowlatency (OstreeRepo *repo, if (!ostree_repo_read_commit (repo, from, &root_from, NULL, cancellable, error)) goto out; - } - if (!ostree_repo_read_commit (repo, to, &root_to, NULL, - cancellable, error)) - goto out; - /* Gather a filesystem level diff; when we do heuristics to ship - * just parts of changed files, we can make use of this data. - */ - modified = g_ptr_array_new_with_free_func ((GDestroyNotify) ostree_diff_item_unref); - removed = g_ptr_array_new_with_free_func ((GDestroyNotify) g_object_unref); - added = g_ptr_array_new_with_free_func ((GDestroyNotify) g_object_unref); - if (!ostree_diff_dirs (OSTREE_DIFF_FLAGS_NONE, root_from, root_to, modified, removed, added, - cancellable, error)) - goto out; + if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_COMMIT, from, + &from_commit, error)) + goto out; - modified_content_objects = g_hash_table_new_full (g_str_hash, g_str_equal, - g_free, g_free); - for (i = 0; i < modified->len; i++) - { - OstreeDiffItem *diffitem = modified->pdata[i]; - /* Theoretically, a target file could replace multiple source - * files. That could happen if say a project changed from having - * multiple binaries to one binary. - * - * In that case, we have last one wins behavior. For ELF rollsum - * tends to be useless unless there's a large static data blob. - */ - g_hash_table_replace (modified_content_objects, - g_strdup (diffitem->target_checksum), - g_strdup (diffitem->src_checksum)); - } - - if (from) - { if (!ostree_repo_traverse_commit (repo, from, 0, &from_reachable_objects, cancellable, error)) goto out; } + if (!ostree_repo_read_commit (repo, to, &root_to, NULL, + cancellable, error)) + goto out; + if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_COMMIT, to, + &to_commit, error)) + goto out; + if (!ostree_repo_traverse_commit (repo, to, 0, &to_reachable_objects, cancellable, error)) goto out; new_reachable_metadata = ostree_repo_traverse_new_reachable (); - new_reachable_content = ostree_repo_traverse_new_reachable (); + new_reachable_regfile_content = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_free); + new_reachable_symlink_content = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_free); g_hash_table_iter_init (&hashiter, to_reachable_objects); while (g_hash_table_iter_next (&hashiter, &key, &value)) @@ -763,14 +743,41 @@ generate_delta_lowlatency (OstreeRepo *repo, if (OSTREE_OBJECT_TYPE_IS_META (objtype)) g_hash_table_add (new_reachable_metadata, serialized_key); else - g_hash_table_add (new_reachable_content, serialized_key); + { + gs_unref_object GFileInfo *finfo = NULL; + GFileType ftype; + + if (!ostree_repo_load_file (repo, checksum, NULL, &finfo, NULL, + cancellable, error)) + goto out; + + ftype = g_file_info_get_file_type (finfo); + if (ftype == G_FILE_TYPE_REGULAR) + g_hash_table_add (new_reachable_regfile_content, g_strdup (checksum)); + else if (ftype == G_FILE_TYPE_SYMBOLIC_LINK) + g_hash_table_add (new_reachable_symlink_content, g_strdup (checksum)); + else + g_assert_not_reached (); + } } - - g_printerr ("modified: %u removed: %u added: %u\n", - modified->len, removed->len, added->len); - g_printerr ("new reachable: metadata=%u content=%u\n", + + if (from_commit) + { + if (!_ostree_delta_compute_similar_objects (repo, from_commit, to_commit, + new_reachable_regfile_content, + CONTENT_SIZE_SIMILARITY_THRESHOLD_PERCENT, + &modified_regfile_content, + cancellable, error)) + goto out; + } + else + modified_regfile_content = g_hash_table_new (g_str_hash, g_str_equal); + + g_printerr ("modified: %u\n", g_hash_table_size (modified_regfile_content)); + g_printerr ("new reachable: metadata=%u content regular=%u symlink=%u\n", g_hash_table_size (new_reachable_metadata), - g_hash_table_size (new_reachable_content)); + g_hash_table_size (new_reachable_regfile_content), + g_hash_table_size (new_reachable_symlink_content)); /* We already ship the to commit in the superblock, don't ship it twice */ g_hash_table_remove (new_reachable_metadata, @@ -780,7 +787,7 @@ generate_delta_lowlatency (OstreeRepo *repo, g_free, (GDestroyNotify) content_rollsums_free); - g_hash_table_iter_init (&hashiter, modified_content_objects); + g_hash_table_iter_init (&hashiter, modified_regfile_content); while (g_hash_table_iter_next (&hashiter, &key, &value)) { const char *to_checksum = key; @@ -800,7 +807,7 @@ generate_delta_lowlatency (OstreeRepo *repo, g_printerr ("rollsum for %u/%u modified\n", g_hash_table_size (rollsum_optimized_content_objects), - g_hash_table_size (modified_content_objects)); + g_hash_table_size (modified_regfile_content)); current_part = allocate_part (builder); @@ -837,22 +844,18 @@ generate_delta_lowlatency (OstreeRepo *repo, /* Scan for large objects, so we can fall back to plain HTTP-based * fetch. */ - g_hash_table_iter_init (&hashiter, new_reachable_content); + g_hash_table_iter_init (&hashiter, new_reachable_regfile_content); while (g_hash_table_iter_next (&hashiter, &key, &value)) { - GVariant *serialized_key = key; - const char *checksum; - OstreeObjectType objtype; + const char *checksum = key; guint64 uncompressed_size; gboolean fallback = FALSE; - ostree_object_name_deserialize (serialized_key, &checksum, &objtype); - /* Skip content objects we rollsum'd */ if (g_hash_table_contains (rollsum_optimized_content_objects, checksum)) continue; - if (!ostree_repo_load_object_stream (repo, objtype, checksum, + if (!ostree_repo_load_object_stream (repo, OSTREE_OBJECT_TYPE_FILE, checksum, NULL, &uncompressed_size, cancellable, error)) goto out; @@ -862,30 +865,37 @@ generate_delta_lowlatency (OstreeRepo *repo, if (fallback) { gs_free char *size = g_format_size (uncompressed_size); - g_printerr ("fallback for %s (%s)\n", - ostree_object_to_string (checksum, objtype), size); + g_printerr ("fallback for %s (%s)\n", checksum, size); g_ptr_array_add (builder->fallback_objects, - g_variant_ref (serialized_key)); + ostree_object_name_serialize (checksum, OSTREE_OBJECT_TYPE_FILE)); g_hash_table_iter_remove (&hashiter); } } - /* Now non-rollsummed content */ - g_hash_table_iter_init (&hashiter, new_reachable_content); + /* Now non-rollsummed regular file content */ + g_hash_table_iter_init (&hashiter, new_reachable_regfile_content); while (g_hash_table_iter_next (&hashiter, &key, &value)) { - GVariant *serialized_key = key; - const char *checksum; - OstreeObjectType objtype; - - ostree_object_name_deserialize (serialized_key, &checksum, &objtype); + const char *checksum = key; /* Skip content objects we rollsum'd */ if (g_hash_table_contains (rollsum_optimized_content_objects, checksum)) continue; if (!process_one_object (repo, builder, ¤t_part, - checksum, objtype, + checksum, OSTREE_OBJECT_TYPE_FILE, + cancellable, error)) + goto out; + } + + /* Now symlinks */ + g_hash_table_iter_init (&hashiter, new_reachable_symlink_content); + while (g_hash_table_iter_next (&hashiter, &key, &value)) + { + const char *checksum = key; + + if (!process_one_object (repo, builder, ¤t_part, + checksum, OSTREE_OBJECT_TYPE_FILE, cancellable, error)) goto out; } diff --git a/src/libostree/ostree-repo-static-delta-private.h b/src/libostree/ostree-repo-static-delta-private.h index 2478f167..55ef437f 100644 --- a/src/libostree/ostree-repo-static-delta-private.h +++ b/src/libostree/ostree-repo-static-delta-private.h @@ -152,4 +152,23 @@ _ostree_repo_static_delta_part_have_all_objects (OstreeRepo *repo, gboolean *out_have_all, GCancellable *cancellable, GError **error); + +typedef struct { + char *checksum; + guint64 size; + GPtrArray *basenames; +} OstreeDeltaContentSizeNames; + +void _ostree_delta_content_sizenames_free (gpointer v); + +gboolean +_ostree_delta_compute_similar_objects (OstreeRepo *repo, + GVariant *from_commit, + GVariant *to_commit, + GHashTable *new_reachable_regfile_content, + guint similarity_percent_threshold, + GHashTable **out_modified_regfile_content, + GCancellable *cancellable, + GError **error); + G_END_DECLS