From 844c5ea652251d57f8afbd3ca133d7fc3ce8fc50 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 15 Aug 2013 09:14:26 -0400 Subject: [PATCH] core: Import bup's "rollsum" code, add a test case For static deltas, one strategy that will be employed is to split each object into chunks, and only include changed chunks in the deltas. --- Makefile-tests.am | 5 ++ src/libostree/bupsplit.c | 152 +++++++++++++++++++++++++++++++++++++++ src/libostree/bupsplit.h | 49 +++++++++++++ tests/test-rollsum.c | 79 ++++++++++++++++++++ 4 files changed, 285 insertions(+) create mode 100644 src/libostree/bupsplit.c create mode 100644 src/libostree/bupsplit.h create mode 100644 tests/test-rollsum.c diff --git a/Makefile-tests.am b/Makefile-tests.am index 442b6035..e9f45f3a 100644 --- a/Makefile-tests.am +++ b/Makefile-tests.am @@ -87,6 +87,11 @@ test_varint_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS) test_varint_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS) testmeta_DATA += test-varint.test +insttest_PROGRAMS += test-rollsum +test_rollsum_SOURCES = src/libostree/bupsplit.c tests/test-rollsum.c +test_rollsum_CFLAGS = $(ostree_bin_shared_cflags) $(OT_INTERNAL_GIO_UNIX_CFLAGS) +test_rollsum_LDADD = $(ostree_bin_shared_ldadd) $(OT_INTERNAL_GIO_UNIX_LIBS) + if BUILDOPT_GJS insttest_SCRIPTS += tests/test-core.js \ tests/test-sizes.js \ diff --git a/src/libostree/bupsplit.c b/src/libostree/bupsplit.c new file mode 100644 index 00000000..067b2e96 --- /dev/null +++ b/src/libostree/bupsplit.c @@ -0,0 +1,152 @@ +/* + * Copyright 2011 Avery Pennarun. All rights reserved. + * + * (This license applies to bupsplit.c and bupsplit.h only.) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY AVERY PENNARUN ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "bupsplit.h" +#include +#include +#include +#include + +// According to librsync/rollsum.h: +// "We should make this something other than zero to improve the +// checksum algorithm: tridge suggests a prime number." +// apenwarr: I unscientifically tried 0 and 7919, and they both ended up +// slightly worse than the librsync value of 31 for my arbitrary test data. +#define ROLLSUM_CHAR_OFFSET 31 + +typedef struct { + unsigned s1, s2; + uint8_t window[BUP_WINDOWSIZE]; + int wofs; +} Rollsum; + + +// These formulas are based on rollsum.h in the librsync project. +static void rollsum_add(Rollsum *r, uint8_t drop, uint8_t add) +{ + r->s1 += add - drop; + r->s2 += r->s1 - (BUP_WINDOWSIZE * (drop + ROLLSUM_CHAR_OFFSET)); +} + + +static void rollsum_init(Rollsum *r) +{ + r->s1 = BUP_WINDOWSIZE * ROLLSUM_CHAR_OFFSET; + r->s2 = BUP_WINDOWSIZE * (BUP_WINDOWSIZE-1) * ROLLSUM_CHAR_OFFSET; + r->wofs = 0; + memset(r->window, 0, BUP_WINDOWSIZE); +} + + +// For some reason, gcc 4.3 (at least) optimizes badly if find_ofs() +// is static and rollsum_roll is an inline function. Let's use a macro +// here instead to help out the optimizer. +#define rollsum_roll(r, ch) do { \ + rollsum_add((r), (r)->window[(r)->wofs], (ch)); \ + (r)->window[(r)->wofs] = (ch); \ + (r)->wofs = ((r)->wofs + 1) % BUP_WINDOWSIZE; \ +} while (0) + + +static uint32_t rollsum_digest(Rollsum *r) +{ + return (r->s1 << 16) | (r->s2 & 0xffff); +} + + +static uint32_t rollsum_sum(uint8_t *buf, size_t ofs, size_t len) +{ + size_t count; + Rollsum r; + rollsum_init(&r); + for (count = ofs; count < len; count++) + rollsum_roll(&r, buf[count]); + return rollsum_digest(&r); +} + + +int bupsplit_find_ofs(const unsigned char *buf, int len, int *bits) +{ + Rollsum r; + int count; + + rollsum_init(&r); + for (count = 0; count < len; count++) + { + rollsum_roll(&r, buf[count]); + if ((r.s2 & (BUP_BLOBSIZE-1)) == ((~0) & (BUP_BLOBSIZE-1))) + { + if (bits) + { + unsigned rsum = rollsum_digest(&r); + *bits = BUP_BLOBBITS; + rsum >>= BUP_BLOBBITS; + for (*bits = BUP_BLOBBITS; (rsum >>= 1) & 1; (*bits)++) + ; + } + return count+1; + } + } + return 0; +} + + +#ifndef BUP_NO_SELFTEST +#define BUP_SELFTEST_SIZE 100000 + +int bupsplit_selftest() +{ + uint8_t *buf = malloc(BUP_SELFTEST_SIZE); + uint32_t sum1a, sum1b, sum2a, sum2b, sum3a, sum3b; + unsigned count; + + srandom(1); + for (count = 0; count < BUP_SELFTEST_SIZE; count++) + buf[count] = random(); + + sum1a = rollsum_sum(buf, 0, BUP_SELFTEST_SIZE); + sum1b = rollsum_sum(buf, 1, BUP_SELFTEST_SIZE); + sum2a = rollsum_sum(buf, BUP_SELFTEST_SIZE - BUP_WINDOWSIZE*5/2, + BUP_SELFTEST_SIZE - BUP_WINDOWSIZE); + sum2b = rollsum_sum(buf, 0, BUP_SELFTEST_SIZE - BUP_WINDOWSIZE); + sum3a = rollsum_sum(buf, 0, BUP_WINDOWSIZE+3); + sum3b = rollsum_sum(buf, 3, BUP_WINDOWSIZE+3); + + fprintf(stderr, "sum1a = 0x%08x\n", sum1a); + fprintf(stderr, "sum1b = 0x%08x\n", sum1b); + fprintf(stderr, "sum2a = 0x%08x\n", sum2a); + fprintf(stderr, "sum2b = 0x%08x\n", sum2b); + fprintf(stderr, "sum3a = 0x%08x\n", sum3a); + fprintf(stderr, "sum3b = 0x%08x\n", sum3b); + + free(buf); + return sum1a!=sum1b || sum2a!=sum2b || sum3a!=sum3b; +} + +#endif // !BUP_NO_SELFTEST diff --git a/src/libostree/bupsplit.h b/src/libostree/bupsplit.h new file mode 100644 index 00000000..e37a56ab --- /dev/null +++ b/src/libostree/bupsplit.h @@ -0,0 +1,49 @@ +/* + * Copyright 2011 Avery Pennarun. All rights reserved. + * + * (This license applies to bupsplit.c and bupsplit.h only.) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY AVERY PENNARUN ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef __BUPSPLIT_H +#define __BUPSPLIT_H + +#define BUP_BLOBBITS (13) +#define BUP_BLOBSIZE (1< + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#include "config.h" + +#include "libgsystem.h" + +#include "bupsplit.h" + +#define BLOB_MAX (8192*4) +#define BLOB_READ_SIZE (1024*1024) + +int +main (int argc, char **argv) +{ + GCancellable *cancellable = NULL; + GError *local_error = NULL; + GError **error = &local_error; + gs_unref_object GFile *path = NULL; + GBytes *bytes = NULL; + + g_setenv ("GIO_USE_VFS", "local", TRUE); + + if (argc > 1) + { + const guint8 *start; + gsize len; + + path = g_file_new_for_path (argv[1]); + bytes = gs_file_map_readonly (path, cancellable, error); + if (!bytes) + goto out; + + start = g_bytes_get_data (bytes, &len); + while (TRUE) + { + int offset, bits; + offset = bupsplit_find_ofs (start, MIN(G_MAXINT32, len), &bits); + if (offset == 0) + break; + if (offset > BLOB_MAX) + offset = BLOB_MAX; + g_print ("%" G_GUINT64_FORMAT "\n", (guint64)offset); + start += offset; + len -= offset; + } + } + else + { + bupsplit_selftest (); + } + + out: + g_clear_pointer (&bytes, g_bytes_unref); + if (local_error) + { + g_printerr ("%s\n", local_error->message); + g_error_free (local_error); + return 1; + } + return 0; +}