From 48326a942a0e677f8ae5c1ee7fe26fa82fd5ea80 Mon Sep 17 00:00:00 2001
From: Donald Buczek <buczek@molgen.mpg.de>
Date: Tue, 24 Oct 2023 15:49:36 +0200
Subject: [PATCH] Add tool beeindextr

The bee inventory file `/var/cache/bee/beecache/INVENTORY` contains the
filenames of the files installed from bee.

One problem is, that a file can have more than one name name because of
symlinks. For example, we currently have a symlink `/usr/doc ->
share/doc` and the inventory currently contains this entry:

   cunit-2.1_3-1.x86_64 1619448799 0 0 0100644 683 7eb1686ef4ebac0f4743c407da5aab52 /usr/doc/CUnit/CUnit_doc.css

So the file is not registerd by its canonical name
`/usr/share/doc/CUnit/CUnit_doc.css` but by the alias name
`/usr/doc/CUnit/CUnit_doc.css`. This can happen, if a package is
installed "through" a symlink in the system.

The problem with multiple paths to the same file is that if one version
of a package installs a file through one path and another version of a
package installs the same file though another path, then the file is
lost after "bee update".

This is because all files from the old package, which are not
registered by the new version of the same package (or any other
installed package) are removed. While the removal works through a
symlink, the protection by its single registered filename does not.

To mitigate this problem, bee should protect the file itself during
certain operations like `bee update`. It is not enough to protect
name variant used in the inventory.

We want to achieve this by translating the directory names used in the
inventory to their canonical form (if possible).

This patch adds a filter tool for the bee index format, which translates
the filenames into a canonical form.

Don't just use realpath(3) or canonicalize_file_name(3) for that,
because this would be much to slow. These functions do readlink() for
every path component of the provided name. As we have to do this for
every file of the inventory, the system call usage and file system
access would explode. Processing the inventory file this way, took more
than two minutes.

Instead, cache the results of readlink for a single invocation of the
tool, so that the operation is done only once per file. Use another
cache for complete translated path names to reduce the load to the
readlink-cache. This way, the same result can be achived in 0.3 seconds.

    $ ls -ld /usr/doc
    lrwxrwxrwx 1 root root 9 Mar  7  2011 /usr/doc -> share/doc
    $ grep CUnit_doc.css /var/cache/bee/bee-cache/INVENTORY
    cunit-2.1_3-1.x86_64 1619448799 0 0 0100644 683 7eb1686ef4ebac0f4743c407da5aab52 /usr/doc/CUnit/CUnit_doc.css
    $ ./beeindextr /var/cache/bee/bee-cache/INVENTORY | grep CUnit_doc.css
    cunit-2.1_3-1.x86_64 1619448799 0 0 0100644 683 7eb1686ef4ebac0f4743c407da5aab52 /usr/share/doc/CUnit/CUnit_doc.css
---
 .gitignore       |   1 +
 Makefile         |   5 +
 src/beeindextr.c | 473 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 479 insertions(+)
 create mode 100644 src/beeindextr.c

diff --git a/.gitignore b/.gitignore
index 695fe88..a8fbe19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@
 /beeversion
 /beegetopt
 /beeflock
+/beeindextr
 /beelib.config.sh
 /bee.1
 /bee-check.1
diff --git a/Makefile b/Makefile
index 5b12cf4..f1548e9 100644
--- a/Makefile
+++ b/Makefile
@@ -68,6 +68,7 @@ HELPER_BEE_SHELL+=bee-remove
 HELPER_BEE_SHELL+=bee-update
 
 HELPER_C+=bee-cache-inventory
+HELPER_C+=beeindextr
 
 HELPER_SHELL+=compat-filesfile2contentfile
 HELPER_SHELL+=compat-fixmetadir
@@ -134,6 +135,7 @@ BEESORT_OBJECTS=bee_tree.o bee_version_compare.o bee_version_output.o bee_versio
 BEEGETOPT_OBJECTS=bee_getopt.o beegetopt.o
 BEEFLOCK_OBJECTS=bee_getopt.o beeflock.o
 BEECACHEINVENTORY_OBJECTS=bee-cache-inventory.o bee_getopt.o
+BEEICANONDIRS_OBJECTS=beeindextr.o
 
 bee_BUILDTYPES=$(addsuffix .sh,$(addprefix buildtypes/,$(BUILDTYPES)))
 
@@ -166,6 +168,9 @@ beeflock: $(addprefix src/, ${BEEFLOCK_OBJECTS})
 bee-cache-inventory: $(addprefix src/, ${BEECACHEINVENTORY_OBJECTS})
 	$(call quiet-command,${CC} ${LDFLAGS} -o $@ $^,"LD	$@")
 
+beeindextr: $(addprefix src/, ${BEEICANONDIRS_OBJECTS})
+	$(call quiet-command,${CC} ${LDFLAGS} -o $@ $^,"LD	$@")
+
 %.o: %.c
 	$(call quiet-command,${CC} ${CFLAGS} -o $@ -c $^,"CC	$@")
 
diff --git a/src/beeindextr.c b/src/beeindextr.c
new file mode 100644
index 0000000..e3f01aa
--- /dev/null
+++ b/src/beeindextr.c
@@ -0,0 +1,473 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <libgen.h>
+#include <stdint.h>
+#include <limits.h>
+#include <assert.h>
+#include <ctype.h>
+
+__attribute__((format (printf, 1, 2)))
+static void die(const char *restrict fmt, ...)  {
+    va_list ap;
+    va_start(ap, fmt);
+    vfprintf(stderr, fmt, ap);
+    va_end(ap);
+    _exit(1);
+}
+
+#define offsetof(type, member)  __builtin_offsetof (type, member)
+
+#define container_of(ptr, type, member) \
+        ((type *) ((char *) (ptr) - offsetof(type, member)))
+
+static void *malloc_nofail(size_t size) {
+    void *ret = malloc(size);
+    if (ret == NULL)
+        die("%m\n");
+    return ret;
+}
+
+static void *zmalloc_nofail(size_t size) {
+    void *ret = malloc_nofail(size);
+    memset(ret, 0, size);
+    return ret;
+}
+
+static char *strdup_nofail(const char *s) {
+    char *dst = strdup(s);
+    if (dst == NULL)
+        die("%m\n");
+    return dst;
+}
+
+/***  Simple string to u32 hash  implementation ***/
+
+static uint32_t naive_hash(const char *s) {
+    uint32_t h = 0;
+    while (*s) {
+        uint32_t highbits = h & 0xf8000000;
+        h = h << 5;
+        h = h ^ (highbits >> 27);
+        h = h ^ *s++;
+    }
+    return h;
+}
+
+/************ simple readlink cache *************/
+
+struct readlink_cache_entry {
+    char *name;
+    uint32_t hash;
+    ssize_t result;
+    int saved_errno;            // valid if result < 0 otherwise 0
+    char *target;               // not zero terminated , NULL when unused
+};
+
+static struct readlink_cache {
+    int slots;
+    int used;
+    struct readlink_cache_entry *entry;
+} readlink_cache;
+
+__attribute__((unused))
+static void readlink_cache_free() {
+    struct readlink_cache *c = &readlink_cache;
+    for (int i=0 ; i < c->used ; i++) {
+        free(c->entry[i].name);
+        free(c->entry[i].target);
+    }
+    c->slots = 0;
+    c->used = 0;
+    free(c->entry);
+    c->entry = NULL;
+}
+
+static void readlink_cache_add(const char *name, ssize_t result, int saved_errno, char *target) {
+    struct readlink_cache *c = &readlink_cache;
+    if (c->slots == 0) {
+        c->slots = 200;
+        c->entry = zmalloc_nofail(200 * sizeof(*c->entry));
+    } else if (c->used >= c->slots) {
+        int new_slots = c->slots + c->slots;
+        struct readlink_cache_entry *new_entry = zmalloc_nofail(new_slots * sizeof(*new_entry));
+        memcpy(new_entry, c->entry, c->slots * sizeof(*new_entry));
+        memset(&new_entry[c->slots], 0, new_slots - c->slots);
+        c->slots = new_slots;
+        free(c->entry);
+        c->entry = new_entry;
+    }
+    c->entry[c->used].name = strdup_nofail(name);
+    c->entry[c->used].hash = naive_hash(name);
+    c->entry[c->used].result = result;
+    c->entry[c->used].saved_errno = saved_errno;
+    if (result > 0) {
+        c->entry[c->used].target = malloc_nofail(result);
+        memcpy(c->entry[c->used].target, target, result);
+    }
+    c->used++;
+}
+
+static ssize_t readlink_cache_readlink(const char *restrict pathname, char *restrict buf, size_t bufsize) {
+    struct readlink_cache *c = &readlink_cache;
+    uint32_t hash = naive_hash(pathname);
+
+    for (int i = c->used-1 ; i >= 0 ; i--) {
+        if (c->entry[i].hash == hash && strcmp(c->entry[i].name, pathname) == 0) {
+            ssize_t result = c->entry[i].result;
+            if (result < 0) {
+                errno = c->entry[i].saved_errno;
+            } else if (result > 0) {
+                if ((unsigned)result > bufsize)
+                    result = bufsize;
+                memcpy(buf, c->entry[i].target, result);
+            }
+            return result;
+        }
+    }
+    ssize_t result = readlink(pathname, buf, bufsize);
+    int saved_errno = 0;
+    if (result < 0)
+        saved_errno = errno;
+    readlink_cache_add(pathname, result, saved_errno, buf);
+    return result;
+}
+
+__attribute__((unused))
+static void readlink_cache_dump() {
+    printf("CACHE:\n");
+    struct readlink_cache *c = &readlink_cache;
+    printf("   slots: %d\n", c->slots);
+    printf("   used:  %d\n", c->used);
+    printf("   entry: %p\n", c->entry);
+    for (int i=0 ; i < c->slots ; i++) {
+            printf("      entry[%d].name:        %s\n", i, c->entry[i].name);
+            printf("      entry[%d].hash:        %08x\n", i, c->entry[i].hash);
+            printf("      entry[%d].result:      %ld\n", i, c->entry[i].result);
+            printf("      entry[%d].saved_errno: %d\n", i, c->entry[i].saved_errno);
+            printf("      entry[%d].target:      %.*s\n", i, (int)c->entry[i].result, c->entry[i].target);
+        }
+}
+
+/************* canondir cache ***********/
+
+struct cdir_cache_entry {
+    char *name;
+    uint32_t hash;
+    char *dst;              // NULL for equal src and destination string
+};
+
+static struct cdir_cache {
+    int slots;
+    int used;
+    struct cdir_cache_entry *entry;
+} cdir_cache;
+
+__attribute__((unused))
+static void cdir_cache_free() {
+    struct cdir_cache *c = &cdir_cache;
+    for (int i=0 ; i < c->used ; i++) {
+        free(c->entry[i].name);
+        free(c->entry[i].dst);
+    }
+    c->slots = 0;
+    c->used = 0;
+    free(c->entry);
+    c->entry = NULL;
+}
+
+static char *cdir_cache_try(const char *restrict path) {
+    struct cdir_cache *c = &cdir_cache;
+    uint32_t hash = naive_hash(path);
+
+    for (int i = c->used-1 ; i >= 0 ; i--) {
+        if (c->entry[i].hash == hash && strcmp(c->entry[i].name, path) == 0)
+            return c->entry[i].dst;
+    }
+    return (void *)-1;   // not found
+}
+
+static void cdir_cache_add(const char *name, const char *dst) {
+    struct cdir_cache *c = &cdir_cache;
+    if (c->slots == 0) {
+        c->slots = 200;
+        c->entry = zmalloc_nofail(200 * sizeof(*c->entry));
+    } else if (c->used >= c->slots) {
+        int new_slots = c->slots + c->slots;
+        struct cdir_cache_entry *new_entry = zmalloc_nofail(new_slots * sizeof(*new_entry));
+        memcpy(new_entry, c->entry, c->slots * sizeof(*new_entry));
+        memset(&new_entry[c->slots], 0, new_slots - c->slots);
+        c->slots = new_slots;
+        free(c->entry);
+        c->entry = new_entry;
+    }
+    c->entry[c->used].name = strdup_nofail(name);
+    c->entry[c->used].hash = naive_hash(name);
+    if ( strcmp(name, dst) != 0)
+        c->entry[c->used].dst = strdup_nofail(dst);
+    else
+        c->entry[c->used].dst = NULL;
+    c->used++;
+}
+
+/****************************************/
+
+static char *get_dirname(char *path,  char *outbuf, ssize_t outbuf_len) {
+    char *p =path + strlen(path);
+    while (p > path + 1 && p[-1] == '/')
+        p--;
+    while (p > path + 1 && p[-1] != '/')
+        p--;
+    while (p > path + 1 && p[-1] == '/')
+        p--;
+    if (p - path + 1 > outbuf_len)
+        die ("get_dirname: output buffer to small\n");
+    memcpy(outbuf, path, p - path);
+    outbuf[p - path] = '\0';
+    return outbuf;
+}
+
+static char *get_basename(char *path, char *outbuf, ssize_t outbuf_len) {
+    char *p = path + strlen(path);
+    int len = 0;
+    while (p > path + 1 && p[-1] == '/')
+        p--;
+    while (p > path + 1 && p[-1] != '/') {
+        p--;
+        len++;
+    }
+    if (len + 1 > outbuf_len)
+        die ("get_basename: output buffer to small\n");
+    memcpy(outbuf, p, len);
+    outbuf[len] = '\0';
+    return outbuf;
+}
+
+static char *resolvedir(char *patharg, char *outbuf, size_t outbuf_len) {
+
+    if (outbuf_len < 1)
+        die ("resolvedir: output buffer to small\n");
+
+    static char dirnamebuf[PATH_MAX];
+
+    char *path = patharg;        // path is the full abosult path we work on
+                                 // when we follow a symlink, this will be changed
+                                 // to point to a malloced() buffer
+
+    char *in = path;             // input pointer
+    char *out = outbuf;          // output pointer
+
+    if (*in != '/')
+        die ("resolvedir: relative paths unsupported\n");
+
+    char *origdir = NULL;        // directory part of original patharg malloced()
+
+    {
+        // try the cdir cache with the full translation of the drirectory part
+        static char basenamebuf[PATH_MAX];
+        char *src = get_dirname(path, dirnamebuf, sizeof(dirnamebuf));
+        char *dst = cdir_cache_try(src);
+        if (dst == (void *)-1) {
+            // not found in cache. keep the directory part we parsed out of the string
+            // so that we can add the translation to the cache later
+            origdir = strdup_nofail(src);
+        } else {
+            if (dst == NULL)
+                dst = src;            // NULL = negative cached (resolved dir = original dir)
+            char *out = outbuf;
+            strcpy(outbuf, dst);
+            out += strlen(dst);
+            if (! (outbuf[0] == '/' && outbuf[1] == '\0'))
+                *out++ = '/';
+            strcpy(out, get_basename(path, basenamebuf, sizeof(basenamebuf)));
+            return outbuf;
+        }
+    }
+
+    in++;               // skip '/'
+    *out++ = '/';
+
+    while(1) {
+        assert( out[-1] == '/' );
+
+        if (*in == '\0')
+            break;
+
+        char *start = in;
+
+        while (*in != '\0' && *in != '/')
+            in++;
+
+        if (in == start) {                                                      // <nothing>/ - ignore redundant '/' 
+            ;
+        } else if (in == start+1 && start[0] == '.') {                          // ./ - ignore
+            ;
+        } else if (in == start+2 && start[0] == '.' && start[1] == '.') {       // ../
+            // up one level - rewind output
+            out--;
+            while (out > outbuf && out[-1] != '/')
+                out--;
+        } else {
+            // copy component name
+            if (outbuf+outbuf_len < out+(in-start)+1)
+                die("resolvedir: output buffer to small\n");
+            memcpy(out, start, in - start);
+            out += in - start;
+
+            // if this is the last component (the filename), do not check for symlinks
+            if (*in == '\0')
+                break;
+
+            // check for symlink
+
+            static char readlinkbuf[PATH_MAX];   // NOT zero-terminated
+
+            // make the output collected so far a zero-terminated string
+            *out = '\0';
+            int l = readlink_cache_readlink(outbuf, readlinkbuf, sizeof(readlinkbuf));
+            if (l == sizeof(readlinkbuf))
+                die("%s: symlink target name to long.\n", outbuf);
+
+            if (l < 0) {
+                // not a symlink
+                *out++ = '/';
+            } else {
+                // symlink
+                int restlen = strlen(in);
+                char *new_path;
+
+                if (readlinkbuf[0] == '/') {
+                    // absolute symlink, clear output, leave "/"
+                    out = outbuf+1;
+                    // in = target + rest
+                    new_path = malloc(l + restlen + 1);
+                    memcpy(new_path, readlinkbuf, l);
+                    strcpy(&new_path[l], in);
+                } else {
+                    // relative symlink, remove last component (the symlink name) from output, leave "/"
+                    while ( out > outbuf+1 && out[-1] != '/')
+                        out--;
+                    // in = "/" + target + rest
+                    new_path = malloc(1 + l + restlen + 1);
+                    new_path[0] = '/';
+                    memcpy(&new_path[1], readlinkbuf, l);
+                    strcpy(&new_path[l+1], in);
+                }
+                if (path != patharg)
+                    free(path);
+                path = new_path;
+                in = path;
+            }
+        }
+        if (*in == '\0')
+            break;
+        in++;
+    }
+    *out = '\0';
+    if (path != patharg)
+        free(path);
+
+    // cache translation of original input directory to canonicalized output directory
+    char *translated_dir = get_dirname(outbuf, dirnamebuf, sizeof(dirnamebuf));
+    cdir_cache_add(origdir, translated_dir);
+    free(origdir);
+
+    return outbuf;
+}
+
+__attribute__((unused))
+static void _resolvedir_selftest(char *in, char *expect) {
+    char obuf[128];
+    resolvedir(in, obuf, sizeof(obuf));
+    if (strcmp(obuf, expect) != 0)
+        printf("WARNING: resolvedir_selftest: in '%s' expected '%s' got '%s'\n", in, expect, obuf); 
+}
+
+__attribute__((unused))
+static void resolvedir_selftest() {
+    _resolvedir_selftest("/", "/");
+    _resolvedir_selftest("/file", "/file");
+    _resolvedir_selftest("/dir/", "/dir");
+    _resolvedir_selftest("/dir/file", "/dir/file");
+    _resolvedir_selftest("/dir////file", "/dir/file");
+    _resolvedir_selftest("/dir1/../dir2/file", "/dir2/file");
+    _resolvedir_selftest("/dir1/../dir2/dir3///",  "/dir2/dir3");
+
+    _resolvedir_selftest("/usr/tmp/file", "/tmp/file");
+    _resolvedir_selftest("/usr/tmp/dir/file", "/tmp/dir/file");
+    _resolvedir_selftest("/lib64", "/lib64");
+    _resolvedir_selftest("/lib64/file", "/lib/file");
+    _resolvedir_selftest("/lib64/dir/file", "/lib/dir/file");
+}
+
+
+static char *skipword(char *c) {
+    while (*c != '\0' && !isspace(*c) )
+        c++;
+    while (*c != '\0' && isspace(*c) )
+        c++;
+    return c;
+}
+
+static char *skipwords(char *c, int n) {
+    for (int i=0 ; i < n ; i++)
+        c = skipword(c);
+    return c;
+}
+
+static char *lbuf;
+static size_t lbuf_len;
+
+static void do_file(const char *restrict inventory) {
+
+    FILE *f = fopen(inventory, "r");
+    if (f == NULL)
+        die("%s: %m\n", inventory);
+
+    errno = 0;
+    while (1) {
+        errno = 0;
+        if (getline(&lbuf, &lbuf_len, f) == -1)
+            break;
+        size_t len = strlen(lbuf);
+        if (len > 0 && lbuf[len-1] == '\n')
+            lbuf[len-1] = '\0';
+
+        char *p = skipwords(lbuf, 7);
+        if (*p == '\0') {
+            die("%s: format error. Line: '%s'\n", inventory, lbuf);
+        }
+        if (p > lbuf+1)
+            p[-1] = '\0';
+
+        static char resolvebuf[PATH_MAX];
+
+        char *resolved = resolvedir(p, resolvebuf, sizeof(resolvebuf));
+        printf("%s %s\n", lbuf, resolved);
+    }
+    if (errno)
+        die("%s: %m\n", inventory);
+
+    fclose(f);
+}
+
+int main(int argc, char **argv) {
+    if (argc==1) {
+        do_file("/proc/self/fd/0");
+    } else {
+        for (int i=1 ; i < argc ; i++) {
+            do_file(argv[i]);
+        }
+    }
+#ifndef NDEBUG 
+    free(lbuf);
+    cdir_cache_free();
+    readlink_cache_free();
+#endif
+    return 0;
+}