From 317a5c0c13406e3eeab6808ee634e3c56cb0f50b Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Thu, 9 May 2024 10:27:35 +0200 Subject: [PATCH 1/5] beeissue: Add generated script to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 695fe88..75c6064 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ /beeversion /beegetopt /beeflock +/beeissue.sh /beelib.config.sh /bee.1 /bee-check.1 From bd8d8cd66eac08d173567104c956158515c1c3ef Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Tue, 24 Oct 2023 15:49:36 +0200 Subject: [PATCH 2/5] Add tool beeindextr The bee inventory file `/var/cache/bee/beecache/INVENTORY` contains the filenames of the files installed from bee. One problem is, that a file can have more than one name name because of symlinks. For example, we currently have a symlink `/usr/doc -> share/doc` and the inventory currently contains this entry: cunit-2.1_3-1.x86_64 1619448799 0 0 0100644 683 7eb1686ef4ebac0f4743c407da5aab52 /usr/doc/CUnit/CUnit_doc.css So the file is not registerd by its canonical name `/usr/share/doc/CUnit/CUnit_doc.css` but by the alias name `/usr/doc/CUnit/CUnit_doc.css`. This can happen, if a package is installed "through" a symlink in the system. The problem with multiple paths to the same file is that if one version of a package installs a file through one path and another version of a package installs the same file though another path, then the file is lost after "bee update". This is because all files from the old package, which are not registered by the new version of the same package (or any other installed package) are removed. While the removal works through a symlink, the protection by its single registered filename does not. To mitigate this problem, bee should protect the file itself during certain operations like `bee update`. It is not enough to protect name variant used in the inventory. We want to achieve this by translating the directory names used in the inventory to their canonical form (if possible). This patch adds a filter tool for the bee index format, which translates the filenames into a canonical form. Don't just use realpath(3) or canonicalize_file_name(3) for that, because this would be much to slow. These functions do readlink() for every path component of the provided name. As we have to do this for every file of the inventory, the system call usage and file system access would explode. Processing the inventory file this way, took more than two minutes. Instead, cache the results of readlink for a single invocation of the tool, so that the operation is done only once per file. Use another cache for complete translated path names to reduce the load to the readlink-cache. This way, the same result can be achived in 0.3 seconds. $ ls -ld /usr/doc lrwxrwxrwx 1 root root 9 Mar 7 2011 /usr/doc -> share/doc $ grep CUnit_doc.css /var/cache/bee/bee-cache/INVENTORY cunit-2.1_3-1.x86_64 1619448799 0 0 0100644 683 7eb1686ef4ebac0f4743c407da5aab52 /usr/doc/CUnit/CUnit_doc.css $ ./beeindextr /var/cache/bee/bee-cache/INVENTORY | grep CUnit_doc.css cunit-2.1_3-1.x86_64 1619448799 0 0 0100644 683 7eb1686ef4ebac0f4743c407da5aab52 /usr/share/doc/CUnit/CUnit_doc.css --- .gitignore | 1 + Makefile | 5 + src/beeindextr.c | 473 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 479 insertions(+) create mode 100644 src/beeindextr.c diff --git a/.gitignore b/.gitignore index 75c6064..a6d1e68 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ /beegetopt /beeflock /beeissue.sh +/beeindextr /beelib.config.sh /bee.1 /bee-check.1 diff --git a/Makefile b/Makefile index 5b12cf4..f1548e9 100644 --- a/Makefile +++ b/Makefile @@ -68,6 +68,7 @@ HELPER_BEE_SHELL+=bee-remove HELPER_BEE_SHELL+=bee-update HELPER_C+=bee-cache-inventory +HELPER_C+=beeindextr HELPER_SHELL+=compat-filesfile2contentfile HELPER_SHELL+=compat-fixmetadir @@ -134,6 +135,7 @@ BEESORT_OBJECTS=bee_tree.o bee_version_compare.o bee_version_output.o bee_versio BEEGETOPT_OBJECTS=bee_getopt.o beegetopt.o BEEFLOCK_OBJECTS=bee_getopt.o beeflock.o BEECACHEINVENTORY_OBJECTS=bee-cache-inventory.o bee_getopt.o +BEEICANONDIRS_OBJECTS=beeindextr.o bee_BUILDTYPES=$(addsuffix .sh,$(addprefix buildtypes/,$(BUILDTYPES))) @@ -166,6 +168,9 @@ beeflock: $(addprefix src/, ${BEEFLOCK_OBJECTS}) bee-cache-inventory: $(addprefix src/, ${BEECACHEINVENTORY_OBJECTS}) $(call quiet-command,${CC} ${LDFLAGS} -o $@ $^,"LD $@") +beeindextr: $(addprefix src/, ${BEEICANONDIRS_OBJECTS}) + $(call quiet-command,${CC} ${LDFLAGS} -o $@ $^,"LD $@") + %.o: %.c $(call quiet-command,${CC} ${CFLAGS} -o $@ -c $^,"CC $@") diff --git a/src/beeindextr.c b/src/beeindextr.c new file mode 100644 index 0000000..5927112 --- /dev/null +++ b/src/beeindextr.c @@ -0,0 +1,473 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +__attribute__((format (printf, 1, 2))) +static void die(const char *restrict fmt, ...) { + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + _exit(1); +} + +#define offsetof(type, member) __builtin_offsetof (type, member) + +#define container_of(ptr, type, member) \ + ((type *) ((char *) (ptr) - offsetof(type, member))) + +static void *malloc_nofail(size_t size) { + void *ret = malloc(size); + if (ret == NULL) + die("%m\n"); + return ret; +} + +static void *zmalloc_nofail(size_t size) { + void *ret = malloc_nofail(size); + memset(ret, 0, size); + return ret; +} + +static char *strdup_nofail(const char *s) { + char *dst = strdup(s); + if (dst == NULL) + die("%m\n"); + return dst; +} + +/*** Simple string to u32 hash implementation ***/ + +static uint32_t naive_hash(const char *s) { + uint32_t h = 0; + while (*s) { + uint32_t highbits = h & 0xf8000000; + h = h << 5; + h = h ^ (highbits >> 27); + h = h ^ *s++; + } + return h; +} + +/************ simple readlink cache *************/ + +struct readlink_cache_entry { + char *name; + uint32_t hash; + ssize_t result; + int saved_errno; // valid if result < 0 otherwise 0 + char *target; // not zero terminated , NULL when unused +}; + +static struct readlink_cache { + int slots; + int used; + struct readlink_cache_entry *entry; +} readlink_cache; + +__attribute__((unused)) +static void readlink_cache_free() { + struct readlink_cache *c = &readlink_cache; + for (int i=0 ; i < c->used ; i++) { + free(c->entry[i].name); + free(c->entry[i].target); + } + c->slots = 0; + c->used = 0; + free(c->entry); + c->entry = NULL; +} + +static void readlink_cache_add(const char *name, ssize_t result, int saved_errno, char *target) { + struct readlink_cache *c = &readlink_cache; + if (c->slots == 0) { + c->slots = 200; + c->entry = zmalloc_nofail(200 * sizeof(*c->entry)); + } else if (c->used >= c->slots) { + int new_slots = c->slots + c->slots; + struct readlink_cache_entry *new_entry = zmalloc_nofail(new_slots * sizeof(*new_entry)); + memcpy(new_entry, c->entry, c->slots * sizeof(*new_entry)); + memset(&new_entry[c->slots], 0, new_slots - c->slots); + c->slots = new_slots; + free(c->entry); + c->entry = new_entry; + } + c->entry[c->used].name = strdup_nofail(name); + c->entry[c->used].hash = naive_hash(name); + c->entry[c->used].result = result; + c->entry[c->used].saved_errno = saved_errno; + if (result > 0) { + c->entry[c->used].target = malloc_nofail(result); + memcpy(c->entry[c->used].target, target, result); + } + c->used++; +} + +static ssize_t readlink_cache_readlink(const char *restrict pathname, char *restrict buf, size_t bufsize) { + struct readlink_cache *c = &readlink_cache; + uint32_t hash = naive_hash(pathname); + + for (int i = c->used-1 ; i >= 0 ; i--) { + if (c->entry[i].hash == hash && strcmp(c->entry[i].name, pathname) == 0) { + ssize_t result = c->entry[i].result; + if (result < 0) { + errno = c->entry[i].saved_errno; + } else if (result > 0) { + if ((unsigned)result > bufsize) + result = bufsize; + memcpy(buf, c->entry[i].target, result); + } + return result; + } + } + ssize_t result = readlink(pathname, buf, bufsize); + int saved_errno = 0; + if (result < 0) + saved_errno = errno; + readlink_cache_add(pathname, result, saved_errno, buf); + return result; +} + +__attribute__((unused)) +static void readlink_cache_dump() { + printf("CACHE:\n"); + struct readlink_cache *c = &readlink_cache; + printf(" slots: %d\n", c->slots); + printf(" used: %d\n", c->used); + printf(" entry: %p\n", c->entry); + for (int i=0 ; i < c->slots ; i++) { + printf(" entry[%d].name: %s\n", i, c->entry[i].name); + printf(" entry[%d].hash: %08x\n", i, c->entry[i].hash); + printf(" entry[%d].result: %ld\n", i, c->entry[i].result); + printf(" entry[%d].saved_errno: %d\n", i, c->entry[i].saved_errno); + printf(" entry[%d].target: %.*s\n", i, (int)c->entry[i].result, c->entry[i].target); + } +} + +/************* canondir cache ***********/ + +struct cdir_cache_entry { + char *name; + uint32_t hash; + char *dst; // NULL for equal src and destination string +}; + +static struct cdir_cache { + int slots; + int used; + struct cdir_cache_entry *entry; +} cdir_cache; + +__attribute__((unused)) +static void cdir_cache_free() { + struct cdir_cache *c = &cdir_cache; + for (int i=0 ; i < c->used ; i++) { + free(c->entry[i].name); + free(c->entry[i].dst); + } + c->slots = 0; + c->used = 0; + free(c->entry); + c->entry = NULL; +} + +static char *cdir_cache_try(const char *restrict path) { + struct cdir_cache *c = &cdir_cache; + uint32_t hash = naive_hash(path); + + for (int i = c->used-1 ; i >= 0 ; i--) { + if (c->entry[i].hash == hash && strcmp(c->entry[i].name, path) == 0) + return c->entry[i].dst; + } + return (void *)-1; // not found +} + +static void cdir_cache_add(const char *name, const char *dst) { + struct cdir_cache *c = &cdir_cache; + if (c->slots == 0) { + c->slots = 200; + c->entry = zmalloc_nofail(200 * sizeof(*c->entry)); + } else if (c->used >= c->slots) { + int new_slots = c->slots + c->slots; + struct cdir_cache_entry *new_entry = zmalloc_nofail(new_slots * sizeof(*new_entry)); + memcpy(new_entry, c->entry, c->slots * sizeof(*new_entry)); + memset(&new_entry[c->slots], 0, new_slots - c->slots); + c->slots = new_slots; + free(c->entry); + c->entry = new_entry; + } + c->entry[c->used].name = strdup_nofail(name); + c->entry[c->used].hash = naive_hash(name); + if ( strcmp(name, dst) != 0) + c->entry[c->used].dst = strdup_nofail(dst); + else + c->entry[c->used].dst = NULL; + c->used++; +} + +/****************************************/ + +static char *get_dirname(char *path, char *outbuf, ssize_t outbuf_len) { + char *p =path + strlen(path); + while (p > path + 1 && p[-1] == '/') + p--; + while (p > path + 1 && p[-1] != '/') + p--; + while (p > path + 1 && p[-1] == '/') + p--; + if (p - path + 1 > outbuf_len) + die ("get_dirname: output buffer to small\n"); + memcpy(outbuf, path, p - path); + outbuf[p - path] = '\0'; + return outbuf; +} + +static char *get_basename(char *path, char *outbuf, ssize_t outbuf_len) { + char *p = path + strlen(path); + int len = 0; + while (p > path + 1 && p[-1] == '/') + p--; + while (p > path + 1 && p[-1] != '/') { + p--; + len++; + } + if (len + 1 > outbuf_len) + die ("get_basename: output buffer to small\n"); + memcpy(outbuf, p, len); + outbuf[len] = '\0'; + return outbuf; +} + +static char *resolvedir(char *patharg, char *outbuf, size_t outbuf_len) { + + if (outbuf_len < 1) + die ("resolvedir: output buffer to small\n"); + + static char dirnamebuf[PATH_MAX]; + + char *path = patharg; // path is the full absolute path we work on + // when we follow a symlink, this will be changed + // to point to a malloced() buffer + + char *in = path; // input pointer + char *out = outbuf; // output pointer + + if (*in != '/') + die ("resolvedir: relative paths unsupported\n"); + + char *origdir = NULL; // directory part of original patharg malloced() + + { + // try the cdir cache with the full translation of the directory part + static char basenamebuf[PATH_MAX]; + char *src = get_dirname(path, dirnamebuf, sizeof(dirnamebuf)); + char *dst = cdir_cache_try(src); + if (dst == (void *)-1) { + // not found in cache. keep the directory part we parsed out of the string + // so that we can add the translation to the cache later + origdir = strdup_nofail(src); + } else { + if (dst == NULL) + dst = src; // NULL = negative cached (resolved dir = original dir) + char *out = outbuf; + strcpy(outbuf, dst); + out += strlen(dst); + if (! (outbuf[0] == '/' && outbuf[1] == '\0')) + *out++ = '/'; + strcpy(out, get_basename(path, basenamebuf, sizeof(basenamebuf))); + return outbuf; + } + } + + in++; // skip '/' + *out++ = '/'; + + while(1) { + assert( out[-1] == '/' ); + + if (*in == '\0') + break; + + char *start = in; + + while (*in != '\0' && *in != '/') + in++; + + if (in == start) { // / - ignore redundant '/' + ; + } else if (in == start+1 && start[0] == '.') { // ./ - ignore + ; + } else if (in == start+2 && start[0] == '.' && start[1] == '.') { // ../ + // up one level - rewind output + out--; + while (out > outbuf && out[-1] != '/') + out--; + } else { + // copy component name + if (outbuf+outbuf_len < out+(in-start)+1) + die("resolvedir: output buffer to small\n"); + memcpy(out, start, in - start); + out += in - start; + + // if this is the last component (the filename), do not check for symlinks + if (*in == '\0') + break; + + // check for symlink + + static char readlinkbuf[PATH_MAX]; // NOT zero-terminated + + // make the output collected so far a zero-terminated string + *out = '\0'; + int l = readlink_cache_readlink(outbuf, readlinkbuf, sizeof(readlinkbuf)); + if (l == sizeof(readlinkbuf)) + die("%s: symlink target name to long.\n", outbuf); + + if (l < 0) { + // not a symlink + *out++ = '/'; + } else { + // symlink + int restlen = strlen(in); + char *new_path; + + if (readlinkbuf[0] == '/') { + // absolute symlink, clear output, leave "/" + out = outbuf+1; + // in = target + rest + new_path = malloc(l + restlen + 1); + memcpy(new_path, readlinkbuf, l); + strcpy(&new_path[l], in); + } else { + // relative symlink, remove last component (the symlink name) from output, leave "/" + while ( out > outbuf+1 && out[-1] != '/') + out--; + // in = "/" + target + rest + new_path = malloc(1 + l + restlen + 1); + new_path[0] = '/'; + memcpy(&new_path[1], readlinkbuf, l); + strcpy(&new_path[l+1], in); + } + if (path != patharg) + free(path); + path = new_path; + in = path; + } + } + if (*in == '\0') + break; + in++; + } + *out = '\0'; + if (path != patharg) + free(path); + + // cache translation of original input directory to canonicalized output directory + char *translated_dir = get_dirname(outbuf, dirnamebuf, sizeof(dirnamebuf)); + cdir_cache_add(origdir, translated_dir); + free(origdir); + + return outbuf; +} + +__attribute__((unused)) +static void _resolvedir_selftest(char *in, char *expect) { + char obuf[128]; + resolvedir(in, obuf, sizeof(obuf)); + if (strcmp(obuf, expect) != 0) + printf("WARNING: resolvedir_selftest: in '%s' expected '%s' got '%s'\n", in, expect, obuf); +} + +__attribute__((unused)) +static void resolvedir_selftest() { + _resolvedir_selftest("/", "/"); + _resolvedir_selftest("/file", "/file"); + _resolvedir_selftest("/dir/", "/dir"); + _resolvedir_selftest("/dir/file", "/dir/file"); + _resolvedir_selftest("/dir////file", "/dir/file"); + _resolvedir_selftest("/dir1/../dir2/file", "/dir2/file"); + _resolvedir_selftest("/dir1/../dir2/dir3///", "/dir2/dir3"); + + _resolvedir_selftest("/usr/tmp/file", "/tmp/file"); + _resolvedir_selftest("/usr/tmp/dir/file", "/tmp/dir/file"); + _resolvedir_selftest("/lib64", "/lib64"); + _resolvedir_selftest("/lib64/file", "/lib/file"); + _resolvedir_selftest("/lib64/dir/file", "/lib/dir/file"); +} + + +static char *skipword(char *c) { + while (*c != '\0' && !isspace(*c) ) + c++; + while (*c != '\0' && isspace(*c) ) + c++; + return c; +} + +static char *skipwords(char *c, int n) { + for (int i=0 ; i < n ; i++) + c = skipword(c); + return c; +} + +static char *lbuf; +static size_t lbuf_len; + +static void do_file(const char *restrict inventory) { + + FILE *f = fopen(inventory, "r"); + if (f == NULL) + die("%s: %m\n", inventory); + + errno = 0; + while (1) { + errno = 0; + if (getline(&lbuf, &lbuf_len, f) == -1) + break; + size_t len = strlen(lbuf); + if (len > 0 && lbuf[len-1] == '\n') + lbuf[len-1] = '\0'; + + char *p = skipwords(lbuf, 7); + if (*p == '\0') { + die("%s: format error. Line: '%s'\n", inventory, lbuf); + } + if (p > lbuf+1) + p[-1] = '\0'; + + static char resolvebuf[PATH_MAX]; + + char *resolved = resolvedir(p, resolvebuf, sizeof(resolvebuf)); + printf("%s %s\n", lbuf, resolved); + } + if (errno) + die("%s: %m\n", inventory); + + fclose(f); +} + +int main(int argc, char **argv) { + if (argc==1) { + do_file("/proc/self/fd/0"); + } else { + for (int i=1 ; i < argc ; i++) { + do_file(argv[i]); + } + } +#ifndef NDEBUG + free(lbuf); + cdir_cache_free(); + readlink_cache_free(); +#endif + return 0; +} From 3cfe004980a361a4e470948d549441cbe959f27f Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Tue, 24 Oct 2023 22:30:52 +0200 Subject: [PATCH 3/5] bee-cache: Use beeindexstr --- src/bee-cache.sh.in | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/bee-cache.sh.in b/src/bee-cache.sh.in index 243844d..fed928d 100644 --- a/src/bee-cache.sh.in +++ b/src/bee-cache.sh.in @@ -169,8 +169,7 @@ function print_missing_files() { } function tmp_merge_install_inventory_files() { - ${BEEFLOCK} --shared ${BEECACHE_INVENTORY} sort -m -u -r -k8 -k1 \ - ${BEECACHE_INVENTORY} "${@}" + ${BEEFLOCK} --shared ${BEECACHE_INVENTORY} ${BEE_LIBEXECDIR}/bee/beeindextr ${BEECACHE_INVENTORY} "${@}" | sort -u -r -k8 -k1 } function tmpinstall_to_filenames() { From 47b4ba888ee50f491306ab373f57d39afd8c6cc6 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Thu, 9 May 2024 13:10:05 +0200 Subject: [PATCH 4/5] bee-query: Use canonical names in file search When "bee query PATTERN" is used to grep for installed files, use the inventory file and translate the filenames from the inventory to their canonical form, which is the real path, not the path used to install them, possibly following symlinks. The new code is a lot faster than the old one. The wall time for `fakeroot bee query otop` is reduced from 0m11.712s to 0m0.760s. --- src/bee-query.sh.in | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/bee-query.sh.in b/src/bee-query.sh.in index ad4313f..30fe565 100644 --- a/src/bee-query.sh.in +++ b/src/bee-query.sh.in @@ -32,6 +32,9 @@ VERSION=${BEE_VERSION} : ${BEE_BINDIR:=@BINDIR@} : ${BEE_LIBEXECDIR:=@LIBEXECDIR@} +: ${BEEFLOCK=${BEE_BINDIR}/beeflock} +: ${BEECACHE_CACHEDIR=${BEE_CACHEDIR}/bee-cache} +: ${BEECACHE_INVENTORY=${BEECACHE_CACHEDIR}/INVENTORY} function bee-list() { ${BEE_LIBEXECDIR}/bee/bee.d/bee-list "${@}" @@ -81,18 +84,22 @@ function get_files() { } function get_pkgs() { - f=$1 - - for pkg in $(bee-list --installed) ; do - - if egrep -q "file=.*${f}" "${BEE_METADIR}/${pkg}/CONTENT" ; then - echo ${pkg} - while read line ; do - eval $(${BEESEP} "${line}") - echo " ${file}" - done < <(egrep "file=.*${f}" "${BEE_METADIR}/${pkg}/CONTENT") - fi - done + file_pattern="$1" + last_pkg='' + + ${BEEFLOCK} --shared ${BEECACHE_INVENTORY} \ + ${BEE_LIBEXECDIR}/bee/beeindextr ${BEECACHE_INVENTORY} | \ + grep -- "$file_pattern" | \ + sort | \ + while read -r pkg mtime uid gid mode size md5 filename; do + if [[ $filename =~ $file_pattern ]]; then + if [ "$pkg" != "$last_pkg" ]; then + echo $pkg + last_pkg="$pkg" + fi + printf " %s\n" "$filename" + fi + done } From a015a2313feaa0f34408912b25fdef10e7e04933 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Thu, 9 May 2024 13:50:43 +0200 Subject: [PATCH 5/5] bee-cache: Remove unsued "beecache grep" command --- src/bee-cache.sh.in | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/bee-cache.sh.in b/src/bee-cache.sh.in index fed928d..e5206dc 100644 --- a/src/bee-cache.sh.in +++ b/src/bee-cache.sh.in @@ -79,11 +79,6 @@ function cache_update_pkg() { return 0 } -function cache_grep() { - ${BEEFLOCK} --shared ${BEECACHE_INVENTORY} \ - grep "${@}" ${BEECACHE_INVENTORY} -} - function print_conflicts() { local pkg=${1} @@ -224,7 +219,6 @@ function usage() { -h, --help display this help Commands: - grep rebuild update print-uniq-files @@ -283,9 +277,6 @@ cache_verify tmpinstall_to_filenames case "${cmd}" in - grep) - cache_grep "${@}" | cut -d ' ' -f${FIELDS} - ;; rebuild) cache_rebuild ;;