From e793572683a804e67b8788498e1c2babf0a66a5d Mon Sep 17 00:00:00 2001 From: Peter Marquardt Date: Tue, 20 Feb 2024 15:16:04 +0100 Subject: [PATCH 1/4] do not clean final due to known unfixed bugs - https://github.com/jonaswinkler/paperless-ng/issues/1490 - https://github.com/paperless-ngx/paperless-ngx/discussions/3090 --- README.todo | 5 +++++ paperless.conf.build.sh | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/README.todo b/README.todo index e9f0130..485b9f7 100644 --- a/README.todo +++ b/README.todo @@ -1,6 +1,11 @@ dependency tree: dot REAME.dot +bug: + ☐ https://github.com/jonaswinkler/paperless-ng/issues/1490 + ☐ https://github.com/paperless-ngx/paperless-ngx/discussions/3090 + Calling OCRmyPDF with args: {'input_file': PosixPath('/scratch/local/plprj/2024-02-19/paperless-ngx34bhqotl/c913a207C78B05A-B130-45F8-9EC4-281AD914ABFA}.PDF'), 'output_file': PosixPath('/scratch/local/plprj/2024-02-19/paperless-0ee3414u/archive.pdf'), 'use_threads': True, 'jobs': 32, 'language': 'deu+eng', 'output_type': 'pdfa', 'progress_bar': False, 'color_conversion_strategy': 'RGB', 'force_ocr': True, 'clean_final': True, 'deskew': True, 'rotate_pages': True, 'rotate_pages_threshold': 12.0, 'sidecar': PosixPath('/scratch/local/plprj/2024-02-19/paperless-0ee3414u/sidecar.txt')} + bug: ☐ tag löschen "löschen|Yes?" diff --git a/paperless.conf.build.sh b/paperless.conf.build.sh index 3e8f123..87b4750 100755 --- a/paperless.conf.build.sh +++ b/paperless.conf.build.sh @@ -26,7 +26,7 @@ _sed=( -e '/#PAPERLESS_OCR_LANGUAGE=/ a PAPERLESS_OCR_LANGUAGE=deu+eng' -e '/#PAPERLESS_OCR_MODE=skip/ a PAPERLESS_OCR_MODE=force' -e "/#PAPERLESS_OCR_LANGUAGE=/ a PAPERLESS_NLTK_DIR=${PROJECT}/data/nltk" - -e "/#PAPERLESS_OCR_CLEAN=/ a PAPERLESS_OCR_CLEAN=clean-final" + -e "/#PAPERLESS_OCR_CLEAN=/ a PAPERLESS_OCR_CLEAN=clean" -e "/#PAPERLESS_CONSUMPTION_DIR=/ a PAPERLESS_CONSUMPTION_DIR=${PROJECT}/consume" -e "/#PAPERLESS_CONSUMPTION_DIR=/ a PAPERLESS_LOGGING_DIR=${PROJECT}/log" -e '/#PAPERLESS_CONSUMER_RECURSIVE=/ a PAPERLESS_CONSUMER_RECURSIVE=true' From 00cd1ac0bb5fb990c86042f9f9495bfc8e4741b9 Mon Sep 17 00:00:00 2001 From: Peter Marquardt Date: Wed, 21 Feb 2024 13:42:39 +0100 Subject: [PATCH 2/4] add liberation fonts - were missing on mariux and hardcoded in paperless-ngx --- .gitignore | 1 + build.profile | 1 + buildall.sh | 1 + liberation-fonts-ttf.build.sh | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 37 insertions(+) create mode 100755 liberation-fonts-ttf.build.sh diff --git a/.gitignore b/.gitignore index 3196ae3..eb96dc9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /ffmpeg-* /jbig2enc-* /jbig2dec-* +/liberation-fonts-* /leptonica-* /logs /nginx-* diff --git a/build.profile b/build.profile index 7233b69..e8ff908 100644 --- a/build.profile +++ b/build.profile @@ -23,5 +23,6 @@ BUILD_tesseract=tesseract-5.3.4 BUILD_nginx=nginx-1.25.3 BUILD_jbig2enc=jbig2enc-0.28-17-gea05019 BUILD_jbig2dec=jbig2dec-0.20 +BUILD_libfontttf=liberation-fonts-ttf-2.1.5 BUILD_paperless=paperless-ngx-2.5.3 diff --git a/buildall.sh b/buildall.sh index 38f09a7..709199b 100755 --- a/buildall.sh +++ b/buildall.sh @@ -70,6 +70,7 @@ ln -fs "${logfile}" "${LOGS}/build.log" [ -d "${BUILD_jbig2enc}" ] || ./jbig2enc.build.sh [ -d "${BUILD_jbig2dec}" ] || ./jbig2dec.build.sh [ -d "${BUILD_tesseract}" ] || ./tesseract.build.sh +[ -d "${BUILD_libfontttf}" ] || ./liberation-fonts-ttf.build.sh [ -d "${BUILD_nginx}" ] || ./nginx.build.sh ./nginx.conf.build.sh [ -d "$PROJECT/${BUILD_paperless}" ] || ./paperless-ngx.build.sh diff --git a/liberation-fonts-ttf.build.sh b/liberation-fonts-ttf.build.sh new file mode 100755 index 0000000..1286097 --- /dev/null +++ b/liberation-fonts-ttf.build.sh @@ -0,0 +1,34 @@ +#!/bin/bash +{ +set -x +set -e + +. build.profile + +function B_LIBFONTTTF { + +[ -d "${PREFIX}" ] || { echo "PREFIX unset or not a directory";exit; } + +# SRCURL="https://github.com/liberationfonts/liberation-fonts/files/7261482/liberation-fonts-ttf-2.1.5.tar.gz" +SRCURL="https://beehive.molgen.mpg.de/31b453e0b77bacde410a34a725b34f8a/liberation-fonts-ttf-2.1.5.tar.gz" + +PREFIX="${PREFIX}/${BUILD_libfontttf}" + +BUILD_PKG="${BUILD_libfontttf}" + +mkdir -p "${PREFIX}" + +test -e "${BUILD_PKG}.tar.gz" || wget -nv "${SRCURL}" -O "${BUILD_PKG}.tar.gz" +test -d "${BUILD_PKG}" || mkdir -pv "${BUILD_PKG}" && tar -xf "${BUILD_PKG}.tar.gz" --strip-components=1 -C "${BUILD_PKG}" + +cd "${BUILD_PKG}" + +for font in *.ttf; do + install -v -m 644 "${font}" "${PREFIX}/${font}" +done + +} + +B_LIBFONTTTF + +} From ef15c5c91fba67ed6d5564a0f05fae14fbca9805 Mon Sep 17 00:00:00 2001 From: Peter Marquardt Date: Wed, 21 Feb 2024 13:43:10 +0100 Subject: [PATCH 3/4] rewrite config-file generation --- paperless.conf.build.sh | 56 +++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/paperless.conf.build.sh b/paperless.conf.build.sh index 87b4750..b58e232 100755 --- a/paperless.conf.build.sh +++ b/paperless.conf.build.sh @@ -17,30 +17,42 @@ ln -vfs ${PROJECT}/conf/paperless.conf ${PREFIX} PAPERLESS_SECRET_KEY=${PAPERLESS_SECRET_KEY:-$(cat /dev/urandom | head -c 50 | openssl base64 |head -1)} # paperless.conf aus paperles.conf.example hart generieren -_sed=( - -e "/#PAPERLESS_URL=/ a PAPERLESS_URL=${PAPERLESS_URL}" - -e "/#PAPERLESS_CSRF_TRUSTED_ORIGINS=/ a PAPERLESS_CSRF_TRUSTED_ORIGINS=${PAPERLESS_CSRF_TRUSTED_ORIGINS:-${PAPERLESS_URL}}" - -e "/#PAPERLESS_REDIS=/ a PAPERLESS_REDIS=unix://${DEVSHM}/redis.sock" - -e "/#PAPERLESS_SECRET_KEY=change-me/ a PAPERLESS_SECRET_KEY=${PAPERLESS_SECRET_KEY}" -# -e '/#PAPERLESS_AUTO_LOGIN_USERNAME=/ a PAPERLESS_AUTO_LOGIN_USERNAME=paperless' - -e '/#PAPERLESS_OCR_LANGUAGE=/ a PAPERLESS_OCR_LANGUAGE=deu+eng' - -e '/#PAPERLESS_OCR_MODE=skip/ a PAPERLESS_OCR_MODE=force' - -e "/#PAPERLESS_OCR_LANGUAGE=/ a PAPERLESS_NLTK_DIR=${PROJECT}/data/nltk" - -e "/#PAPERLESS_OCR_CLEAN=/ a PAPERLESS_OCR_CLEAN=clean" - -e "/#PAPERLESS_CONSUMPTION_DIR=/ a PAPERLESS_CONSUMPTION_DIR=${PROJECT}/consume" - -e "/#PAPERLESS_CONSUMPTION_DIR=/ a PAPERLESS_LOGGING_DIR=${PROJECT}/log" - -e '/#PAPERLESS_CONSUMER_RECURSIVE=/ a PAPERLESS_CONSUMER_RECURSIVE=true' - -e '/#PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=/ a PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=true' - -e "/#PAPERLESS_DATA_DIR=/ a PAPERLESS_SCRATCH_DIR=${TMPDIR}" - -e "/#PAPERLESS_DATA_DIR=/ a PAPERLESS_DATA_DIR=${PROJECT}/data" - -e "/#PAPERLESS_STATICDIR=/ a PAPERLESS_STATICDIR=${PREFIX}/static" - -e "/#PAPERLESS_MEDIA_ROOT=/ a PAPERLESS_MEDIA_ROOT=${PROJECT}/media" - -e '/#PAPERLESS_TIME_ZONE=/ a PAPERLESS_TIME_ZONE=Europe/Berlin' - -e '/#PAPERLESS_CONSUMER_ENABLE_BARCODES=/ a PAPERLESS_CONSUMER_ENABLE_BARCODES=true' - -e '/#PAPERLESS_CONSUMER_ENABLE_BARCODES=/ a PAPERLESS_CONSUMER_ENABLE_ASN_BARCODE=true' +_conf=( +# PAPERLESS_AUTO_LOGIN_USERNAME=paperless +PAPERLESS_CONSUMER_ENABLE_ASN_BARCODE=true +PAPERLESS_CONSUMER_ENABLE_BARCODES=true +PAPERLESS_CONSUMER_RECURSIVE=true +PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=true + +PAPERLESS_CONSUMPTION_DIR=${PROJECT}/consume +PAPERLESS_DATA_DIR=${PROJECT}/data +PAPERLESS_LOGGING_DIR=${PROJECT}/log +PAPERLESS_MEDIA_ROOT=${PROJECT}/media +PAPERLESS_NLTK_DIR=${PROJECT}/data/nltk +PAPERLESS_SCRATCH_DIR=${TMPDIR} +PAPERLESS_STATICDIR=${PREFIX}/static + +PAPERLESS_URL=${PAPERLESS_URL} +PAPERLESS_CSRF_TRUSTED_ORIGINS=${PAPERLESS_CSRF_TRUSTED_ORIGINS:-${PAPERLESS_URL}} + +PAPERLESS_OCR_CLEAN=clean +PAPERLESS_OCR_LANGUAGE=deu+eng +PAPERLESS_OCR_MODE=force + +PAPERLESS_REDIS=unix://${DEVSHM}/redis.sock +PAPERLESS_SECRET_KEY=${PAPERLESS_SECRET_KEY} +PAPERLESS_THUMBNAIL_FONT_NAME=${PROJECT}/${BUILD_libfontttf}/LiberationMono-Regular.ttf +PAPERLESS_TIME_ZONE=Europe/Berlin ) ( + set -x echo "# generated by $0 in $PWD. do not edit." - sed "${_sed[@]}" ${BUILD_paperless}/paperless.conf.example + echo "# paperless.conf.example" + cat ${BUILD_paperless}/paperless.conf.example + echo "# generated by $0 in $PWD" + + for c in "${_conf[@]}"; do + echo "$c" + done ) | tee ${TO} From ff526744012129f766d7b9172a29d4d15ea336c9 Mon Sep 17 00:00:00 2001 From: Peter Marquardt Date: Wed, 21 Feb 2024 13:49:07 +0100 Subject: [PATCH 4/4] update todo --- README.todo | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.todo b/README.todo index 485b9f7..1af1118 100644 --- a/README.todo +++ b/README.todo @@ -1,20 +1,18 @@ dependency tree: dot REAME.dot + bug: ☐ https://github.com/jonaswinkler/paperless-ng/issues/1490 ☐ https://github.com/paperless-ngx/paperless-ngx/discussions/3090 Calling OCRmyPDF with args: {'input_file': PosixPath('/scratch/local/plprj/2024-02-19/paperless-ngx34bhqotl/c913a207C78B05A-B130-45F8-9EC4-281AD914ABFA}.PDF'), 'output_file': PosixPath('/scratch/local/plprj/2024-02-19/paperless-0ee3414u/archive.pdf'), 'use_threads': True, 'jobs': 32, 'language': 'deu+eng', 'output_type': 'pdfa', 'progress_bar': False, 'color_conversion_strategy': 'RGB', 'force_ocr': True, 'clean_final': True, 'deskew': True, 'rotate_pages': True, 'rotate_pages_threshold': 12.0, 'sidecar': PosixPath('/scratch/local/plprj/2024-02-19/paperless-0ee3414u/sidecar.txt')} +upgrade: + ssl-certs in conf umziehen bug: ☐ tag löschen "löschen|Yes?" -test: - ☐ zbar ohne imagemagick - - ☐ conf.build-scripte - sed '/kmous=/d;/XM=/d;$s/$/XM=,/' encrypted document: [2024-02-19 14:07:32,454] [DEBUG] [paperless.parsing.tesseract] Calling OCRmyPDF with args: {'input_file': PosixPath('/home/wwwutz/paperless/2.5.3/media/documents/originals/0000754.pdf'), 'output_file': PosixPath('/scratch/local/paperless/wwwutz/2.5.3/paperless-huijc19r/archive.pdf'), 'use_threads': True, 'jobs': 32, 'language': 'deu+eng', 'output_type': 'pdfa', 'progress_bar': False, 'color_conversion_strategy': 'RGB', 'force_ocr': True, 'clean_final': True, 'deskew': True, 'rotate_pages': True, 'rotate_pages_threshold': 12.0, 'sidecar': PosixPath('/scratch/local/paperless/wwwutz/2.5.3/paperless-huijc19r/sidecar.txt')} @@ -26,10 +24,11 @@ encrypted document: ___________________ Archive: + ✔ conf.build-scripte @done (24-02-21 13:48) @project(bug) + sed '/kmous=/d;/XM=/d;$s/$/XM=,/' + ✔ zbar ohne imagemagick @done (24-02-21 09:01) @project(test) ✔ ocrmypdf @done (24-02-15 17:01) ( . profile; ocrmypdf --force-ocr ../PDFofDeath/2024-01-29\ 5650025416_B.pdf xxx.pdf ) - --force-ocr was issued, causing transcoding. - The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted. ✔ pngquant @done (24-02-15 13:48) git clone --recursive https://github.com/kornelski/pngquant.git The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted. @@ -51,4 +50,6 @@ Archive: https://www.python.org/downloads/release/python-3118/ ✔ sqlite => 3.45.1 @done (24-02-12 13:18) https://sqlite.org/download.html + --force-ocr was issued, causing transcoding. + The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted.