From 5a92d2ed1929184d96d00dec8a1e42ab4f0001db Mon Sep 17 00:00:00 2001 From: moritzrfs Date: Sun, 8 Feb 2026 10:12:26 +0100 Subject: [PATCH] Add scan script --- scan_0208.sh | 172 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100755 scan_0208.sh diff --git a/scan_0208.sh b/scan_0208.sh new file mode 100755 index 0000000..a843d78 --- /dev/null +++ b/scan_0208.sh @@ -0,0 +1,172 @@ +#!/bin/bash + +# Script to control an ADF scanner +# - start scanning and create a single pdf file +# - with empty page and orientation detection +# - tested with Fujitsu SP-1120 +# +# ... excessively borrowed from https://github.com/rocketraman/sane-scan-pdf +# +# Version: 0.2 (headless, fixed page order) +# Date: 2025-02-08 +# License: GNU General Public License +# Modified by: Claude AI +# Original Author: Eric Scheibler +# E-Mail: email [at] eric-scheibler [dot] de +# URL: http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/ +# +# Install: +# sudo apt install imagemagick poppler-utils sane tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper + +OUTPUT="scan.pdf" +HELP=0 +VERBOSE=0 + +# scanner params +DEVICE=pfusp +RESOLUTION=400 +MODE=Lineart + +# ocr params +OCR_LANGUAGE=deu +OVERWRITE_OUTPUT_FILE=0 + + +##### + +TMP_DIR=$(mktemp -d -p "" scan.XXXXXXXXXX) +cleanup() { + rm -rf "$TMP_DIR" +} +trap cleanup EXIT + + +# Parse command-line options +while [[ $# > 0 ]]; do + case "$1" in + -h|--help) HELP=1 ;; + -v|--verbose) VERBOSE=1 ;; + -o|--output) shift; OUTPUT="$1" ;; + -x|--device) shift; DEVICE=$1;; + -m|--mode) shift; MODE=$1 ;; + -r|--resolution) shift; RESOLUTION=$1 ;; + -l|--language) shift; OCR_LANGUAGE=$1 ;; + -w|--overwrite-output-file) OVERWRITE_OUTPUT_FILE=1 ;; + esac + shift # next option +done + +if [[ $HELP == 1 ]]; then + echo "$(basename $0) [OPTIONS]... [OUTPUT]" + echo "" + echo "OPTIONS" + echo " -x, --device" + echo " Override scanner device name, defaulting to \"pfusp\"" + echo " -m, --mode" + echo " Mode e.g. Lineart (default), Halftone, Gray, Color, etc." + echo " -r, --resolution" + echo " Resolution e.g 400 (default)" + echo " -l, --language " + echo " which language to use for OCR (default: deu)" + echo "" + echo "OUTPUT" + echo " -o, --output " + echo " Output to named file default=scan.pdf" + echo " -w, --overwrite-output-file" + echo " Overwrite the output pdf file, if it already exists" + echo " -v, --verbose" + exit 0 +fi + +if [[ $VERBOSE == 0 ]]; then + quiet_param="--quiet" + suppress_error_messages="2> /dev/null" +fi + +if [[ "$OUTPUT" == "" ]]; then + echo >&2 "Output file must be specified. Aborting." + exit 1 +fi + +if [[ -f "$OUTPUT" ]]; then + if [[ $OVERWRITE_OUTPUT_FILE == 0 ]]; then + echo >&2 "Output file $OUTPUT already exists. Aborting." + exit 1 + else + rm "$OUTPUT" + fi +fi + + +echo >&2 "Scanning..." +scanadf --device-name "$DEVICE" --source Adf-duplex --resolution $RESOLUTION --mode $MODE -o $TMP_DIR/scan-%04d +if [[ $? != 0 ]]; then + exit 1 +fi +echo "" + + +shopt -s extglob nullglob +image_files=($TMP_DIR/scan-[0-9]*) +num_scans=${#image_files[@]} + +if [[ $num_scans > 0 ]]; then + echo "Processing $num_scans pages" + + # Process images in normal order + for image_file in ${image_files[@]}; do + echo "Process $(basename $image_file)" + + # unpaper + eval unpaper $quiet_param --overwrite --dpi $RESOLUTION $image_file $image_file $suppress_error_messages + + # convert to tiff + convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $image_file ${image_file}.tiff + rm $image_file + + # orientation detection + orientation_result=$(eval tesseract ${image_file}.tiff - --psm 0 $suppress_error_messages) || orientation_result= + if [[ $orientation_result == *"Rotate: 180"* ]]; then + echo "Image orientation is upside down, rotate" + convert -rotate 180 ${image_file}.tiff ${image_file}.tiff + fi + + # empty page detection + percentage_white=$(convert ${image_file}.tiff -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:) || percentage_white=0 + is_empty_page=$(echo "$percentage_white >= 99.8" | bc -l) + if [[ $is_empty_page == 1 && $orientation_result == "" ]]; then + echo "Empty page removed" + else + eval tesseract ${image_file}.tiff $image_file -l $OCR_LANGUAGE pdf $suppress_error_messages + rm ${image_file}.tiff + fi + + echo "" + done + + # Collect PDF files in REVERSE order to fix the page sequence + pdf_files=($TMP_DIR/scan-[0-9]*.pdf) + num_pdf_files=${#pdf_files[@]} + + if [[ $num_pdf_files > 0 ]]; then + # Reverse the array to fix page order + reversed_pdf_files=() + for ((i=${#pdf_files[@]}-1; i>=0; i--)); do + reversed_pdf_files+=("${pdf_files[i]}") + done + + if [[ $num_pdf_files == 1 ]]; then + echo "Creating output PDF..." + mv "${reversed_pdf_files[0]}" "$OUTPUT" + else + echo "Concatenating $num_pdf_files PDFs in correct order..." + pdfunite "${reversed_pdf_files[@]}" "$OUTPUT" + fi + fi +fi + +if [[ -f "$OUTPUT" ]]; then + echo "Done. Output saved to: $OUTPUT" +else + echo "No scans found." +fi