#!/bin/bash # Script to control an ADF scanner # - start scanning and create a single pdf file # - with empty page and orientation detection # - tested with Fujitsu SP-1120 # # ... excessively borrowed from https://github.com/rocketraman/sane-scan-pdf # # Version: 0.2 (headless, fixed page order) # Date: 2025-02-08 # License: GNU General Public License # Modified by: moritzrfs # Original Author: Eric Scheibler # E-Mail: email [at] eric-scheibler [dot] de # URL: http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/ # # Install: # sudo apt install imagemagick poppler-utils sane tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper OUTPUT="scan_$(date +%Y-%m-%d-%H-%M-%S).pdf" HELP=0 VERBOSE=0 # scanner params DEVICE=pfusp RESOLUTION=400 MODE=Lineart # ocr params OCR_LANGUAGE=deu OVERWRITE_OUTPUT_FILE=0 ##### TMP_DIR=$(mktemp -d -p "" scan.XXXXXXXXXX) cleanup() { rm -rf "$TMP_DIR" } trap cleanup EXIT # Parse command-line options while [[ $# > 0 ]]; do case "$1" in -h|--help) HELP=1 ;; -v|--verbose) VERBOSE=1 ;; -o|--output) shift; OUTPUT="$1" ;; -x|--device) shift; DEVICE=$1;; -m|--mode) shift; MODE=$1 ;; -r|--resolution) shift; RESOLUTION=$1 ;; -l|--language) shift; OCR_LANGUAGE=$1 ;; -w|--overwrite-output-file) OVERWRITE_OUTPUT_FILE=1 ;; esac shift # next option done if [[ $HELP == 1 ]]; then echo "$(basename $0) [OPTIONS]... [OUTPUT]" echo "" echo "OPTIONS" echo " -x, --device" echo " Override scanner device name, defaulting to \"pfusp\"" echo " -m, --mode" echo " Mode e.g. Lineart (default), Halftone, Gray, Color, etc." echo " -r, --resolution" echo " Resolution e.g 400 (default)" echo " -l, --language " echo " which language to use for OCR (default: deu)" echo "" echo "OUTPUT" echo " -o, --output " echo " Output to named file default=scan.pdf" echo " -w, --overwrite-output-file" echo " Overwrite the output pdf file, if it already exists" echo " -v, --verbose" exit 0 fi if [[ $VERBOSE == 0 ]]; then quiet_param="--quiet" suppress_error_messages="2> /dev/null" fi if [[ "$OUTPUT" == "" ]]; then echo >&2 "Output file must be specified. Aborting." exit 1 fi if [[ -f "$OUTPUT" ]]; then if [[ $OVERWRITE_OUTPUT_FILE == 0 ]]; then echo >&2 "Output file $OUTPUT already exists. Aborting." exit 1 else rm "$OUTPUT" fi fi echo >&2 "Scanning..." scanadf --device-name "$DEVICE" --source Adf-duplex --resolution $RESOLUTION --mode $MODE -o $TMP_DIR/scan-%04d if [[ $? != 0 ]]; then exit 1 fi echo "" shopt -s extglob nullglob image_files=($TMP_DIR/scan-[0-9]*) num_scans=${#image_files[@]} if [[ $num_scans > 0 ]]; then echo "Processing $num_scans pages" # Process images in normal order for image_file in ${image_files[@]}; do echo "Process $(basename $image_file)" # unpaper eval unpaper $quiet_param --overwrite --dpi $RESOLUTION $image_file $image_file $suppress_error_messages # convert to tiff convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $image_file ${image_file}.tiff rm $image_file # orientation detection orientation_result=$(eval tesseract ${image_file}.tiff - --psm 0 $suppress_error_messages) || orientation_result= if [[ $orientation_result == *"Rotate: 180"* ]]; then echo "Image orientation is upside down, rotate" convert -rotate 180 ${image_file}.tiff ${image_file}.tiff fi # empty page detection percentage_white=$(convert ${image_file}.tiff -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:) || percentage_white=0 is_empty_page=$(echo "$percentage_white >= 99.8" | bc -l) if [[ $is_empty_page == 1 && $orientation_result == "" ]]; then echo "Empty page removed" else eval tesseract ${image_file}.tiff $image_file -l $OCR_LANGUAGE pdf $suppress_error_messages rm ${image_file}.tiff fi echo "" done # Collect PDF files in REVERSE order to fix the page sequence pdf_files=($TMP_DIR/scan-[0-9]*.pdf) num_pdf_files=${#pdf_files[@]} if [[ $num_pdf_files > 0 ]]; then # Reverse the array to fix page order reversed_pdf_files=() for ((i=${#pdf_files[@]}-1; i>=0; i--)); do reversed_pdf_files+=("${pdf_files[i]}") done if [[ $num_pdf_files == 1 ]]; then echo "Creating output PDF..." mv "${reversed_pdf_files[0]}" "$OUTPUT" else echo "Concatenating $num_pdf_files PDFs in correct order..." pdfunite "${reversed_pdf_files[@]}" "$OUTPUT" fi fi fi if [[ -f "$OUTPUT" ]]; then echo "Done. Output saved to: $OUTPUT" # Create JSON metadata file JSON_FILE="${OUTPUT%.pdf}.json" cat > "$JSON_FILE" << EOF { "status": "success", "pages": $num_pdf_files, "mode": "$MODE", "quality": $RESOLUTION } EOF echo "Metadata saved to: $JSON_FILE" else echo "No scans found." # Create JSON metadata file for failed scan JSON_FILE="${OUTPUT%.pdf}.json" cat > "$JSON_FILE" << EOF { "status": "failed", "pages": 0, "mode": "$MODE", "quality": $RESOLUTION } EOF echo "Metadata saved to: $JSON_FILE" fi