199 lines
5.4 KiB
Bash
Executable File
199 lines
5.4 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Script to control an ADF scanner
|
|
# - start scanning and create a single pdf file
|
|
# - with empty page and orientation detection
|
|
# - tested with Fujitsu SP-1120
|
|
#
|
|
# ... excessively borrowed from https://github.com/rocketraman/sane-scan-pdf
|
|
#
|
|
# Version: 0.2 (headless, fixed page order)
|
|
# Date: 2025-02-08
|
|
# License: GNU General Public License
|
|
# Modified by: moritzrfs
|
|
# Original Author: Eric Scheibler
|
|
# E-Mail: email [at] eric-scheibler [dot] de
|
|
# URL: http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/
|
|
#
|
|
# Install:
|
|
# sudo apt install imagemagick poppler-utils sane tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper
|
|
|
|
SCAN_DIR="scans"
|
|
mkdir -p "$SCAN_DIR"
|
|
OUTPUT="$SCAN_DIR/scan_$(date +%Y-%m-%d-%H-%M-%S).pdf"
|
|
HELP=0
|
|
VERBOSE=0
|
|
|
|
# scanner params
|
|
DEVICE=pfusp
|
|
RESOLUTION=400
|
|
MODE=Lineart
|
|
|
|
# ocr params
|
|
OCR_LANGUAGE=deu
|
|
OVERWRITE_OUTPUT_FILE=0
|
|
|
|
|
|
#####
|
|
|
|
TMP_DIR=$(mktemp -d -p "" scan.XXXXXXXXXX)
|
|
cleanup() {
|
|
rm -rf "$TMP_DIR"
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
|
|
# Parse command-line options
|
|
while [[ $# > 0 ]]; do
|
|
case "$1" in
|
|
-h|--help) HELP=1 ;;
|
|
-v|--verbose) VERBOSE=1 ;;
|
|
-o|--output) shift; OUTPUT="$1" ;;
|
|
-x|--device) shift; DEVICE=$1;;
|
|
-m|--mode) shift; MODE=$1 ;;
|
|
-r|--resolution) shift; RESOLUTION=$1 ;;
|
|
-l|--language) shift; OCR_LANGUAGE=$1 ;;
|
|
-w|--overwrite-output-file) OVERWRITE_OUTPUT_FILE=1 ;;
|
|
esac
|
|
shift # next option
|
|
done
|
|
|
|
if [[ $HELP == 1 ]]; then
|
|
echo "$(basename $0) [OPTIONS]... [OUTPUT]"
|
|
echo ""
|
|
echo "OPTIONS"
|
|
echo " -x, --device"
|
|
echo " Override scanner device name, defaulting to \"pfusp\""
|
|
echo " -m, --mode"
|
|
echo " Mode e.g. Lineart (default), Halftone, Gray, Color, etc."
|
|
echo " -r, --resolution"
|
|
echo " Resolution e.g 400 (default)"
|
|
echo " -l, --language <lang>"
|
|
echo " which language to use for OCR (default: deu)"
|
|
echo ""
|
|
echo "OUTPUT"
|
|
echo " -o, --output <outputfile>"
|
|
echo " Output to named file default=scan.pdf"
|
|
echo " -w, --overwrite-output-file"
|
|
echo " Overwrite the output pdf file, if it already exists"
|
|
echo " -v, --verbose"
|
|
exit 0
|
|
fi
|
|
|
|
if [[ $VERBOSE == 0 ]]; then
|
|
quiet_param="--quiet"
|
|
suppress_error_messages="2> /dev/null"
|
|
fi
|
|
|
|
if [[ "$OUTPUT" == "" ]]; then
|
|
echo >&2 "Output file must be specified. Aborting."
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -f "$OUTPUT" ]]; then
|
|
if [[ $OVERWRITE_OUTPUT_FILE == 0 ]]; then
|
|
echo >&2 "Output file $OUTPUT already exists. Aborting."
|
|
exit 1
|
|
else
|
|
rm "$OUTPUT"
|
|
fi
|
|
fi
|
|
|
|
|
|
echo >&2 "Scanning..."
|
|
scanadf --device-name "$DEVICE" --source Adf-duplex --resolution $RESOLUTION --mode $MODE -o $TMP_DIR/scan-%04d
|
|
if [[ $? != 0 ]]; then
|
|
exit 1
|
|
fi
|
|
echo ""
|
|
|
|
|
|
shopt -s extglob nullglob
|
|
image_files=($TMP_DIR/scan-[0-9]*)
|
|
num_scans=${#image_files[@]}
|
|
|
|
if [[ $num_scans > 0 ]]; then
|
|
echo "Processing $num_scans pages"
|
|
|
|
# Process images in normal order
|
|
for image_file in ${image_files[@]}; do
|
|
echo "Process $(basename $image_file)"
|
|
|
|
# unpaper
|
|
eval unpaper $quiet_param --overwrite --dpi $RESOLUTION $image_file $image_file $suppress_error_messages
|
|
|
|
# convert to tiff
|
|
convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $image_file ${image_file}.tiff
|
|
rm $image_file
|
|
|
|
# orientation detection
|
|
orientation_result=$(eval tesseract ${image_file}.tiff - --psm 0 $suppress_error_messages) || orientation_result=
|
|
if [[ $orientation_result == *"Rotate: 180"* ]]; then
|
|
echo "Image orientation is upside down, rotate"
|
|
convert -rotate 180 ${image_file}.tiff ${image_file}.tiff
|
|
fi
|
|
|
|
# empty page detection
|
|
percentage_white=$(convert ${image_file}.tiff -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:) || percentage_white=0
|
|
is_empty_page=$(echo "$percentage_white >= 99.8" | bc -l)
|
|
if [[ $is_empty_page == 1 && $orientation_result == "" ]]; then
|
|
echo "Empty page removed"
|
|
else
|
|
eval tesseract ${image_file}.tiff $image_file -l $OCR_LANGUAGE pdf $suppress_error_messages
|
|
rm ${image_file}.tiff
|
|
fi
|
|
|
|
echo ""
|
|
done
|
|
|
|
# Collect PDF files in REVERSE order to fix the page sequence
|
|
pdf_files=($TMP_DIR/scan-[0-9]*.pdf)
|
|
num_pdf_files=${#pdf_files[@]}
|
|
|
|
if [[ $num_pdf_files > 0 ]]; then
|
|
# Reverse the array to fix page order
|
|
reversed_pdf_files=()
|
|
for ((i=${#pdf_files[@]}-1; i>=0; i--)); do
|
|
reversed_pdf_files+=("${pdf_files[i]}")
|
|
done
|
|
|
|
if [[ $num_pdf_files == 1 ]]; then
|
|
echo "Creating output PDF..."
|
|
mv "${reversed_pdf_files[0]}" "$OUTPUT"
|
|
else
|
|
echo "Concatenating $num_pdf_files PDFs in correct order..."
|
|
pdfunite "${reversed_pdf_files[@]}" "$OUTPUT"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if [[ -f "$OUTPUT" ]]; then
|
|
echo "Done. Output saved to: $OUTPUT"
|
|
|
|
# Create JSON metadata file
|
|
JSON_FILE="${OUTPUT%.pdf}.json"
|
|
cat > "$JSON_FILE" << EOF
|
|
{
|
|
"status": "success",
|
|
"pages": $num_pdf_files,
|
|
"mode": "$MODE",
|
|
"quality": $RESOLUTION
|
|
}
|
|
EOF
|
|
echo "Metadata saved to: $JSON_FILE"
|
|
else
|
|
echo "No scans found."
|
|
|
|
# Create JSON metadata file for failed scan
|
|
JSON_FILE="${OUTPUT%.pdf}.json"
|
|
cat > "$JSON_FILE" << EOF
|
|
{
|
|
"status": "failed",
|
|
"pages": 0,
|
|
"mode": "$MODE",
|
|
"quality": $RESOLUTION
|
|
}
|
|
EOF
|
|
echo "Metadata saved to: $JSON_FILE"
|
|
fi
|