#! /bin/sh #================================================================ # estpdfhtml # Strip a file of PDF and extract its text as HTML. #================================================================ # set variables LANG=C ; export LANG LC_ALL=C ; export LC_ALL progname="estpdfhtml" tmpfile="/tmp/$progname.$$" infile="$1" outfile="$2" # show help message if [ "$1" = "--help" ] then printf 'Strip a file of PDF and extract its text as HTML.\n' printf '\n' printf 'Usage:\n' printf ' %s [infile] [outfile]\n' "$progname" printf ' estindex register -xsuf .pdf application/pdf %s casket\n' "$progname" printf '\n' exit 0 fi # function to remove the temporary file tmpclean(){ rm -rf "$tmpfile" } # function to create the temporary file for input output(){ if [ -z "$outfile" ] then cat else cat >> "$outfile" fi } # set the exit trap trap tmpclean 1 2 3 13 15 # check the input file existence if [ -n "$infile" ] && [ ! -f "$infile" ] then printf '%s: %s: no such file\n' "$progname" "$infile" exit 1 fi # create the temporary file if [ -z "$infile" ] then cat > "$tmpfile" infile="$tmpfile" fi # function to get the document or its alternative getdoc(){ altfile="$1.alt" if [ -n "$ESTORIG" ] then altfile="$ESTORIG.alt" fi if [ -f "$altfile" ] then cat "$altfile" else pdftotext -raw -htmlmeta -enc UTF-8 -eol unix -q "$1" - fi } # output the result getdoc "$infile" | iconv -f UTF-8 -t UTF-8 -c | awk ' BEGIN { esc = 0 mul = 1 emp = 0 } { if(esc < 1 && $0 ~ /^/ && $0 ~ /title>$/){ printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n") gsub(/<[^>]*>/, "", $0) gsub(/&/, "\\&", $0) gsub(/</, "\\<", $0) gsub(/>/, "\\>", $0) printf("<title>%s\n", $0) } else if($0 == "
"){
    esc++
    printf("

") mul = 1 } else if($0 == "

"){ esc-- printf("

\n") } else if($0 == "\f"){ printf("

\n
\n

") } else { if(esc > 0){ gsub(/&/, "\\&", $0) gsub(//, "\\>", $0) gsub(/^ */, "", $0) gsub(/ *$/, "", $0) } if(length($0) < 1){ emp++ } else if(match($0, /^= 2 || (mul == 0 && substr($0, 1, 1) <= "~")){ printf(" ") } i = 0 while(i < length($0)){ c = substr($0, i + 1, 1) if(c == " "){ if(mul == 0){ printf(" ") } mul = 0 } else { printf("%c", c) mul = c > "~" } i++ } emp = 0 } } } ' | output # clean up the temporary directory tmpclean # exit normally exit 0 # END OF FILE