#!/bin/sh -

#--
# External parser for HTDIG that parses Word files so they can
# be indexed.
#--
# Written by Richard W.M. Jones <rjones@imcl.com>. Distributed freely
# under the terms of the GNU General Public License (GPL).
# Modified by Andrew M. Bishop <amb@gedanken.demon.co.uk>
#--

#----------------------------------------------------------------------
# Configurable stuff here:

# The program that converts Word files into text. I use ``catdoc''
# by Victor Wagner <vitus@agropc.msk.su>. You may wish to just use
# ``strings''.
CATDOC=/usr/local/bin/catdoc
#CATDOC=strings

# End of configurable stuff.
#----------------------------------------------------------------------

# Arguments are:
#   $1 = input file
#   $2 = content type (ignored)
#   $3 = base URL
#   $4 = HTDIG config file (ignored)
# HTDIG expects us to print out:
#   w WORD LOCATION HEADING    Word at location 0-1000 under heading
#   u URL DESCRIPTION          URL with description
#   t TITLE                    Title of document
#   h HEAD                     Heading
#   a ANCHOR                   Anchor (ie. like <a name="">)
#   i IMAGE_URL                Image pointer

#----------------------------------------------------------------------

# Format input to word per line.

wordPerLine () {
    tr '[ \010]' '\012' | awk 'NF==1 {print;}'
}

# Change non-alphabetical/numeric characters in space.

removeNonAlNum () {
    tr -c '[a-zA-Z0-9\015]' ' '
}

#----------------------------------------------------------------------

# Parse input file to linear list of words.
$CATDOC $1 | removeNonAlNum | wordPerLine > /tmp/htparsedoc.$$

# Compute length of list.
filelen=`wc -l < /tmp/htparsedoc.$$`

# We can't find the title from the document, so make one up.
echo "t	Binary Document $3"

# We can't make an excerpt so we make one up.
echo "h	No excerpt available"

# Pass words to htdig.
if [ $filelen -gt 0 ]; then
    awk "{printf (\"w\t%s\t%d\t-\t\n\", \$1, 1000*NR/$filelen);}" \
	< /tmp/htparsedoc.$$
fi

# Remove temporary file.
rm /tmp/htparsedoc.$$
