#! /bin/sh
#
#  Filename:  getmeta  (QSearch 2.1)
#
#  Other Files Used:  getmeta.subdirs
#
#  Author:  Br. David Carlson
#
#  Original Date of Creation:  January 24, 2001
#
#  Revision History:
#
#  Modified:  May 20, 2003 so that getmeta does not index files that would
#  give dead links in QSearch.  (In previous versions, this could happen if
#  read access was turned on for an HTML file but it was within some directory
#  with x access turned off for the web server.)  This new version of getmeta
#  also handles directory names containing spaces.
#
#  Modified:  January 25, 2003 to handle meta tag information that spans
#  more than one line.  Can also handle data that comes on the same line
#  as the meta tag but before that tag, as well as data that comes after the
#  closing > for the meta tag data but on the same line as the >.
#
#  Modified:  January 5, 2003 to expand the Dec 22 change so that multiple
#  spaces after a comma before a keyword get removed.  Also processes files
#  with both the .html and .htm extensions.
#
#  Modified:  December 22, 2002 to allow users to put a space after a comma
#  that comes before a keyword in the meta tag section of an HTML file.
#  This requires that the removespace executable be in the same directory
#  as this getmeta script.  This directory location must be placed in the
#  variable GETMETAHOME below.
#
#  Original Description:
#
#  This shell script extracts the information from the DESCRIPTION and
#  KEYWORDS meta tags of all html files in the TARGET directory and in the
#  subtrees rooted at any of the directories in SUBDIRS.  The information
#  is written to the file KEYWORDFILE.  This script should be run by root
#  (probably automatically as a crontab entry).
#
#  Warranty/Copyright/Usage Information:
#
#  This software is provided "as is" without warranty of any kind.  The user
#  assumes all of the risk in using the software.  Although the author retains
#  the copyright to the software, blanket permission is given to freely use,
#  copy, and modify the software as long as the files still contain the author's
#  copyright notice; and the software, or any modification of it, remains as
#  freeware or shareware.  Should you wish to use this code as part of a
#  commercial product, contact the author, carlsond@stvincent.edu, for permission.
#
#  Copyright 2001 - 2003, Br. David Carlson, St. Vincent Archabbey, Latrobe, PA


#  This function process one file.  If it is an accessable HTML file with appropriate META
#  tag info, it is indexed by having its META tag data written to the file given by $TMPKEYFILE.
#     1  The full pathname for the file to be processed.
ProcessFile()
   {
   local result
   local HEADDATA
   local METADATA
   local KEYS
   local DESCRIP
   local filename

   result=`ls -l "$1" | cut -c8`
   if [ "$result" = "r" ]
   then
      HEADDATA=`head -$LINES "$1" | tr '\n' ' ' | tr '<>' '\n'`
      METADATA=`echo "$HEADDATA" | grep -i "^meta "` 
      KEYS=`echo "$METADATA" | grep -i '="keywords"'`
      if [ -n "$KEYS" ]
      then
         DESCRIP=`echo "$METADATA" | grep -i '="description"'`
         KEYS=`echo "$KEYS" | cut -d '=' -f3 | tr ',' '#' | tr -d '"' | tr -s ' ' | "$GETMETAHOME/removespace"`
         DESCRIP=`echo "$DESCRIP" | cut -d '=' -f3 | tr -d '"'`
         filename=`echo "$1" | cut -c ${START}-`
         # Note removal of carriage returns (octal 15) in case we have a DOS/Windows text file:
         echo "${filename}|${DESCRIP}#${KEYS}#" | tr -d '\015' >> $TMPKEYFILE
      fi
   fi
   }


#  Function to process a directory.
#  Input parameters:
#     1   the complete pathname of the directory to process
#     2   yes or no, indicating whether or not to try to process subdirectories
ProcessDir()
   {
   local ACCESS
   local TMP
   local OLDDIR
   local CURDIR
   local CHECK

   count=$[$count + 1]
   TMP="/tmp/getmeta.$$.$count"   # get a unique tempfile name

   OLDDIR="`pwd`"
   cd "$1"
   CURDIR="`pwd`"

   ACCESS=`ls -ld "$CURDIR" | cut -c10`
   if [ "$ACCESS" == "x" ]
   then
      CHECK="yes"
   else
      CHECK="no"
   fi

   # Handle HTML files if called for:
   if [ "$CHECK" == "yes" ]
   then
      # Get a list of HTML files, but send error output to the bit bin.
      ls *.[hH][tT][mM] *.[hH][tT][mM][lL] > "$TMP" 2> /dev/null
      while read FILENAME
      do
         ProcessFile "${CURDIR}/$FILENAME"
      done < "$TMP"

      # Handle subdirectories:
      if [ "$2" == "yes" ]
      then
         ls -l | grep "^d" | cut -c57- > "$TMP"
         while read DIRNAME
         do
            ProcessDir "${CURDIR}/$DIRNAME" "yes"
         done < "$TMP"
      fi
      rm "$TMP"
   fi

   cd "$OLDDIR"
   }


# The script begins execution here:

#*******************  Change this section as needed: ***************************************

# number of lines at head of each HTML file to look at in finding META tag data:
LINES=30

# target directory (where to begin the indexing of HTML files):
TARGET="/www"

# start recording filenames with column 5, 1 more than TARGET's length:
START=5

# path to the KEYFILE that holds the indexed data:
KEYFILE="/www/cgi-bin/keywordfile"

# This getmeta script and the removespace executable should both be in the location
# specified by the GETMETAHOME variable:
GETMETAHOME="/usr/local/bin"

# file containing list of subdirectories to be indexed:
# file is assumed to be within the directory given by GETMETAHOME
SUBDIRFILE="getmeta.subdirs"

# Location for temp file:
TMPKEYFILE="/tmp/keyfile.$$"

#*******************  End of section to be changed. ****************************************

count=0
ProcessDir "$TARGET" "no"

# Loop through the subdirectories and call ProcessDir on each.

while read subdir
do
   ProcessDir "$subdir" "yes"
done < "${GETMETAHOME}/$SUBDIRFILE"

cp $TMPKEYFILE $KEYFILE
rm $TMP $TMPKEYFILE
exit 0