#! /bin/sh # # Filename: getmeta (QSearch 2.1) # # Other Files Used: getmeta.subdirs # # Author: Br. David Carlson # # Original Date of Creation: January 24, 2001 # # Revision History: # # Modified: May 20, 2003 so that getmeta does not index files that would # give dead links in QSearch. (In previous versions, this could happen if # read access was turned on for an HTML file but it was within some directory # with x access turned off for the web server.) This new version of getmeta # also handles directory names containing spaces. # # Modified: January 25, 2003 to handle meta tag information that spans # more than one line. Can also handle data that comes on the same line # as the meta tag but before that tag, as well as data that comes after the # closing > for the meta tag data but on the same line as the >. # # Modified: January 5, 2003 to expand the Dec 22 change so that multiple # spaces after a comma before a keyword get removed. Also processes files # with both the .html and .htm extensions. # # Modified: December 22, 2002 to allow users to put a space after a comma # that comes before a keyword in the meta tag section of an HTML file. # This requires that the removespace executable be in the same directory # as this getmeta script. This directory location must be placed in the # variable GETMETAHOME below. # # Original Description: # # This shell script extracts the information from the DESCRIPTION and # KEYWORDS meta tags of all html files in the TARGET directory and in the # subtrees rooted at any of the directories in SUBDIRS. The information # is written to the file KEYWORDFILE. This script should be run by root # (probably automatically as a crontab entry). # # Warranty/Copyright/Usage Information: # # This software is provided "as is" without warranty of any kind. The user # assumes all of the risk in using the software. Although the author retains # the copyright to the software, blanket permission is given to freely use, # copy, and modify the software as long as the files still contain the author's # copyright notice; and the software, or any modification of it, remains as # freeware or shareware. Should you wish to use this code as part of a # commercial product, contact the author, carlsond@stvincent.edu, for permission. # # Copyright 2001 - 2003, Br. David Carlson, St. Vincent Archabbey, Latrobe, PA # This function process one file. If it is an accessable HTML file with appropriate META # tag info, it is indexed by having its META tag data written to the file given by $TMPKEYFILE. # 1 The full pathname for the file to be processed. ProcessFile() { local result local HEADDATA local METADATA local KEYS local DESCRIP local filename result=`ls -l "$1" | cut -c8` if [ "$result" = "r" ] then HEADDATA=`head -$LINES "$1" | tr '\n' ' ' | tr '<>' '\n'` METADATA=`echo "$HEADDATA" | grep -i "^meta "` KEYS=`echo "$METADATA" | grep -i '="keywords"'` if [ -n "$KEYS" ] then DESCRIP=`echo "$METADATA" | grep -i '="description"'` KEYS=`echo "$KEYS" | cut -d '=' -f3 | tr ',' '#' | tr -d '"' | tr -s ' ' | "$GETMETAHOME/removespace"` DESCRIP=`echo "$DESCRIP" | cut -d '=' -f3 | tr -d '"'` filename=`echo "$1" | cut -c ${START}-` # Note removal of carriage returns (octal 15) in case we have a DOS/Windows text file: echo "${filename}|${DESCRIP}#${KEYS}#" | tr -d '\015' >> $TMPKEYFILE fi fi } # Function to process a directory. # Input parameters: # 1 the complete pathname of the directory to process # 2 yes or no, indicating whether or not to try to process subdirectories ProcessDir() { local ACCESS local TMP local OLDDIR local CURDIR local CHECK count=$[$count + 1] TMP="/tmp/getmeta.$$.$count" # get a unique tempfile name OLDDIR="`pwd`" cd "$1" CURDIR="`pwd`" ACCESS=`ls -ld "$CURDIR" | cut -c10` if [ "$ACCESS" == "x" ] then CHECK="yes" else CHECK="no" fi # Handle HTML files if called for: if [ "$CHECK" == "yes" ] then # Get a list of HTML files, but send error output to the bit bin. ls *.[hH][tT][mM] *.[hH][tT][mM][lL] > "$TMP" 2> /dev/null while read FILENAME do ProcessFile "${CURDIR}/$FILENAME" done < "$TMP" # Handle subdirectories: if [ "$2" == "yes" ] then ls -l | grep "^d" | cut -c57- > "$TMP" while read DIRNAME do ProcessDir "${CURDIR}/$DIRNAME" "yes" done < "$TMP" fi rm "$TMP" fi cd "$OLDDIR" } # The script begins execution here: #******************* Change this section as needed: *************************************** # number of lines at head of each HTML file to look at in finding META tag data: LINES=30 # target directory (where to begin the indexing of HTML files): TARGET="/www" # start recording filenames with column 5, 1 more than TARGET's length: START=5 # path to the KEYFILE that holds the indexed data: KEYFILE="/www/cgi-bin/keywordfile" # This getmeta script and the removespace executable should both be in the location # specified by the GETMETAHOME variable: GETMETAHOME="/usr/local/bin" # file containing list of subdirectories to be indexed: # file is assumed to be within the directory given by GETMETAHOME SUBDIRFILE="getmeta.subdirs" # Location for temp file: TMPKEYFILE="/tmp/keyfile.$$" #******************* End of section to be changed. **************************************** count=0 ProcessDir "$TARGET" "no" # Loop through the subdirectories and call ProcessDir on each. while read subdir do ProcessDir "$subdir" "yes" done < "${GETMETAHOME}/$SUBDIRFILE" cp $TMPKEYFILE $KEYFILE rm $TMP $TMPKEYFILE exit 0