#!/usr/bin/perl
#
#NAME
# abcbot - search the Web for ABC tunes
#
#SYNOPSIS
# abcbot  [options] [host...]
#
#NOTE
# Scan for "giveup" for the current default time to abandon a site.
#
#DESCRIPTION
# This program is a web explorer robot that looks for ABC music.
#
# This program works from a "hosts" database which is currently kept in  the
# "hst/" subdirectory, one file per host.  Each file contains a line per URL
# at that host, possibly followed by one or more  lines  giving  information
# that was extracted from that URL.
#
# Each pass of this program gets a list of URLs and/or host names.  URLs are
# read from standard input, and all we do with them is append  them  to  the
# host's   file.    Thus   http://foo.bar.com/abc/   results   in  the  file
# hst/foo.bar.com having a line added giving the URL.
#
# There are two distinct ways of running this program:  with  or  without  a
# host  list  on  the command line.  If there are no hosts listed, we run in
# "initialization" mode. The input stream should contain a list of URLs (and
# possibly other information such as hosts to avoid).  For each URL, we make
# sure that the host's file exists, and we append a  line  for  the  URL  at
# depth 1. The next pass should then find these URLs and scan them for links
# or tunes.
#
# Here's how I do the initialization:
#   abcbot +CURLs >& abcbot.out &
#
# If  called  with  hosts on the command line, we are in "update" mode.  The
# input may contain hosts to avoid, but URLs there will be ignored. Instead,
# we  do  a scan of each of the listed hosts and update its file in the host
# directory.  For each host, we move its file to backup (by appending '-' to
# its  name), and then we read this backup file and write a new file for the
# host.  For each URL in the host's file, we fetch  the  file,  and  extract
# hyperlinks and ABC tunes.
#
# Hyperlinks   are   accumulated,   and  at  the  end  we  will  repeat  the
# "initialization" and append the links to their hosts' files.  For each ABC
# tune  found, we add a line showing the "interesting" information about the
# tune.  What is considered interesting may change from time to time.
#
# Because of the difficulties in preventing  infinite  loops  with  URLs  we
# implement  two  ways  of  limiting  the URLs that may be followed: You can
# restrict the depth of recursion with the -D option, and you  can  restrict
# the hostname(s) with the +H option.
#
#ENVIRONMENT
# We read the following from the environment:
#
# V_abcbot=<l><file>
#   If defined, this defines our "verbose" level and output file.  The level
#   <l>  is  a number (which defaults to 1 or 2, depending on shat I want at
#   the moment), the optional <file> (which defaults to STDERR) is where the
#   output is written.  Note that this variable's name consists of 'V_' plus
#   the program's name.  If you call this program by some  other  name,  you
#   should of course use 'V_' plus that name.
#
#INPUT
# We always read from stdin, so if you don't  want  to  provide  any  input,
# you'll  need to redirect our input to /dev/null.  The input is scanned for
# URLs, and they are added to our starting list (at depth 1).
#
# As a special aid in limiting searches, the  input  may  contain  lines  of
# these forms (with or without the colons):
#   done:   http://foo.bar.com/xyz
#   ignore  http://foo.bar.com/xyz
#   avoid:  http://foo.bar.com/xyz
# These are ways of telling abcbot to ignore certain URLs.  The  "done"  and
# "ignore"  commands  give  specific  URLs  that  are to be avoided; this is
# implemented by simply listing them as "already done".   With  the  "avoid"
# command,  we  extract  the  host  name, and URLs for that host will not be
# used.
#
#OUTPUT
#
#OPTIONS
# Options start with '-' or '+' plus a letter,  with  possibly  a  parameter
# (and no embedded spaces).  Some of the options take an initial '+' to mean
# "enable" and '-' to mean "disable".  For others, the '-'  or  '+'  is  not
# relevant.   If  '+'  is  shown in the list below,, then it is significant.
# Capitalization of the option letters doesn't matter (but it may matter  in
# an argument string if there is one).
#
# -<n>
#   where <n> is an integer, means a timeout of <n> seconds.  The default is
#   currently:
#
	$ABCtmout = $ENV{'ABCtmout'} || 30;	# Was 600
#
# -d<depth>
#   This restricts the depth of directory  searches  to  <depth>.   This  is
#   mostly to avoid infinite loops.  The default is 3.  Experience has shown
#   that each depth level produces at least a factor of 10 increase  in  run
#   time,  so  you  should be careful with this.  It's much faster to have a
#   shallow depth and a long list of starting URLs.  One recommendation: use
#   the  previous  output  as  input,  so  all  the  successes  then will be
#   re-scanned (at depth 2) in the current run.
#
# +h<host>
#   Allow URLs for <host>.  Default: All hosts allowed.  If there is one  or
#   more +h options, then only these hosts are allowed.
#
# -s
# +s<n>
#   Skip over <n> URLs while searching.  This has the effect of not making a
#   lot  of requests in succession of a single server.  It is implemented by
#   moving n-1 URLs to the end of the URL list before each attempt to  fetch
#   a URL.
#
#LOCALHOST
# The following host names are rewritten:
#
	%hostsub = (
#		'ecf-guest.mit.edu' => 'localhost',
#		'trillian.mit.edu' => 'localhost',
#		'jc.tzo.net:1742' => 'localhost',
#		'lochaber.tullochgorm.com' => 'localhost',
	);
#
#EXAMPLES
#
#SIGNALS
# There are various ways that this  program  may  get  hung  up  because  of
# misbehavior  (or  behavior that may be valid but I don't understand it) on
# the part of web servers.  You can "kick" this program by sending it  these
# signals:
#
# CONT
#   Abandon the current URL by closing the connection.
# INT or HUP
#   Abandon the search and write the output files.
# QUIT
#   Abandon the current document and host, finish up, and exit.
# USR1
#   Decrease the verbose level ($V) by 1.  Also, we write a dump of the call
#   stack to the verbose log.
# USR2
#   Increase the verbose level ($V) by 1.  Also, we write a dump of the call
#   stack to the verbose log.
#   
#
#MISC
# Lines in a file starting with "%%noindex" tell abcbot to ignore tunes.  If
# a  "%%noindex"  line is inside a tune, only that one tune will be ignored.
# If a "%%noindex" line is found ouside a tune, it means to ignore the  rest
# of the file. [Added by JC 2007-10-08]
#
#BUGS
# This program is highly experimental, in alpha state, and all that.  Use it
# at your own risk.  (Not much risk, there, actually, but I thought I'd give
# the usual friendly warnings.) Just don't write the output  back  over  the
# input,  and check its output with a browser or two, and there shouldn't be
# many problems.
#
# Of course, there are constant problems with spelling variations. Musicians
# are  atrocious spellers.  This program doesn't even attempt to tackle this
# issue.
#
#AUTHOR:
# John Chambers <jc@trillian.mit.edu> http://trillian.mit.edu/~jc/

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# INITIALIZATION
	$V = 1;
	$ENV{'PATH'} = ".:sh:$ENV{'PATH'}";
	($P = $0) =~ s'.*/'' unless defined($P);	# This program's name
	($myhost = `hostname`) =~ s/\s+$//;			# Our hostname
	print "$0: P='" . $ENV{'P'} . "'\n" if $V>0;
	print "$0: PATH='" . $ENV{'PATH'} . "'\n" if $V>1;
	push @INC, '.', 'pm', 'sh', split(':',$ENV{'PATH'});
	print "$0: INC='" . join(':',@INC) . "'\n" if $V>1;
#
	require "V.pm";				# Verbose/debug package
	&Vopt(($V_P = $ENV{"V_$P"}) || '1');		# Verbose level
	print V "$P: V=$V V_P='$V_P'\n" if $V>0;
	require "DT.pm";			# Date/time routine(s)
	require "namesubs.pm";		# Name-munging routines
#
	use HTML::Entities;			# For handling HTML "entity" encodings
	binmode(STDOUT,':utf8');	# Convert output to UTF-8 encoding
#	use htmlsubs;				# HTML-text conversion (doesn't work on rucker)
	require "htmlsubs.pm";		# HTML-text conversion
	my $hs = new htmlsubs;		# We're Object-Oriented!
#
# We have converted to using curl to fetch files. It knows how to handle the
# https protocol, which a lot of major sites have transitioned to, and it's
# available in all unix/linux libraries now.
#
# We used to use webcat as a subprocess to fetch files from the web. This was
# done  so  that we could properly time out zombie connections to some of the
# broken web sites out there.  It  turns  out  that  you  can  only  abort  a
# connect() with sig('ALRM'), and if you attempt to close the socket after an
# alarm, you may die a horrible death. With that isolated in a subprocess, we
# can continue to run past such disasters and continue with the next URL.
#
# This program no longer uses the LWP::Simple modules.  I've found a  simpler
# approach.   But  you'll have to download these modules, and possibly change
# push to say where you put them:
#
	require "Backup.pm";		# File backup routine.
#	use Backup;		# File backup routine.
	require "abcCode.pm";	# Calculates tune codes.
#	use abcCode;	# Calculates tune codes.
	require "DT.pm";			# Date/Time routine.
#	use DT;			# Date/Time routine.
##	use HTTPcon;	# Makes HTTP connection to server.
##	use URLdata;	# Opens URL and returns file handle.
	require "URLhref.pm";	# Combines URL + HREF -> new URL.
#	use URLhref;	# Combines URL + HREF -> new URL.
	require "URLtrim.pm";   # Shrinks URLs.
#	use URLtrim;   # Shrinks URLs.
	require "HTMLdir.pm";	# HTML directory listing.
#	use HTMLdir;	# HTML directory listing.
#
# They'll have to be in your @INC path;  by  default  we  add  $HOME/sh  and
# $HOME/pl to @INC, so those are good places to put them.

$| = 1;			# Auto-flush stdout
$, = "\n";		# Is this still used?
$" = "\n\t";	# How about this?
$exitstat = 0;	# Anyone can set this to nonzero

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Some global vars for controlling actions:

$agentid       = "$P/abcbot+curl";
$allowcgi      =    0;	# We usually don't look at cgi URLs
$articles      =  '-';	# Don't include initial articles
$avoids        =    0;	# Count the avoids that we processed
$cachedir      = 'cache';	# Main name of cache directory
$caching       =    1;	# Are we cacheing files that contain abc?
$cfgfilesread  =    0;	# Number of config files we've read
$cfgfile   = "$P.cfg";	# The name of our global config file
$chkuplinks    =    1;	# Check for "parent/home/back" links
$crawldelay    =    1;	# Delay between HTTP requests to a host
$currhost      =   '';	# The host we're processing right now
%Disprefix     =   ();	# List of URL initial strings to reject
$hstdelay = 0;	#$spd * 15;	# Days to wait before rescanning host
$QuotesOK      =    0;	# If true, quotes are accepted in URLs
%RejectPfx     =   ();	# List of URL initial strings for this host to reject
@Rewrite       =   ();	# Rewrite rules for URLs
$followUpLink  =    0;	# Whether to follow links that contain "/../"
$giveuptime    =  120;	# Abandon host if no tune found in this time (900 = 15 min)
$lasttunetime  = time;	# When we last found a tune on current host
#HDRkludge     =    1; 	# Try to ignore HDR files
$HTTPcontime   =    0;	# When we last tried an HTTP connect
#listabchosts  =    1;	# Collect list of hosts with abc files
$purgebad      =    1;	# Drop URIs that get 404 (Not found)
$renameold     =    0;	# Rename (Backup) old files in cache
$saveunmatched =    0;	# If true, unmatched chunks will be preserved
$schedule      =    0;	# If >0, schedule a rerun after this many minutes
$SCDkludge     =    0; 	# Try to ignore SCD dance-form titles
$testhost = 'www.thesession.org';	# Special tests for this host
#urlskip       =    1;	# Set > 1 to scatter URLs
$Xdefault      =  '1';	# Default index for tunes without an X: header

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Initialize the module to calculate tune encodings:
#
$abcCode = new abcCode;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Counter default values:

$doctunes   = 0;	# Number of X: lines discovered in current doc
$doclinks   = 0;	# Number of hyperlinks discovered in current doc
$doctitls   = 0;	# Number of T: lines discovered in current doc
$filemax    = 0;	# Is this used?
$ignoretune = 0;	# If true, ignore all tunes in this file
$ignorefile = 0;	# If true, ignore the current tune
$inHTTPhdrs = 0;	# If true, we expect HTTP headers
$linkcnt    = 0;	# Total links  at this host
$scancnt    = 0;	# Total scans  of this host
$tunecnt    = 0;	# Total tunes  at this host
$titlcnt    = 0;	# Total titles at this host

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# These are passed to $getcmd.  There are a lot of problems with web  servers
# that require a specific HTTP version number.  If we get nothing from a site
# that should have ABC tunes, try setting the HTTPversion to '1.0' or '1.1'.

$HTTPdelay   = &env('HTTPdelay',      1);	# Was 0
$HTTPtimeout = &env('HTTPtimeout',   30);	# Default HTTP timeout interval
$HTTPversion = &env('HTTPversion', '1.1') unless defined $HTTPversion;
$TOopen      = $TOread = -1;				# Timeout intervals

print V "$P: hstdelay=$hstdelay.\n" if $V>0;
print V "$P: HTTPdelay=$HTTPdelay.\n" if $V>0;
print V "$P: crawldelay=$crawldelay.\n" if $V>0;
print V "$P: HTTPtimeout=$HTTPtimeout.\n" if $V>0;
print V "$P: HTTPversion=$HTTPversion.\n" if $V>0;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Initialize the tune cache.

$cachetunes  = &env('ABCcache' , 1);	# Writing cache?
#cachebase   = '.';						# Where to put the cache
$cachetmp    = "cache$$.data";			# Cache file while reading
print V "$P: Caching tunes.\n" if $V>3 && $cachetunes;
$hs->setOption("+cache=$cachetmp",'+U')	# Tell htmlsubs to use our cache file
	if $cachetunes;
$hs->setTestHost($testhost)				# Tell htmlsubs about our test host,
	if $testhost;						# if we have a test host name

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Assorted limits and controls:

$abcdepth = 4;	# Depth limit for *.abc files
$hstdepth =		# Default depth limit for current host
$maxdepth = 3;	# Default depth limit for directories [jc 20061212]
$maxurls  = 0;	# If >0, give up after this many URLs
$urlcount = 0;	# Number of URLs processed so far
$depth    = 1;	# The current depth in directories

#maxlinedflt =
$maxlines  = 3000;	# Give up if no ABC in this many lines of text
$maxscans  =    1;	# Max number of scans before giving up on host
$showlinks =    0;	# Include links in host files

$smryfile = "scandata";		# Where to write 1-line summary info

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# commands for fetching web files:

#dfltget = "webcat +H +I";		# Default command to get a file (old)
$dfltget = "curl -s -L -D -";	# Default command to get a file (new)
print V "$P: dfltget=\"$dfltget\"\n" if $V>1;

# Before this point should be only simple assignments of initial values.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here are our global arrays and tables:

%BadHost  = ();	# Hosts which should not be accessed at all
%BadPath  = ();	# Paths on this host to avoid
@BadPat   = ();	# Patterns for which URLs to avoid
@Base     = ();	# Base URLs that start acceptable URLs
@oldchunk = ();	# lines of old hst/* file entry
@newchunk = ();	# lines of new hst/* file entry
%Depth    = ();	# link depth for a URL
%DepthHost= ();	# link depth limit for a host
%ProtHost = ();	# protocols to use for a host
%Done     = (); # time that URL (full or short) was scanned
%h2d      = ();	# list of depths for the h2u URIs
%h2n      = ();	# number of URLs for each host
%h2u      = ();	# list of URIs for each host
%inithost = ();	# hostnames from the command line
%Opt      = ();	# Runtime options for host
%outlink  = ();	# URLs that have already been seen in this file
@tune     = ();	# lines of current ABC tune
#URLts    = ();	# timestamp when a URL was last read

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# We ignore the following file types (if the value here is nonzero):

%IgnoreType = (
	"application/pdf" => 1,
	"application/postscript" => 1,
	"application/rtf" => 1,
	"application/x-tar" => 1,
);

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# We implement a special kludge to handle local URLs:

($thishost = `hostname`) =~ s/\s*$//;
print V "$P: $thishost is our host name.\n" if $V>2;
%local = (
	"http://localhost/~jc/"         => ($ENV{HOME} . "/public_html/"),
##	"http://$thishost/~jc/"         => ($ENV{HOME} . "/public_html/"),
#	"http://ecf-guest.mit.edu/~jc/" => ($ENV{HOME} . "/public_html/"),
#	"http://trillian.mit.edu/~jc/"  => ($ENV{HOME} . "/public_html/"),
);
@local = sort(keys(%local));

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
%quotes = (
	'"' => 0x22,
	"'" => 0x27,
);

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here are the MIME types that we look for.  Since most file formats have more
# than one MIME type, we map the MIME type to a simple word.

$notearchives = 1;	# Whether to look for zip files, etc.

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Stuff dealing with time:

$spm = 60;	# Seconds per minute
$mph = 60;	# Minutes per hour
$hpd = 24;	# Hours per day
$spd = $spm * $mph * $hpd;		# Should be 86400

$mintime = ($V>6) ? 15 :  0;	# Minimum wait time before rereading a URL
$maxdays = ($V>6) ?  0 :  0;	# Maximum wait time in days
$maxtime = $maxdays * $spd;		# Maximum wait time before purging entries

$oScanY = $oScanM = 0;	# For remembering year and month of scan

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's where we initialize our signal handling.

$SIG{CONT} = 'sigCONT';	# CONT causes stack dump and abandons current URL
$SIG{HUP}  = 0;			# HUP  was 'sigINT' but is now ignored
$SIG{INT}  = 'sigINT';	# INT  causes stack dump and terminate
$SIG{QUIT} = 'sigQUIT';	# QUIT causes stack dump, abandons everything and exits
$SIG{USR1} = 'sigUSR1';	# USR1 causes stack dump and decrements $V
$SIG{USR2} = 'sigUSR2';	# USR2 causes stack dump and increments $V

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Scan the command-line arguments, processing them as we go.  Input files are #
# read  and  used  to  build  tables.  Any URLs discovered are accumulated in #
# @URLs.  Options are processed as read, so they will only affect  things  to #
# their right, except for URLs, which we save for last.                       #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
for $arg (@ARGV) {
	print V "$P: Arg \"$arg\"\n" if $V>2;
	if (($fl,$opt) = ($arg =~ m'^([-+])(.*)'i)) {
		print V "$P: opt=\"$opt\"\n" if $V>2;
# - - - -
# +art  include articles
# -art  exclude articles
#  This option controls whether articles (the, a, an, le, la,  etc.)
#  are to be stripped from the start of titles. The default is -art,
#  which does nothing.
		if ($opt =~ m'^art'i) {
			$articles = $fl;
			print V "$P: " . ($articles eq '-' ? 'Ignoring' : 'Including') . " articles." if $V>0;
# - - - -
# -C<file>
#  Read a bot config file.  This is a config file giving starting URLS, plus
#  allowed and disallowed URLs and hostnames.
		} elsif ($opt =~ m'^C(.*)'i) {
			print V "$P: Reading config file '$1'.\n" if $V>0;
			&cfgbot($1);
			++$cfgfilesread;	# Note that we've read a config file
# - - - -
# -D<N>  recursion depth limit.
#  The "depth" of a URL is how many hops it is from our input URL list.  The
#  URLs  in  this  list are at depth D:1; the URLs they link to are at depth
#  D:2, and so on.  This depth is recorded in the hst/* files.  The default
#  is  -D3, which experience shows is a practical limit.  Note:  Files whose
#  names end with ".abc" are allowed to be one level deeper.
		} elsif ($opt =~ m'^D(\d*)$'i) {
			$maxdepth = $1;
			print V "$P: maxdepth='$maxdepth'\n" if $V>1;
			$abcdepth = $maxdepth + 1;
# - - - -
# +L  show links from a URL.
#  This is primarily a debugging hook. If enabled, the +L options means that
#  all  the  hyperlinks  will be listed after a URL, with a ">" flag to show
#  that they are hyperlinks.  This increases the size of  the  hst/*  files
#  significantly, so the default is -L.
		} elsif ($opt =~ m'^L$'i) {
			$showlinks  = ($fl eq '-') ? 0 : $1;
			print V "$P: showlinks='$showlinks'\n" if $V>1;
# - - - -
# -<N>
#  This  is  a  timeout  in seconds.  If we can't get a URL in this time, we
#  abandon it and go on to the next.  The default is currently 60 seconds.
		} elsif ($opt =~ s'^(\d+)$'') {
			$ABCtmout = $1;
			print V "$P: ABCtmout=$ABCtmout\n" if $V>1;
# - - - -
# +P  purge bad URLs.
		} elsif ($opt =~ m'^P$'i) {
			$purgebad  = ($fl eq '-') ? 0 : 1;
			print V "$P: purgebad='$purgebad'\n" if $V>1;
# - - - -
# -S<N>  schedule next host after <N> minutes.
#  When we finish, we will schedule another instance of this program for the
#  next  host  after  <N>  minutes.   The next host is the one in the hst/*
#  directory that is lexically next; if there is none,  we  pick  the  first
#  host.   The  default  if  <N>  is  missing is to call a routine to try to
#  determine the interval.
		} elsif ($opt =~ m'^S(\d*)$'i) {
			$schedule = $1 || &getschedule();
			print V "$P: schedule='$schedule'\n" if $V>1;
			$abcdepth = $schedule + 1;
# - - - -
# -T
# +T
# -T<N>
#  This sets the timeout for opening URLs to <N> seconds.  The default is 30
#  seconds.  Special cases: -T means -T5 and +T means +T20. If <N> is given
#  you may use '-' or '+' interchangeably.
		} elsif ($opt =~ m'^T(\d*)$'i) {
			$HTTPtimeout = $1 || (($fl eq '-') ? 10 : 60);
			print V "$P: HTTPtimeout='$HTTPtimeout'\n" if $V>1;
# - - - -
# -U<N>
#  This sets the max number of URLs that we attempt to process.  This is a
#  debug hook only.  We abandon our task, clean up, and exit when $urlcount
#  passes this number.  If $maxurls is zero, it means no limit.
		} elsif ($opt =~ m'^U(\d*)$'i) {
			$maxurls = (length($1)>0) ? int($1) : 10;
			print V "$P: maxurls=$maxurls.\n" if $V>0;
# - - - -
# -V<version>
#  This is the HTTP version number. The default is 1.1, but some web servers
#  are picky about this and require 1.1.  We try to discover this by looking
#  at the version returned in HTTP messages, but  it's  faster  if  you  can
#  specify  it on the command line.  We can also get this from the cfg/$host
#  file.
		} elsif ($opt =~ m'^V([0-9.]*)$'i) {
			print V "$P: HTTPversion='$HTTPversion' to be set ...\n" if $V>1;
			$HTTPversion = $1 ? $1 : '1.1';
			print V "$P: HTTPversion='$HTTPversion' from cmdline option.\n" if $V>1;
# - - - -
# -W<N>  min wait: wait at least <N> seconds before rereading a URL.
# +W<N>  max wait: reread a URL after <N> seconds.
#  These options control when we are allowed to reread a  URL.   -W  is  the
#  minimum  time;  i.e.,  we shouldn't reread a URL until this much time has
#  passed since we last read it.  +W gives the time after  which  a  URL  is
#  considered  obsolete and should be read again.  If <N> is null, we reread
#  everything.
		} elsif ($opt =~ m'^W(\d*)$'i) {
			if ($1 eq '') {
				$mintime =  $maxtime = $maxdays = 0;
				print V "$P: Rereading everything.\n" if $V>1;
			} elsif ($fl eq '-') {
				$mintime = $1;
				print V "$P: Min timeout is $mintime sec.\n" if $V>1;
			} else {
				$maxtime = ($maxdays = $1) * $spd;
				print V "$P: Max timeout is $maxtime sec.\n" if $V>1;
			}
			print V "$P: mintime=$mintime maxtime=$maxtime (maxdays=$maxdays)\n" if $V>1;
# - - - -
# The notation '<file' is treated as input redirection.  Doing  this  ourself
# can be useful at times.
		} elsif ($arg =~ m'^<(.*)') {
			unless (open(STDIN,$1)) {
				print V "$P: Can't read '$1' ($!)\n" if $V>0;
			}
# - - - -
# None of these patterns matched the option string.
		} else {
			print V "$P: Option \"$fl$opt\" not understood.\n" if $V>1;
		}
# - - - -
# B:<URL>
#  Base URL.  Accepted URLs must start with one of these, or will be ignored.
#  You can use more than one, and a URL will be accepted if it starts with
#  any of them
	} elsif ($arg =~ /^B:*\s*(.*)/) {
		print V "$P: Base URL '$arg'\n" if $V>0;
		push @Base, $arg;
# - - - -
	} elsif ($arg =~ m"^(https*)://([^/]+)") {
		print V "$P: arg=\"$arg\" $1 hst\"$2\"\n" if $V>0;
		$inithost{lc($2)} ++;	# Remember lower-case host name
		print V "$P: URL=\"$arg\" hst\"$2\"\n" if $V>0;
		&URL($arg,1); 			# Add it as a level-1 URL to be examined
# - - - -
# Args that don't start with '-' or '+' are treated as host names.  Here,  we
# just accumulate them for later processing.
	} else {
		$arg =~ s"^.*/+""i;		# Strip of any directories
		$arg =~ s"-+$"";		# Trim "backup" host names
		$inithost{lc($arg)} ++;	# Remember lower-case host name
	}
}
#$maxurls = (length($1)>0) ? int($1) : 10;
print V "$P: maxurls=$maxurls.\n" if $V>0;

unless ($cfgfilesread > 0) {	# Have we read any config files yet?
	print V "$P: Reading our default config file '$cfgfile'.\n" if $V>1;
	&cfgbot($cfgfile);
	++$cfgfilesread;	# Note that we've read a config file
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
my %URLcode = (
	"\t" => '%09',	# HT Horizontal Tab
	"\n" => '%0A',	# LF Line Feed, newline
	"\r" => '%0D',	# CR Carriage Return
	" "  => '%20',	# SPACE
	'"'  => '%22',	# DOUBLE QUOTE
	"'"  => '%27',	# APOSTROPHE
	"%"  => '%25',	# PERCENT
	"&"  => '%26',	# AMPERSAND
	"+"  => '%2B',	# PLUS SIGN
	"<"  => '%3C',	# LESS THAN
	"="  => '%3D',	# EQUAL SIGN
	">"  => '%3E',	# GREATER THAN
#	"?"  => '%3F',	# QUESTION MARK
);

if ($V>1) {
	print V "$P:   abcdepth=$abcdepth.\n";
	print V "$P:   articles=$articles.\n";
	print V "$P:   hstdepth=$hstdepth.\n";
	print V "$P:    maxdays=$maxdays.\n";
	print V "$P:   maxdepth=$maxdepth.\n";
	print V "$P:    maxtime=$maxtime.\n";
	print V "$P:    maxurls=$maxurls.\n";
	print V "$P:    mintime=$mintime.\n";
	print V "$P:   purgebad=$purgebad.\n";
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Now all the hosts in our list should have their own files in hst/$host  and #
# these files should be filled with the URLs for the host.  Next, we expect a #
# list of host names on the command line.  We run through these hosts and run #
# thru each one's hst/$host file, and process each URL we find there.         #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

if (%inithost) {
	print V "$P: HOST processing ...\n" if $V>3;
	%h2d = %h2n = %h2u = ();	# Forget about input list of URLs
hostfile:
	for $h (sort keys %inithost) {
		print V "$P: HOST \"$h\" ...\n" if $V>3;
		if ($OLDopen) {close OLD; $OLDopen = 0}
		if ($HSTopen) {&CloseHST($HostT0{$h},time)}
		last if ($endDoc || $finishup);
		if ($BadHost{$h}) {
			print V "$P: Host \"$h\" is in BadHost list.\n" if $V>3;
			next hostfile;
		}
		$hstdepth = $maxdepth;		# Default depth limit
		$abcdepth = $hstdepth + 1;	# ABC files may be one level deeper
		if (-f ($hstfile = "hst/$hstname")) {	# Do we have a hst/ file for this hostname?
			local(undef,undef,undef,undef,undef,undef,undef,undef,undef,$mtime) = stat($hstfile);
			if (($t = time - $mtime) < $hstdelay) {	# How old is it?
				print V "$F: Host $hstname ignored, hst file is only $t sec old.\n" if $V>0;
				next hostfile;
			}
		}
		&host($h);
		print V "Host \"$h\" done. " . `date -u` if $V>0;
	}
	print V "All hosts done.\n" . `date -u` if $V>0;
} elsif (%h2n) {
	print V "$P: There are no hosts to process, but we have a URL list.\n" if $V>3;
	&saveURLs;					# Save the info from the initial URLs
	%h2d = %h2n = %h2u = ();	# Forget about this set of URLs
} else {
	print V "$P: There are no hosts to process and no URL list.\n" if $V>3;
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

if (%h2n) {
	print V "$P: There are more hosts and URLs to remember.\n" if $V>3;
	&saveURLs;			# Save the info from the initial URLs
}

# Should we schedule a new run of this robot?

if ($schedule > 0) {
	print V "$P: $esep\n" if $V>3;
	print V "$P: Schedule another run after $schedule minutes ...\n" if $V>3;
	@hosts = grep(!/(-|\.LCK)\s*$/,glob("hst/*"));
	$hosts = int(@hosts); print V "$P: We have $hosts hosts.\n"; # if $V>3;
	$nexthost = '';
host:
	foreach $host (@hosts) {
		print V "$P: host file \"$host\"\n" if $V>6;
		$host =~ s"^hst/([-.\w]+)\s*$"$1";
		if ($x = $hostsub{$host}) {$host = $x}
		print V "$P: host \"$host\"\n" if $V>3;
		if ($host gt $currhost) {
			print V "$P: host \"$host\" > \"$currhost\"\n" if $V>5;
			$nexthost = $host;
			last host;
		}
	}
	$nexthost = $host[0] if !$nexthost;
	if ($nexthost) {
		print V "$P: Next host is \"$host\".\n" if $V>3;
		$log = "log/$host";
		$ENV{"V_abcbot"} = "$V$log";
		$atcmd = "at now + $schedule $P +S $host '<BadURLs'";
		print V "$P: atcmd=\"$atcmd\"\n" if $V>5;
		if (system $atcmd) {
			$exitstat = $!;
			print V "$P: FAILED \"$atcmd\" ($!)\n" if $V>1;
			print V "$P: Exit status was $?.\n" if $V>1;
		}
	}
}

if (-f $cachetmp) {unlink $cachetmp}
print V "\n" if $V>1;
print V "$P: Exit with status $exitstat.\n" . `date -u` if $V>0;
exit $exitstat;
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #

##	sub abc2html {
##	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
##	# Convert the abc escape sequences to HTML.
##	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
##		local($s) = @_;
##		$s =~ s#\\(o)#\&${1}slash;#ig;
##		$s =~ s#\\a(a)#\&${1}ring;#ig;
##		$s =~ s#\\"(\w)#\&${1}uml;#ig;
##		$s =~ s#\\'(\w)#\&${1}acute;#ig;
##		$s =~ s#\\`(\w)#\&${1}grave;#ig;
##		$s =~ s#\\,(\w)#\&${1}cedille;#ig;
##		$s =~ s#\\~(\w)#\&${1}tilde;#ig;
##		$s;
##	}

sub avoid {my $F='avoid';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This processes a config-file command to avoid a URL,  hostname,  etc.   The #
# data is simply stuffed into some global arrays for later use when trying to #
# decide how to handle a URL.  Note that this is  called  from  the  cfgbot() #
# routine below, and from the cfghost() routine in the cfghost.pm module.     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($arg) = @_;
	local($host,$prot,$rest);
	print V "$F: Avoid {$arg}\n" if $V>2;
	if (($prot,$host,$rest) = ($arg =~ m"(https*|ftp)://([-_:.\w]+)(.*)"i)) {
		print V "$F: Avoid prot='$prot'='$host' rest='$rest'\n" if $V>2;
		if ($rest eq '') {				# $prot://host
			print V "$F: Avoid host: '$host' rest=''\n" if $V>1;
			$BadHost{$host}++;			# It's just a host "to be avoided"
			++$avoids;
		} elsif ($rest eq '/') {		# $prot://host/
			print V "$F: Avoid host/ '$host'\n" if $V>1;
			$BadHost{$host}++;			# It's just a host "to be avoided"
			++$avoids;
		} elsif ($host eq $myhost) {	# $prot://host/path
			print V "$F: AVOID PATH: myhost=$myhost rest='$rest'\n" if $V>1;
			$BadPath{$rest}++;			# Host + path "to be avoided"
			++$avoids;
		} elsif ($host eq $currhost) {	# $prot://host/path
			print V "$F: AVOID PATH: currhost=$currhost rest='$rest'\n" if $V>1;
			$BadPath{$rest}++;			# URL pathnames to be avoided
			++$avoids;
		} else {
			print V "$F: Avoid '$arg' IGNORED (not on $host).\n" if $V>1;
		}
#	} elsif ($arg =~ /^([-_:.\w]+)\s*$/) {	# Previous else test
	} elsif ($arg =~ m"^(/[^/]+/)\s*$") {
		push @BadPat, $1;			# Mark this as a pattern to be avoided
		++$avoids;
		print V "$F: AVOID PAT: \"$1\"\n" if $V>0;
	} else {
		print V "$F: Avoid \"$arg\" IGNORED (can't parse).\n" if $V>0;
	}
}

sub BASE {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Save the contents of a <BASE ...> tag.  We may need this when we try to use #
# a relative URL later on in the file.  We save it in the global $base var.   #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($atts) = @_;
	local($att,$val,$lup);
	$atts .= ' ';
	print V "BASE: atts=\"$atts\"\n" if $V>7;
	while ($atts =~ s/\s*^(\w+)="*([^"\s]+)["\s]+//) {
		$att = uc($1);
		$val = $2;
		print V "BASE: att=\"$att\" val=\"$val\"\n" if $V>6;
		print V "BASE: atts=\"$atts\"\n" if $V>7;
		if (uc($att) eq 'HREF') {
			$base = $val;
			print V "BASE: base=\"$base\"\n" if $V>3;
			return $base;	# This is all we want
		}
		if (++$lup > 5) {
			print V "BASE: Looped $lup times; giving up.\n" if $V>3;
			return undef;
		}
	}
	print V "BASE: Quit with atts=\"$atts\"\n" if $atts && $V>3;
	return undef;
}

sub CheckEnd {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Terminate the checking.  We first write the log message $m if  the  verbose #
# level is greater than $v. We then kill the check process and close the pipe #
# from its stdout.  The value of $r is the caller's  intended  return  value, #
# which we may print (or not).                                                #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($r,$v,$m) = @_;
	print V "$m\n" if $V>$v;
	kill 9, $chkpid;
	close CHK;
	return $r;
}

sub CheckHost {my $F='CheckHost'; local($h) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Do miscellaneous validations on a host.  The return value is the number  of #
# problems  found.   0  means  there are no objections to the host; a nonzero #
# return means there is some problem and we should skip this host.            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($chk,$chktim,$n,$p,$t);
	local($httplvl,$errcode,$errmsg,$path);
	@robotstxt = ();	# Contents of robots.txt file, if any
	$hstmsg = '';
	if ($n = $BadHost{$h}) {
		$hstmsg = "Host \"$h\" in blacklist";
		print V "$F: $hstmsg.\n" if $V>0;
		return 1;
	}
	$p = $ProtHost{$h} || 'http';	# Protocol for this host
	print V "$F: HTTPversion='$HTTPversion'\n" if $V>1;
	$getcmd = $Getcmd{"C:$h"} || "$dfltget -V$HTTPversion";
	print V "$F: getcmd=\"$getcmd\"\n" if $V>1;
#	$getcmd = $dfltget;		# This failed for docgrooms.com; I've reverted it to get the C: command from the cfg file.
	$chk = "$getcmd -T$ABCtmout '$p://$h/robots.txt'";
	print V "$F: \"$chk\"\n" if $V>1;
	$w3timedout = 0;			# Global set true to flag timeout
	$chktim = time;				# When we started this check
	if ($HTTPtimeout > 0) {		# Timeout in effect?
	#	$savsig = $SIG{ALRM};	# Save old alarm routine
		$SIG{ALRM} = 'W3tmout';	# Establish alarm routine
		alarm $HTTPtimeout;		# Set alarm
		&DT();					# was dt()
		print V "$F: Set alarm after $HTTPtimeout sec at $now.\n" if $V>5;
		$CHKopen = 1;			# Triggers close on timeout
	}
	$HTTPcontime = time;		# Note time we last tried to connect
	unless ($chkpid = open(CHK,"$chk |")) {
		$hstmsg = "\"$chk\" failed ($!)";
		print V "$F: $hstmsg\n" if $V>0;
		$CHKopen = 0;
#		return 2;
	}
	print V "$F: Process $chkpid \"$chk\" running.\n" if $V>6;
	print V "$F: Read '$p://$h/robots.txt'\n" if $V>2;
#	$inHTTPhdrs = 1;			# The +H option produces headers
	$CHKopen = 1;				# Triggers close on timeout
line:
	while ($line = <CHK>) {
		if ($w3timedout) {
			&DT();				# was dt()
			$hstmsg = "#### TIMEOUT in for loop at $now ####";
			print V "$F: $hstmsg\n" if $V>0;
			$t = ($now = time) - $chktim;
			print V "$F: chktim=$chktim now=$now t=$t\n" if $V>3;
			return &CheckEnd(3,1,"$F: Can't connect to \"$h\" in $t sec. (return 3)");
		}
		$line =~ s/[\r\s]+$//;
		print V "$F: +++ \"$line\"\n" if $V>7;
		if ($inHTTPhdrs) {
			print V "$F: HDR \"$line\"\n" if $V>3;
			$t = ($now = time) - $chktim;
			if (!$line) {
				print V "$F: End of headers.\n" if $V>3;
				$inHTTPhdrs = 0;
			}
			if ($line =~ m"^<!--.*Can't connect to .*-->") {
				print V "$F: chktim=$chktim now=$now t=$t\n" if $V>3;
				return &CheckEnd(4,1,"$F: Can't connect to \"$h\" in $t sec. (return 4)");
			}
			if ($line =~ m"^<!--.*No route to .*--> ") {
				return &CheckEnd(4,1,"$F: Can't connect to \"$h\" in $t sec. (return 4)");
			}
			if ($line =~ m"^<!--.*ALARM after (\d+) sec .*-->") {
				return &CheckEnd(4,1,"$F: Can't connect to \"$h\" in $1 sec. (return 4)");
			}
			if ((($httplvl,$errcode,$errmsg) = ($line =~ m"^HTTP/([0-9.]+)\s+ERR\s+(\d+)\s+(.*)$"i))
			||	(($httplvl,$errcode,$errmsg) = ($line =~ m"^HTTP/([0-9.]+)\s+(\d+)\s+(.*)$"i))
			) {
				print V "$F: httplvl=$httplvl errcode=$errcode errmsg=\"$errmsg\"\n" if $V>3;
				if ($errcode >= 400) {
					$hstmsg = "No robots.txt file found";
					print V "$F: $hstmsg\n" if $V>3;
					return &CheckEnd(0,1,"$F: HTTP/$1 ERR $2 ($3)");
				}
				print V "$F: robots.txt file found.\n" if $V>0;
				print V "$F: HTTP/$1 ERR $2 ($3) -- accepted.\n" if $V>3;
				next line;
			}
			print V "$F: Not an ERR line.\n" if $V>7;
			if ($line =~ m"^HTTP/([0-9.]+)\s") {
				$getcmd = $Getcmd{"C:$h"} || $dfltget;
			#	$getcmd = $dfltget;
				print V "$F: getcmd='$getcmd'\n" if $V>1;
			} elsif ($line =~ m"^Server:\s*(.*)") {
				$Server{$h} = $1;
				print V "$F: Server for \"$h\" is \"$Server{$h}\"\n" if $V>6;
			}
		} else {
			print V "$F: TXT \"$line\"\n" if $V>1;
			push @robotstxt, $line;
			if ($line =~ /^User-agent:\s*(.*)$/i) {
				$agentpat = $1;
				$agentmatch = 0;
				if (($agentpat eq '*') || ($agentpat =~ /abcbot/i)) {
					print V "$F: User-agent \"$agentpat\" matches us.\n" if $V>0;
					$agentmatch = 1;
				}
			} elsif ($line =~ /^Disallow:*\s*(.*)$/) {
				($path = $1) =~ s/[\r\s\n\t]+$//;
				if ($path) {
					$Disprefix{$path}++;
				#	$hstmsg = "Disallow \"$path\"";
					print V "$F: DISALLOW \"$path\"\n" if $V>0;
				}
			}
		}
	}
	print V "$F: Done with \"$p://$h/robots.txt\"\n" if $V>3;
	if ($w3timedout) {
		$t = time - $chktim;
		$hstmsg = "#### TIMEOUT (at for loop after $t sec) ####";
		print V "$F: $hstmsg\n" if $V>0;
		return &CheckEnd(6,1,"$F: Can't connect to \"$h\" in $t sec. (return 6)");
	}
	return &CheckEnd(0,0,"$F: $h OK");
}

sub cfgbot {my $F='cfgbot'; local($cfgfilnam) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Read in a config file for this program.  It should contain a list  of  URLs #
# that need special treatment.                                                #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($arg,$cmd,$line,$path,$prot,$host,$rest,$why);
	print V "$F: Read config file '$cfgfilnam' ...\n" if $V>1;
	unless (open(CFG,$cfgfilnam)) {
		print V "$F: Can't read config file '$cfgfilnam' [$!]\n" if $V>0;
		return 0;
	}
	print V "$F: myhost \"$myhost\"\n" if $V>1;
	print V "$F: currhost \"$currhost\"\n" if $V>1;
	$prot = 'http';
	while ($line = <CFG>) {					# Input contains config directives
		$line =~ s/[\r\n\s]*$/ /;			# We want exactly one space at end of line
		print V "$F: line \"$line\"\n" if $V>2;
		if ($line =~ /^\s*#/) {				# Ignore comments
			print V "$F: Ignore: \"$line\"\n" if $V>2;
		} elsif ($line =~ /^\s*$/) {		# Ignore blank lines
		} elsif ($line =~ s"^\s*((https*|ftp)://\S+)\s"$1"i) {	# Bare URL means a link to scan
			$prot = $2;						# Protocol from this URL
			$rest = $3;
			print V "$F: URL: $1\n" if $V>3;
			print V "$F: prot='$prot' rest=\"$rest\"\n" if $V>3;
			&URL($1,1); # unless %inithost;	# Add it as a level-1 URL to be examined
		} elsif ($line =~ s"^(scan|search):*\s*(\S+)\s"$2"i) {
			print V "$F: SCAN $2\n" if $V>3;
			&URL($2,1); # unless %inithost;	# Add it as a level-1 URL to be examined
		} elsif ($line =~ s"^(dead|done|gone|ignore):*\s*(\S+)/*\s"$2"i) {
			$why = $1;						# Reason for failure
			$Depth{$2} = 1;					# Mark this one as "already done"
			$Done{$2}  = $now;				# Use current time as when we did it
			print V "$F: Ignore \"$2\" [$why] at time $now.\n" if $V>0;
		} elsif ($line =~ s"^(delay):*\s*(\S+)\s"$2"i) {
			print V "$F: crawldelay = HTTPdelay = $2 sec. [were '$crawldelay' '$HTTPdelay']\n" if $V>3;
			$scandelay = $HTTPdelay = int($2);	# Min time between requests
		} elsif ($line =~ s"^avoid:*\s*(\S+)\s.*"$1"i) {	# Hosts or URLs to avoid
			print V "$F: Avoid \"$line\"\n" if $V>5;
			if (($prot,$host,$rest) = ($line =~ m"(https*|ftp)://([-_:.\w]+)(.*)"i)) {
				print "$F: Avoid prot='$prot' host='$host' rest=\"$rest\"\n" if $V>1;
				if ($rest eq '') {				# http://host
					$BadHost{"$prot://$host/"}++;			# It's just a host "to be avoided"
					print V "$F: Avoid host: \"$host\"\n" if $V>1;
				} elsif ($rest eq '/') {		# http://host/
					$BadHost{$host}++;			# It's just a host "to be avoided"
					print V "$F: Avoid host/ \"$host\"\n" if $V>1;
				} elsif ($host eq $myhost) {	# http://host/path
					$BadPath{"$rest"}++;	# Host + path "to be avoided"
					print V "$F: AVOID PATH: $host \"$rest\"\n" if $V>1;
				} else {
					$BadPath{"$host$rest"}++;	# Host + path "to be avoided"
					print V "$F: Avoid path: \"$host$rest\"\n" if $V>1;
				}
			} elsif ($line =~ /^([-_:.\w]+)\s*$/) {
				$BadHost{$1} ++;			# Mark this host as "to be avoided"
				print V "$F: AVOID HOST: \"$1\"\n" if $V>1;
			} else {
				print V "$F: Avoid \"$line\" IGNORED (can't parse).\n" if $V>0;
			}
		} elsif ($line =~ m"^avoid:\s*(/.+/)\s*$"i) {
			push @BadPat, $1;				# Mark this as a pattern for URLs to avoid
			++$avoids;
			print V "$F: AVOID PAT: \"$1\"\n" if $V>0;
		} elsif ($line =~ /^Disallow:*\s*(.*)\s*$/i) {
			($path = $1) =~ s/[\r\s\n\t]+$//;
			if ($path) {
				print V "$F: Disallow \"$path\"\n" if $V>1;
				$Disprefix{$path}++;
			}
		} elsif ($line =~ /^SCDkludge/) {	# Ignore T: lines that look like dance code
			$SCDkludge = 1;
			print V "$F: SCDkludge: '$SCDkludge'\n" if $V>1;
		} elsif ($line =~ /^(V):*\s*(\S+)[r\s]*/) {
			print V "$F: Verbosity $1: \"$2\"\n" if $V>1;
			$V = $2;			# Verbosity for this site
		} else {							# Otherwise it's a comment
			print V "$F: \"$line\" IGNORED (can't parse).\n" if $V>0;
		}
	}
	close CFG;
	return 1;
}

sub CloseHST {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Finish writing all files related to the current host and close  the  files. #
# The params are the start and finish timestamps for this host.               #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($t0,$t1) = @_;
	local($hr,$mn,$sc,$tm,$ss,$mm,$hh,$DD,$MM,$CY);
	local($gcmd,$msg,$pfile,$srvr);
	++$scancnt;		# Count the current scan
	print HST "\n$now T X:$tunecnt T:$titlcnt F:$filecnt H:$currhost\n";
	print V   "\n$now T X:$tunecnt T:$titlcnt F:$filecnt H:$currhost\n" if $V>0;
	unless ($tunemax>0 && $titlmax>0) {
		print V "$P: No ABC ever found at host $currhost in pass $scancnt.\n" if $V>0;
		print HST "\n$now # No ABC found at $currhost in pass $scancnt\n";
		if ($scancnt > $maxscans) {
			$pfile = "nul/$currhost";
			print V "$P: Move \"$hfile\" to \"$pfile\"\n" if $V>3;
			&Backup($pfile) if -e $pfile;
			unless (rename($hfile,$pfile)) {
				print V "$P: Can't rename  \"$hfile\" to \"$pfile\" ($!)\n" if $V>0;
			}
		}
	}
	$tm = $t1 - $t0;
	$sc = $tm % 60;
	$mn = int($tm/60) % 60;
	$hr = int($tm/3600);
	$msg = sprintf "$now Scanned $currhost at $cymd $hms in $tm sec (%d:%02d:%02d)",$hr,$mn,$sc;
	if ($scancnt < 3) {
		$msg .= " NEW SITE $filecnt files $tunecnt tunes $titlcnt titles.";
	} else {
		$msg .= "  $filecnt files $tunecnt tunes $titlcnt titles.";
	}
	print HST "\n$msg\n";
	print V     "$msg\n" if $V>0;
	close HST;
#
	if (open(LOG,">>$smryfile")) {
		local($ss,$mm,$hh,$DD,$MM,$CY) = gmtime($now = time); ++$MM; $CY += 1900;
		$summary = sprintf("%04d%02d%02d %02d:%02d %8d sec (%2d:%02d:%02d) %6d files %6d tunes %6d titles at $currhost\n"
			,$CY,$MM,$DD,$hh,$mm
			,$tm,$hr,$mn,$sc
			,$filecnt,$tunecnt,$titlcnt);
		printf LOG $summary;
		close LOG;
	}
	$HSTopen = 0;
	$gcmd = $dfltget;	# For most hosts, the default GET command works.
	$srvr = $Server{$currhost};
	print V "$P: Command  for \"$currhost\" is \"$gcmd\"\n" if $V>3;
	print V "$P: Server   for \"$currhost\" is \"$srvr\"\n" if $V>3;
	print V "$P: maxdepth for \"$currhost\" is \"$hstdepth\"\n" if $hstdepth != $maxdepth && $V>3;
}

sub LoadLinks {my $F='LoadLinks';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	my $file;
	my($l,$ts,$fl,$dpth,$upath);
	for $file (@_) {
		print V "$F: Load \"$file\"\n" if $V>1;
		if (open(LINKFILE,$file)) {
			while ($l = <LINKFILE>) {
				$l =~ s/[\r\s]*$/ /;	# Strip white stuff
				next if $l eq ' ';		# Ignore blank lines
				print V "$F: LINKFILE line: $l" if $V>1;
				if ($l =~ /^\s*#/) {	# Comment
					print V "$F: Drop \"$l" if $V>3;
				} elsif ((($ts,$fl,$dpth,$upath) = ($l =~ m'^(\d+)\s+([-#\w]) D:(\d+) *(.*) $'))
				||   (($ts,$dt,$fl,$dpth,$upath) = ($l =~ m'^(\d+)=(\d+) ([-#\w]) D:(\d+) *(.*) $'))) {
					print V "$F: fl='$fl' D:'$dpth' \"$upath\"\n" if $V>1;
					print V "$F: upath=\"$upath\"\n" if $V>2;
					&NewU($upath,1,$ts);	# Was $dpth, now all hosts start at depth 1
				} else {
					print V "$F: BAD \"$l\"" if $V>3;
				}
			}
			close LINKFILE;
			print V "$F: Done \"$file\"\n" if $V>2;
		} else {
			print V "$F: Can't read \"$file\" ($!)\n" if $V>0;
		}
	}
}

sub Max {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is a dumb loop that finds the max numeric value in a list. Why doesn't #
# perl  have  this as a builtin?  It has nearly everything else.  We only use #
# this with non-negative integers (times), so we assume a min value of 0.     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($n) = shift || 0;
	for (@_) {
		$n = $_ if defined$_ && ($_ > $n);
		shift;
	}
	return $n;
}

sub TIMEOUT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's an alarm handler for reads from DOC.  When  a  timeout  happens,  we #
# close  the  DOC  file  and  return,  which  should cause abandonment of the #
# current document.                                                           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	$endDoc = 1;
	$w3timedout = 1;
	&main::V("TIMEOUT called with V=$V.\n") if $V>1;
}

sub NewU {my $F='NewU'; local($upath,$dpth,$ts) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Add a URI to our list of added/unprocessed URIs.  We keep track of the  min #
# depth,  and  at  the  end, any URIs in the @Left list will be scanned.  The #
# timestamp is currently included, but not actually used,  since  we  fake  a #
# timestamp of zero to force a scan of all new URIs.                          #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($d);
	print V "$F: D:$dpth U:$upath\n" if $V>2;
	if ($Done{$upath}) {
		print V "$F: #### \"$upath\" already done at $Done{$upath} depth $Depth{$upath}.\n" if $V>2;
		return;
	}
	if (defined($d = $Depth{$upath})) {
		print V "$F: Dup Depth{$upath}='$Depth{$upath}' is '$d'\n" if $V>7;
		if ($Depth{$upath} > $dpth) {
			$Depth{$upath} = $dpth;	# Note minimum depth for URI
			print V "$F: Depth{$upath}=$dpth\n" if $V>5;
		}
	} else {	# New URI; set its depth
		$Depth{$upath} = $dpth;	# Note its depth
		print V "newU: Depth{$upath}=$dpth\n" if $V>3;
		push @Left, $upath;		# List of unprocessed URIs
		print V "$F: New Depth{$upath}='$Depth{$upath}'\n" if $V>5;
	}
}

sub cfgRewrite {my $F='cfgRewrite'; local($arg) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#   Replace /pat/rpl/
#   Rewrite /pat/rpl/
# This processes a config-file command to rewrite URLs. #
# Note that this is  called  from  the  cfgbot() #
# routine below, and from the cfghost() routine in the cfghost.pm module.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($d,$pat,$rpl,$rule);
	print V "$F: Rewrite \"$arg\".\n" if $V>1;
	unless ($d = substr($arg,0,1)) {	# Pick off initial delimiter
		print V "$F: arg \"$arg\" is null!\n" if $V>1;
		return;
	}
	print V "$F: Delimiter is '$d'\n" if $V>2;
	if ($arg =~ s/^$d([^$d]+)$d/$d/) {		# Look for pattern
		$pat = $1;
		print V "$F: pat=$d$pat$d\n" if $V>2;
	} else {
		print V "$F: No second delimiter '$d' in '$arg'\n" if $V>1;
		return;
	}
	$rpl = '';		# Default replacement
	if ($arg =~ s/^$d([^$d]*)$d$//) {	# Look for replacement, null is ok
		$rpl = $1;
		print V "$F: rpl=$d$rpl$d\n" if $V>1;
	} elsif ($arg) {
		print V "$F: Replacement '$arg' not understood.\n" if $V>1;
		return;
	}
	if ($arg) {		# We should have used up the arg at this point
		print V "$F: Junk '$arg' left after parse.\n" if $V>2;
		return;
	}
	$rule = "s$d$pat$d$rpl$d";
	print V "$F: Rewrite rule: $rule\n" if $V>2;
	push @Rewrite, $rule;
}

sub showRewrite {my $F='showRewrite';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($rule);
	print V "$F: Rewrite Rules:\n" if $V>1;
	for $rule (@Rewrite) {
		print V "$F: \t$rule\n" if $V>1;
	}
}

sub doRewrite {my $F='doRewrite';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($url) = @_;
	local($was,$rule);
	$was = $url;	# Copy for messages
	print V "$F: Rewrite '$url' ...\n" if $V>2;
	for $rule (@Rewrite) {
		if (eval("\$url =~ $rule")) {
			print V "$F: ===> '$url'\n" if $V>3;
			print V "$F: Rule: $rule\n" if $V>2;
		} else {
			print V "$F: ==== '$url'\n" if $V>3;
		}
	}
	if ($was ne $url) {
		print V "$F: Rewrote '$was'\n" if $V>1;
		print V "$F: ======> '$url'\n" if $V>1;
	}
	return $url;
}

sub URL {my $F=':URL'; local($uri,$depth) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Register a new URL for later scanning.  We can do some weeding out here  if #
# we  so desire.  We return 0 if we reject the URL; 1 if we accept it, though #
# callers don't yet use this info. We implement a special ABC kludge here: If #
# the URL ends with .abc, we accept it even if it's beyond the maximum depth. #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($done,$h,$hh,$hst,$lh,$lu,$lx,$n,$p,$pfx,$len,$u,$ufull,$upath,$x);
	print V "$F: ufull={" . &esc($ufull) . "}\n" if $V>2;
	print V "$F: \"$ufull\" d=$depth.\n" if $V>2;
	if ((@BadPat || @Base) && chkURL($lurl)) {
		print V "$F: \"$lurl\" REJECTED by chkURL()\n" if $V>0 && @BadPat;
		return 0;
	} else {
		print V "$F: \"$lurl\" accepted by chkURL()\n" if $V>2;
	}
	if ($Done{$ufull}) {
		print V "$F: #### \"$ufull\" already done at $Done{$ufull} depth $Depth{$ufull}.\n" if $V>2;
		return 0;
	}
	if (!$followUpLink && ($ufull =~ /\/\.\.\//)) {
		print V "$F: \"$ufull\" ignored (/../)\n" if $V>1;
		return 0;
	}
	$ufull = @Rewrite ? &doRewrite($uri) : $uri;	# Are there rewrite rules?
	for $pfx (keys %Disprefix) {
		$len = length $pfx;
		print V "$F: Disprefix len=$len '$pfx'\n" if $V>3;
		if ($uri ne $ufull) {
			if (substr($uri,0,$len) eq $pfx) {	# Check the original URI
				print V "$F: Disallow \"$uri\" [Disprefix \"pfx\"]\n" if $V>0;
				return 0;
			}
		}
		if (substr($ufull,0,$len) eq $pfx) {	# Check the rewritten URL
			print V "$F: Disallow \"$ufull\"\n" if $V>0;
			return 0;
		}
	}
	if ($depth > $hstdepth) {
		if ($ufull =~ /\.abc$/i) {		# ABC files are special
			if ($depth > $abcdepth) {	# They may be one level deeper
				print V "$F: \"$ufull\" ignored (depth $depth > $abcdepth && .abc file)\n" if $V>1;
				return 0;
			}
			print V "$F: \"$ufull\" accepted (depth $depth && .abc file)\n" if $V>2;
		} elsif ($depth >= $hstdepth) {	# Reject non-ABC files at depth limit
			print V "$F: \"$ufull\" ignored (depth $depth >= $hstdepth)\n" if $V>1;
			return 0;
		}
	}
	print V ">->-> \"$ufull\" [$depth]\n" if ($depth>$hstdepth && $V>2);
	if ($QuotesOK) {
		$ufull =~ s/'/%27/g;			# Rewrite both kinds of quotes as HTTP hex
		$ufull =~ s/"/%22/g;
		print V "$F: \"$ufull\" quotes rewritten.\n" if $V>1;
	} else {
		if ($ufull =~ /['"]/) {		# Reject URLs that contain quotes
			print V "$F: \"$ufull\" ignored (contains a quote)\n" if $V>1;
			return 0;
		}
		print V "$F: \"$ufull\" accepted [no quotes]\n" if $V>2;
	}
	$ufull = &URLtrim($ufull);				# Shorten the URL if possible
	print V "----> $depth '$ufull'\n" if $V>2;
	if ($ufull !~ m'^(https*|file):'i) {	# Accept only these protocols
		print V "$F: \"$ufull\" ignored (https*|file rule)\n" if $V>1;
		return undef;
	}
	if ($ufull =~ m'\.(exe)\b/'i) {		# Don't try to fetch MS executables
		print V "$F: \"$ufull\" ignored (.exe rule)\n" if $V>1;
		return undef;
	}
	if ($ufull =~ m'\b(bin|tmp)/'i) {		# Don't try to fetch from bin or tmp directories
		print V "$F: \"$ufull\" ignored (bin|tmp rule)\n" if $V>1;
		return undef;
	}
	print V "$F: allowcgi=$allowcgi.\n" if $V>1;
	unless ($allowcgi) {	# Check for banned CGI calls
		if (!$allowcgi && ($ufull =~ m'\bcgi\b'i)) {		# Ignore cgi scripts
			print V "$F: \"$ufull\" ignored (cgi rule)\n" if $V>1;
			return undef;
		}
		if ($ufull =~ m/<\w+.*>/) {		# Ignore URLs that look like HTML tags
			print V "$F: \"$ufull\" ignored (HTML rule)\n" if $V>1;
			return undef;
		}
		if (!$allowcgi && ($ufull =~ m/[\?;#"]/)) {		# Ignore URLs that look like CGI calls
			print V "$F: \"$ufull\" ignored (HTML/CGI rule)\n" if $V>1;
			return undef;
		}
	}
	if ($ufull =~ m'/\?\w=\w$') {			# Ignore apache listing URLs
		print V "$F: \"$ufull\" ignored (/?X=Y\$ rule)\n" if $V>1;
		return undef;
	}
	if ($ufull =~ m'jc/.*\.hdr$'i) {		# Ignore jc's HDR files
		print V "$F: \"$ufull\" ignored (hdr rule)\n" if $V>1;
		return undef;
	}
	print V "$F: ufull=\"$ufull\"\n" if $V>2;
	if (($p,$h,$upath) = ($ufull =~ m'^(https*|ftp)://([^/]+)(.*)$'i)) {	# FTP doesn't work yet
		print V "$F: upath=\"$upath\"\n" if $V>1;
		unless ($h =~ /^[-_:.\w]*$/) {
			print V "$0: Bogus host \"$h\" ignored.\n" if $V>2;
			return undef;
		}
		$hst = lc($h);
		if ($depth <= 1) {
			$hh = "http://$hst$upath";
			$AllowURL{$hh}++;	# Note that this one is explicitly allowed
			print V "$F: Allow \"$hh\"\n" if $V>2;
		}
		if (%BadHost && $BadHost{$h}) {
			print V "$F: \"$ufull\" ignored (bad host \"$h\")\n" if $V>1;
			return undef;
		}
		if (%BadHost) {
			if ($BadHost{$h}) {
				print V "$F: \"$ufull\" ignored (bad host \"$h\")\n" if $V>1;
				return undef;
			}
		}
		$lh = length($h);		# Length of hostname in URL
		$lu = length($upath);	# Length of pathname in URL
		if (%BadPath) {
			print V "$F: bad path check ...\n" if $V>2;
			for $x (keys %BadPath) {	# Run thru forbidden paths
				$lx = length($x);		# Length of this path
				print V "$F: lx=$lx lu=$lu '$x'\n" if $V>3;
				if (($lu <= $lx) && (substr($upath,0,$lx) eq $x)) {
					print V "$F: \"$ufull\" ignored (bad path \"$x\")\n" if $V>1;
					return undef;		# URI starts with forbidden path
				} else {
					print V "$F: '$x' !~ '$upath'\n" if $V>3;
				}
			}
		}
		if ((@BadPat || @Base) && chkURL($ufull)) {		# Do we have patterns to reject
			print V "$F: \"$ufull\" rejected by chkURL.\n" if $V>0;
			return undef;		# URI starts with forbidden path
		}
		print V "$F: hst='$hst' currhost='$currhost'\n" if $V>3;
		if ($hst eq $currhost) {
			print V "$F: URI upath=\"$upath\" at depth $depth is local.\n" if $V>2;
			print V "$F: URI ufull=\"$ufull\" at depth $depth is local.\n" if $V>2;
			&NewU($upath,$depth,($now = time));
			return 1;
		} else {
			print V "$F: URI upath=\"$upath\" at depth $depth is non-local.\n" if $V>2;
			print V "$F: URI ufull=\"$ufull\" at depth $depth is non-local.\n" if $V>2;
			print V "$F: hst=\"$hst\" currhost=\"$currhost\"" if $V>2;
		}
		if ($h =~ m'(__|\.\.)') {
			print V "$F: \"$ufull\" ignored (host contains \"$1\")\n" if $V>1;
			return undef;
		}
		$n = $h2n{$hst} || 0;
		$h2u{$hst}->[$n] = $upath;	# Note URI, not the full URL
		$h2d{$hst}->[$n] = $depth;	# Note each URL's depth
		$h2n{$hst} ++;				# Count the URLs for each host
		print V "$F: host $hst URL $h2n{$hst} is \"$ufull\" [$doclinks links]\n" if $V>1;
		if ($showlinks) {
			unless ($outlink{"$currURL:$ufull"}) {
				&DT();				# was dt()
				push @newchunk, ("$now > D:$depth " . (($hst eq $currhost) ? $upath : $ufull));
#				if ($hst eq $currhost) {
#					push @newchunk, "$now > D:$depth $upath";
#				} else {
#					push @newchunk, "$now > D:$depth $ufull";
#				}
			}
			++ $outlink{"$currURL:$ufull"};	# Number of times we've encountered this URL
			print V "----> $depth '$ufull'\n" if $V>1;
		}
	}
	return 1;
}

sub URLenc {my $F='URLenc';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# do the HTTP encoding to convert URL special chars to %XX.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	my($c,$d,$l,$v);
	for (@_) {
		$l = $_;	# Copy the arg.
		print V "$F: +++ \"$l\"<br>\n" if $V>6;
		$l =~ s/([\t\n\r "'%&+<=>])/$URLcode{$1}/eg;
		$v .= $l;
	}
	return $v;
}

sub W3tmout {my $F='W3tmout';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's an alarm handler for reads from DOC.  When  a  timeout  happens,  we #
# close  the  DOC  file  and  return,  which  should cause abandonment of the #
# current document.                                                           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	&DT();				# was dt()
	$TOopen = $now - $TMopen;	# Time since we opened the current file
	$TOread = $now - $TMread;	# Time since we last read from the file
	print V "$F: Called with TOopen=$TOopen TOread=$TOread sec [HTTPtimeout=$HTTPtimeout]\n" if $V>2;
	if ($TOread < $HTTPtimeout) {
		print V "$F: Timeout ignored: only $TOread sec since last read [HTTPtimeout=$HTTPtimeout]\n" if $V>0;
	} else {
		print V "$F: TIMEOUT after $TOopen/$TOread sec [HTTPtimeout=$HTTPtimeout]\n" if $V>0;
		$endDoc = 1;
		$w3timedout = 1;
	}
}

sub done {my $F='done';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Mark a URI/URL as done at a specific depth and time.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($d,$t,$upath,$ufull) = @_;
	local($td);
	$d = $depth        unless defined($d) && $d > 0;
	$t = ($now = time) unless defined($t) && $t > 0;
	if (defined($td = $Done{$upath}) && ($td < $t-1)) {
		print V "$F: \"$upath\" already marked done at time $Done{$upath} depth $Depth{$upath}.\n"
			if $V>0;
		return;
	}
	if (defined($td = $Done{$ufull}) && ($td < $t-1)) {
		print V "$F: \"$ufull\" already marked done at time $Done{$ufull} depth $Depth{$ufull}.\n"
			if $V>0;
		return;
	}
	$Depth{$ufull} = $d;
	$Done{$ufull}  = $t;
	print V "$F: URLfull \"$ufull\" marked done at time $t depth $d.\n" if $V>3;
	$Depth{$upath} = $d;
	$Done{$upath}  = $t;
	print V "$F: URLpath \"$upath\" marked done at time $t depth $d.\n" if $V>3;
}

sub chunk {my $F="chunk";
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A "chunk" starts with a line giving a timestamp and URL, plus  some  little
# fields  giving  the  URL's  depth  and what we last did with it, optionally
# followed by data about that URL. The 1-char flags that follow the time are:
#  U new URL, not processed yet.
#  > link to another URL.
#  - URL not read for some reason.
#  # Error trying to read URL.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($depth,$newuline,$olduline,$links,$rest,$titls,$tunes,$upath,$ufull);
	local($d,$dt,$fl,$l,$oB,$oL,$oX,$oT,$ts,$x,$y);
	local($sec,$min,$hour,$mday,$mon,$year); # gmtime() return list
	print V "CHUNK:\n\t@oldchunk\n" if $V>7;
	$olduline = shift @oldchunk;
	print V "$F: \"$olduline\"\n" if $V>6;
	$dt = $oB = $oL = $oX = $oT = '';
	$docbytes =
	$matched  = 0;
	# We now attempt to match several different "U" lines that we have used in
	# various versions of this bot.
	if ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+)( B:\d*)( L:\d*)( X:\w*)( T:\d*) (.*)$') {
		$ts    = $1;	# Time of last update
		$fl    = $2;	# Line-type flag, 'U' for URLs
		$depth = $3;	# Hyperlink depth
		$oB    = $4;	# Old byte count
		$oL    = $5;	# Old tune count
		$oX    = $6;	# Old tune count
		$oT    = $7;	# Old title count
		$upath   = $8;	# URL minus protocol and host
		$matched = 1;
	} elsif ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+)( B:\d*)( X:\w*)( T:\d*) (.*)$') {
		$ts    = $1;	# Time of last update
		$fl    = $2;	# Line-type flag, 'U' for URLs
		$depth = $3;	# Hyperlink depth
		$oB    = $4;	# Old byte count
		$oX    = $5;	# Old tune count
		$oT    = $6;	# Old title count
		$upath   = $7;	# URL minus protocol and host
		$matched = 1;
	} elsif ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+)( B:\d+) (.*)$') {
		$ts    = $1;
		$fl    = $2;
		$depth = $3;
		$oB    = $4;	# Old byte count
		$upath   = $5;
		$matched = 1;
	} elsif ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+) (.*)$') {
		$ts    = $1;
		$fl    = $2;
		$depth = $3;
		$upath = $4;
		$matched = 1;
	} elsif ($olduline =~ m'^(\d+)=(\d+) ([-#\w]) D:(\d+) (.*)$') {
		$ts    = $1;
		$dt    = $2;
		$fl    = $3;
		$depth = $4;
		$upath   = $5;
		$matched = 1;
	} elsif ($olduline =~ m'^(\d+) T X:(\d+) T:(\d+) F:(\d+) H:(.*)$') {	# Scan results line
		$ts    = $1;	# Timestamp
		$oX    = $2;	# Old tune count
		$oT    = $3;	# Old title count
		$oF    = $4;	# Old file count
		$oH    = $5;	# Old host name
		++$scancnt;		# Count the scans
		print V "$F: Scan $scancnt matched.\n" if $V>1;
	} else {
		print V "$F: Line not matched.\n" if $V>6;
	}
	if ($matched) {
		$upath =~ s/[\r\s.]+$//;			# Trim trailing junk
		print V "$F: dt $fl D=$depth oB=$oB $oL=$oL oX=$oX oT=$oT upath=\"$upath\"\n" if $V>6;
		if ($upath =~ /\.(bak|jpe*g|ps|pdf|tif*|zip|midi*|cfm|dmg|old|flv|fmt|gif|png|ppt|sit|swf|g*z|au|mp\d*|wav|wmv|tar)$/i) {
			print V "----> '$upath' ignored ($1 suffix).\n" if $V>2;
			return;
		}
		$upath =~ s":80/"/";				# Drop default port
		$ufull = "http://$currhost$upath";	# Construct full URL
		print V "$F: upath=\"$upath\"\n" if $V>3;
		print V "$F: ufull=\"$ufull\"\n" if $V>3;
		if ($Done{$ufull}) {
			print V "$F: \"$ufull\" already done at $Done{$ufull} depth $Depth{$ufull}.\n" if $V>2;
			print V "$F: DROP \"$ufull\"\n" if $V>3;
			return;
		}
		if (defined($d = $Depth{$upath})) {
			print V "$F: \"$upath\" is at depth $d.\n" if $V>5;
			if ($d < $depth) {			# Adjust URI's depth
				print V "$F: \"$upath\" changed from depth $depth to $d.\n" if $V>2;
				$depth = $d;			# URI's Use min depth
			}
		}
		if ($x = defined($y = $Done{$ufull}) && $y && defined($x)) {	# Have we seen this URL already?
			print V "$F: #### \"$ufull\" already done at $Done{$ufull} depth $Depth{$ufull}.\n" if $V>2;
			@oldchunk = ();				# Suppress the chunk entirely
			return;
		} elsif ($fl eq '-') {			# Old comment lines
			$newuline = $olduline;
			@newchunk = @oldchunk;
			print V "$F: Mark \"$ufull\" done now (-).\n" if $V>5;
			&done($d,$ts,$upath,$ufull);	# Mark this URI/URL done
			print V "$F: $ts URL \"$ufull\" marked done now (-).\n" if $V>5;
		} elsif (@oldchunk && (($x = &DT() - $ts) < $mintime)) {				# was dt()
			print V "$F: $ts only $x < $mintime sec.\n" if $V>5;
			$newuline = $olduline;
			@newchunk = grep(!/- (too soon|rescan|obsolete) /,@oldchunk);
			unshift @newchunk, "$now - too soon ($x < $mintime)" if $V>3;
			print V "$F: Mark \"$ufull\" done now ($x < mintime=$mintime).\n" if $V>5;
			&done($d,$ts,$upath,$ufull);	# Mark this URI/URL done
			print V "$F: $ts URL \"$ufull\" marked done now ($x < mintime=$mintime)\n" if $V>5;
			for $l (@newchunk) {		# Look for previous link and tune counts
				if (($links,$tunes)  = ($l =~ /(\d+) links, (\d+) ABC tune/)) {
					$linkcnt += $links; $linkmax = $linkcnt if $linkmax < $linkcnt;
					$tunecnt += $tunes; $tunemax = $tunecnt if $tunemax < $tunecnt;
					$filecnt ++ if $tunecnt>0;
				} elsif (($links,$tunes,$titls) = ($l =~ /(\d+) links, (\d+) tunes, (\d+) titles/)) {
					$linkcnt += $links; $linkmax = $linkcnt if $linkmax < $linkcnt;
					$tunecnt += $tunes; $tunemax = $tunecnt if $tunemax < $tunecnt;
					$titlcnt += $titls; $titlmax = $titlcnt if $titlmax < $titlcnt;
					$filecnt ++ if $tunecnt>0 || $titlcnt>0;
				}
			}
		} elsif ($x > $maxtime) {
			print V "$F: $ts $x > maxtime=$maxtime.\n" if $V>5;
			&DT();				# was dt()
			$newuline = "$now U D:$depth$oB$oL$oX$oT $upath";
			push @newchunk, "$now - rescan ($x > $maxtime)" if $V>7;
			print V "$F: Scan \"$ufull\" now ($x > maxtime=$maxtime).\n" if $V>5;
			&scanURL($ufull,$upath);			# Rescan it
			print V "$F: Mark \"$ufull\" done now ($x > maxtime=$maxtime).\n" if $V>0;
			&done($depth,$now,$upath,$ufull);
			print V "$now URL \"$ufull\" marked done now ($x > maxtime=$maxtime)\n" if $V>5;
		} else {					# It's a URL that we should examine
			print V "$F: SCAN \"$ufull\"\n" if $V>6;
			&DT();				# was dt();
			$newuline = "$now U D:$depth$oB$oL$oX$oT $upath";
			print V "$F: Scan \"$ufull\" now (else).\n" if $V>5;
			&scanURL($ufull,$upath);			# Scan it
			print V "$F: Mark \"$ufull\" done now (after scanURL).\n" if $V>5;
			&done($depth,$now,$upath,$ufull);
		}
		if ($maxurls > 0 && $urlcount > $maxurls) {	# Debug hook: exit after $maxurls URLs
			print V "$F: $ts Abort after $urlcount URLs. (maxurls=$maxurls)\n" if $V>0;
			return;
		}
	} elsif ($olduline =~ / T L:(\d+) X:(\w+) T:(\d+) F:(\d+) H:(.*)/) {
		$linkmax = $1 if $linkmax < $1;
		$tunemax = $2 if $tunemax < $2;
		$titlmax = $3 if $titlmax < $3;
		++$scancnt;				# Count the number of times we've scanned this host
		$newuline = $olduline;
		@newchunk = @oldchunk;
	} elsif (($ts,$fl,$rest) = ($olduline =~ m'^(\d+) ([-+T]) (.*)$'i)) {
		print V "$F: ts=$ts $fl $rest\n" if $V>6;
		($sec,$min,$hour,$mday,$mon,$year) = gmtime($1);
		if ($oScanY == $year && $oScanM == $mon) {	# [jc] 20030331
			# We've seen a timestamp for this month; make trivial change to indicate it:
			$olduline = "";	# "$ts t $rest";	# Note 't' rather than 'T'
		}
		$oScanY = $year;	# Note year and month of scan
		$oScanM = $mon;
		$newuline = $olduline;
		@newchunk = @oldchunk;
	} elsif (($ts,$dt,$fl,$rest) = ($olduline =~ m'^(\d+)=(\d+) ([-+T]) (.*)$')) {
		print V "$F: dt $fl $rest\n" if $V>6;
		$newuline = $olduline;
		@newchunk = @oldchunk;
	} elsif (($ts,$dt) = ($olduline =~ m'^(\d+)=(\d+)$')) {
		print V "$F: dt timestamp.\n" if $V>6;
		$newuline = $olduline;
		@newchunk = @oldchunk;
	} else {
		print V "$F: Unmatched:", $olduline, @oldchunk, "\n" if $V>3;
		if ($saveunmatched) {
			$newuline =  $olduline;
			@newchunk =  @oldchunk;
			unshift @newchunk, "$now - Unmatched";
		} else {
			$newuline = '';
			@newchunk = ();
		}
	}
	if ($newuline || @newchunk) {
		print HST "\n";
		print HST "$newuline\n" if $newuline;
		print HST  @newchunk    if @newchunk;
		print HST "\n"          if @newchunk;
	}
	@oldchunk = ();
	@newchunk = ();
	print V "$F: done.\n" if $V>7;
}

sub env {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Return an environment value, if it's defined.  If not, set it  to  the  2nd #
# arg, and return that value.  It's best if the value is a string.            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($name,$dflt) = @_;
	if (defined $ENV{$name}) {
		return  $ENV{$name};
	} else {
		return($ENV{$name} = $dflt);
	}
}

sub errchunk {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($erruline) = @_;
	&DT();				# was dt()
	push @newchunk, $erruline;
	print V "$P: erruline=\"$erruline\"\n" if $V>5;
	$newuline = $olduline;
	print V "$P: newuline=\"$newuline\"\n" if $V>5;
}

sub getschedule {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here we try to determine the schedule interval for repeated  runs  of  this #
# robot.  We first try to extract the interval from a file. If that fails, we #
# return a constant; this should only happen during debugging.  Note that  if #
# the  value  returned  is zero, no rescheduling is done.  This is useful for #
# stopping runaways. Note that the time interval is in minutes.               #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($l,$n);
	$schedfile = 'abcbot.sch' unless $schedfile;
	if (open(SCHED,$schedfile)) {
		while ($l = <SCHED>) {if ($l =~ /^(\d+)/) {$n = $1; close SCHED; last}}
	} else {
		print V "getschedule: Can't read \"$schedfile\" ($!); using 1 min.\n" if $V>0;
		$n = 1;
	}
	print V "getschedule: Return $n min.\n" if $V>3;
}

sub host {my $F='host';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's the main routine to process one host name..
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($h,$cmd) = @_;
	local($base,$flg,$i,$init,$suff,$t);
	$currhost = $h;
	$HostT0{$h} = $HostT0{$currhost} = time;
	print V "\n" if $V>2;
	print V "$hsep\n" if $V>2;
	print V "$P/$F: HOST \"$h\" P='$P' V=$V.\n" if $V>0;
	&hostunlock() if $lfile;	# Unlock previous host
#	%Disprefix = ();			# Forget any disallows for previous host
	%RejectPfx = ();			# Rejected initial strings for this host
	for $init (%Disprefix) {	# Find the disallowed initial strings for this host
		if ($init =~ /\b$currhost\b/) {
			$RejectPfx{$init} = 1;
			print V "$F: RejectPfx{$init}\n" if $V>1;
		}
	}
	%AllowURI = ();			# Find the allows for this host
#	print V "$F: #### AllowURL is empty!\n" unless %AllowURL;
	print V "$F: AllowURL contains URLs\n" if %AllowURL && $V>3;
	for $allow (keys %AllowURL) {
		print V "$F: Allow \"$allow\"\n" if $V>6;
		if (($pp,$hh,$uu) = ($allow =~ m"^(\w+)://([-_:.\w]+)(/.*)")) {
			print V "$F: Allow h=\"$h\" vs hh=\"$hh\" (pp=\"$pp\"\n" if $V>6;
			if ($h eq lc($hh)) {
				$AllowURI{$uu}++;	# Allow this URI for this host
				print V "$F: Allow host \"$h\" uri \"$uu\"\n" if $V>5;
			} else {
				print V "$F: host \"$h\" uri \"$uu\" don't match\n" if $V>5;
			}
		}
	}
	$cfile = "cfg/$h";		# Config info this host
	$hfile = "hst/$h";		# Current data for this host
	$afile = "add/$h";		# Additional URLs for host
	$nfile = "new/$h";		# Newly added URLs for host
	$ofile = "old/$h";		# Backup file name
	$lfile = "lck/$h";		# Lock file name
	if (-f $lfile) {		# Someone's working on it
		print V "$F: Lock file exists lfile=\"$lfile\" ($!)\n" if $V>0;
		return;
	}
	unless (open(LCK,">$lfile")) {
		print V "$F: Can't write lfile=\"$lfile\" ($!)\n" if $V>0;
		return;
	}
	&hostlock($lfile);
	$TMopen = 		# Time since we opened the current file
	$TMread = 		# Time since we last read from the file
	$URLtime =		# Time we started a new URL
		&DT();		# Make these default to right now.				# was dt()
	if (&CheckHost($h)) {
		print V "$F: Host \"$h\" failed checks.\n" if $V>0;
		print V "$F: $hstmsg\n" if $V>0;
		&Backup($afile) if -e $afile;
		return;
	}
	if (@robotstxt) {
		print V "$F: Got 'http://$h/robots.txt' file.\n" if $V>0;
	}
	if (-f $cfile) {
		print "$F: require cfghost.pm\n" if $V>0;
		$i = require "cfghost.pm";	# Routines to deal with cfg/* files
		print V "$F: require \"cfghost.pm\" returned $i.\n" if $V>1;
		&cfghost($cfile,$h);
		print V "$F: HTTPversion='$HTTPversion' [after cfghost()]\n" if $V>1;
		&showRewrite() if @Rewrite;
		$hstdepth = $DepthHost{$h} || $maxdepth;
		$abcdepth = $hstdepth + 1;	# ABC files may be one level deeper
		if ($flg = $Opt{'CGI'}) {
			$allowcgi = $flg eq '+';
			print V "$F: Set CGI option to $allowcgi.\n" if $V>0;
		}
		if ($flg = $Opt{'tagP'}) {
			print V "$F: Set tagP option to '$flg'\n" if $V>0;
			$hs->setOption("$flg" . 'tagP');
		}
		if ($flg = $Opt{'tagTD'}) {
			print V "$F: Set tagTD option to '$flg'\n" if $V>0;
			$hs->setOption("$flg" . 'tagTD');
		}
		print V "$F: Max depth for $h is $hstdepth/$abcdepth.\n" if $V>0;
	}
	&Backup($ofile) if -e $ofile;
	if (rename($hfile,$ofile)) {
		print V "$F: Moved \"$hfile\" to \"$ofile\"\n" if $V>3;
	} else {
		print V "$F: Can't rename(\"$hfile\",\"$ofile\") ($!)\n" if $V>0;
	}
	if (open(OLD,$ofile)) {
		print V "$F: Reading \"$ofile\" ($!)\n" if $V>2;
		$OLDopen = 1;
	} else {
		print V "$F: Can't read \"$ofile\" ($!)\n" if $V>0;
		close OLD;		# Make sure it's not a relic open file
		$OLDopen = 0;
	}
	if (open(HST,">>$hfile")) {
		print V "$F: Writing \"$hfile\"\n" if $V>1;
		$HSTopen = 1;
	} else {
		print V "$F: Can't write hfile=\"$hfile\" ($!)\n" if $V>0;
		close HST;		# Make sure it's not a relic open file
		$HSTopen = 0;
		return;
	}
	$HSTopen = 1;
	select HST; $| = 1; select V;
	$linkcnt = $tunecnt = $titlcnt = $filecnt = 0;
	$linkmax = $tunemax = $titlmax = $filemax = 0;
	print V "$F: filecnt=$filecnt for new host $h.\n" if $V>5;
	&DT();					# was dt()
	print HST "$now + start $h\n";
#	$scancnt = 0;			# Number of times we've scanned this host
	@oldchunk = ();			# One URL and its info
	@newchunk = ();			# New info about this URL
#	%Depth = ();			# Min depth of URIs so far
#	%Done = ();				# List of URIs we've processed
#	@Left = ();				# List of URIs still to handle
#	print V "$F: Emptied \%Done and \@Left\n" if $V>6;
#
	&LoadLinks($afile) if (-f $afile);
	&LoadLinks($nfile) if (-f $nfile);
hostline:
	while ($l = <OLD>) {
		print V "$F: Next OLD line.\n" if $V>5;
		next if ($l =~ / \+ (start|done)\b/);	# Lines to drop
		next if ($l =~ m"jc/.*\.hdr$"i);		# Ignore jc's HDR files
		last if ($endDoc || $finishup);
		$l =~ s"[\r\s]+$"";				# Trim away trailing white stuff
		print V "====| $l\n" if $V>7;
		if ($l) {
			push @oldchunk, $l;		# Accumulate lines of one "chunk"
		} else {
			&chunk() if @oldchunk;	# Process one "chunk" of the host's data
		}
		if ($maxurls>0 && $urlcount>$maxurls) {
			print V "$F: hostline: Abort after $urlcount URLs (maxurls=$maxurls)\n" if $V>0;
			last hostline;
		}
	}
	&chunk() if @oldchunk;
	# Run thru the added URIs here. Note that if an entry in %addURI has been
	# undef'd, its name might still be there, and only the value is undefined.
	print V "$F: " . int(@Left) . " URIs left.\n" if $V>5;
URI:
	while (@Left) {				# Local URIs discovered in hyperlinks
#		print V "\n" if $V>3;
		print V "$F: There are " . int(@Left) . " local URIs left.\n" if $V>5;
		&DT();					# Note the clock time for each URK				# was dt()
		if (($t = $now - $lasttunetime) > $giveuptime) {
			print V "$F: ### It has been $t sec (" . &dhms($t) . ") since last tune found; giving up.\n" if $V>0;
			print V "$F: Our give-tup time is $t sec (" . &dhms($giveuptime) . ").\n" if $V>0;
			$finishup = $endDoc = $endHost = 1;
		}
		if ($maxurls>0 && $urlcount>$maxurls) {
			print V "$F: URI: ### Abort after $urlcount URLs\n" if $V>0;
			$finishup = $endDoc = $endHost = 1;
		}
		if ($endHost) {				# Abandon the current file and host.
			print V "$F: Abandon current host endHost=$endHost ...\n" if $V>0;
			$hs->DOCclose();
			&Backup($afile) if -e $afile;
			&Backup($nfile) if -e $nfile;
			last URI;				# Ignore further URIs for this host
		}
		if ($endDoc) {				# Abandon the current file only.
			print V "$F: Abandon current URL endHost=$endHost ...\n" if $V>0;
			$hs->DOCclose();
			next URI;				# Try another URI
		}
		$upath = shift @Left;		# Get one URI
		print V "$F: URI: \"$upath\" (" . int(@Left) . " left)\n" if $V>5;
		next URI unless $upath;	# Paranoia: Ignore nulls
		if ($Done{$upath}) {		# Have we done it already?
			print V "$F: #### \"$upath\" already done at $Done{$upath} depth $Depth{$upath}.\n" if $V>2;
			print V "$F: DROP \"$upath\"\n" if $V>3;
			next URI;
		}
		$purged = 0;				# Set true to skip this URL
		if (($base,$suff) = ($upath =~ /^(.*)\.(gif|ps|midi*|jpe*g|zip|g*z|au|mp\d*|wav)$/)) {
			print V "----> '$upath' dropped (suffix).\n" if $V>3;
			if ($notearchives) {
				if ($suff eq 'zip') {
					system "echo ZIP: http://$h/$upath >> ZIPfiles"
				} elsif ($suff =~ /g*z/) {
					system "echo GZIP http://$h/$upath >> ZIPfiles"
				}
			}
			next URI;		# Skip this file
		}
		unless (defined($dpth = $Depth{$upath})) {
			print V "$F: #### U:$upath depth unknown.\n" if $V>0;
			$dpth = 1;		# Make a guess
		}
		if ($dpth < 1) {
			print V "$F: #### U:$upath ignored at depth $dpth.\n" if $V>3;
			next URI;
		}
		print V "$F: Add D:$dpth U:$upath\n" if $V>5;
		@oldchunk = ("0 U D:$dpth $upath");
		&chunk();
	}
	print V "$F: No more new URIs for \"$h\".\n" if $V>3;
	&CloseHST($HostT0{$h},&DT()) if $HSTopen;				# was dt()
	&Backup($afile) if -e $afile;
	&Backup($nfile) if -e $nfile;
	&hostunlock if $lfile;
	if (-d "$cachedir/$h") {	# Relink the host's cached files
		$cmd = "nice relink +r '$cachedir/$h'";
		print V "$F: cmd=\"$cmd\"\n" if $V>3;
		system "$cmd &";
	}
}

sub hostlock {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print LCK "$$ $P\n";
	close LCK;
	print V "$P: LOCKed \"$lfile\"\n" if $V>3;
}

sub hostunlock {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	close LCK;
	unlink($lfile);
	print V "$P: unLOCK \"$lfile\"\n" if $V>3;
	$lfile = '';
}

sub href {my $F='href';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Given an href, we decide here how to handle it. The caller must pass us the #
# URL  from  the href, and the string (item) between the '>' and the </a>, in #
# case we need to check what's there.  The main use we make of the item is to #
# check  for and reject "parent dir" references.  We also look at a few other #
# suffixes and decide whether we should load them and scan their contents. If #
# the  URL  is  accepted,  we  pass  it  to &URL() for later processing.  For #
# rejected URLs, we just return.                                              #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local(
		$curr,		# Current URL
		$href,		# URL pointed to
		$item,		# HTML text associated with $href
		$incr)		# Level increment: 0 for frames, 1 for other URLs
			= @_;
	local($a,$l,$p,$s,$u);
	$incr = 1 unless defined $incr;
	print V "$F: curr=\"$curr\" href=\"$href\" item=\"$item\"\n" if $V>6 || ($href =~ /^\$/);
	print V "$F: currhost='$currhost' testhost='$testhost' url='$url'\n" if $V>2;
	print V "$F: href=\"$href\"\n" if $V>2;
	if (@Rewrite) {					# Are we rewriting URLs?
		$href = &doRewrite($href);	# Try to rewrite this one
		print V "$F: href=\"$href\"\n" if $V>2;
	}
	if ($currhost eq $testhost) {
		print V "$F: href=\"$href\"\n" if $V>2;
		if ($href =~ s"/(display|display.php)/(\d+).+"/display/$2") {
			print V "$F: href=\"$href\" <=========\n" if $V>2;
		}
	}
#	$href =~ s/\s*".*//;			# Discard anything after a double quote
	if ($allowcgi) {				# Are we rejecting CGI calls?
		print V "$F: CGI calls are allowed.\n" if $V>2;
	} else {
		if ($href =~ /^(\?)/i) {	# Check for CGI with URL variables
			print V "$F: Ignore href=\"$href\" [contains '?']\n" if $V>0;
			return;
		}
		if (!$allowcgi && ($href =~ /\bcgi\b/i)) {
			print V "$F: Ignore href=\"$href\" [contains 'cgi' allowcgi=$allowcgi]\n" if $V>0;
			return;
		}
	}
	if ($href =~ /\.(bak|log|tmp|out)\b/i) {
		print V "$F: Ignore href=\"$href\" (bak|log|tmp|out)\n" if $V>5;
		return;
	}
	if ($base) {
		print V "$F: base=\"$base\" replaces curr=\"$curr\"\n" if $V>6;
		$curr = $base;
	}
	print V "$F: href={" . &esc($href) . "}\n" if $V>2;
	if ($href =~ '/$') {	# If final '/', treat as directory
		print V "$F: Treat href=\"$href\" as directory.\n" if $V>7;
		if ($chkuplinks && ($item =~ /\b(Parent|Home|Back)\b/i)) {
			print V "$F: Ignore href=\"$href\" item=\"$item\"\n" if $V>1;
			return;
		} elsif ($href =~ m"^\w*://") {	# Full URL
			print V "$F: \"$href\" read at depth $depth.\n" if $V>6;
			&URL($href,$depth+$incr);
		} else {						# Relative URL
			$u = &URLhref($curr,$href);
			print V "$F: \"$u\" read at depth $depth.\n" if $V>6;
			&URL($u,$depth+$incr);
		}
		print V "$F: Done with directory.\n" if $V>6;
		return;
	}
	# No final '/' on HREF:
	print V "$F: Treat href=\"$href\" as non-directory.\n" if $V>6;
	if ($href =~ m'(.*)#(\w+)$') {
		print V "$F: Offset href=\"$1\" (#$2)\n" if $V>5;
		$u = &URLhref($curr,$1);
		print V "$F: \"$u\" URL at depth $depth.\n" if $V>6;
		&URL($u,$depth+$incr);
	} else {
		print V "$F: Simple href=\"$1\" (#$2)\n" if $V>6;
		$u = &URLhref($curr,$href);
		print V "$F: \"$u\" URL at depth $depth.\n" if $V>6;
		&URL($u,$depth+$incr);
	}
}

sub mkdirs {my $F='mkdirs';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Create a directory tree.  This is recursive.  We attempt to make  the  full #
# path.   If that fails, we trim off the last field, and call ourself to make #
# the parent directory. When that returns, we once again try to make the full #
# path.  We return >1 for success, 0 for failure because the directory exists #
# already, and <1 for real failure.                                           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($p) = @_;
	local($d,$n,$x);
	return( 0) if -d $p;	# Does it already exist?
	return(-1) if -f $p;	# Is it a file?
	return(1) if mkdir($p,0775);
	$n = 0;				# That failed.
	print V "$F: Make dir \"$p\"\n" if $V>3;
	if (($d,$x) = ($p =~ m"^(.+)/([^/]+)/*$")) {
		if (-d $d) {	# Does parent directory exist?
			print V "$F: Dir \"$p\" exists.\n" if $V>1;
		} elsif (($n = &mkdirs($d)) >= 0) {	# <=== Recurse
			print V "$F: Made dir \"$p\"\n" if $V>1;
		} else {
			print V "$F: #### Can't make dir \"$d\" [$!]\n" if $V>0;
		}
	}
	if (mkdir($p,0775)) {
		++$n;		# Accumulate the number of directories created
		print V "$F: Made dir \"$p\"\n" if $V>1;
		return $n;
	} else {
		print V "$F: #### Cannot make  dir \"$p\" [$!]\n" if $V>0;
		return -$n;
	}
}

sub saveURLs {my $F='saveURLs';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Write the accumulates list of URLs to the appropriate hst/* files. All the
# URLs have been split into host and URI portions, and we have the arrays:
#  $h2n{$host} is the number of URIs for $host
#  $h2u{$host}->[$n]  is the nth URI for $host
#  $h2d{$host}->[$n]  is the depth of each URI (1 for initial URIs)
# For each host $h, we open the hst/$h file and append a "U" line  for  each
# URI.  These may be redundant, of course, and if so, will be dropped when we
# process that host again.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($d,$h,$nfile,$hst,$upath,$ufull,$urls,$x);
	unless (%h2n) {
		print V "$P: There are no URLs to save.\n" if $V>3;
		return;
	}
	print V "$P: Save URL list ...\n" if $V>3;
host:
	for $h (sort keys %h2n) {
		next unless $h;
		$hst = lc($h);
		$urls = $h2n{$hst};
		print V "$P: Host \"$hst\" has $urls new URLs.\n" if $V>5;
		next if $urls < 1;
		$nfile = "new/$hst";
		unless (open(NEW,">>$nfile")) {
			print V "### Can't write nfile=\"$nfile\" ($!) (saveURLs)\n" if $V>0;
			next host;
		}
		print NEW "# $h\n";			# Make sure the file identifies the host
		for ($n = 0; $n < $urls; $n++) {
			$upath = $h2u{$hst}->[$n] || '/';
			$d   = $h2d{$hst}->[$n];
			$ufull = ($ProtHost{$hst} || 'http') . '//' . $hst . $upath;
			print V "$F: ufull=\"$ufull\"\n" if $V>1;
			print V "#---> uri $n depth $d host \"$hst\" is \"$upath\"\n" if $V>3;
			if ($x = $Done{$ufull}) {
				print V "$F: #### \"$ufull\" already done at $Done{$ufull} depth $Depth{$ufull}.\n" if $V>2;
			} else {
				&DT();				# was dt()
				print NEW "\n0 U D:$d $upath\n";
			}
		}
		close NEW;
	}
	print V "$P: Initial URL list done.\n\n" if $V>3;
	close H;
	if (defined($host) && $host) {
		$h2n{$host} = ();	# Forget the initial list
		$h2u{$host} = ();
		$h2d{$host} = ();
	}
}

sub dhms {my $F='dhms'; local($ptime) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Convert a second count to days, minutes, hours and seconds.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($val,$d,$h,$m,$s);
	$s = $ptime % 60; $ptime /= 60;
	$m = $ptime % 60; $ptime /= 60;
	$h = $ptime % 24; $d = $ptime /24;
	$val = sprintf("%dd%dh%dm%ds",$d,$h,$m,$s);
	$val =~ s/^[0hmds]*//;
	$val = '0s' unless $val;
	return $val;
}

sub dmpline {my $F="dmpline";
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Produce a symbolic dump of one or more strings.  We label them HTTP or DATA #
# depending on whether we think they're in the HTTP header or in the document #
# data. The first arg is an ID string shown at the start of each line.        #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($id) = shift;
	local($line,$type);
	$type = $inHTTPhdrs ? 'HTTP' : 'DATA';
	for $line (@_) {
		print V "$id $type $doclines {" . &esc($line) . "}\n";
	}
}

sub scanHDRs {my $F="scanHDRs";
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# We assume we're in the HTTP header section of the input.  We gobble up  the #
# data,  and look for the few things that might be of interest to us.  Mostly #
# we just discard the lines.  When we hit a  blank  line,  we  turn  off  the #
# inHTTPhdrs flag and return.  We also return '' at end-of-file.              #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($line,$lines,$type);
	$TMread =  time;	# Note time of last DOC read
	$type = '';			# Return value is Content-Type, if found, else null
	$lines = 0;			# Number of input lines read.
	while ($line = $hs->DOCline()) {
		unless (defined($line)) {
			print V "$F: EOF on DOC in HTTP headers after $lines lines.\n" if $V>0;
			closeDoc('EOF');
			$inHTTPhdrs = 0;
			return $type;
		}
		++$lines;
		&dmpline('+++',$line) if $V>2;
		$line =~ s/[\r\s]+$//;	# Trim white stuff and newline
		unless ($line) {		# Blank line is end of HTTP headers
			print V "$F: End of HTTP headers.\n" if $V>2;
			$inHTTPhdrs = 0;
			return $type;
		}
		# Look for interesting headers:
		if ($line =~ /^content-type:\s(.*)$/i) {
			$type = $1;
			print V "$F: Content type is \"$type\"\n" if $V>2;
		} else {
			print V "$F: Ignore \"$line\"\n" if $V>5;
		}
		$TMread =  time;		# Note time of last DOC read
	}
	return $type;
}

sub chkURL {my $F='chkURL'; local($url) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Check a URL against the @BadPat list of patterns.  Return 1 if a  match  is #
# found, to indicate rejection of the URL, or 0 if there's no match.          #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($exp,$pat,$base,$bases);
	print V "$F: url=\"$url\"\n" if $V>2;
	$bases = 0;
	if (@Base) {
		for $base (@Base) {		# Accept anyURLs that start with these
			if ($url =~ /^$base/) {
				++$bases;
				print V "$F: Base '$base' matches '$url'\n" if $V>2;
			}
		}
		unless ($bases) {
			print V "$F: '$url' REJECTED (no Base match)\n" if $V>0;
			return 1;
		}
	}
	for $pat (@BadPat) {	# Reject any URLs that match these
		print V "$F: pat=\"$pat\"\n" if $V>2;
		$exp = "'$url' =~ m$pat";
		print V "$F: exp=\"$exp\"\n" if $V>2;
		if (eval($exp)) {
			print V "$F: '$url' REJECTED by pat=/$pat/\n" if $V>0;
			return 1;
		}
	}
	print V "$F: '$url' ACCEPTED.\n" if $V>1;
	return 0;
}

sub scan {my $F="scan";
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Grovel through a file, looking for hyperlinks or pieces of abc code.  Check #
# out each of the files listed.  Directories cause recursive traversal. Files #
# with interesting suffixes are read. This routine is complicated by the need #
# to  decode  HTML  as well as plain text.  An extra complication is that the #
# HTTP headers may include various error indications, and we may not be  able #
# to get the file at all.                                                     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($URL) = @_;
	local($endDoc,$DOCopen);	# Localize the DOC file
	local($allow,$getcmd,$init,$inABC,$inHdr,$H,$K,$pfx,$X,$ttl);
	local($udir,$ubas,$usuf);
	local($nhrefs,$possibletunes,$when,$xx);
	local($ct,$d,$dpth,$h,$i,$item,$l,$line,$lurl,$n,$p,$s,$surl,$t,$u);
	local($ss,$mm,$hh,$DD,$MM,$CY);
#	print V "\n" if $V>2;
	print V "$F: SCAN \"$URL\"\n" if $V>3;
	if ($t = $Done{$URL}) {
		$d = $Depth{$URL};
		print V "$F: #### \"$URL\" already done at $Done{$URL} depth $Depth{$URL}.\n" if $V>1;
		print V "$F: DROP \"$ufull\"\n" if $V>1;
		return 0;
	}
	$DOCopen = $inABC = $inHdr = 0;	# No longer reading from DOC
	$possibletunes = 0;			# Tunes found in file but maybe rejected
	$tunesinfile   = 0;			# Tunes in file accepted
	$lurl = &URLtrim($URL);		# Long URL may have final '/' and other junk
	if ($lurl =~ /^(.*)((%22%20|" )class=.*)$/) {
		printf V "$F/%d: #### lurl \"$lurl\" contains \"$2\"\n",__LINE__ if $V>1;
		printf V "$F/%d: ####  URL=\"$URL\"\n",__LINE__ if $V>1;
		$lurl = $1;
		printf V "$F/%d: #### lurl=\"$lurl\" <============",__LINE__ if $V>1;
	}
	print V "$F: lurl={" . &esc($lurl) . "}\n" if $V>2;
	if ($allowcgi) {		# Check for CGI
		print V "$F: CGI calls are allowed.\n" if $V>2;
	} else {
		if ($lurl =~ /(\bcgi\b|\bbin\b)/) {		# Once contained '|\?'
			print V "$F: \"$lurl\" rejected due to \"$1\"\n" if $V>0;
			return;
		}
	}
	if ($lurl =~ /(#)/) {		# Ignore internal anchors
		print V "$F: \"$lurl\" rejected due to \"$1\"\n" if $V>2;
		return;
	}
#	print V "\n" if $V>1;
	print V "$esep\n" if $V>2;
	if ($lurl =~ /^(.*)((%22%20|" )class=.*)$/) {	# DEBUG: Look for class= garbage at end of URL
		printf V "$F/%d: #### lurl \"$lurl\" contains \"$2\"\n",__LINE__ if $V>1;
		$lurl = $1;
		printf V "$F/%d: #### lurl=\"$lurl\" <============",__LINE__ if $V>1;
	}
	($ss,$mm,$hh,$DD,$MM,$CY) = gmtime($now = time); ++$MM; $CY += 1900;
	print V "\n====> $depth $CY/$MM/$DD $hh:$mm:$ss V=$V \"$lurl\"\n" if $V>0;	# <=========== Log new URL
	$URLtime = $now;
	if ($lurl =~ 'sid=') {
		print V "$F: \"$lurl\" should be checked ...\n" if $V>0;
	}
	if ((@BadPat || @Base) && chkURL($lurl)) {
		print V "$F: \"$lurl\" rejected by chkURL()\n" if $V>0 && @BadPat;
		return;
	} else {
		print V "$F: \"$lurl\" accepted by chkURL()\n" if $V>0;
	}
	($surl = $lurl) =~ s"/+$"";	# Short URL lacks final '/'
# Split the URL into directory/base/suffix:
	if (($udir,$ubas,$usuf) = ($surl =~ m"(.*/)([^/]*)\.(\w+)$")) {
		$usuf = lc($usuf);		# Suffix is caseless to us 
	} elsif (($udir,$ubas) = ($surl =~ m"(.*/)([^/]*)$")) {
		$usuf = '';				# No suffix is OK, too
	} else {
		$udir = $ubas = $usuf = ''
	}
	print V "udir=\"$udir\" ubas=\"$ubas\" usuf=\"$usuf\"\n" if $V>2;
	if ($surl eq '') {			# Shouldn't happen
		local($p,$c,$l) = caller;
		print V "$F: \"$URL\" (from $p/$c/$l)\n" if $V>0;
		return 0;
	}
	if (($dpth = $Depth{$surl}) && ($dpth <= $depth)) {
		print V "$F: \"$URL\" is marked as depth $dpth.\n" if $V>3;
		if ($Done{$URL}) {
			print V "$F: \"$URL\" already scanned at depth $dpth.\n" if $V>2;
			return 1;
		}
	}
#	$Depth{$surl} = $depth;		# Note that we've done this URL
#	print V "$F: \"$lurl\" marked as depth $depth.\n" if $V>3;
	for $init (keys %RejectPfx) {
		if (substr($upath,0,length($init)) eq $init) {
			print V "$F: REJECT \"$upath\" for currhost=\"$currhost\"\n" if $V>1;
			$purged = 1;		# Tell caller to drop this URL
			$Done{$upath} = $now;
			$Depth{$upath} = $depth;
			return 0;
		}
	}
#	if (%Disprefix) {
#		print V "$F: Checking disallows ...\n" if $V>3;
#disallow:
#		for $dispfx (keys %Disprefix) {
#			print V "$F: disallow: \"$dispfx\" \n" if $V>2;
#			if (substr($upath,0,length($dispfx)) eq $dispfx) {
#				print V "$F: Disallow: \"$dispfx\" \n" if $V>2;
#				print V "$F: Disallows \"$upath\" \n" if $V>2;
#				for $allow (keys %AllowURI) {
#					print V "$F: allow: \"$allow\"\n" if $V>2;
#					if (substr($upath,0,length($allow)) eq $allow) {
#						print V "$F: ALLOW: \"$upath\"\n" if $V>1;
#						print V "$F: PATTERN: \"$allow\"\n" if $V>1;
#						$purged = 0;
#						last disallow;
#					} else {
#						print V "$F: no match: \"$upath\"\n" if $V>1;
#					}
#				}
#				print V "$F: DISALLOW: \"$upath\"\n" if $V>1;
#				print V "$F: PATTERN: \"$dispfx\"\n" if $V>1;
#				$purged = 1;	# Tell caller to drop this URL
#				$Done{$upath} = $now;
#				$Depth{$upath} = $depth;
#				return 0;
#			}
#		}
#	}
	$currURL   = $lurl;	# Global copy of current URL
	$DOCopen   = &DT();	# Time DOC opened; zero if not open				# was dt()
	$inABC     = 0;		# Not in ABC yet
	$inHdr     = 0;		# Not in ABC header either ;-)
	$docbytes  = 0;		# Count the bytes in the current document
	$doclines  = 0;		# Count the lines in the current document
	$doclinks  = 0;		# Count the links in the current document
	$doctitls  = 0;		# Count the titles in the current document
	$doctunes  = 0;		# Count the tunes in the current document
	if ($lurl =~ /\.(doc|exe|tgz)\b/) {	# Reject several binary file formats
		print V "$F: \"$ufull\" ignored (.doc/.exe/.tgz rule)\n" if $V>1;
		$Done{$lurl} = $now;
		$Depth{$lurl} = $depth;
		$purged = 1;	# Tell caller to drop the URL
		$newuline = '';
		@newchunk = ();
		return 0;
	}
	if ($lurl =~ /^(.*)((%22%20|" )class=.*)$/) {	# DEBUG: Look for class= garbage at end of URL
		printf V "$F/%d: #### lurl \"$lurl\" contains \"$2\"\n",__LINE__ if $V>1;
		$lurl = $1;
		printf V "$F/%d: #### lurl=\"$lurl\" <============",__LINE__ if $V>1;
	}
	print V "$F: Opening '" . &esc($lurl) . "'\n" if $V>2;
	$TMopen = $TMread = &DT();				# was dt()
	for $pfx (keys %RejectPfx) {
		print V "$F: RejectPfx check \"$pfx\"\n" if $V>2;
		if (substr($lurl,0,length($pfx)) eq $pfx) {
			print V "$F: REJECT \"$lurl\" [matches $pfx]\n" if $V>1;
			$Done{$lurl} = $now;
			$Depth{$lurl} = $depth;
			$purged = 1;	# Tell caller to drop the URL
			$newuline = '';
			@newchunk = ();
			return 0;
		}
	}
	if ($lurl =~ /^(.*)((%22%20|" )class=.*)$/) {	# DEBUG: Look for class= garbage at end of URL
		printf V "$F/%d: #### lurl \"$lurl\" contains \"$2\"\n",__LINE__ if $V>1;
		$lurl = $1;
		printf V "$F/%d: #### lurl=\"$lurl\" <============",__LINE__ if $V>1;
	}
	$HTTPcontime = time;	# Note time we last tried to connect
	unless ($hs->DOCopen($docurl = $lurl)) {
		$DOCopen = 0;
		if ($V>5 || ($lurl =~ m"/$")) {
			&DT();
			push @newchunk, "$now # not accessible.";
			$currURL = '';
		} else {
			$newuline = '';
			@newchunk = ();
		}
		$TMopen = $TMread = 0;	# The time we opened the current file
		print V "$F: \"$lurl\" not accessible.\n" if $V>2;
		return 0;
	}
	$TMopen = time;		# Note time of last DOC open
	print V "$F: Reading \"$lurl\" V=$V ...\n" if $V>2;
	$fileXhdrs = $fileThdrs = $filePhdrs = $fileKhdrs = 0;
	$w3timedout  = 0;	# No timeout yet
	$ignoretune  = 0;	# If true, ignore all tunes in this file
	$ignorefile  = 0;	# If true, ignore the current tune
	$endDoc      = 0;	# If true, close the doc file and contine with next doc
	$hs->setOption("+HDRs");	# We want to see HTTP header
	$inHTTPhdrs = 1;	# Interpret first lines as HTTP headers
	print V "$F: Look for HTTP headers ...\n" if $V>2;
	if ($ct = &scanHDRs()) {		# Read the HTTP headers
		print V "$F: Content-Type '$ct' found in HTTP header.\n" if $V>2;
		if ($IgnoreType{$ct}) {
			print V "$F: Content-Type '$ct' ignored.\n" if $V>1;
			&closeDoc("Content-Type '$CT' ignored");
		} else {
			print V "$F: Content-Type '$ct' accepted.\n" if $V>1;
		}
	} else {
		print V "$F: Content-Type \"$ct\" not known.\n" if $V>1;
	}
# Each time around this loop, we try to  get  one  line  from  the  document,
# append  it  to the remains of the previous line, and decide what to do with
# it.  Most of the time, we will empty out $line, but in  HTML  docs  we  may
# return  here  with  a  partial  line  unprocessed.  We distinguish the HTTP
# headers from the doc's contents, and there are a number of  things  in  the
# headers that we look for.  [No HTML here any more.]
buffer:
	while (!$endDoc && !$ignorefile && !$HTTPalrm && !$finishup) {
		print V "$F: Call scanBuf()\n" if $V>5;
		$i = &scanBuf();
		print V "$F: scanBuf() returned $i.\n" if $V>2 || $i>2;
		if    ($i eq 0) {return 0}
		elsif ($i eq 1) {return 1}
		elsif ($i eq 2) {next buffer}
		elsif ($i eq 3) {last buffer}
		else  {print V "$F: ### scanBuf() returned $i ###\n" if $V>2}
	}
	if ($w3timedout) {
		print V "$F: Timed out in \"$lurl\"\n" if $V>0;
		&timedout();
	}
	if ($inABC) {
		print V "$F: EOF ends ABC tune at line $doclines.\n" if $V>1;
		&tune();		# Write the tune to the cache
		@tune = ();		# Initialize for the next tune
		$inABC = $inHdr = 0;	
		$X = $ttl = undef;
	}
	++$filecnt if ($doctunes>0 || $doctitls>0);
		# N.B.: We will only index the tune if there's a title. Since $doctunes
		# may count tunes for which there's no title, we may be looking at a tune
		# that will not be indexed.  Not that there's a problem with that.
	if ($V>7) {
		print V "$F: filecnt=$filecnt because doctunes=$doctunes doctitls=$doctitls.\n";
		$xx = ($doctunes > 0) ? " in ABC file $filecnt" : '';
		push @newchunk, "$now $doclinks links, $doctunes tunes, $doctitls titles$xx."
			if $doctunes || $doctitls || $doclinks;
	}
	$linkcnt += $doclinks;			# Total links  at this host
	$tunecnt += $doctunes;			# Total tunes  at this host
	$titlcnt += $doctitls;			# Total titles at this host
	print V "$F: doclinks=$doclinks linkcnt=$linkcnt doctunes=$doctunes tunecnt=$tunecnt doctitls=$doctitls titlcnt=$titlcnt\n" if $V>3;
	$linkmax = $linkcnt if $linkmax < $linkcnt;
	$tunemax = $tunecnt if $tunemax < $tunecnt;
	$titlmax = $titlcnt if $titlmax < $titlcnt;
#	$t = &DT() - Max($HTTPcontime,$TMread);	# Seconds since last read	# was dt()
	$t = &DT() - $URLtime;			# Seconds since last read			# was dt()
	print V "$F: t=$t crawldelay=$crawldelay.\n" if $V>2;
	$d = $crawldelay - $t;			# How long to delay between GETs
	if ($d > 0) {					# Do we have a min delay between GETs?
		print V "$F: Delay $d sec at $cymd $hms [t=$t < crawldelay=$crawldelay]\n" if $V>2;
		sleep($d);					# If so, wait out the rest of the delay
	}
	if ($endDoc || $HTTPalrm) {	# Some disaster detected
		$exitreason = "timeout alarm after $t sec";
		print V "Close \"$URL\" ($exitreason)\n" if $V>3;
		$endDoc = $HTTPalrm = 0;
	} elsif ($finishup) {
		$exitreason = "Told to finishup";
		print V "$F: $exitreason ...\n" if $V>0;
	} else {
		$exitreason = 'EOF';
		print V "$F: EOF on DOC file at $cymdhms.\n" if $V>6;
	}
	&closeDoc($exitreason) if $DOCopen;
	if ($tunesinfile>0 && $V>1) {		# ABC line count
		($ss,$mm,$hh,$DD,$MM,$CY) = gmtime($now = time); ++$MM; $CY += 1900;
		$s = ($tunesinfile > 1) ? 'tunes' : 'tune';
		$when = "$CY/$MM/$DD $hh:$mm:$ss";
		print V "      \"$lurl\" ==== $tunesinfile abc $s ==== X:$fileXhdrs T:$fileThdrs P:$filePhdrs K:$fileKhdrs $when\n";
		print V "$now T X:$tunecnt T:$titlcnt F:$filecnt H:$currhost at $when\n" if $V>0;
	 
	}
	print V "$F: Xhdrs=$fileXhdrs Thdrs=$fileThdrs Phdrs=$filePhdrs Khdrs=$fileKhdrs\n" if $V>2;
	print V "$F: D:$depth B:$docbytes L:$doclinks X:$doctunes T:$doctitls $upath\n" if $V>1;
	alarm 0; $SIG{ALRM} = 0;
	print V "$F: Set alarm 0.\n" if $V>6;
	print V "$F: DONE \"$URL\"\n" if $V>1;
	return 1;
}

sub closeDoc {my $F='closeDoc';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($why) = @_ || 'no reason';
	$hs->endDoc($why);
	print V "$hsep\n" if $V>2;
	print V "$F: DOC had $fileXhdrs X: lines.\n" if $V>2 && $fileXhdrs>2;
	print V "$F: DOC had $fileThdrs T: lines.\n" if $V>2 && $fileThdrs>2;
	print V "$F: DOC had $filePhdrs P: lines.\n" if $V>2 && $filePhdrs>2;
	print V "$F: DOC had $fileKhdrs K: lines.\n" if $V>2 && $fileKhdrs>2;
}

sub scanBuf {my $F='scanBuf';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is the code that handles once through the buffer: loop in scan().   It
# was  moved  here  during an attempt to profile the code, and might be moved
# back eventually.  Or not, if this division of labor turns out to be useful.
# Our return values tell the caller to do these things:
#   0: return 0
#   1: return 1
#   2: next buffer
#   3: last buffer
# The job of this routine is to grovel through the current input buffer, do a
# rough parse, and hand identifiable chunks off to various other routines for
# further processing.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($data,$line,$l1,$l2,$val);
	print V "$F: Called.\n" if $V>5;
	if ($endDoc || $ignorefile) {
		print V "$F: Ignoring rest of document.\n" if $V>1 && $ignorefile;
		print V "$F: Closing document.\n"  if $V>1 && $endDoc;
		return 3;			# Abandon this doc
	}
	print V "$F: Reading DOC V=$V ...\n" if $V>5;
	if ($w3timedout) {			# Should we give up on this doc?
		print V "$F: TIMEOUT with V=$V\n" if $V>1;
		unless ($doclinks || $doctunes || $doctitls) {
			print V "$F: TIMEOUT and nothing interesting found.\n" if $V>0;
			return 3;			# Abandon this doc
		}
		print V "$F: Timeout ignored because $doclinks links, $doctunes tunes, $doctitls titles.\n" if $V>3;
	}
	print V "$F: No timeout.\n" if $V>5;
	print V "$F: DOCline call ...\n" if $V>5;
	$TMread =  time;			# Note time of last DOC read
	$data = $hs->DOCline();
	if (!defined($data)) {		# Read one line from document
		print V "$F: EOF\n" if $V>1;
		return 3;
	}
#	++$doclines;				# Count the lines in the document (done in docLine now)
#	if ($V>1) {					# Count the lines with X:, T:, P: or K: headers
		++$fileXhdrs if ($data =~ /\bX:/);
		++$fileThdrs if ($data =~ /\bT:/);
		++$filePhdrs if ($data =~ /\bP:/);
		++$fileKhdrs if ($data =~ /\bK:/);
#	}
	&dmpline('===',$data) if $V>2;
	print V "BUF: \"$data\"\n" if $V>5;
	$TMread = time;				# Note last time we got data from file
	print V "$F: Set opent1=$TMread after TIMEOUT\n" if $V>1 && $w3timedout;
	$docbytes += length($data)	# Count the bytes in the document
		unless $inHTTPhdrs;
	print V "$F: Doc has $doclines lines $docbytes bytes so far.\n" if $V>5;
	print V "$F: maxlines=$maxlines usuf='$usuf'\n" if $V>5;
	if ($doclines > $maxlines && $usuf ne 'abc') {	# Should we continue?
		unless ($inABC || $doctunes) {
			# For files not of abc type, we require a recognized tune
			# with maxlines, or we reject it and drop the connection.
			# This does a lot to prevent grovelling through huge files
			# of irrelevant stuff.
			print V "$F: Abort after $doclines lines; maxlines=$maxlines, usuf='$usuf' inABC=$inabc doctunes=$doctunes.\n" if $V>2;
			&closeDoc("No ABC in initial $maxlines lines");
			return 1;
		}
	}
	$line = $data;				# Add to leftover from last line
	$line =~ s/[\s\r\n]+$//s;	# Discard trailing white stuff
	print V "$F: line={" . &esc($line) . "}\n" if $V>3;
	print V "LINE \"$line\"\n" if $V>3;
	if (!$line && $inHTTPhdrs) {	# Null line -
		$inHTTPhdrs = 0;			# Terminates headers
		print V "$F: Null line terminates HTTP header.\n" if $V>3;
		return 2;				# Otherwise ignore it
	}
	print V "$F: \"$line\"\n" if $V>5;
# We're not in the HTTP headers.  So this must be in the data.  We try to cut
# out  the  ABC  tunes and store them in @tune, calling tune() at every blank
# line that ends a tune.
	while ($line =~ s"<URL:([^>]+?)>""s) {
		print V "$F: <URL:$1>\n" if $V>2;
		$doclinks ++;			# Count the links from the current document
		&href($lurl,$1,'',1);	# Handle this href later
		&dmpline('+++',$line) if $V>2;
	}
# Next we look for possible multiple ABC headers on one line. If we find
# them, we insert newlines and put all but the first back into the
# htmlsubs module's text buffer.
	$line =~ s/^\s+(\w:)/$1/s;			# Wipe out initial white stuff
	if ($inHdr && ($line =~ s/\s+([A-Z]:)([^|])/\n$1$2/gis)) {	# Insert newlines before internal headers
		print V "$F: Inserted NL at \"$1$2\" in line $lines.\n" if $V>2;
		&dmpline('---',$line) if $V>2;
		if (($l1,$l2) = ($line =~ /^([^\n]*)\n(.*)/s)) {
			&dmpline('--1',$l1) if $V>2;
			&dmpline('--2',$l2) if $V>2;
			$hs->putBack("$l2\n");
			$line = $l1;
			&dmpline('===',$line) if $V>2;
		} else {
			print V "$F: #### Oops! Split line doesn't match properly!!!\n" if $V>0;
		}
	}
	if ($inABC) {		# If we've already seen an X: line ...
		$line =~ s/^\s+(\w:)/$1/;	# Strip out initial white stuff from ABC headers
		if (!$line) {			# Blank line ends tune
			print V "$F: Blank line ends ABC tune.\n" if $V>2;
			$val = &tune();		# Process the tune
			print V "$F: Tune " . ($val ? 'accepted' : 'rejected') . " by &tune().\n" if $V>1;
			@tune = ();			# Reset for possible next tune
			$X = undef;
			$inABC = 0;
		} elsif ($line =~ /^X:\s*(\d+)([^\s]*)/) {	# X: line ends tune
			print V "$F: X:$1$2 line ends previous ABC tune.\n" if $V>2;
			$val = &tune();		# Process the previous tune
			print V "$F: Previous tune " . ($val ? 'accepted' : 'rejected') . ".\n" if $V>1;
			$X = "$1$2";
			print V "$F: X:$X may be new ABC tune (X:$X)\n" if $V>2;
			$inABC = $inHdr = 1;	# We're now in the header section of an ABC tune
			@tune = ($line);	# Reset for possible next tune
		} elsif ($line =~ /^K:\s*(.+)/) {	# K: line ends tune header
			print V "$F: K:$K \n" if $V>2;
			$inHdr = 0;			# Note that we've passed the tune's header
			push @tune, $line;	# Add the line to the tune
		} elsif ($line =~ /^T:\s*(.+)/) {	# Tune line found inside ABC tune
			print V "$F: New title \"$1\" found.\n" if $V>2;
			if ($inHdr) {		# New title inside tune's header
				print V "$F: T: line inside header is alternate title.\n" if $V>2;
			} else {			# T: line found outside tune header
				print V "$F: T: line outside header ends ABC tune.\n" if $V>1;
				$val = &tune();			# Process the tune
				print V "$F: Tune " . ($val ? 'accepted' : 'rejected') . " by tune().\n" if $V>1;
				$X = int($X) + 1;		# Bump the tune index number
				@tune = ("X: $X");		# Use this for the next tune
				$inABC = $inHdr = 1;	# We're in header of new tune
			}
			push @tune, $line;	# Add title line to the tune
		} elsif ($line) {
			push @tune, $line;	# Just add the line to the tune
		}
		$line = '';
		print V "$F: Return 2 (Line added to tune)\n" if $V>3;
		return 2;
	}
	print V "$F: Not in ABC tune.\n" if $V>5;
	# We're not inside an ABC tune.  The main complications here are ABC %% directives.
	if ($line =~ /^%%(.*)/) {
		$directive = $1;
		print V "$F: DIR %%$directive\n" if $V>2;
		if ($directive =~ /^noindex\b/) {	# %%noindex says ignore ABC tune(s)
			$exitreason = "%%NOINDEX inABC=$inABC inHdr=$inHdr";
			print V "$F: DIR $exitreason\n" if $V>0;	# Should be 0
			$ignorefile = 1;
			$endDoc = 1;	# This may be redundant
			print V "$F: Return 3 ($exitreason)\n" if $V>0;
			return 3;		# Close down this file 
		}
	}
	$nhrefs = 0;
	$line =~ s/^\s+//;			# Strip initial white stuff
	print V "$F: line={" . &esc($line) . "}\n" if $V>3;
#	$line =~ s#^</*[bi]>##si;		# Strip out <B> and <I> tags
#	$line =~ s#^</*pre>#\n\n#si;	# Replace <PRE> tags with double newlines
	if ($line =~ m"^X:\s*(\d+)([^\s]*)") {	# X: line starts a (new) ABC tune
		$X = "$1$2";			# Note that we keep any alpha junk after the index number
		print V "$F: may be ABC tune (X:$X)\n" if $V>2;
		$inABC = $inHdr = 1;
		@tune = ($line);
		++$possibletunes;		# Count the (possible) ABC tunes
		$line = '';
		print V "$F: Return 2 (X line may be ABC)\n" if $V>5;
		return 2;
	}
	if (($H,$ttl) = ($line =~ /^([TPN]):\s*(.*)/)) {	# T: or P: line required
		unless (defined $X) {	# Missing X: line?
			$X = $Xdefault;		# Default tune index
			++$possibletunes;	# Count the (possible) ABC tunes
			@tune = ("X: '$X'");	# X: is first line
			$inHdr = 1;			# Note that we're in the header section
		}
		print V "$F: looks like ABC tune ($H:$ttl)\n" if $V>2;
		$line = "$H:__" unless $ttl;    # Missing title?
		$inABC = 1;
		push @tune, $line;
		$line = '';
		print V "$F: Return 2 ($H line possible ABC)\n" if $V>5;
		return 2;
	}
	print V "$F: Not an X or T or P line.\n" if $V>5;
	if ($line) {			# Not X: or T: so just add it to the tune
		push @tune, $line;
	}
	$line = '';
	print V "$F: Return 2 (at end)\n" if $V>5;
	return 2;
}

sub timedout {my $F='timedout';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is called while abandoning a URL, to convert the current "chunk" of  a #
# hst/* file to show the timeout.                                             #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	$now = time;
	$TOopen = $now - $TMopen;	# Time since we opened the current file
	$TOread = $now - $TMread;	# Time since we last read from the file
	print V "$F: TIMEOUT after $TOopen/$TOread at $cymdhms in line $doclines.\n" if $V>0;
#	if ($doclines < 1) {
#		print V "$F: Timed out instantly!!!\n" if $V>0;
#		++ $V;		# Try to find out what's happening
#		print V "$F: Increased V to $V.\n" if $V>0;
#	}
	push @newchunk, "$now URL $lurl";
	push @newchunk, "$now # TIMEOUT after $TOopen/$TOread sec.";
	&closeDoc("Timed out");
	if (@newchunk) {
		print V "$F: Timed out with partial chunk built.\n" if $V>5;
	} else {
		print V "$F: Timed out with nothing, preserving old data\n" if $V>5;
		push @newchunk, @oldchunk if @oldchunk;
	}
}

sub scanURL {my $F='scanURL';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Scan one URL for tunes.  Actually, all we do here is pass off most  of  the #
# work  to  scan(), and then examine the results to see if we want to include #
# this URL in the output.                                                     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($ufull,$upath) = @_;
	local($ft,$pt);
	print V "$F: ufull=\"$ufull\"\n" if $V>2;
	print V "$F: upath=\"$upath\"\n" if $V>2;
	if ($upath =~ /^(.*)((%22%20|" )class=.*)$/) {
		print V "$F: #### upath \"$upath\" contains \"$2\"\n" if $V>1;
		$upath = $1;
		print V "$F: #### upath=\"$upath\" <==========\n" if $V>1;
	}
	if ($ufull =~ /^(.*)((%22%20|" )class=.*)$/) {
		print V "$F: #### ufull \"$ufull\" contains \"$2\"\n" if $V>1;
		$ufull = $1;
		print V "$F: #### ufull=\"$ufull\" <==========\n" if $V>1;
	}
	$urlcount ++;		# Count the URLs that we process
	$purged = 0;		# Mark it as tentavely OK to process
	if (@Rewrite) {		# Are there rewrite rules?
		$ufull = &doRewrite($ufull); print V "$F: ufull=\"$ufull\"\n" if $V>2;
		$upath = &doRewrite($upath); print V "$F: upath=\"$upath\"\n" if $V>2;
	}
	print V "$F: currhost='$currhost' testhost='$testhost' ufull='$ufull'\n" if $V>2;
	if (($ft = $Done{$ufull}) || ($pt = $Done{$upath})) {
		print V "$F: #### \"$ufull\" already done at $ft depth $Depth{$ufull}.\n" if $ft && $V>2;
		print V "$F: #### \"$upath\" already done at $pt depth $Depth{$upath}.\n" if $pt && $V>2;
		return;
	}
	if ($ufull =~ /\b$testhost\b/) {
		print V "$F: ufull=\"$ufull\" contains '$testhost'\n" if $V>2;
		if ($ufull =~ s"/(display|display.php)/(\d+)\D.*"/display/$2") {
			print V "$F: ufull=\"$ufull\" <=========\n" if $V>2;
		}
		if ($Done{$ufull}) {
			print V "$F: #### \"$ufull\" already done at $Done{$ufull} depth $Depth{$ufull}.\n" if $V>2;
			return;
		}
	}
	&scan($ufull);
	if ($purged) {
		print V "$F: PURGED purgebad=$purgebad.\n" if $V>2;
		$newuline = "$now # D:$depth B:$docbytes L:$doclinks X:$doctunes T:$doctitls $upath";
	} elsif ($depth<2 || $doctunes || $doctitls || !$purgebad) {	# used to include "$doclinks ||"
		print V "$F: GOOD\n" if $V>2;
		$newuline = "$now U D:$depth B:$docbytes L:$doclinks X:$doctunes T:$doctitls $upath";
	} else {			# No links, tunes or titles
		print V "$F: Purge because links=tunes=titles=0 purged=$purged.\n" if $V>2;
		$newuline = '';	# This won't be output
	}
	print V "$F: newuline=\"$newuline\"\n" if $V>2;
	print V "$F: DONE \"$ufull\"\n" if $V>3;
	&done($depth,$now,$upath,$ufull);
}

sub showcalls {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Dump the function-call stack.  This happens in response to some interrupts, #
# depending on how things are configured at the moment. We can also call this #
# from the debugger.                                                          #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($l,$package, $filename, $line, $subroutine);
	return unless $V<1;
	print V "$P: Call stack:\n";
	$l = 0;
	while (($package, $filename, $line, $subroutine) = caller($l)) {
		printf V "\tLevel %3d line %5d $filename\tin $subroutine\n",$l,$line;
		++$l;
	}
}

sub sigCONT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A CONT signal tells us to abandon the current URL  and  continue  with  the #
# next.   This  is useful when we are hung on a connection, though this isn't #
# much of a problem now that we run $getcmd as a subprocess.  Just  kill  the #
# $getcmd process, and we'll continue.                                        #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print V "sigCONT: CONT signal.\n" if $V>0;
	print V "sigCONT: close DOC ...\n" if $V>6;
	if ($DOCopen) {
		&closeDoc('sigCONT');
		print V "sigCONT: closed DOC.\n" if $V>3;
	}
	&showcalls();
}

sub sigQUIT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A QUIT signal tells us to abandon the current URL and host, clean  up,  and #
# exit.                                                                       #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print V "sigQUIT: QUIT signal.\n" if $V>0;
	&showcalls();
	if ($DOCopen) {
		print V "sigQUIT: close DOC ...\n" if $V>6;
		&closeDoc('sigQUIT');
		print V "sigQUIT: closed DOC.\n" if $V>3;
	}
	$finishup = $endDoc = $endHost = 1;
}

sub sigINT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# An INT signal gets us here, where we do a stack dump and set  the  finishup #
# flag to trigger abandonment of all further URLS.                            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print V "sigINT: INT signal.\n" if $V>0;
	&showcalls();
	&hostunlock if $lfile;
	$finishup = $endDoc = $endHost = 1;
#	exit 1;
}

sub sigUSR1 {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A USR1 signal just produces a stack dump, and then we continue.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	--$V;
	print V "sigUSR1: USR1 signal V=$V.\n" if $V>0;
	&showcalls();
}

sub sigUSR2 {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A USR2 signal just produces a stack dump, and then we continue.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	++$V;
	print V "sigUSR2: USR2 signal V=$V.\n" if $V>0;
	&showcalls();
}

sub cacheTune {my $F='cacheTune';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is called to stuff one tune into the cache. If it's the first tune, we #
# have extra work: We must first create a directory derived from the URL.  We #
# massage the URL into a pathname, converting "http://" to "$cachedir/",  and #
# adding a final ':' to avoid conflicts with previous naming schemes. We also #
# HTTP-encode a few characters. If we end up creating the directory, the tune #
# is written to a file in the directory with a name based on the X: index and #
# the canonicalized T:  title.   Temporary  kludge:  I'm  experimenting  with #
# several  different ways to canonicalize tune titles.  For now, this routine #
# calls Cnames() rather than the older Cname() function, getting back a  list #
# of canonical names.  We create a single multiply-linked cache file based on #
# all these names. Eventually I'll decide which works best, and eliminate the #
# others.  We return 1 if we succeed, 0 if we fail for some reason.           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($path) = $hs->URLenc($docurl); 
	local($abcname,$abcfile,@cname,$cnames,$file,$i,$line,$lines,$name,$ndx,$size);
	$path =~ s":*//+"/"g;	# Reduce the :// and multiple slashes
	$path =~ s"[/\s]*$":";	# End with ":"
	if ($path =~ /^(.*)((%22%20|" )class=.*)$/) {
		print V "$F: #### path \"$path\" contains \"$2\"\n" if $V>1;
		print V "$F:  docurl=\"$docurl\"\n" if $V>2;
		$path = $1;
		print V "$F: #### path=\"$path\" <==========\n" if $V>3;
	}
	unless (-d $path) {
		if (&mkdirs($path) < 0) {		# Create the directory path
			print V "$F: Failed to make dir \"$path\"\n" if $V>0;
		} else {
			print V "$F: Made dir \"$path\"\n" if $V>3;
		}
	}
	unless (-d $path) {
		print V "$F: Cache dir \"$path\" doesn't exist; can't cache tune.\n" if $V>0;
		return 0;
	}
	if (-f $cachetmp) {		# Was the source file cached?
		$size = -s $cachetmp;
		print V "$F: Link $cachetmp -> $path/%%src ($size bytes)\n" if $V>2;
		$file = "$path/%%src";
		print V "$F: file={" . &esc($file) . "}\n" if $V>2;
		if (&cmptunefile($file)) {
			print V "$F: Tune in \"$file\" has changed.\n" if $V>0;
			&bpln($cachetmp,$file,$F,__LINE__);
		} else {
			print V "$F: Tune in \"$file\" has not changed.\n" if $V>2;
		}
		if (unlink($cachetmp)) {	# Unlink the tmp cache file to avoid re-use
			print V "$F: Unlinked \"$cachetmp\"\n" if $V>2;
		} else {
			print V "$F: #### Can't unlink \"$cachetmp\" [$!]\n" if $V>0;
		}
	}
	$cnames = $ndx = 0;
line:
	for $line (@tune) {		# Scan the tune for X: and T: line
		if ($line =~ /^X:\s*(\d+[:._\w]*)/) {
			$ndx = $1;
			print V "$F: X:$ndx\n" if $V>2;
		#	$name = "%$ndx";
		#	print V "$F: name={" . &esc($name) . "}\n" if $V>2;
			next line
		}
		if ($line =~ /^([TP]):\s*(.*)$/) {	# Accept T: or P: as title
			print V "$F: X:'$X' $1:$2\n" if $V>2;
			@cname = &Cnames($2);
			print V "$F: cname=[" . join(',',@cname) . "]\n" if $V>2;
			$cnames = @cname;				# Number of canonical names
			for ($i=0; $i<$cnames; $i++) {	# For each canonical name
			#	$cname[$i] = $ndx . ':' . $cname[$i];	# Prefix the X: index
				print V "$F: cname[$i]={" . &esc($cname[$i]) . "}\n" if $V>2;
			}
			last line;		# Only the first one for now
		}
	}
	# Write the current file to this host's cache:
	$lines = 0;				# Number of lines written to cache file
	$abcname = $abcfile = '';
name:
	for ($i=0; $i<$cnames; $i++) {	# Run thru the canonical names
		$name = $cname[$i];
		if ($i > 0 && $name eq $cname[0]) {
			print V "$F: $i name={" . &esc($name) . "} already done.\n" if $V>2;
			next name;
		}
		while (length($name) < 2) {
			print V "$F: $i Short name '$name'\n" if $V>2;
			$name .= '_';
		}
		print V "$F: $i Cache to name '$name'\n" if $V>2;
		$file = "$path/%$ndx:$name";
		print V "$F: $i file={" . &esc($file) . "}\n" if $V>2;
		$abcname = "$path/%$ndx:$name.abc";			# First file labelled with .abc suffix
		print V "$F: $i Path name \"$abcname\"\n" if $V>2;
		if ($i == 1) {				# Write it the first time only 
			$abcfile = $abcname;	# Remember this file's pathname
			print V "$F: $i Tune file \"$abcfile\"\n" if $V>2;
			if (cmptunefile($abcname) != 0) {		# Is there alread a file with the same data?
				Backup($abcfile) if -e $abcfile;	# Back up previous version, if it exists
				unless (open(CFILE,">$abcfile")) {	# Create single-tune file
					print V "$F: #### Can't write \"$abcfile\" [$!]\n" if $V>0;
					return 0;
				}
				for $line (@tune) {
					print CFILE "$line\n";
					++$lines;
				}
				print V "$F: $i Wrote $lines lines to \"$abcfile\"\n" if $V>2;
				close CFILE;
			} else {
				print V "$F: File \"$abcname\" contains the tune already.\n" if $V>2;
			}
		} elsif ($renameold && -e $abcname) {			# For the other names, we now rename them to backup
			print V "$F: $i Backup '$abcname'\n" if $V>2;
			&Backup($abcname);
		} else {			# Name doesn't match a file
			print V "$F: $i No file '$abcname'\n" if $V>2;
		}
	}
	close CFILE;	# Paranoia
	return 1;
}

sub bpln {my $F='bpln';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#     &bpln(file1,file2,subname,__LINE__);                                    #
# Backup  file2 if it exists, and then link file1 to file2.  The return value #
# is 1 if we succeed, 0 if we fail, as with link().                           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($f1,$f2,$clr,$lin) = @_;
	local($caller,$line);
	$caller = $clr || $F;
	$line   = $lin || '???';
	&Backup($f2) if -e $f2;
	if (link($f1,$f2)) {
		print V "$caller: Linked '$f1' -> '$f2'\n" if $V>0;
		return 1;
	} else {
		print V "$F: #### Can't link '$f1' -> '$f2' [$!]\n" if $V>0; 
		return 0;
	}
}

sub cmptunefile {my $F='cmptunefile'; local($path) = @_;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Compare the current tune with a file's content and return nonzero  if  they #
# are  different,  0  if  they're the same.  Actually, -1 means that the file #
# doesn't exist (and thus can't be the same as the  current  tune);  0  means #
# that  the file exists and contains the same data as @tune; >0 means that it #
# exists but differs from the current @tune, and  the  return  value  is  the #
# (1-based)  line  number  of  the  first  difference.  We currently only use #
# zero/nonzero to decide whether to (over)write the file.                     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($cmd,$l,$line,$flines,$tlines,$text,$stat);
	local(@data);		# Where to put the data.
	print V `/bin/ls -l "$path"` if $V>1;
	if (open(TUNEFILE,$path)) {
		print V "Reading data ...\n" if $V>1;
		while ($line = <TUNEFILE>) {
			$line =~ s/[\r\s]+$//;
			push @data, $line;
		}
		close TUNEFILE;
	} else {
		print V "$F: Can't read file \"$path\" [$!]\n" if $V>2;
		return -1;
	}
	$tlines = int(@tune);	# Lines of data in the tune.
	print V "$F: There are $tlines lines in the tune.\n" if $V>2;
	$flines = int(@data);	# Lines of data in the file.
	print V "$F: There are $flines lines in the file.\n" if $V>2;
	if ($tlines == $flines) {
		print V "$F: A same-size tune is in \"$path\"\n" if $V>2;
		for ($l = 0; $l < $tlines; $l++) {
			if ($tune[$l] eq $data[$l]) {
				print V "$F: Lines $l are the same.\n" if $V>3;
			} else {
				print V "$F: Lines $l differ in \"$path\"\n" if $V>1;
				return $l+1;	# Tell caller that the tune and file differ.
			}
		}
		print V "$F: The same tune is in \"$path\"\n" if $V>2;
		return 0;	# They're the same
	}
	print V "$F: A different-size tune is in \"$path\" [$flines, not $tlines lines]\n" if $V>1;
	return -2;	# They differ
}

sub tune {my $F='tune';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# The @tune array contains what looks like an ABC tune.  Extract the critical #
# data from it, and if it passes as a tune, write the data to the output. The #
# return value is 0 if we reject it as a tune; >0 if we accept it.            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($H,$l,$K,$k,$M,$m,$lines,@N,$P,@P,$t,$T,@T,$Tsrc,$X);
	local($Achords,$Clefs,$Tcnt,$Vcnt,%Voices);
	local($comment,$directive,$lines);
	local($GBcode,$JCcode,$UDcode);
	$lines = int(@tune);
	print V "$F: Tune has $lines lines.\n" if $V>2;
#	if ($cachetmp) {
#		print V "$F: Caching due to $lines-line tune ...\n" if $V>1;
#		&cacheTune();
#	}
	if ($V>2) {			# Dump the tune to the verbose output
		print V "%%%%%%\n";
		for $l (@tune) {print V "%% $l\n"}
		print V "%%%%%%\n";
	}
	if ($ignoretune) {
		print V "$F: #### Tune dropped because $ignoretune=$ignoretune\n" if $V>1;
		$ignoretune = 0;	# Don't ignore the next one.
		return 0;
	}
	if ($ignorefile) {
		print V "$F: #### Tune dropped because $ignorefile=$ignorefile\n" if $V>1;
		return 0;
	}
	$X = $K = $m = undef;	# 20151215 changed from zero to fix handling of X:0
	$Tcnt = $Vcnt = 0;
	$T = $P = $M = '';
	@N = ();
	@P = ();
	@T = ();
	$Achords = $Clefs = $Vcnt = 0;
	%Hdrs = ();
	%Voices = ();
line:
	for $l (@tune) {
		print V "l=\"$s\"\n" if $V>7;
		++$lines;
		if ($l =~ /^\s*%(.*)/) {			# Comments
			$comment = $1;
			print V "$F: COM %$comment\n" if $V>2;
			if ($comment =~ /^%(.*)/) {		# ABC directive
				$directive = $1;
				print V "$F: DIR %%$directive\n" if $V>2;
				if ($directive =~ /^noindex\b/) {	# %%noindex says ignore ABC tune(s)
					print V "$F: DIR NOINDEX inABC=$inABC\n" if $V>2;
					if ($inABC) {		# If we're in a tune, ignore this tune
						print V "$F: DIR NOINDEX inside ABC tune.\n" if $V>2;
						print V "$F: Ignore rest of tune.\n" if $V>2;
						$ignoretune = 1;
						return 0;
					} else {			# If we're outside a tune, ignore the rest of the file
						print V "$F: DIR NOINDEX outside ABC tune.\n" if $V>2;
						print V "$F: Ignore rest of file.\n" if $V>2;
						$ignorefile = 1;
						return 0;
					}
				} else {
					print V "$F: Unknown %%$directive ignored.\n" if $V>1;
					next line;
				}
			}
			print V "$F: COM %$comment ignored.\n" if $V>3;
			next line;
		}
		if ($l =~ m"^X:\s*(\d+)([^\s]*)") {	# X: index header
			$X = "$1$2";				# Keep any alpha junk after the index number
			print V "$F: X:'$X'\n" if $V>2;
		} elsif ($l =~ /^N:\s*(.*)/) {	# N: parts header
			unless (defined($K)) {				# Use only within headers
				push @N, $1;
				print V "$F: N:'$1'\n" if $V>2;
			}
		} elsif ($l =~ /^P:\s*(.*)/) {	# P: parts header
			$P = $1 unless $P;			# Note first P: text
			unless (defined($K)) {				# Use only within headers
				push @P, $1;
				print V "$F: P:'$1'\n" if $V>2;
			}
			if (defined($X) && !$T) {	# Tune with P: line and no T: line?
				$T = $P;				# Use the first P: line as the title
				print V "$F: P:\"$T\" used as title.\n" if $V>1; 
			}
		} elsif ($l =~ /^T:\s*(.*)/) {	# T: title header
			++$Tcnt;					# Count the T: lines
			$T = $1 unless $T;			# Note first non-blank title
			$t = $1 || '__';			# Use '__' for missing title
			$t =~ s/<[^>]+>//g;			# Delete HTML tags (Is this still needed?)
			$t =~ s/\s+/ /g;			# Reduce white space
			if ($SCDkludge && $T && ($t =~ m'[\d_]+x[\d_]+[A-Z][\d_]*')) {
				print V "$F: Drop SCD title \"$t\"\n" if $V>1;
				next line;
			}
			if ($t =~ /^[-_\s]+$/) {		# Ignore titles of only spacing chars
				print V "$F: Drop _ title\ \"$t\"n" if $V>1;
				next line;
			}
			if ($t) {					# It's probably a "real" title
				push @T, $t;			# Accumulate titles
				print V "$F: T:'$t'\n" if $V>1;
			}
		} elsif ($l =~ /^M:\s*(.*)/) {	# M: meter
			next line unless defined($X) || $T;	# Ignore M lines outside tune
			unless ($M) {				# Use only the first meter
				$M = $1;
				$M =~ s/<[^>]+>//g;		# Delete HTML
				$M =~ s/%.*//;			# Strip off comment
				$M =~ s/\s+//;			# Strip out white space
				print V "$F: M:'$M'\n" if $V>2;
			}
		} elsif ($l =~ /^K:\s*([^\r\s]*)\s*(.*)/) {	# K: key ends headers
			next line unless defined($X) || $T;	# Ignore K lines outside tune
			next line if defined($K) && ($K ne '');	# Is the key already defined and non-null?
			$K = $1;			# It's the first key sig
			$k = $2;			# Any excess stuff
			if (($k =~ /\b(treble|alto|tenor|bass)\b/i) || ($k =~ /\b(clef=[GCF])\b/)) {
				$K .= " $1";	# Include clef with key
				$Clefs ++;		# Count all clefs
			}
			print V "$F: K:'$K'\n" if $V>2;
		} elsif ($l =~ /^V:\s*(\w+)/) {	# V: Voice line
			$Hdrs{V}++;					# Note that voices are used
			$Voices{$1}++;				# Note the different voices
			if ($l =~ /\bclef=(treble|alto|bass|G|C|F)\b/) {
				$Clefs ++;				# Count all clefs
			}
			$Vcnt = int(keys %Voices);	# Count the distinct voices
		} elsif ($l =~ /^([A-Za-z]):/) {	# V: Other headers
			$Hdrs{$1} ++;
		} elsif ($l =~ /^\d:/) {			# Why do we see this?
			print V "drop: $l.\n" if $V>2;
		} elsif (defined($K)) {				# Collecting music for abcCode()
			if ($l =~ /"[A-G][b#]*[m7]*"/) {	# Look for chords
				$Achords ++;
				$Hdrs{'"'} ++;			# Add as a kind of "Header"
			}
			unless ($l =~ /^\w:/) {		# Ignore things like w: words
				$m .= $l;				# Accumulate music as one string
			}
		}
	}
	print V "$F: Tune has $lines lines, with $Tcnt titles $Vcnt voices X:'$X' K:\"$K\"\n" if $V>1;
	if (defined($X) && $T && $Tcnt && defined($K)) {
		print V "$F: TUNE accepted unconditionally ($Tcnt titles X='$X' T=\"$T\" K=\"$K\")\n" if $V>1;
	} elsif (($Tcnt || $P) && $K) {
		print V "$F: TUNE accepted ($Tcnt titles P=\"$P\" K=\"$K\")\n" if $V>1;
	} elsif (($Tcnt || $P) && $K) {
		print V "$F: TUNE dubious ($Tcnt titles P=\"$P\" K=\"$K\" and $Vcnt voices)\n" if $V>0;
	} elsif (defined($X) && $K) {
		print V "$F: TUNE untitled ($Tcnt titles P=\"$P\" K=\"$K\" and $Vcnt voices)\n" if $V>0;
	} else {
		print V "$F: TUNE rejected ($Tcnt titles X=$X T=\"$T\" P=\"$P\" K=\"$K\")\n" if $V>0;
		return 0;
	}
	$Vcnt = int(keys %Voices);			# Number of distinct voices found
	print V "$F: $Vcnt voices found in X:'$X'\n" if $Vcnt>0 && $V>2;
	print V "$F: inABC=$inABC inHdr=$inHdr K=$K X=$X m=$m\n" if $V>5;
	unless (($inABC && defined($K)) || (defined($X) && defined($K) && defined($m))) {
		print V "$F: No X line found.\n" if (!defined($X) && $V>1);
		print V "$F: No K line found.\n" if (!defined($K) && $V>1);
		print V "$F: No music found.\n"  if (!defined($m) && $V>1);
		print V "$F: Tune X:$X T:$T K:$K failed tests.\n" if $V>1;
		return $doctunes;
	}
	++$tunesinfile;			# Count the tune as accepted
	$lasttunetime = time;	# Note time each tune is recognized
	if ($cachetmp) {		# Are we caching tunes?
		print V "$F: Caching $lines-line tune ...\n" if $V>2;
		&cacheTune();
	}
	print V "$F: Call abcCode(\"$K\",\"$L\",\"$M\",\"$m\")\n" if $V>7;
	($GBcode,$JCcode,$UDcode) = $abcCode->abcCode($K,$L,$M,$m);
	print V "$F: GBcode=\"$GBcode\" JCcode=\"$JCcode\" UDcode=\"$UDcode\"\n" if $V>6;
	&DT();				# was dt()
	if (@T) {			# Did we find any T: lines?
		for $t (@T) {
			print V "$F: Title from T:$t\n" if $V>2;
		}
		$Tsrc = 'T';
	} elsif (@P) {
		push @T, $P[0];		# If no title lines, use first P: line
		$Tsrc = 'P';
		print V "$F: Title from P:$P[0]\n" if $V>2;
	} elsif (@N) {
		push @T, $N[0];		# If still no title, use first N: line
		print V "$F: Title from N:$N[0]\n" if $V>2;
		$Tsrc = 'N';
	} else {
		print V "$F: No title found in tune X:$X [ufull=$ufull]\n" if $V>2;
		push @T, '__';		# Dummy title
		$Tsrc = '_';		
	}
	$H = join('',sort(keys(%Hdrs)));
	$doctitls += int(@T);
	for $T (@T) {
		$l = "$now X:$X M:$M K:$K";
		if ($Clefs  > 0) {$l .= " C:$Clefs"}
		if ($Vcnt > 0) {$l .= " V:$Vcnt"}
		$l .= " H:$H" if $H;
		$l .= " C1=$JCcode C2=$UDcode T:$T";
		print V "$F: $Tsrc $l\n" if $V>1;
		push @newchunk, $l;
	}
	++$doctunes;
	print V "$F: $doctunes in this file.\n" if $V>2;
	$ignoretune = 0;	# If true, ignore all tunes in this file
	print V "$F: Return doctunes=$doctunes.\n" if $V>2;
	return $doctunes;
}
