ABC tunes

#!/usr/bin/perl # # NAME # TuneBot - search the Web for ABC tunes # # SYNOPSIS # TuneBot [options] [" new URL. require "URLtrim.pm"; # Shrinks URLs. require "HTMLdir.pm"; # HTML directory listing. # # They'll have to be in your @INC path; by default we add $HOME/sh # and $HOME/pl to @INC, so those are good places to put them. # # Oh, and one more thing: We use w3cat to fetch files from the web. # You should find it in the same directory. This was done so that # we could properly time out zombie connections to some of the # broken web sites out there. It turns out that you can only abort # a connect() with sig('ALRM'), and if you attempt to close the # socket, you die a horrible death. With that isolated in the w3cat # subprocess, we can continue to run past such disasters. # # ENVIRONMENT # We read the following from the environment: # # V_TuneBot= # If defined, this defines our "verbose" level and output file. # The level is a number (which defaults to 0), the optional # (which defaults to STDERR) is where the output is # written. Note that this variable's name consists of 'V_' plus # the program's name. If you call this program by some other # name, you should of course use 'V_' plus that name. # # INPUT # We always read from stdin, so if you don't want to provide any # input, you'll need to redirect our input to /dev/null. The input # is scanned for URLs, and they are added to our starting list (at # depth 1). # # As a special aid in limiting searches, the input may contain # lines of these forms (with or without the colons): # done: http://foo.bar.com/xyz # ignore http://foo.bar.com/xyz # avoid: http://foo.bar.com/xyz # These are ways of telling TuneBot to ignore certain URLs. The # "done" and "ignore" commands give specific URLs that are to be # avoided; this is implemented by simply listing them as "already # done". With the "avoid" command, we extract the host name, and # URLs for that host will not be used. # # OUTPUT # The output is one line per ABC title discovered. The format is # rather simple HTML, designed to be surrounded by

...

# to generate a simple table. The fields are currently somewhat in # flux as I experiment ... # # Note that the output is unsorted, and is in the order that the # tunes were discovered. We've found that this is the best way to # do it, with subsequent sorting done by a separate process. One of # the reasons is that we've had a lot of problems with this program # either hanging or bombing while trying to connect. There's a lot # of very flakey web software around. By producing our output as we # go, you at least get access to what was found. Also, this program # can take many hours to run, but our (partial) output is available # for use at any time. # # OPTIONS # Options start with '-' or '+' plus a letter, with possibly a # parameter (and no embedded spaces). Some of the options take an # initial '+' to mean "enable" and '-' to mean "disable". For # others, the '-' or '+' is not relevant. If '+' is shown in the # list below,, then it is significant. Capitalization of the option # letters doesn't matter (but it may matter in an argument string # if there is one). # # - # where is an integer, means a timeout of seconds. The # default is currently: # $ABCtmout = 30; # # -d # This restricts the depth of directory searches to . This # is mostly to avoid infinite loops. The default is 3. Experience # has shown that each depth level produces at least a factor of # 10 increase in run time, so you should be careful with this. # It's much faster to have a shallow depth and a long list of # starting URLs. One recommendation: use the previous output as # input, so all the successes then will be re-scanned (at depth # 2) in the current run. # # +h # Allow URLs for . Default: All hosts allowed. If there is # one or more +h options, then only these hosts are allowed. # # -s # +s # Skip over URLs while searching. This has the effect of not # making a lot of requests in succession of a single server. It # is implemented by moving n-1 URLs to the end of the URL list # before each attempt to fetch a URL. # # EXAMPLES # # SIGNALS # There are various ways that this program may get hung up because # of misbehavior (or behavior that may be valid but I don't # understand it) on the part of web servers. You can "kick" this # program by sending it these signals: # # CONT # Abandon the current URL by closing the connection. # INT or HUP # Abandon the search and write the output files. # USR1 # Write a dump of the call stack to the verbose log. Useful for # diagnosing hangups. If this program doesn't do anything for # more than about 2 minutes, you might send it a USR1 signal, to # see what it was trying to do. And since this causes the # "interrupted system call" error, it tends to also get things # moving again. # # BUGS # This program is highly experimental, in alpha state, and all # that. Use it at your own risk. (Not much risk, there, actually, # but I thought I'd give the usual friendly warnings.) Just don't # write the output back over the input, and check its output with a # browser or two, and there shouldn't be many problems. # # Of course, there are constant problems with slight spelling # variations. This program doesn't even attempt to tackle this # issue. # # AUTHOR: # John Chambers http://trillian.mit.edu/~jc/music/ # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Assorted initializations: $| = 1; ($me = $0) =~ s'.*/''; ($myhost = `hostname`) =~ s/\s+$//; $abcCode = new abcCode; # Counters: $abcfiles = 0; # Number of files that contain abc. $abctunes = 0; # Number of X: lines discovered. $abctlines = 0; # Number of T: lines discovered. $abctitles = 0; # Number of distinct T: titles discovered. $loadcount = 0; # Number of files loaded. $SCDkludge = 1; # Try to ignore SCD dance-form titles. # Options: $chkparentdirs = 0; # Check for "parent dir" lines. $listabchosts = 1; # Collect list of hosts with abc files. # Alignment of various output fields (default = 'L') $align{X} = 'R'; # Right-align X fields. # Initial widths of various output fields: %max = ( 'K', 12, # Pad K fields to this many bytes. 'X', 4, # Pad X fields to this many bytes. 'orig', 40, # Pad origin fields to this many bytes. 'type', 8, # Pad type fields to this many bytes. ); # Recursion control: $maxdepth = 3; # Default depth limit for directories. $abcdepth = 4; # Depth limit for *.abc files. $depth = 1; # The current depth in directories. # Before this point should be only simple assignments of initial values. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Set the verbosity from various environment variables. The value may # be a verbose level (1 digit), plus an optional output file name. $Vopt = $ENV{"D_$me"} || $ENV{"T_$me"} || $ENV{"V_$me"} || '1'; if ($Vopt =~ /^(\d)(.+)/) { $V = $1; $Vfil = $2; if (!open(V,">>$Vfil")) { print V "$me: Can't write \"$Vfil\" ($!)\n" if $V>0; open(V,">>STDERR"); } } else { $V = $Vopt; open(V,">>STDERR"); } select V; $| = 1; select STDOUT; $| = 1; print V "$me started with V=$V ", `date` if $V>1; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # This is the list of URLs to process. You can prime this # list with hard-coded URLs if you wish. This is useful # for testing. Or you can read them from stdin, below. @URLs = ( # './', # Useful for testing. # 'http://trillian.mit.edu/~jc/music/' # My own music archive. ); $URL = ''; # Current URL, also localized in scan().. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Read in a list of URLs that need special treatment: print V "$me: Read STDIN ...\n" if $V>3; for $line () { # Input contains URL directives. $line =~ s/\s*$/ /; # Exactly one space at end of line. print V "$me: line \"$line\"\n" if $V>3; if ($line =~ s"^\s*(http://\S+)\s"$1"i) { print V "$me: URL: $line\n" if $V>3; &URL($line,1); # Add it as a level-1 URL to be examined. } elsif ($line =~ s"^(scan|search):*\s*(\S+)\s"$2"i) { print V "$me: SCAN $line\n" if $V>3; &URL($line,1); # Add it as a level-1 URL to be examined. } elsif ($line =~ s"^(done|ignore):*\s*(\S+)/*\s"$2"i) { print V "$me: DONE $line\n" if $V>3; $Depth{$line} = 1; # Mark this one as "already done". } elsif ($line =~ s"^(avoid):*\s*(\S+)/*\s"$2"i) { print V "$me: Avoid $line\n" if $V>2; if (($host,$rest) = ($line =~ m"http://([-\w.:]+)(.*)"i)) { $BadHost{$host} = 1; # Mark this one as "to be avoided". print V "$me: Avoid \"$host\"\n" if $V>2; } elsif ($line =~ /^[-\w.:]+$/) { $BadHost{$line} = 1; # Mark this one as "to be avoided". print V "$me: Avoid \"$line\"\n" if $V>2; } else { print V "$me: AVOID $line IGNORED (can't parse).\n" if $V>2; } } else { # Otherwise it's a comment. } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here are some patterns to suppress URLs (if enabled): #%ignore = ( # '/peacebook/' => 1, # Interesting, but HUGE! # '^http://lscftp.kgn.ibm.com/' => 1, # No response. #); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Suffixes to type mapping: %suf = ( 'abc', 'abc', 'html', 'html', ); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Global patterns: # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # An INT or HUP signal causes us to wrap things up. $SIG{CONT} = 'sigcont'; $SIG{INT} = 'sigdone'; $SIG{HUP} = 'sigdone'; $SIG{USR1} = 'showcalls'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Scan the command-line arguments, processing them as we go. Input # # files are read and used to build tables. Any URLs discovered are # # accumulated in @URLs. Options are processed as read, so they will # # only affect things to their right, except for URLs, which we save # # for last. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # for $a (@ARGV) { print V "$me: Arg \"$a\"\n" if $V>3; if (($fl,$opt) = ($a =~ m'^([-+])(.*)'i)) { if ($opt =~ m'^art'i) { $articles = $fl; print V "$me: " . ($articles eq '-' ? 'Ignoring' : 'Including') . " articles." if $V>1; } elsif ($opt =~ m'^D(\d*)$'i) { $maxdepth = ($fl eq '-') ? 0 : $1; print V "$me: maxdepth='$maxdepth'\n" if $V>1; $abcdepth = $maxdepth + 1; } elsif ($opt =~ m'^P$'i) { $Pflag = ($fl eq '-') ? 0 : $1; print V "$me: Pflag='$Pflag'\n" if $V>1; } elsif ($opt =~ m'^H(.*)$'i) { $host{$h = $1} = ($fl eq '-') ? 0 : 1; print V "$me: host{$h}=$host{$h}\n" if $V>2; } elsif ($opt =~ m'^S(\d*)$'i) { $urlskip = ($fl eq '-') ? 1 : ($1 || 1); print V "$me: host{$h}=$host{$h}\n" if $V>2; } elsif ($opt =~ s'^(\d+)$'') { $ABCtmout = $1; print V "$me: ABCtmout=$ABCtmout\n" if $V>2; } } elsif ($a =~ m'^\.(\w+)=(\w+)$') { $suf{$1} = $2; } else { &URL($a,1); } } print V "$me: articles=$articles.\n"; print V "$me: maxdepth=$maxdepth.\n"; print V "$me: abcdepth=$abcdepth.\n"; print V "$me: max{list} =$max{list}.\n"; print V "$me: max{K} =$max{K}.\n"; print V "$me: max{X} =$max{X}.\n"; print V "$me: max{table}=$max{table}.\n"; # We use our starting time CCYYMMDD as part of the name of our output # files, to make it clear just when the search ran. ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime(time); $cymd = sprintf("%04d%02d%02d",1900+$year,1+$mon,$mday); $ofile = "$me.$cymd"; if (open(O,">$ofile")) { print V "$me: Writing to \"$ofile\"\n" if $V>1; select O; $| = 1; select STDOUT; link($ofile,"$ofile.html"); } else { print V "$me: Can't write \"$ofile\" [$!]\n" if $V>0; } $sfile = "Summary.$cymd"; if (open(S,">$sfile")) { print V "$me: Writing to \"$sfile\"\n" if $V>1; select S; $| = 1; select STDOUT; } else { print V "$me: Can't write \"$sfile\" [$!]\n" if $V>0; } print O "ABC tunes

\n";

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Run through the list of URLs, reading each one.  If  the  data  looks #
# like a directory, we read it recursively. If the data is a .abc file, #
# we read it, extract the title(s), and add it  to  our  %U  table  for #
# later use..                                                           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
$urlskip = 1;				# If scattering URLs.
while (@URLs) {
	last if $finishup;		# set by INT and HUP signals.
	$u = shift @URLs;		# Next URL, please.
	if ($urlcount++ % $urlskip) {	# Save URLs for later.
		print V "     (\"$u\") [$depth]\n" if $V>3;
		push @URLs, $u;		# Others are thrown back.
	} else {
		$urlsdone++;		# Count the URLs that we process.
		$depth = $Udepth{$u} || 1;
		print V "Next: \"$u\" [$depth]\n" if $V>3;
		&scan($u);
	}
}
print V "\n$me search done ", `date` if $V>1;

if ($V>0) {			# Summary statistics.
	print S "\n";
	print S "$loadcount files loaded.\n";
	print S "$abcfiles files contain abc.\n";
	print S "$abctunes ABC tunes found.\n";
	print S "$abctlines T: lines found.\n";
	print S "$abctitles distinct ABC titles found.\n";
	if ($loadcount > 0) {
		$i = $loadtime / $loadcount;
		print S "Loads: $loadcount took $loadtime sec ($i sec/file)\n";
	} else {
		print S "No successful loads.\n";
	}
	if ($fails) {
		$i = $failtime / $fails;
		print S "Fails: $fails took $failtime sec ($i sec/file)\n";
	} else {
		print S "No failed loads.\n";
	}
	print S "\nABC files linked to from:\n";
	for $x (sort keys %ABClink) {
		printf S "%8d < $x\n", $ABClink{$x};
	}
	print S "\nABC files found at:\n";
	for $x (sort keys %Tabcs) {
		printf S "%8d + $x\n", $Tabcs{$x};
	}
	print S "\nSites visited:\n";
	for $h (sort keys %Site) {
		printf S "\t%8d : $h\n",$SiteN{$h};
	}
}

print O "

\n"; print V "$me exit ", `date` if $V>1; exit 0; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub TuneEnd { local($name) = shift; local($n) = shift; local($t) = shift; local($GBcode,$JCcode,$lines); local($id) = "$me/TuneEnd"; $abctunes++ if $n >0; # Count the tunes. $lines = @_; # Count the lines in this tune. print V "$id: Tune has $lines lines; K=\"$K\" L=\"$L\" M=\"$M\" .\n" if $V>3; if (@_) { ($GBcode,$JCcode) = $abcCode->abcCode($K,$L,$M,@_); print V "$id: GBcode=\"$GBcode\" JCcode=\"$JCcode\" T=\"$T\"\n" if $V>3; } if ($name) { for ($i=0; $i<=$t; $i++) { print V "$id: Tune $i: ti=$ti[$i] tn=\"$tn[$i]\"\n" if $V>3; OStune($ti[$i],$tn[$i]) if $tn[$i]; } } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Given a tune name and index, this routine writes its info in # # "short" html form to file O, which must be open. We use this # # routine to produce the intermediate "trace" file as we load the # # info. We also call this from abclist to write to the current output # # html file at the end. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub OStune { local($i,$name) = @_; local($nuna,$r,$key,$seq,$titl,$tabc,); $nuna = "$i:$name"; $titl = &abc2html($Title{$nuna}); $tabc = &findURL('abc',$i,$name,$titl); $seq = &pad($TX{$nuna}, ' ', 'X'); $key = &pad($TK{$nuna}, ' ', 'K'); print V "$me: No title for \"$nuna\"\n" if ($V>0 && !$titl); $r = 0; print O ""; if ($tabc) {print O "abc "; $r++} else {print O '___ '} print O "$seq "; print O "$GBcode "; print O "$JCcode "; print O "$key "; print O ''; print O "$titl\n"; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Parse a chunk of abc code. The arg is the X: or T: line that triggered the # # call. We process lines from until the first blank line or EOF, # # whichever comes first. We return the number of lines we ate. This tells the # # caller how much of the input we actually used up. Note that, for files with # # multiple abc tunes, this routine must be called repeatedly, once for each # # new tune. It's the caller's job to skip over junk between tunes. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub abc { local($line,$prnt) = @_; local($n,$name,$nuna,@ti,@tn,$K,$L,$M,$O,$T,$TT,@tune); local($t) = 0; # Count the titles. local($X) = 1; # In case X: line is missing. local($l) = 1; # Count the arg as first line. local($n) = 0; # Tune instance count. local($id) = "$me/abc"; line: for (;$line;$line = &docline) { # The first time we process the arg. $line =~ s/\s+$//; # Discard CRs and LFs. if (!$line) { # This is abc's end-of-tune. print V "$id: Blank line $l is end of \"$nuna\" ($n tunes found).\n" if $V>3; TuneEnd($name,$n,$t,@tune); return $l; } ++ $l; print V "$id: ABC \"$line\"\n" if $V>4; if ($line =~ /^X:\s*(\d+)/) { $X = $1; } elsif ($line =~ /^%/) { # Ignore comments and directives. next line; } elsif (($line =~ /^T:\s*(.*)$/) # Title line. || (!$T && ($line =~ /^P:\s*(.*)$/)) # P: line if no titles seen. ) { $T = $1; # One title found. if ($SCDkludge && ($T =~ /^[Xx_\d]+[HJMRSW]\d*$/)) { # Special Scottish dance-form kludge. print V "$id: Ignore \"$T\" (SCD kludge)\n" if $V>3; $T = $TT; # Remember previous title, if any. next line; # Otherwise ignore numeric titles. } next line if ($T =~ /^[-\d_]+$/); # Ignore if just these. print V " \"$URL\" T: $T\n" if $V>2; if (!($name = &canon($T))) { # Canonicalize the tune'sname. print V "$id: Reject title \"$T\"\n" if $V>1; next line } ++$t; # Count the titles. $TT = $T; # Remember last-accepted title. $n = ++$Tcount{$name}; # Number of instances of this tune. print V "$id: Instance $n of \"$name\"\n" if $V>4; $abctitles++ if $n == 1; # Count the distinct titles. $abctlines++; # Count the T: lines. $nuna = "$n:$name"; # Num+name for this tune. $TX{$nuna} = $TX{$t} = $X; # Remember the tune's X: index. if ($prnt) { ++$ABClink{$prnt}; # Count the ABC tunes linked to by the parent. if ($Pflag && $Udepth{$prnt} > 1) { print V "depth \"$prnt\" was $Udepth{$prnt} now 1.\n" if $V>1; $Udepth{$prnt} = 1; # Promote parent. } } else { print V "$id: \"$URL\" has no parent.\n" if $V>3; } print V "$id: TX{$nuna}=$TX{$nuna}\n" if $V>4; $ti[$t] = $n; $tn[$t] = $name; # Note $n:$name pairs in this tune. print V "$id: ti[$t]=$ti[$t] tn[$t]=$tn[$t] TX{$nuna}=$TX{$nuna}\n" if $V>3; $U{"abc:$name"} .= "$URL "; # Add to list of possible URLs. $U{"abc:$nuna"} .= "$URL "; # Add to list of specific URLs. &ABCcount($URL) if $listabchosts; $abcfiles++ # Number of ABC files found. if !($Turls{$URL}++); # List of URLs that we found abc in. $SiteN{$currsite}++; # Count of ABC tunes per site. $Title{$nuna} = $T; # Remember this instance's title. } elsif ($line =~ /^O:\s*(.*)/) { $O = $1; for ($i=1; $i<=$t; $i++) { # Search thru tunes in this file. if ($TX{$i} == $X) { # If this is the current tune, $nuna = "$ti[$i]:$tn[$i]"; $TX{$nuna} = $X if $X; # Remember X and O fields. print V "$id: TX{$nuna}=$TX{$nuna}\n" if $V>5; } } } elsif (!$K && ($line =~ /^K:\s*([\^=_\s\w]+)/)) { $K = $1; # One key signature found. print V "$id: K='$K' t=$t\n" if $V>3; for ($i=1; $i<=$t; $i++) { # It's the key for all the titles. print V "$id: TX{$i}='$TX{$i}'\n" if $V>5; if ($TX{$i} == $X) { # Only for the current tune. $nuna = "$ti[$i]:$tn[$i]"; $TX{$nuna} = $X if $X; if (length($X) > $max{X}) { print V "$id: New max{X} length($X) > $max{X}\n" if $V>3; $max{X} = &Max($max{X},length($X)); } $max{X} = &Max($max{X},length($X)); print V "$id: TX{$nuna}=$TX{$nuna}\n" if $V>5; $TK{$nuna} = $K; print V "$id: TK{$nuna}=$TK{$nuna}\n" if $V>3; } else { print V "$id: TX{$i} is '$TX{$i}' not '$X'\n" if $V>5; } } } elsif ($line =~ /^L:\s*(\d+)\/(\d+)/) { $L = $1; print V "$id: L=\"$L\"\n" if $V>3; } elsif ($line =~ /^M:\s*(\d+)\/(\d+)/) { $M = $1; unless ($L) {($L = $M) =~ s/^\d+/1/} print V "$id: M=\"$M\" L=\"$L\"\n" if $V>3; } elsif ($line =~ /^M:\s*[Cc]/) { $M = '4/4'; $L = '1/4' unless $L; print V "$id: M=\"$M\" L=\"$L\"\n" if $V>3; } elsif ($line =~ /^([A-Za-z]):\s*(.*)/) { print V "$id: $1: line ignored.\n" if $V>3; } else { print V "$id: Music: \"$line\"\n" if $V>3; push @tune, $line; } } print V "$id: EOF at line $l is end of \"$nuna\".\n" if $V>3; TuneEnd($name,$n,$t,@tune); $l; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Given a URL that contains an ABC tune, we strip away fields from # # its URL one at a time, and increment the ABC count for each # # resulting (partial) URL. The end result is a count of how many ABC # # titles are found under each web directory. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub ABCcount { local($u) = @_; local($d,$x); while (($d,$x) = ($u =~ m'^([a-z]+)://(.+)/([^/]*)$'i)) { ++$Tabcs{"$1://$2/"}; $u = "$1://$2"; printf(V "%6d $u/\n", $Tabcs{"$1://$2/"}) if $V>3; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Given a tune name and index, this routine writes its info in "long" # # html form to file O, which must be open. We use this routine to # # produce the intermediate "trace" file as we load the info. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub OLtune { local($i,$name) = @_; local($nuna,$r,$seq,$ndx,$tabc,$titl,$tkey,$turl); local($id) = "$me/OLtune"; $nuna = "$i:$name"; $ndx = $TX{$nuna}; $tkey = &pad($TK{$nuna}, '.', 'key'); print V "$id: tkey=\"$tkey\"\n" if $V>3; $titl = &abc2html($Title{$nuna}); $turl = &findURL('abc',$i,$name,$titl); # $turl =~ s"/www.irishfest.com/oneills/dev/"/www.irishfest.com/oneills/___/"; print V "$id: turl=\"$turl\"\n" if $V>4; $tabc = &TuneLink('abc',$ndx,$name,$nuna,$turl); $seq = &pad($ndx, '_', 'X'); print V "$id: No title for \"$nuna\"\n" if ($V>0 && !$titl); $r = 0; print O ""; if ($turl) {print O "File "; $r++} else {print O '____ '} if ($tkey) {print O "$tkey "} else {print O '______ '} if ($tabc) {print O $tabc} else {print O '___ '} print O "$titl\n"; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub TuneLink { local($typ,$ndx,$name,$nuna,$url) = @_; local($TYP) = uc($typ); local($lnk); local($id) = "$me/TuneLink"; print V "$id: typ=$typ ndx=$ndx name=$name nuna=$nuna url=\"$url\"\n" if $V>4; if ($url) { $lnk = "$TYP "; print V "$id: URL \"$lnk\"\n" if $V>4; # } elsif ($url = &findURL($typ,$ndx,$name,$titl)) { # $lnk = "$TYP "; # print V "$id: LNK \"$lnk\"\n" if $V>4; } else { $lnk = ('-' x length($typ)) . ' '; print V "$id: NUL \"$lnk\"\n" if $V>4; } return $lnk; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Convert the abc escape sequences to HTML. sub abc2html { local($s) = @_; $s =~ s#\\(o)#\&${1}slash;#ig; $s =~ s#\\a(a)#\&${1}ring;#ig; $s =~ s#\\"(\w)#\&${1}uml;#ig; $s =~ s#\\'(\w)#\&${1}acute;#ig; $s =~ s#\\`(\w)#\&${1}grave;#ig; $s =~ s#\\,(\w)#\&${1}cedille;#ig; $s =~ s#\\~(\w)#\&${1}tilde;#ig; $s; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Canonicalize a name. We upper-case everything, and strip out all # # funny chars. If $articles is enabled, we look for articles # # initially and after a comma, and delete them. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub canon { local($name) = @_; local($lcs,$ucs); ($ucs = $name) =~ s/[^A-Z]+//g; # Extract upper-case letters. ($lcs = $name) =~ s/[^a-z]+//g; # Extract lower-case letters. print V "canon: name=\"$name\" lcs=\"$lcs\" ucs=\"$ucs\"\n" if $V>5; if ($ucs && $lcs) { # Both cases used. $name =~ s/^[^A-Z]+//; # Strip stuff before first upper-case letter. print V "canon: Mixed-case \"$name\"\n" if $V>5; } elsif (!$ucs && $lcs) { # All lower case print V "canon: Lower-case \"$name\" curious\n" if $V>5; } elsif ($ucs && !$lcs) { # All upper case print V "canon: Upper-case \"$name\" suspect.\n" if $V>5; } else { # No letters at all. print V "canon: Name \"$name\" with no letters rejected.\n" if $V>5; return ''; } if ($articles eq '-') { $name =~ s/^the\s+//i; $name =~ s/^an?\s+//i; $name =~ s/^l[ae]?s?\s+//i; $name =~ s/,s*the\s+//i; $name =~ s/,s*an?\s+//i; $name =~ s/,s*l[ae]?s?\s+//i; } $name = uc($name); # Upper-case everything. $name =~ s"&(\w)\w*;"$1"g; # De-htmlize the name. # $name =~ s/,.*//; # Discard everything after a comma. $name =~ s"\W+""g; # Delete non-alpha chars. # $Tname{$name} = 1; # Note that we've seen the name. return $name; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # This looks in several arrays for the "right" URL to produce for a # # specific tune, given the name, number, and type codes. If we can't # # find a usable URL, we return a null string. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub findURL { local($typ,$num,$nam,$ttl) = @_; local($u); local($TYP) = uc($typ); for $lis ($U{"$typ:$num:$nam"}, $U{"$typ:$nam"}) { @lis = split ' ', $lis; for $u (@lis) { return $u if $u; } } if ($ttl =~ /^(\w*):\s*(.*)/) { return $u if ($u = &findURL($typ,$num,&canon($2),$2)); } return ''; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Given a URL or local file name, this routine attempts to open it # # and return with F containing the file handle. If we succeed, we # # return 1; a return of 0 means that we can't read the object. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub Open { local(*F,$name) = @_; local($h,$stat,$p,$path,$t,$t0); my $id = "$me/Open"; if ($V>3) { local($p,$c,$l) = caller; print V "$id: \"$name\" (from $p/$c/$l)\n"; } elsif ($V>1) { print V "$id: \"$name\" [$depth]\n" if $V>3; } ($path = $name) =~ s/^[<\s]+//; if (($p,$h) = ($path =~ m'^([a-z]+)://([-a-z0-9:._]+?)/'i)) { # Hostname? print V "$id: URL contains host \"$h\"\n" if $V>6; $currsite = "$p://$h"; $Sites{$currsite} = 1; $SiteN{$currsite} = 0; } else { $h = $myhost; print V "$id: URL contains no host, using \"$h\"\n" if $V>6; } if (%host) { # Are there host restrictions? print V "$id: Testing host \"$h\" ...\n" if $V>6; if ($host{$h}) { print V "$id: Accepted host \"$h\"\n" if $V>6; } else { print V "$id: Rejected host \"$h\"\n" if $V>3; return; } } else { print V "$id: Accepted host \"$h\" (no host restrictions)\n" if $V>6; } if (-d $path) { # A local directory is special. require "HTMLdir.pm"; # Dir-to-HTML module. if (HTMLdir(*F,$path)) { $loadcount ++; return 1; } print V "$id: Can't read local dir \"$path\" ($!)\n" if $V>0; close F; return undef; } if (open(*F,$name)) { # Is it a local file? print V "$id: Opened local file \"$name\"\n" if $V>3; $loadcount ++; return 1; } # Is it a URL? if ($name !~ m"^(http|file|ftp):"i) { local($p,$c,$l) = caller; print V "$id: \"$name\" ignored (from $p/$c/$l)\n"; return undef; } $t0 = time; if ($directopen) { $stat = &URLdata(*F,$path); # Try a web connection to the URL. $tt = time - $t0; print V "$id: \"$path\" returned in $tt sec. ($!)\n" if $V>4; } else { $cmd = "w3cat -T$ABCtmout +TH $path |"; print V "$id: \"$cmd\"\n" if $V>3; if ($stat = open(F,$cmd)) { print V "$id: \"$cmd\" running.\n" if $V>3; $URLhdr = 1; } else { print V "$id: \"$cmd\" failed ($!).\n" if $V>1; } } if (!$stat) { print V "$id: \"$path\" failed in $tt sec. ($!)\n" if $V>0; $fails ++; $failtime += $tt; close F; return undef; } $loadcount ++; $loadtime += $tt; print V "$id: \"$path\" $tt sec.\n" if (($V>1) && ($tt>0)); return 1; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Register a file under its name and type. For example, if we note # # the file 'http://foo.bar/qux.gif', we call # # &File('gif','http://foo.bar/qux','http://foo.bar/qux.gif') # # This will leave behind global information: # # $U{'gif:qux'] = 'http://foo.bar/qux.gif' # # This tells us how to find a gif file for the name 'qux'. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub File { local($ext,$pth,$url) = @_; local($nam); ($nam = $pth) =~ s'.*/''; print V "File: \"$url\"\n" if $V>1; $nam = &canon($nam); $U{"$ext:$nam"} .= "$url "; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Given an href, we decide here how to handle it. The caller must # # pass us the URL from the href, and the string (item) between the # # '>' and the , in case we need to check what's there. The main # # use we make of the item is to check for and reject "parent dir" # # references. We also look at a few other suffixes and decide whether # # we should load them and scan their contents. If the URL is # # accepted, we pass it to &URL() for later processing. For rejected # # URLs, we just return. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub href { local( $prnt, # Parent URL to current, if any. $curr, # Current URL. $href, # URL pointed to by $curr. $item) # HTML text associated with $href. = @_; local($a,$l,$p,$s,$u); print V "href: path=\"$curr\" href=\"$href\" item=\"$item\"\n" if $V>3 || ($href =~ /^\$/); return if $href =~ /^(mailto|file|ftp):/; if ($href =~ /^cgi\b/i) { print V "href: Ignore href=\"$href\" (/cgi)\n" if $V>4; return; } if ($href =~ '/$') { # If final '/', treat as directory. print V "href: Treat href=\"$href\" as directory.\n" if $V>4; if ($chkparentdirs && ($item =~ /Parent Dir/i)) { print V "href: Ignore href=\"$href\" item=\"$item\"\n" if $V>4; return; } elsif ($href =~ /^\w*:/) { # Full URL print V "href: \"$href\" read at depth $depth.\n" if $V>3; &URL($href,$depth+1,$prnt); } else { # Relative URL. $u = &URLhref($curr,$href); print V "href: \"$u\" read at depth $depth.\n" if $V>3; &URL($u,$depth+1,$prnt); } return; } # No final '/' on HREF: print V "href: Treat href=\"$href\" as non-directory.\n" if $V>4; if ($href =~ m'#') { print V "href: \"$curr\" href=\"$href\" ignored (#).\n" if $V>3; # } elsif ($href =~ m'(.*)\.abc$'i) { # $u = &URLhref($curr,$href); # print V "href: \"$u\" abc at depth $depth.\n" if $V>3; # $a = &URL($u,$depth+1,$prnt); # } elsif (($p,$s) = ($href =~ m'(.*)\.(\w+)$'i)) { # print V "href: \"$href\" suffix \"$s\"\n" if $V>3; # if ($suf{$s} eq 'abc') { # Is this a possible abc file? # $u = &URLhref($curr,$href); # print V "href: \"$u\" at depth $depth.\n" if $V>3; # &URL($u,$depth+1,$prnt); # } else { # print V "href: \"$href\" suffix \"$s\" unknown.\n" if $V>3; # } } else { $u = &URLhref($curr,$href); print V "href: \"$u\" URL at depth $depth.\n" if $V>3; &URL($u,$depth+1,$prnt); } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub Max {local($n) = shift; for (@_) {$n = $_ if $_ > $n; shift} return $n} # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub outfooter { local($ll); $ll = @footer; print V "outfooter: Produce $ll footer lines.\n" if $V>5; print O "\n" if !$gothr; for $l (@footer) { ++$inHTML if (!$inHTML && ($l =~ /<\w*>/)); if (!$inHTML) { $l = HTML::Entities::encode_entities($l); $l =~ s#\<(\w+@[-\w.]+)\>#$&#; } print V "outfooter: \"$l\"\n" if $V>5; print O "$l\n"; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub outheader { local($ll,$xx); $ll = @header; print V "outheader: Produce $ll header lines.\n" if $V>5; if ($doctitle) { print O "$doctitle\n"; } else { $xx = "for $outlttr" if $outlttr; print O " Tune list $xx \n"; } for $l (@header) { ++$inHTML if (!$inHTML && ($l =~ /<\w*>/)); $l = HTML::Entities::encode_entities($l) if (!$inHTML); print V "outheader: \"$l\"\n" if $V>5; print O "$l\n"; ++$gothr if ($l =~ //i); } print O "\n" if !$gothr; print O "$outhdr\n" if $outhdr; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # This pads a field to $max{$fld} chars. The pad arg is the pad char; # if null, we don't do any padding. sub pad { local($str,$pad,$fld) = @_; local($l,$m,$n,$p,$v); $m = $max{$fld} || 1; # Max actual field length. $l = ($lim{$fld} || $m); # Limit to field length. $n = ($m < $l) ? $m : $l; # Min of actual and limit. $p = $l - length($str); # Padding needed. print V "pad: str='$str' pad='$pad' fld='$fld' l=$l m=$m n=$n p=$p.\n" if $V>5; $fld =~ s/^\s+//; # Trim initial white stuff. $fld =~ s/\s+$//; # Trim trailing white stuff. $fld =~ s/\s+/ /g; # Convert internal white stuff to single space. $fld =~ s/\s/_/g if $pad eq '_'; # Convert internal spaces to underscores. if ($align{$fld} eq 'R') { # Right-aligned field? $v = ($pad x $p) . $str; } else { $v = substr($str . ($pad x $p), 0, $l); } print V "pad: m=$m l=$l n=$n p=$p for fld=$fld\t\"$str\" => \"$v\"\n" if $V>5; return $v; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub closeDOC { close DOC; $DOCopen = 0; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's an alarm handler for reads from DOC. When a timeout happens, # we close the DOC file and return, which should cause abandonment of # the current document. sub HTTPtmout { local($t); $t = time - $HTTPcontime; print V "ALARM (HTTPtmout) after $t sec.\n" if $V>1; $closeDOC = 1; print V "HTTPtmout: Close \"$URL\" (tmout) ...\n" if $V>1; $HTTPclosetime = time; &closeDOC if $DOCopen; $HTTPclosedtime = time - $HTTPclosetime; print V "HTTPtmout: Closed \"$URL\" in $HTTPclosedtime sec.\n" if $V>1; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Grovel through a file, looking for hyperlinks or pieces of abc # # code. check out each of the files listed. Directories cause # # recursive traversal. Files with interesting suffixes are read. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub scan { local($URL) = @_; local(*DOC,$closeDOC,$DOCopen,$DOCdtype,$DOCstype); # Localize the DOC file. local($flag,$nhrefs,$isdir,$inHTML,$prnt,$tunes); local($b,$d1,$d2,$dpth,$h,$href,$item,$l,$line,$lurl,$n,$p,$s,$surl,$t,$ttl,$u); local($id) = "scan"; $DOCopen = 0; $tunes = 0; $lurl = &URLtrim($URL); # Long URL may have final '/'. ($surl = $lurl) =~ s"/+$""; # Short URL lacks final '/'. $prnt = $Parent{$lurl} || $Parent{$surl}; # print V "$id: \"$lurl\" <- \"$prnt\" (lurl)\n" if $V>5 && $Parent{$lurl}; # print V "$id: \"$surl\" <- \"$prnt\" (surl)\n" if $V>5 && $Parent{$surl}; print V "\n====> \"$lurl\" [$depth] <- \"$prnt\"\n" if $V>1; if ($surl eq '') { # Shouldn't happen. local($p,$c,$l) = caller; print V "scan: \"$URL\" (from $p/$c/$l)\n"; return 0; } if (($dpth = $Depth{$surl}) && ($dpth <= $depth)) { print V "scan: \"$URL\" already scanned at depth $dpth.\n" if $V>2; return 1; } $Depth{$surl} = $depth; # Note that we've done this URL. $inHTML = 0; # Not (yet) known to be HTML format. print V "scan: \"$lurl\"\n" if $V>3; if (!&Open(*DOC,$lurl)) { $DOCopen = 0; print V "scan: \"$lurl\" not accessible.\n" if $V>3; return 0; } $DOCopen = 1; print V "scan: \"$lurl\" ...\n" if $V>3; buffer: while (!$closeDOC && !$HTTPalrm) { if (!($b = &docline)) { # Read one line from document. print V "scan: EOF\n" if $V>3; last buffer; } # print V "scan: \"$line$b\"\n" if $V>2 && $line; $line .= $b; # Add to leftover from last line. $line =~ s/\s+$//; # Discard trailing white stuff. if (!$line) { # Null line - $URLhdr = 0; # Terminates headers. next buffer; # Otherwise ignore it. } print V "scan: \"$line\"\n" if $V>3; if ($URLhdr) { print V "HEAD: \"$line\"\n" if $V>3; if (($DOCdtype,$DOCstype) = ($line =~ m"Content-Type:\s*(.*)/(.*)\s*$")) { $dt = lc($DOCdtype); if ($dt ne 'text' && $dt ne 'application') { print V "scan: Non-text type \"$DOCdtype/$DOCstype\" ignored.\n" if $V>2; &closeDOC; return 1; } if ($DOCstype =~ m"html"i) { $inHTML = 1; } else { $inHTML = 0; } } $line = ''; next buffer; } if (!$ttl && !$inHTML) { # Code to save the first . print V "scan: Check for <TITLE>\n" if $V>5; if (($ttl) = ($line =~ m"<TITLE>(.*)"i)) { print V "scan: Found ...\n" if $V>4; ++$inHTML; if ($ttl =~ m"Index of "i) { print V "scan: Found Index of ...\n" if $V>4; $ttl =~ s"/*$"/"; $isdir = 1; $lurl =~ s"/*$"/"; # Make sure long url has final '/' } # We have removed the title from the line. } } $nhrefs = 0; $Apat = '<a\s+.*?href="*([^"]+?)"*>(.*?)</a>'; while (($href,$item) = ($line =~ m/$Apat/i)) { print V "scan: Matched href=\"$href\" item=\"$item\"\n" if $V>3; if (!$href) { # Is this a perl bug? print V "scan: Matched \"$line\" with null href!!!\n" if $V>3; $line = ''; # Abandon the line. next buffer; } $line =~ s/$Apat//i; $inHTML = 1; # Note it's HTML. $nhrefs ++; &href($lurl,$lurl,$href,$item); # Handle this href later. } if ($line =~ m/<a\s/i) { # Unclosed anchor? print V "scan: Unclosed <A in \"$line\"\n" if $V>3; next buffer; } if ($nhrefs) { print V "scan: Drop \"$line\" after $nhrefs hrefs removed.\n" if $V>3; $line = ''; next buffer; } print V "scan: No hrefs.\n" if $V>5; if ($line =~ /^<a\s/i) { print V "scan: HTML anchor not terminated.\n" if $V>5; $inHTML = 1; # Note it's HTML. next buffer; # Append another line. } if ($line =~ /^([XT]):/) { $flag = $1; ++$tunes; # Count the (possible) ABC tunes. print V "scan: \"$line\" may be ABC.\n" if $V>3; $d1 = $Udepth{$prnt}; $n = &abc($line,$prnt); print V "scan: $n lines of ABC found in tune $tunes.\n" if $V>3; $d2 = $Udepth{$prnt}; print V "scan: Parent \"$prnt\" promoted from level $d1 to $d2.\n" if $V>2 && $d1!=$d2; $line = ''; next buffer; } print V "scan: No X or T line.\n" if $V>5; if ($line =~ s"^(<.*>)\s*"") { $inHTML = 1; print V "scan: HTML tag \"$1\" deleted.\n" if $V>5; } next buffer if !$line; if (-d $line) { print V "scan: \"$line\" is local directory.\n" if $V>3; $line =~ s"/*$"/"; # Make sure it has final '/'. &URL($line,$depth+1,$URL); # Add to list of URLs to process. next buffer; } if (-f $line) { print V "scan: \"$line\" is local file.\n" if $V>3; &URL($line,$depth+1,$URL); next buffer; } print V "scan: Can't parse \"$line\"\n" if $V>4; if ($inHTML) { $line =~ s"^([^<]+)""; print V "scan: Drop \"$1\" from HTML.\n" if $V>3; } else { print V "scan: Drop \"$line\"\n" if $V>3; $line = ''; } print V "scan: \"$line\"\n" if $V>6; } if ($closeDOC || $HTTPalrm) { # Some disaster detected. $t = time - Max($HTTPcontime,$HTTPreadtime); print V "Close \"$URL\" (timeout alarm after $t sec.)\n" if $V>1; $closeDOC = $HTTPalrm = 0; } else { print V "scan: EOF on DOC file.\n" if $V>3; } &closeDOC if $DOCopen; if ($tunes>0 && $V>1) { # ABC line count. ($ss,$mm,$hh,$DD,$MM,$YY) = gmtime(time); ++$MM; $YY += 1900; $s = ($tunes > 1) ? 'tunes' : 'tune'; print V " \"$lurl\" ==== $tunes abc $s ==== $YY/$MM/$DD $hh:$mm:$ss\n"; } alarm 0; $SIG{ALRM} = 0; print V "scan: Set alarm 0.\n" if $V>3; return 1; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Return one line of input from the current document. Here is where # we try to deal with incoming HTML, by splitting it into lines on # any strings of \r or \n, and stripping out tags. sub docline { local($line); if (!$DOCline) { $DOCline = <DOC>; # Next chunk of input. print V ":DOC: \"$DOCline\"\n" if $V>4; return '' if !$DOCline; } if ($inHTML) { # HTML: return one "line". if ($DOCline) { # Any input left? if ($DOCline =~ s"^(.*?)[\r\n]+"") { $line = $1; print V "line: $line\n" if $V>4; $line =~ s"</*(bl|br|hr|img|li|dl|p|pre|ul)\b.*?>""ig; } else { $line = $DOCline; $DOCline = ''; } } } else { # Not HTML. $line = $DOCline; $DOCline = ''; } $line =~ s/\s*$/\n/; print V "LINE: $line" if $V>3; return $line; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub sigcont { print V "$me/sigcont: close DOC ...\n" if $V>3; if ($DOCopen) { &closeDOC; print V "$me/sigcont: closed DOC.\n" if $V>3 } &showcalls(); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub sigdone { &showcalls(); $finishup = 1; exit 1; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub showcalls { local($l,$package, $filename, $line, $subroutine); print V "$me: Call stack:\n" if $V>0; $l = 0; while (($package, $filename, $line, $subroutine) = caller($l)) { printf V "\tLevel %3d line %5d $filename\tin $subroutine\n",$l,$line if $V>0; ++$l; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Register a new URL for later scanning. We can do some weeding out # # here if we so desire. We return 0 if we reject the URL; 1 if we # # accept it, though callers don't yet use this info. We implement a # # special ABC kludge here: If the URL ends with .abc, we accept it # # even if it's beyond the maximum depth. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub URL { local($url, # New URL. $dpth, # Depth in search. $prnt) # Parent URL. = @_; local($F) = ':URL'; print V "$F: \"$url\" d=$dpth.\n" if $V>4; if ($dpth > $maxdepth) { if (($url =~ /\.abc$/i) && ($dpth > $abcdepth)) { print V "$F: \"$url\" ignored (depth $dpth > $abcdepth && .abc file)\n" if $V>3; return 0; } else { print V "$F: \"$url\" ignored (depth $dpth >= $maxdepth)\n" if $V>3; return 0; } } print V ">->-> \"$url\" [$depth]\n" if $dpth>$maxdepth && $V>2; $url = &URLtrim($url); # Shorten the URL if possible. print V "$F: u=\"$url\" after URLtrim\n" if $V>4; if ($prnt) { unless ($Parent{$url}) { $Parent{$url} = $PRNT; print V "$F: \"$url\" <- \"$prnt\"\n" if $V>3; } else { print V "$F: \"$url\" parent already known.\n" if $V>3; } } else { print V "$F: \"$url\" parent unknown.\n" if $V>3; } # $url =~ s"/www.irishfest.com/oneills/___/"/www.irishfest.com/oneills/dev/"; print V "----> \"$url\" [$dpth] <- \"$prnt\"\n" if $V>1; if ($url !~ /^(http|file):/i) { # Accept only these protocols. print V "$F: \"$url\" ignored (http|file rule)\n" if $V>2; return undef; } if ($url =~ m"\bbin/"i) { # Don't try to fetch from bin/ print V "$F: \"$url\" ignored (bin rule)\n" if $V>2; return undef; } if ($url =~ m"\bcgi\b"i) { # Don't try to fetch from cgi print V "$F: \"$url\" ignored (cgi rule)\n" if $V>2; return undef; } if (($url =~ m"^http://([-\w.:]+)/"i) && $BadHost{$1}) { print V "$F: \"$url\" ignored (bad host \"$1\")\n" if $V>2; return undef; } if ($Udepth{$url} < 1) { push @URLs, $url; $Udepth{$url} = $dpth; print V "$F: \"$url\" new at depth $dpth.\n" if $V>3; $Parent{$url} = $prnt if $prnt; } else { if ($Udepth{$url} > $dpth) { # Eventually this should't happen. print V "$F: \"$url\" new depth $dpth (was $Udepth{$url}).\n" if $V>1; $Udepth{$url} = $dpth; } else { print V "$F: \"$url\" old at depth $Udepth{$url}.\n" if $V>3; } } return 1; }