#!/usr/bin/perl
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#       THIS IS A HIGHLY EXPERIMENTAL PROGRAM. USE IT AT YOUR OWN RISK!       #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#
#NAME
#  ABCsearch - Ask a search site for pointers to ABC tunes
#
#SYNOPSIS
#  ABCsearch &
#
#REQUIRES
	use Carp;
#	require "URLdata.pm";	# Fetch one URL
#	require "HTTPcon.pm";	# Make HTTP connections
#	require "HTMLdir.pm";	# Handling local directories
	require "Vopt.pm";		# Verbosity routines
#
#DESCRIPTION
#  This is a meta-meta-search program.
#
#  It is currently set up to talk to google.com; this may change at my whim.
#
#  The basic idea is to pass the search engine  a  number  of  keywords,  and
#  grovel through the replies to try to find sites with ABC music files. This
#  is done by looking for URLs, and adding them to the ABC search bot's lists
#  of per-host starting URLs.
#
#  So far, the best search keys for ABC music seem to be  "ABC",  "notation",
#  "music" and "tunes".  This may also change.
#
#OPTIONS
#
#EXAMPLES
#
#  Here are the URLs returned by google for a search:
#    http://www.google.com/search?q=ABC+tunes&btnG=Google+Search
#    http://www.google.com/search?q=ABC+tunes&hl=en&start=10&sa=N
#    http://www.google.com/search?q=ABC+tunes&hl=en&start=20&sa=N
#
#FILES
#  add/$h
#    This is where the search bot keeps its list of likely URLs for host $h.
#
#  new/$h
#    Some experimental code also saves per-host URLs here, and we  might  use
#    these  files  rather  than add/$h at times.  The best scheme isn't quite
#    clear yet.
#
#BUGS
#
#SEE ALSO
#
#AUTHOR
#  John Chambers <jc@trillian.mit.edu>
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

$| = 1;
$exitstat = 0;
($P = $0) =~ s".*/"";
&Vopt($ENV{"V_$P"} || $ENV{"D_$P"} || 2);	# Verbose level

$matchlimit = 1000;	# How many matches to process per SE
#matchlimit =    3;	# For debugging
$trylimit   =  100;	# Max number of times to ask each host

@keys = (		# Search key words
	'ABC',
	'notation',
	'tunes',
	'music',
	'collection',
);
%seURL = (		# URLs for search engines
	'yahoo'     => 'http://search.yahoo.com/search',
#	'google'    => 'http://www.google.com/search',
#	'teoma'     => 'http://s.teoma.com/search',
#	'altavista' => 'http://www.altavista.com/web/results',
#	'walshaw'   => 'http://abcnotation.org.uk/tunes.html',
);
@URLs = (		# URLs to search
);
@IgnoreHosts = (	# Ignore hosts that match these patterns
	'search.yahoo\.com$',
	'\.altavista\.com$',
	'\.google\.com$',
	'\.teoma.com$',
	'\.versiontracker\.com$',	# Loooooong delays
);
@IgnoreURL = (	# Don't do these URLs (again)
);

for $se (sort keys %seURL) {
	print V "$P: Search Engine \"$se\"\n" if $V>1;
	$mcount = &oneSE();
	print V "$P: Search Engine \"$se\" $mcount matches.\n" if $V>1;
} 

print V "$P: Exit with status $exitstat.\n" if $V>1;
exit $exitstat;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub oneSE {
	local($F) = 'oneSE';
	local($h,$kwds,$ndx,$url);
#	$cgi = $seURL{$se} . "?q";	# So far all SEs use q= for query string
#	$sep = '=';
#	for $key (@keys) {
#		$cgi .= "$sep$key";
#		$sep = '+';
#	}
	$kwds = join('+',@keys);
	$matches = $ndx = $tries = 0;	# Number of matches so far
	while ($tries < $trylimit && $matches < $matchlimit && $ndx < $matchlimit) {
		if ($se eq 'google') {
			$num = 50;	# Number of matches per page
			if ($ndx) {	# Queries after the first
				$url = "$seURL{$se}?q=$kwds&start=$ndx&num=$num&sa=N";
			} else {	# First query
				$url = "$seURL{$se}?q=$kwds&btnG=Google+Search&num=$num";
			}
			$ndx += $num;
			$pat = 'http://([-.:\w]+)(/[^">]+)>';
		} elsif ($se eq 'altavista') {
			$num = 10;	# Number of matches per page
			if ($ndx) {	# Queries after the first
				$url = "$seURL{$se}?itag=ody&kgs=1&kls=0&q=$kwds&stq=$ndx";
			} else {	# First query
				$url = "$seURL{$se}?itag=ody&kgs=1&kls=0&q=$kwds";
			}
			$ndx += $num;
			$pat = '<span class=ngrn>([-.:\w]+)(/[^>]+)\s*</span>';
		} elsif ($se eq 'yahoo') {
			$num = 10;		# Can yahoo do more than 10 at a time?
			$url = "$seURL{$se}?q=$kwds&c=web&cs=iso-8859-1&o=$ndx";
			$ndx += $num;
			$pat = '<a\s+class="yschttl\s+spt"\s+href="http://([-.:\w]+)(/[^"]+).';
			#       <a class="yschttl spt" href="http://abcnotation.com/tunes"
		} elsif ($se eq 'teoma') {
			$num = $matchlimit;		# Teoma can do any number at a time
			$url = "$seURL{$se}?q=$kwds&nw=False&u=$num";
			$ndx += $num;
			$pat = ';pg=1&amp;u=http://([-.:\w]+)(/[^"]+)" id=';
		} elsif ($se eq 'walshaw') {
			$num = 1;		# Chris Walshaw's site list isn't a search
			$url = $seURL{$se};
			$ndx += $num;
			$pat = '<meta.*URL=([^"]+)">';
		} else {
			print STDERR "$0/oneSE: Unknown SE \"$se\"\n";
		}
		oneSearch($se,$url);
		$tries ++;
	}
	return $seMatch{$se};
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub oneSearch {
	local($Search,$url) = @_;
	local($F) = 'oneSearch';
	local($buf,$cmd,$ip,$l,$n,$urls);
	local(*DOC);
	$cmd = "webcat +TH '$url'";
	print "cmd={$cmd}\n" if $V>2;
	unless (open(DOC,"$cmd |")) {
		print V "$F: Can't get \"$url\" [$!]\n" if $V>0;
		return 0;
	}
	print "\nGET: $url\n" if $V>1;
	$n = 0;
line:
	for $l (<DOC>) {
	#	++$matches if ($l =~ /^<p><\!--m-->/);
		if ($l =~ m'"yschttl spt"') {
			print "$l\n" if $V>3;
		}
	url:
		while ($l =~ s,$pat,,) {
			++$n;		# Count the matches.
			$h = $1;
			($u = $2) =~ s/[\r\s"]+$//;
			$u =~ s/%7E/~/i;
			for $ip (@IgnoreHosts) {
				if ($h =~ m"$ip") {
					print STDERR "$0: Ignored host \"$h\"\n" if $V>2;
					next url;
				}
			}
			print "Matched h='$h' u='$u'\n" if $V>1;
			unless ($h =~ /^[-_.:\w]*$/) {
				print STDERR "$0: Bogus host \"$h\" ignored.\n";
				next;
			}
			unless ($got{"$h$u"}++) {
				$afile = "add/$h";
				unless (open(ADD,">>$afile")) {
					print STDERR "$0: Can't write \"$afile\" ($!)\n";
					next line;
				}
				print ADD "\n0 U D:1 $u\n";
				print "URL: http://$h$u\n" if $V>2;
				close ADD;
			}
		}
	}
	$seMatch{$se} += $n;
	print "$0/oneSearch: $n matches, $seMatch{$se} total.\n";
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #