#!/usr/bin/perl
#!/usr/bin/perl

# NAME
#   TuneURLs -

# SYNOPSIS
#   TuneURLs [N] BotRun.out

# DESCRIPTION
#   This is a postprocessor filter for the verbose output of the TuneBot
#   script (usually run from the BotRun script).  We extract information
#   from the log about the web directories that have N or more ABC tunes
#   total in their files, and output the URLs.

#   The default value of N is 100.

#   At present, if a URL is produced, its parent directories are not.

# FILES
#  The input file is the  output  of  the  BotRun  script,  which  at
#  verbose levels 2 and higher includes lines that look like:
#       52 http://trillian.mit.edu/~jc/music/abc/Scotland/march/
#      592 http://trillian.mit.edu/~jc/music/abc/Scotland/
#     3024 http://trillian.mit.edu/~jc/music/abc/
#     3024 http://trillian.mit.edu/~jc/music/
#     3024 http://trillian.mit.edu/~jc/
#     3024 http://trillian.mit.edu/
#  For the default N=100, the output from this would be:
#      592 http://trillian.mit.edu/~jc/music/abc/Scotland/

# AUTHOR
#  John Chambers <jc@trillian.mit.edu>

($me = $0) =~ s".*/"";
$V = $ENV{"V_$me"} || 1;
$N = 100;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# We do a URL sort by first checking to see whether either string  is #
# an  initial  substring  of  the other.  If so, we indicate that the #
# longer comes first.  In other cases, we do an alpha compare.        #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub urlsort {
	my($la) = length($a);
	my($lb) = length($b);
	return  1 if ($la < $lb && $a eq substr($b,0,$la));
	return -1 if ($lb < $la && $b eq substr($a,0,$lb));
	$a cmp $b;
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
foreach $a (@ARGV) {
	if ($a =~ /^\d+$/) {
		$N = $a;
	} elsif (open(F,$a)) {
		for $line (<F>) {
			if (($n,$u) = ($line =~ m'^\s*(\d+)\s+(http://.*)$')) {
				if ($n >= $N) {
					$Tabcs{$u} = $n;
					print "<$Tabcs{$u}	$u\n" if $V>2;
				}
			}
		}
	} else {
		print STDERR "$me: Can't read \"$a\" ($!)\n";
	}
}

$lastn = 0;
for $u (sort(urlsort keys(%Tabcs))) {
	print ">$Tabcs{$u}	$u\n" if $V>2;
	if ($Tdone{$u} && $Tabcs{$u} == $n) {
		print "------ $u\n" if $V>1;
	} else {
		$n = $Tabcs{$u};
		printf "%6d	$u\n", $n;
		++$Tdone{$u};
		print "Done$Tdone{$u}	$u\n" if $V>2;
		while (($d,$x) = ($u =~  m'^([a-z]+://.+)/([^/]*/)$'i)) {
			++$Tdone{$u = "$d/"};
			print "Done$Tdone{$u}	$u\n" if $V>2;
		}
		$lastn = $n;
	}
}

exit 0;
