#!/usr/bin/perl
#
#NAME
#  Hosts2html - convert the hosts/* files to alphabetized HTML files
#
#SYNOPSIS
#  Hosts2html &
#
#REQUIRES
#  These should be in the same directory; if not, you'll have to  set
#  @INC to say where to find them.
#
	require "Vopt.pm";
	require "Backup.pm";
	require "CTitle.pm";
#
#DESCRIPTION
#  The hst/ directory is scanned for host files, and  for  each,  the
#  information  about  ABC tunes is extracted and written to a set of
#  HTML files.  We create the output files in the  ndx/  directory,
#  which must exist.
#
#  This is a post-processor for the abcbot program, which creates the
#  per-host  ABC info files in the hst/ directory.  The data there is
#  in a random order, and not HTML.  The ndx/* files that  we  create
#  may  be  used  directly to locate tunes by name.  They may also be
#  searched via the findtune.html web page, which provides lookup via
#  perl patterns..
#
#OPTIONS
#   None, so far.
#
#ENVIRONMENT
#   We use the Vopt.pm module to set up  verbose-mode  output.   This
#   means  that  we  use the environment variable V_Hosts2html, whose
#   value should consists of a numeric verbose level followed  by  an
#   file name.  The default output is STDERR. You can set the verbose
#   level as follows:
#
#   setenv V_Hosts2html 3/tmp/Hosts2html.out	# csh or tcsh users.
#   export V_Hosts2html=3/tmp/Hosts2html.out	# ksh or bash users.
#
#BUGS
#  We don't attempt to create any needed directories.
#
#SEE ALSO
#
#AUTHOR
#  John Chambers <jc@trillian.mit.edu>

$| = 1;
$exitstat = 0;
($P = $0) =~ s".*/"";
&Vopt($ENV{"V_$P"} || $ENV{"D_$P"} || $ENV{"T_$P"} || '1');
$Hlimit = $ENV{"H_$P"} || 0;	# Host limit, for debugging.
print V "$P: Started with H=$H V=$V.\n" if $V>1;

$articles = '-';	# Suppress articles in titles
$Xmax  =  6;		# Max width of index
$Kmax  = 12;		# Max width of key
$Hmax  =  6;		# Max width of header list
$Mmax  =  5;		# Max width of meter
$C1max = 15;		# Max width of Code 1
$C2max = 15;		# Max width of Code 2
%reduce = (			# URL reductions.
	'http://localhost/~jc/'        => '/~jc/',
	'http://dmz.atsbank.com/~jc/'  => '/~jc/',
	'http://trillian.mit.edu/~jc/' => '/~jc/',
);
@suppress = (
	'jc/.*/Scotland/.*\.hdr$'
);

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Extract the list of hosts from the hst/ directory.  We  ignore  any
# file with '-' or '.LCK' on the end of its name.

@hosts = grep(!/(-|\.LCK)\s*$/,glob("hst/*"));
$hosts = int(@hosts);
print V "$P: $esep\n" if $V>2;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Now run through the host list, and for each host,  read  its  file,
# leaving behind information about ABC titles in the %T table.

print V "$P: We have $hosts hosts.\n" if $V>1;
host:
	foreach $hstfil (@hosts) {
		print V "$P: host file \"$hstfil\"\n" if $V>2;
		if (($host) = ($hstfil =~ m".*/([-_:.\w]+)\s*$")) {
			print V "$P: $hsep\n" if $V>2;
			print V "$P: HOST \"$host\"\n" if $V>1;
			++$Hcount;
			if ($Hlimit > 0 && $Hcount > $Hlimit) {
				print V "$P: Past host limit of $Hlimit; quitting.\n" if $V>1;
				last host;
			}
			unless (open(H,$hstfil)) {
				next host;
			}
line:		for $line (<H>) {
				print V "$P: Line $line" if $V>5;
				$line =~ s/[\r\n]+$//;
				unless ($line) {
					print V "$P: End of chunk.\n" if $V>3;
					# Do we want to forget anything here?
					next line;
				}
				print V "$P: line $line\n" if $V>5;
##				# Assorded line formats that have been used:
##				if	(($line =~ m'^(\d+)(\s+)([-#\w])\s+D:(\d+)\s+B:(\d*)\s+L:([\d/.]*)\s+X:([\d/.]*)\s+T:(\d*)\s+(.*)$')) {
##				#	$dt  = $1;
##				#	$sp  = $2;
##				#	$tp  = $3;
##				#	$Dn  = $4;
##				#	$Bn  = $5;
##				#	$Ln  = $6;
##				#	$Xn  = $7;
##				#	$Tn  = $8;
##					$path = $9;
##					$bytes = sprintf "%06d",$5;
##					print V "$P: Chunk ts=$1 dt=$2 fl='$3' D:$4 \"$path\"\n" if $V>2;
##					next line;
##				}
##				if	(($line =~ m'^(\d+)(\s+)([-#\w])\s+D:(\d+)\s+B:(\d*)\s+X:([\d/.]*)\s+T:(\d*)\s+(.*)$')) {
##				#	$dt  = $1;
##				#	$sp  = $2;
##				#	$tp  = $3;
##				#	$Dn  = $4;
##				#	$Bn  = $5;
##				#	$Xn  = $6;
##				#	$Tn  = $7;
##					$path = $8;
##					$bytes = sprintf "%06d",$5;
##					print V "$P: Chunk ts=$1 dt=$2 fl='$3' D:$4 \"$path\"\n" if $V>2;
##					next line;
##				}
##				if	(($line =~ m'^(\d+)(\s+)([-#\w])\s+D:(\d+)\s+B:(\d+)\s+(.*)$')) {
##				#	$dt  = $1;
##				#	$sp  = $2;
##				#	$tp  = $3;
##				#	$Dn  = $4;
##				#	$Bn  = $5;
##					$path = $6;
##					$bytes = sprintf "%06d",$5;
##					print V "$P: Chunk ts=$1 dt=$2 fl='$3' D:$4 \"$path\"\n" if $V>2;
##					next line;
##				}
				# General code to split the line apart one field at a time:
				# First, pick off the initial timestamp, which may have two forms,
				# depending on whether abcbot was run with the debugging turned on
				if	(($line =~ m'^(\d+)(\s+)([-#\w])\s+D:(\d+)\s+(.*)$')
				||	 ($line =~ m'^(\d+)=(\d+)\s+([-#\w])\s+D:(\d+)\s+(.*)$')) {
					$path = $5;
					$bytes = 0;
					print V "$P: Chunk ts=$1 dt=$2 fl='$3' D:$4 \"$path\"\n" if $V>2;
					next line;
				}
				print V "$P: ---- $line\n" if $V>5;
				$X = $H = $K = $M = $C1 = $C2 = $T = '';
## 975765312 X:33 M:9/8 K:F H:"BCLOQZ C1=067663066067664 C2=ddddududddduddu T:Some Title
				if ($line =~ s"^(\d+)\s+X:([\d/.]+)\s+"") {
					%V = ();			# Values of various index fields
					$V{X} = $2;
					while ($line) {		# Break up the line one field at a time
						if ($line =~ s"^([TPN]):(.*)$"") {	# The title comes last
							$V{':'} = $1;					# [TPN] flag for title
							$V{CT} = &CTitle($V{T} = $2);	# Canonical and original title
							print V "$P: 1 V{:}=$V{':'} V{CT}=$V{CT} V{T}=\"$V{T}\"\n" if $V>2;
						} elsif ($line =~ s"^\s*(\w+)[:=]([^:=]*?)\s+(\w+)[:=]"$3:") {
							$V{$1} = $2;
							print V "$P: 2 V{$1}='$V{$1}'\n" if $V>2;
						} elsif ($line =~ s"^\s*(\w+)[:=](.*?)\s+(\w+)[:=]"$3:") {
							$V{$1} = $2;
							print V "$P: 3 V{$1}='$V{$1}'\n" if $V>2;
						} else {
							print V "$P: ### Can't parse \"$line\"\n" if $V>0;
							$line =~ s"^\s*\S+\s*"";	# Pop off one field
						}
					}
					print V "$P: CT=$V{CT} B=$bytes\n" if $V>3;
					unless ($ct = $V{CT}) {			# Canonical title.
						print V "$P: Null CTitle for \"$V{T}\"\n" if $V>2;
						next line;
					}
					$url = "http://$host$path";			# Generate HREF URL
					$cn  = ++$CTN{$ct};					# Counter to distinguish duplicates
					$bc  = sprintf "%06d",$bytes;		# Byte count, fixed length for sorting
					$key = "${ct}:${bc}:${url}:${cn}";	# Sort key for entry
					$val = "$V{X}:$V{C1}:$V{C2}:$V{M}:$V{K}:$V{H}:$V{C}:$V{V}:$V{':'}:$V{T}";	# Fields we want now.
					print V "$P: key '$key' ==> '$val'\n" if $V>2;
					$T{$key} = $val;	# TITLE:URL -> X:C1:C2:M:K:H:C:V:Title
					next line;
				}
			}
		}
	}
print V "$P: $esep\n" if $V>2;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# All the data has been read into the %T array.  We now  run  through  it  in #
# lexical order, and output the data.                                         #
#                                                                             #
# At  present,  we  use  the first two chars of the (upper-case) TTL field to #
# decide which output file to produce.  Every time these two chars change, we #
# create a new output file.                                                   #
#                                                                             #
# Note that the sort key starts with the canonicalized title, followed by the #
# 8-digit file size.  This causes the entries for a single title to be sorted #
# with  smaller  files  first.   This is intentional, because I've found that #
# users tend to fetch the first version of a title first. This way, they will #
# ask for the smaller files first, minimizing the network load.               #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

print V "$P: Input phase done; processing data.\n" if $V>1;
for $key (sort keys %T) {
	$val = $T{$key};
	print V "$P: key '$key' val '$val'\n" if $V>2;
	if (($TTL,$SZ,$URL,$CN) = ($key =~ m"^(\w+):(\d*):(.*):(\d*)$")) {
		print V "$P: T='$TTL' U='$URL' SZ=$SZ N=$CN.\n" if $V>2;
		$CC = substr($TTL . '__',0,2);
		if (($X,$C1,$C2,$M,$K,$H,$C,$V,$f,$Title) = ($val =~ m"^([\d/.]+):([^:]*):([^:]*):([^:]*):([^:]*):([^:]*):([^:]*):([^:]*):(.*)$")) {
			print V "$P: === X=$X C1='$C1' C2='$C2' M='$M' H='$H' K='$K' T='$f:$Title'\n" if $V>2;
			if ($CC ne $CX) {
				print V "$P: $hsep\n" if $V>2;
				print V "$P: Closing '$CX' and opening '$CC'.\n" if $V>2;
				print O "</pre></body>\n";
				if (!open(O,">ndx/$CC.html")) {
					print V "$0: Can't write '$CC.html' [$!]\n";
					close O;
				}
				print O "<html><head><title>ABC tunes starting with $CC</title></head><body><pre>\n";
				&line('','','ABC','X','Meter','Key','Headers','Code 1','Code 2','Title');
				&line('','','---','-','-----','---','-------','------','------','-----');
				print V "$CC.html\n" if $V>1;
				$CX = $CC;
				%dup = ();	# Record of duplicate lines.
			}
			&line("$TTL"   ,$URL,'ABC',$X,$M,$K,$H,$C1,$C2,$f,$Title,$SZ);
		} else {
			print V "$P: Can't parse \"$val\"\n" if $V>1;
		}
	}
}
print V "$P: $esep\n" if $V>2;
print O "</pre></body>\n";
close O;
exit 0;

exit $exitstat;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

sub line { my $F='line';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Write one line to the current output file.  The first time this  is  called #
# for  a  new  file, the args will be constants to produce the column titles, #
# with no URL.  The rest of the calls will be with variable args to  generate #
# one tune reference.                                                         #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($TTL,	# Canonical title, all upper case
		$URL,	# Full URL, or URI for local files
		$Fmt,	# Format (always 'ABC' now)
		$X,		# Index number
		$M,		# Meter
		$K,		# Key
		$H,		# Header lines
		$C1,	# JC code (GB code difference)
		$C2,	# USD code
		$f,		# Header flag ([TPN]) for title
		$Title,	# Title in original form
		$Size	# File size, in bytes.
	) = @_;
	local($BC,$p,$r,$u,$XX);
	$C1 = substr(($C1 . (' ' x $C1max)),0,$C1max);
	$C2 = substr(($C2 . (' ' x $C2max)),0,$C2max);
	$K  = substr($K . (' ' x $Kmax), 0, $Kmax);
	$H  = substr($H . (' ' x $Hmax), 0, $Hmax) if length($H) < $Hmax;
	$M  = substr($M . (' ' x $Mmax), 0, $Mmax);
	$XX = substr((' ' x $Xmax) . $X, -$Xmax, $Xmax);
	$BC = sprintf "%08d",$Size;
	$f  = 'T' unless $f;	# Title [TPN] flag
	print V "$F:	f='$f' T=\"$title\"\n" if $V>2;
	for $p (keys %reduce) {
		$r = $reduce{$p};
		print V "$P: p='$p' r='$r'\n" if $V>4;
		if ((($u = $URL) =~ s"$p"$r") ne $URL) {
			print V "$P: '$URL' => '$u'\n" if $V>4;
			$URL = $u;
		}
	}
	if ($URL) {
		$l = "<TT><!--$BC $TTL--><A HREF='$URL'>$Fmt</A> $XX C1:$C1 C2:$C2 M:$M K:$K H:$H </TT>$f:$Title";
	} else {
		$l = "<TT><!--$TTL--><!-- -->$Fmt $XX C1:$C1 C2:$C2 M:$M K:$K H:$H <TT>$f:$Title";
	}
	print O "$l\n" unless $dup{$l};
	$dup{$l} = 1;
}
