#!/usr/bin/perl -Tw

# NAME
#   w3cat - catenate web pages to standard output

# SYNOPSIS
#   w3cat [ URL | path ]...

# REQUIRES
#   perl4 or perl5 and the following modules, which should  be  found
#   in  the same directory where you found this program.  Change @INC
#   to include whatever directory you put them into.

	push @INC,"$ENV{HOME}/sh",'sh','.';
	require "Vopt.pm";		# Verbose output.
	require "HTTPcon.pm";	# Makes HTTP connection, sends GET.
	require "HTMLdir.pm";	# Produces HTML listing of directory.
	require "URLopen.pm";	# Parses URL and returns file handle.

# DESCRIPTION
#   This is a web version of the Unix cat(1) command.

#   Given a list of URLs, this program reads them one at a time,  and
#   writes  their  contents,  catenated  into  one  long  string,  to
#   standard output.  Local file names may be used instead  of  URLs.
#   Directories are output in a simplified HTML format.

#   If you want to learn how to do this stuff,  you  can  study  this
#   program.   It  is  useful  as  a starting point for writing other
#   simple web clients.  It's not nearly as difficult as people would
#   like  you  to believe.  But the socket stuff uses several magical
#   incantations that "you just have to know"; see the required  perl
#   module files for this socket magic.

# OPTIONS
#   The default setup is to deliver only the data (contents) of a URL
#   and discard the header and tracing information.  Here is the list
#   of our current options. The options may be combined into a single
#   string,  as usual, with the qualification that options which have
#   an arg (O and P) must be the last in the string.  Options may  be
#   in any order, and apply to all subsequent URLs unless canceled by
#   another option.

#   +D
#     Output the data [default].

#   -D
#     Don't output the data.

#	+H
#     Include the HTTP header info in the output.

#   -H
#     Don't include the HTTP header info in the output [default].


#   -I
#     Don't send agent identification (default).
#   +I"agentid"
#     Send the quoted string as the agent identification.   Some  web
#     sites  won't talk to you unless you pretend to be an acceptable
#     browser.  If there is no string, we send the string:

	$dflagentid = "Mozilla/4.5 [en] (compatible; I; Linux 2.1)";

#   +M
#     Macintosh kludge:  convert \r to \n (if not followed by \n)

#   -O<file>
#     Write the data to <file>.  Default is stdout.

#   +P<proxy>
#     Proxy gateway.  If you are hidden behind a proxy, put the proxy
#     hostname (and optionally :port) in a -P option, and we will try
#     to indirect through the proxy server.

#   -P<noproxy>
#     Proxy exception. The <noproxy> string should be a perl pattern.
#     If a URL matches this pattern, the proxy gateway isn't used.

#   -R
#     Ignore redirects (default).
#   +R
#     Follow HTTP "Location:" redirects.

#   +T
#     Enable WWW tracing. This sets a global flag that causes various
#     routines to produce lines of the form:
#       <!--name:  ...  some message ...  -->
#     These look like both HTTP header lines and HTML comments.  Some
#     WWW tools (such as the "H" html viewer) can show these to  tell
#     you which stage of a GET operation we have reached.

#   -T
#     Disable WWW tracing [default].

#   -T<n>
#     Timeout of <n> seconds. The default is no timeout, meaning that
#     the  underlying  system's connect() will determine the timeout,
#     if any.

#   -V<version>
#     The HTTP version to use.  The default is -V1.0.  For -V1.1, an
#     extra Host: header is sent, since some servers require it.

#   +X
#     Exit abruptly on timeout.  The default is -X,  which  means  to
#     just  abandon  the URL.  On some systems, there is a bug in the
#     connect() system call that can result in hanging  indefinitely;
#     this option is a last-resort "solution" that "works" if you are
#     only trying to get one URL.

# ENVIRONMENT
#   We use the following from the environment:

#   W3PROXY
#     The name (or  address)  and  an  optional  :port  for  a  proxy
#     gateway.   URLs  that don't match the W3NOPROXY will be fetched
#     indirectly via the proxy's web server.  If not defined, we will
#     attempt direct TCP connections for all URLs.

#   W3NOPROXY
#     A pattern which is applied to URLs, and if they match, no proxy
#     is used.  That is, any URL that matches W3NOPROXY is considered
#     local, and we will access it directly.  If not defined, we will
#     use W3PROXY for all URLs (if it exists).

# LIMITATIONS
#   So far only the http:// protocol is implemented; ftp://,  file://
#   and  others  may  appear  if  I need them.  If someone feels like
#   adding FTP code, you might send me a copy.

#   HTTP "redirection" (the "Location:" HTTP header)  is  implemented
#   now  via  the  +R option.  By default, it is disabled and must be
#   handled by the caller, if desired.  This mainly means that if you
#   omit the final '/' on a directory name, we will fail. This is not
#   considered a bug, so it'll probably never be fixed.

# DEBUGGING
#    You can use "perl -dw", of course. Or you can do the following:
#     setenv V_w3cat 5/tmp/w3cat.out	# csh or tcsh users.
#     export V_w3cat=5/tmp/w3cat.out	# ksh or bash users.
#   This will turn on the "print V" lines for $V in the range 0-5, and
#   write the verbose output to /tmp/w3cat.out.

# BUGS
#   Despite many attempts to detect failure, we still don't optimally
#   handle all the myriad things that can go wrong. In particular, on
#   some systems, the connect() system call can hang indefinitely and
#   can't  be  killed  by  an ALARM.  There does not appear to be any
#   known solution to this problem.  (No amount of clever  code  will
#   help if your process doesn't get any cpu time.)

# AUTHOR
#   <a href="mailto:jc@trillian.mit.edu">John Chambers</a>
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

$| = 1;					# Don't buffer output.
$exitstat =   0;		# Set this to get a failure exit status.
($me = $0) =~ s"^.*/"";
&Vopt($ENV{"V_$me"} || $ENV{"D_$me"} || $ENV{"T_$me"} || '1');
print V "$me: Started ", `date` if $V>1;

#bufsiz =    10;		# Small for testing.
$bufsiz = 10000;		# Large for routine use.

$W3hdrs = 0;			# Whether to output header lines.
$W3data = 1;			# Whether to output data.
$kludge1_1_404 = 0;	# HTTP/1.1 GET required.

arg:
for $u (@ARGV) {
	$moved = 0;
	$URLerr = "Don't know why";				# Set by URLopen when failures.
	if (($pfx,$opt) = ($u =~ /^([-+])(.*)/)) {
		while ($opt) {						# Each time MUST remove at least one char.
			if ($opt =~ s/^D//i) {			# +D or -D (whether to produce data)
				$W3data = ($pfx eq '+') ? 1 : 0;
				print V ($W3data ? "Do" : "Don't"), " produce data.\n" if $V>1;
			} elsif ($opt =~ s/^H//i) {		# +H or -H (whether to produce headers)
				$W3hdrs = ($pfx eq '+') ? 1 : 0;
				print V ($W3hdrs ? "Do" : "Don't"), " produce headers.\n" if $V>1;
			} elsif ($opt =~ s/^I//i) {		# +I<agentid> or -P
				if ($pfx eq '-') {
					$W3agentid = '';
					print V "$me: No agent identification." if $V>1;
				} else {
					$W3agentid = $opt || $dflagentid;	# Rest of string is id.
					print V "$me: Agent \"$W3agentid\"\n" if $V>1;
					$opt = '';
				}
			} elsif ($opt =~ s/^M//i) {      # +M or -M (Macintosh kludge)
				$MACfl = ($pfx eq '+') ? 1 : 0;
				print V "$me: Mac kludge " . ($MACfl?'on':'off') . ".\n" if $V>1;
			} elsif ($opt =~ s/^O//i) {		# +O or -O (output file)
				$outfile = $opt;			# Rest of string is file name.
				$opt = '';
				print V "$me: Output to \"$outfile\"\n" if $V>1;
			} elsif ($opt =~ s/^P//i) {		# +P<proxy> or -P<noproxy>
				if ($pfx eq '-') {
					$W3nopxy = $opt;		# Rest of string is pattern.
					print V "$me: Proxy exceptions are /$W3nopxy/\n" if $V>1;
				} else {
					$W3proxy = $opt;		# Rest of string is proxy host.
					print V "$me: Proxy server is $W3nopxy.\n" if $V>1;
				}
				$opt = '';
			} elsif ($opt =~ s/^R//i) {		# +R or -R (whether to follow 302 redirects)
				$W302 = ($pfx eq '+') ? 1 : 0;
				print V ($W302 ? "Do" : "Don't"), " follow 302 redirects.\n" if $V>1;
			} elsif ($opt =~ s/^T//i) {		# +T or -T (WWW tracing) or -T<n> (timeout)
				if ($pfx eq '+') {			# +T enables tracing.
					$W3trace = 1;
					print V "Do produce WWW tracing.\n" if $V>1;
				} elsif ($opt =~ s/^(\d+)//) {	# -T\d+ is timeout.
					$HTTPtimeout = $1;
					print V "HTTPtimeout = $HTTPtimeout sec.\n" if $V>1;
				} else {					# -T disables tracing.
					$W3trace = 0;
					print V "Don't produce WWW tracing.\n" if $V>1;
				}
			} elsif ($opt =~ s/^V//i) {		# +V<version>
				$HTTPvopt = $HTTPversion = $opt;
				$opt = '';
				print V "$me: HTTP version '$HTTPvopt'\n" if $V>1;
			} elsif ($opt =~ s/^X//i) {		# +X
				$HTTPtimexit = ($pfx eq '+') ? 1 : 0;
				print V ($W302 ? "Do" : "Don't"), " exit on timeout.\n" if $V>1;
			} else {
				print V "$me: unknown option \"$opt\" ignored.\n";
				$opt =~ s/.//;				# Discard this option char.
			}
		}
	} elsif (&URLopen(*U,$u)) {
		print V "$me: Opened \"$u\"\n" if $V>1;
		if ($HTTPtimeout > 0) {
			alarm $HTTPtimeout;
			$savsig = $SIG{ALRM};
			$SIG{ALRM} = 'READalarm';
			print V "$me: Set alarm after $HTTPtimeout sec.\n" if $V>2;
		}
		$statmax = 0;	# Max status code seen.
		$staterr = '';
		# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
		# Here's where we read the data from one URL and write it  to #
		# standard output.  If you want to do something else with the #
		# data, you should rewrite this loop:                         #
		# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
		if ($outfile && !$outopen) {	# Do we need to open the output?
			print V "$me: Open \"$outfile\"\n" if $V>1;
			if (open(O,">$outfile")) {	# Try to open it for writing.
				print V "$me: Writing \"$outfile\" [$!]\n" if $V>1;
			} else {
				print V "$me: Can't write \"$outfile\" [$!]\n" if $V>0;
				$outfile = '';
			}
			$outopen = 1;
		}
		print V "$me: Headers (URLhdr=$URLhdr) ...\n" if $V>1;
hdr:	while ($URLhdr && ($b = <U>)) {	# Read this URL's headers.
			$b =~ s/\s+$//;				# Discard trailing white stuff.
			if ($W3hdrs) {if ($outfile) {print O "$b\n"} else {print "$b\n"}}
			if ($b) {
				if (($httpvrs,$statcode,$statmsg) = ($b =~ /^HTTP\/([\d.]+)\s+(\d+)\s+(.*)/)) {
					print "<!--$me: httpvrs=$httpvrs statcode=$statcode -->\n" if $V>1;
					if ($statcode > $statmax) {$statmax = $statcode; $staterr = $statmsg}
					if ($W302 && $statcode == 302) {
						print "<!--$me: MOVED -->\n" if $V>1;
						$moved = 1;
					}
					if ($httpvrs eq '1.1' && $statcode >= 400) {
						if ($HTTPversion ne '1.1') {
							print "<!--$me: HTTP/$HTTPversion => HTTP/1.1 404 kludge -->\n" if $W3trace;
							$kludge1_1_404 = 1;
							$HTTPversion = '1.1';
							redo arg;
						} else {
							print "<!--$me: HTTP/1.1 $statcode $statmsg (probably real) -->\n" if $W3trace;
						}
					}
				} elsif ($moved && ($b =~ /^Location:\s*(.*)$/)) {
					print "<!--$me: New URL \"$1\" -->\n" if $V>1;
					$u = $1;
					redo arg;
				}
			if ($statcode >= 400) {
				print "<!--$me: Fail with HTTP err $statcode ($statmsg)  \"$u\"-->\n" if $W3trace;
			}
			} else {
				$URLhdr = 0;		# Blank line ends headers.
			}
		}
		if ($statmax >= 400) {
			exit 1;
		}
		$/ = undef;
		if ($HTTPtimeout > 0) {alarm $HTTPtimeout}
		print V "$me: Headers done.\n" if $V>1;
		if (!$W3data) {			# Data not wanted?
			close(U);			# Close this connection.
			next arg;			# Go on to next URL.
		}
#		if ($W3hdrs) {
#			print V "$me: Writing NL\n" if $V>1;
#			if ($outfile) {print O "\n"} else {print "\n"}
#		}
data:	while ($n = read(U,$b,$bufsiz)) {
			print V "$me: Got $n bytes: \"$b\"\n" if $V>5;
			if (!$W3hdrs && $URLhdr) {	# Suppressing header lines.
				print V "$me: HDR check in \"$b\"\n" if $V>1;
				if ($b =~ s/^.*\r\n\r\n//s) {
					$URLhdr = 0;		# Found \n\n separator.
				} else {
					next;				# No separator, discard it all.
				}
			}
			if ($MACfl) {$b =~ s"\r\n?"\n"g}
			if ($outfile) {print O $b} else {print $b}
			if ($HTTPtimeout) {alarm $HTTPtimeout}
		}
		unless (defined $n) {
			print V "\n$me: Error reading \"$u\" (Reason: $!)\n" if $V>0;
			$exitstat = $?;
		}
	} else {
		print V "$me: Can't open \"$u\" ($URLerr)\n" if $V>0;
		$exitstat = 1;
	}
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub READalarm {
	my $t = time - $HTTPopentime;
	print "<!--$me: ALARM after $t sec -->\n" if $W3trace;
	exit -1;
}

exit $exitstat;
