#!/usr/bin/perl -Tw #NAME # httpcat - catenate web pages to standard output #SYNOPSIS # httpcat [ URL | path ]... #REQUIRES # perl5 and the following modules, which should be found in the same # directory where you found this program. Change @INC to include whatever # directory you put them into. push @INC,"$ENV{HOME}/sh",'sh','.'; require "Vopt.pm"; #Verbose output. require "HTTPcon.pm"; #Makes HTTP connection, sends GET. require "HTMLdir.pm"; #Produces HTML listing of directory. require "URLopen.pm"; #Parses URL and returns file handle. #DESCRIPTION # This is a web version of the Unix cat(1) command. # Given a list of URLs, this program reads them one at a time, and writes # their contents, catenated into one long string, to standard output. Local # file names may be used instead of URLs. Directories are output in a # simplified HTML format. # If you want to learn how to do this stuff, you can study this program. It # is useful as a starting point for writing other simple web clients. It's # not nearly as difficult as people would like you to believe. But the # socket stuff uses several magical incantations that "you just have to # know"; see the required perl module files for this socket magic. #OPTIONS # The default setup is to deliver only the data (contents) of a URL and # discard the header and tracing information. Here is the list of our # current options. The options may be combined into a single string, as # usual, with the qualification that options which have an arg (O and P) # must be the last in the string. Options may be in any order, and apply to # all subsequent URLs unless canceled by another option. # +B # Binary data; don't alter white space. # -B # Non-binary; reduce trailing white space to single \n char. # +D # Output the data [default]. # -D # Don't output the data. # -E # Don't extract text; deliver the data as-is. (default) # +E # Extract the text. This is a simple conversion from HTML to plain text. # Assorted line-end tags are converted to \n or \n\n as appropriate. # Hyperlinks are converted to , which is recognized by a lot of # non-HTML software, including ABCbot. # +H # Include the HTTP header info in the output. # -H # Don't include the HTTP header info in the output [default]. # -I # Don't send agent identification (default). # +I"agentid" # Send the quoted string as the agent identification. Some web sites won't # talk to you unless you pretend to be an acceptable browser. If there is # no string, we send the following string, which seems to convince most # servers that we're either netscape or IE (which pretends to be # netscape), plus a bogus sytem name to confuse those servers that won't # serve to linux or BSD users. Feel free to use your own AgentID string, # but you should know that there is server software out there that will # refuse service to any client that doesn't call itself "Mozilla". $dflagentid = "Mozilla/4.5 [en] (compatible; I; $me)"; # $dflagentid = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)"; # +M # Macintosh kludge: convert \r to \n (if not followed by \n). This may be # obsolescent, now that we have code to automatically split the input # apart on \r, \n or \r\n. # -O # Write the data to . Default is stdout. # +P # Proxy gateway. If you are hidden behind a proxy, put the proxy hostname # (and optionally :port) in a -P option, and we will try to indirect # through the proxy server. # -P # Proxy exception. The string should be a perl pattern. If a URL # matches this pattern, the proxy gateway isn't used. # -R # Ignore redirects (default). # +R # Follow HTTP "Location:" redirects. # +T # Enable WWW tracing. This sets a global flag that causes various routines # to produce lines of the form: # # These look like both HTTP header lines and HTML comments. Some WWW tools # (such as the "H" html viewer) can show these to tell you which stage of # a GET operation we have reached. # -T # Disable WWW tracing [default]. # -T # Timeout of seconds. The default is no timeout, meaning that the # underlying system's connect() will determine the timeout, if any. # +T # Retry times on dropped connections. This is to handle a failure mode # seen on some servers: The server accepts the TCP connection, and drops # the connection after receiving the GET request, without even sending an # error code. We retry such connections $tries time, with a default of 10, # with 1-second delays between tries. This option selects a different # limit to the number of retries. (See home1.swipnet.se for examples.) # -V # The HTTP version to use. The default is -V1.0. For -V1.1, an extra Host: # header is sent, since some servers require it. # +X # Exit abruptly on timeout. The default is -X, which means to just abandon # the URL. On some systems, there is a bug in the connect() system call # that can result in hanging indefinitely; this option is a last-resort # "solution" that "works" if you are only trying to get one URL. #ENVIRONMENT # We use the following from the environment: # W3PROXY # The name (or address) and an optional :port for a proxy gateway. URLs # that don't match the W3NOPROXY will be fetched indirectly via the # proxy's web server. If not defined, we will attempt direct TCP # connections for all URLs. # W3NOPROXY # A pattern which is applied to URLs, and if they match, no proxy is used. # That is, any URL that matches W3NOPROXY is considered local, and we will # access it directly. If not defined, we will use W3PROXY for all URLs (if # it exists). #LIMITATIONS # So far only the http:// protocol is implemented; ftp://, file:// and # others may appear if I need them. If someone feels like adding FTP code, # you might send me a copy. # HTTP "redirection" (the "Location:" HTTP header) is implemented now via # the +R option. By default, it is disabled and must be handled by the # caller, if desired. This mainly means that if you omit the final '/' on a # directory name, we will fail. This is not considered a bug, so it'll # probably never be fixed. #DEBUGGING # You can use "perl -dw", of course. Or you can do the following: # setenv V_httpcat 5/tmp/httpcat.out # csh or tcsh users. # export V_httpcat=5/tmp/httpcat.out # ksh or bash users. # This will turn on the "print V" lines for $V in the range 0-5, and write # the verbose output to /tmp/httpcat.out. #BUGS # Despite many attempts to detect failure, we still don't optimally handle # all the myriad things that can go wrong. In particular, on some systems, # the connect() system call can hang indefinitely and can't be killed by an # ALARM. There does not appear to be any known solution to this problem. (No # amount of clever code will help if your process doesn't get any cpu time.) #AUTHOR # John Chambers # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # $| = 1; # Don't buffer output. $exitstat = 0; # Set this to get a failure exit status. ($me = $0) =~ s"^.*/""; &Vopt($ENV{"V_$me"} || $ENV{"D_$me"} || $ENV{"T_$me"} || '1'); print V "$me: Started ", `date` if $V>1; $binary = 1; # Binary copy is default #bufsiz = 10; # Small for testing $bufsiz = 10000; # Large for routine use $W302 = 1; # Follow 302 redirects $W3hdrs = 0; # Whether to output header lines $W3data = 1; # Whether to output data $kludge1_1_404 = 0; # HTTP/1.1 GET required # That's all the config stuff you should have to worry about. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's our table for converting HTML's symbolic entities to characters. The # value is the decimal code for the character in 8859-1. How to deal with # other character sets is still not very clearly defined. %htmlsym = ( 'lt' => 60, # Less than 'gt', => 62, # Greater than 'quot' => 34, # (Double) quote 'nbsp' => 160, # no-break space = non-breaking space, U+00A0 ISOnum 'iexcl' => 161, # cent sign, U+00A2 ISOnum 'pound' => 163, # currency sign, U+00A4 ISOnum 'yen' => 165, # broken bar = broken vertical bar, 'sect' => 167, # diaeresis = spacing diaeresis, 'copy' => 169, # feminine ordinal indicator, U+00AA ISOnum 'laquo' => 171, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum 'not' => 172, # soft hyphen = discretionary hyphen, 'reg' => 174, # registered sign = registered trade mark sign, U+00AE ISOnum 'macr' => 175, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia 'deg' => 176, # plus-minus sign = plus-or-minus sign, 'sup2' => 178, # superscript two = superscript digit two = squared, U+00B2 ISOnum 'sup3' => 179, # superscript three = superscript digit three = cubed, U+00B3 ISOnum 'acute' => 180, # acute accent = spacing acute, U+00B4 ISOdia 'micro' => 181, # pilcrow sign = paragraph sign, 'middot' => 183, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum 'cedil' => 184, # superscript one = superscript digit one, 'ordm' => 186, # masculine ordinal indicator, U+00BA ISOnum 'raquo' => 187, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum 'frac14' => 188, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum 'frac12' => 189, # vulgar fraction one half = fraction one half, U+00BD ISOnum 'frac34' => 190, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum 'iquest' => 191, # inverted question mark = turned question mark, U+00BF ISOnum 'Agrave' => 192, # latin capital letter A with grave = latin capital letter A grave, 'Aacute' => 193, # latin capital letter A with acute, U+00C1 ISOlat1 'Acirc' => 194, # latin capital letter A with circumflex, U+00C2 ISOlat1 'Atilde' => 195, # latin capital letter A with tilde, U+00C3 ISOlat1 'Auml' => 196, # latin capital letter A with diaeresis, U+00C4 ISOlat1 'Aring' => 197, # latin capital letter A with ring above = latin capital letter A ring, 'AElig' => 198, # latin capital letter AE = latin capital ligature AE, 'Ccedil' => 199, # latin capital letter C with cedilla, U+00C7 ISOlat1 'Egrave' => 200, # latin capital letter E with grave, U+00C8 ISOlat1 'Eacute' => 201, # latin capital letter E with acute, U+00C9 ISOlat1 'Ecirc' => 202, # latin capital letter E with circumflex, U+00CA ISOlat1 'Euml' => 203, # latin capital letter E with diaeresis, U+00CB ISOlat1 'Igrave' => 204, # latin capital letter I with grave, U+00CC ISOlat1 'Iacute' => 205, # latin capital letter I with acute, U+00CD ISOlat1 'Icirc' => 206, # latin capital letter I with circumflex, U+00CE ISOlat1 'Iuml' => 207, # latin capital letter I with diaeresis, U+00CF ISOlat1 'ETH' => 208, # latin capital letter N with tilde, 'Ograve' => 210, # latin capital letter O with grave, U+00D2 ISOlat1 'Oacute' => 211, # latin capital letter O with acute, U+00D3 ISOlat1 'Ocirc' => 212, # latin capital letter O with circumflex, U+00D4 ISOlat1 'Otilde' => 213, # latin capital letter O with tilde, U+00D5 ISOlat1 'Ouml' => 214, # latin capital letter O with diaeresis, U+00D6 ISOlat1 'times' => 215, # latin capital letter O with stroke 'Ugrave' => 217, # latin capital letter U with grave, U+00D9 ISOlat1 'Uacute' => 218, # latin capital letter U with acute, U+00DA ISOlat1 'Ucirc' => 219, # latin capital letter U with circumflex, U+00DB ISOlat1 'Uuml' => 220, # latin capital letter U with diaeresis, U+00DC ISOlat1 'Yacute' => 221, # latin capital letter Y with acute, U+00DD ISOlat1 'THORN' => 222, # latin capital letter THORN, U+00DE ISOlat1 'szlig' => 223, # latin small letter sharp s = ess-zed, U+00DF ISOlat1 'agrave' => 224, # latin small letter a with grave = latin small letter a grave, 'aacute' => 225, # latin small letter a with acute, U+00E1 ISOlat1 'acirc' => 226, # latin small letter a with circumflex, U+00E2 ISOlat1 'atilde' => 227, # latin small letter a with tilde, U+00E3 ISOlat1 'auml' => 228, # latin small letter a with diaeresis, U+00E4 ISOlat1 'aring' => 229, # latin small letter a with ring above = latin small letter a ring, 'aelig' => 230, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 'ccedil' => 231, # latin small letter c with cedilla, U+00E7 ISOlat1 'egrave' => 232, # latin small letter e with grave, U+00E8 ISOlat1 'eacute' => 233, # latin small letter e with acute, U+00E9 ISOlat1 'ecirc' => 234, # latin small letter e with circumflex, U+00EA ISOlat1 'euml' => 235, # latin small letter e with diaeresis, U+00EB ISOlat1 'igrave' => 236, # latin small letter i with grave, U+00EC ISOlat1 'iacute' => 237, # latin small letter i with acute, U+00ED ISOlat1 'icirc' => 238, # latin small letter i with circumflex, U+00EE ISOlat1 'iuml' => 239, # latin small letter i with diaeresis, U+00EF ISOlat1 'eth' => 240, # latin small letter n with tilde, 'ograve' => 242, # latin small letter o with grave, U+00F2 ISOlat1 'oacute' => 243, # latin small letter o with acute, U+00F3 ISOlat1 'ocirc' => 244, # latin small letter o with circumflex, U+00F4 ISOlat1 'otilde' => 245, # latin small letter o with tilde, U+00F5 ISOlat1 'ouml' => 246, # latin small letter o with diaeresis, U+00F6 ISOlat1 'divide' => 247, # latin small letter o with stroke, 'ugrave' => 249, # latin small letter u with grave, U+00F9 ISOlat1 'uacute' => 250, # latin small letter u with acute, U+00FA ISOlat1 'ucirc' => 251, # latin small letter u with circumflex, U+00FB ISOlat1 'uuml' => 252, # latin small letter u with diaeresis, U+00FC ISOlat1 'yacute' => 253, # latin small letter y with acute, U+00FD ISOlat1 'thorn' => 254, # latin small letter thorn, U+00FE ISOlat1 'yuml' => 255, # latin small letter y with diaeresis, U+00FF ISOlat1 ); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # arg: for $u (@ARGV) { $moved = 0; $URLerr = "Don't know why"; # Set by URLopen when failures. if (($pfx,$opt) = ($u =~ /^([-+])(.*)/)) { while ($opt) { # Each time MUST remove at least one char. if ($opt =~ s/^B//i) { # +B or -B (binary/nonbinary mode) $binary = ($pfx eq '+') ? 1 : 0; print V ($binary ? "Binary" : "Text"), " data.\n" if $V>1; } elsif ($opt =~ s/^D//i) { # +D or -D (whether to produce data) $W3data = ($pfx eq '+') ? 1 : 0; print V ($W3data ? "Do" : "Don't"), " produce data.\n" if $V>1; } elsif ($opt =~ s/^E//i) { # +E or -E (whether to extract text) $Etext = ($pfx eq '+') ? 1 : 0; $binary = 0; print V ($Etext ? "Do" : "Don't"), " extract text.\n" if $V>1; } elsif ($opt =~ s/^H//i) { # +H or -H (whether to produce headers) $W3hdrs = ($pfx eq '+') ? 1 : 0; print V ($W3hdrs ? "Do" : "Don't"), " produce headers.\n" if $V>1; } elsif ($opt =~ s/^I//i) { # +I or -P if ($pfx eq '-') { $W3agentid = ''; print V "$me: No agent identification." if $V>1; } else { $W3agentid = $opt || $dflagentid; # Rest of string is id. print V "$me: Agent \"$W3agentid\"\n" if $V>1; $opt = ''; } } elsif ($opt =~ s/^M//i) { # +M or -M (Macintosh kludge) $MACfl = ($pfx eq '+') ? 1 : 0; $binary = 0; print V "$me: Mac kludge " . ($MACfl?'on':'off') . ".\n" if $V>1; } elsif ($opt =~ s/^O//i) { # +O or -O (output file) $outfile = $opt; # Rest of string is file name. $opt = ''; print V "$me: Output to \"$outfile\"\n" if $V>1; } elsif ($opt =~ s/^P//i) { # +P or -P if ($pfx eq '-') { $W3nopxy = $opt; # Rest of string is pattern. print V "$me: Proxy exceptions are /$W3nopxy/\n" if $V>1; } else { $W3proxy = $opt; # Rest of string is proxy host. print V "$me: Proxy server is $W3nopxy.\n" if $V>1; } $opt = ''; } elsif ($opt =~ s/^R//i) { # +R or -R (whether to follow 302 redirects) $W302 = ($pfx eq '+') ? 1 : 0; print V ($W302 ? "Do" : "Don't"), " follow 302 redirects.\n" if $V>1; } elsif ($opt =~ s/^T//i) { # +T or -T (WWW tracing) or -T (timeout) if ($opt =~ s/^(\d+)//) { # T with number: if ($pfx eq '+') { $tries = $1; # +T\d+ is limit to tries. print V "tries = $tries.\n" if $V>1; } else { $HTTPtimeout = $1; # -T\d+ is timeout. print V "HTTPtimeout = $HTTPtimeout sec.\n" if $V>1; } } else { # T without number: if ($pfx eq '+') { $W3trace = 1; # +T enables tracing. print V "Do produce WWW tracing.\n" if $V>1; } else { $W3trace = 0; # -T disables tracing. print V "Don't produce WWW tracing.\n" if $V>1; } } } elsif ($opt =~ s/^V//i) { # +V $HTTPvopt = $HTTPversion = $opt; $opt = ''; print V "$me: HTTP version '$HTTPvopt'\n" if $V>1; } elsif ($opt =~ s/^X//i) { # +X $HTTPtimexit = ($pfx eq '+') ? 1 : 0; print V ($W302 ? "Do" : "Don't"), " exit on timeout.\n" if $V>1; } else { print V "$me: unknown option \"$opt\" ignored.\n"; $opt =~ s/.//; # Discard this option char. } } next arg; } $try = $hdrlines = 0; # Count the opens and header lines. $tries = 10 unless $tries; try: while ($try++ < $tries && $hdrlines == 0) { print V "$me: Try $try\n" if $V>0 && $try>1; sleep 1 if $try>1; # Don't hit the server too hard. unless (&URLopen(*U,$u)) { print V "$me: Can't open \"$u\" ($URLerr)\n" if $V>1; $exitstat = 1; next arg; } print V "$me: Opened \"$u\"\n" if $V>1; if ($HTTPtimeout > 0) { alarm $HTTPtimeout; $savsig = $SIG{ALRM}; $SIG{ALRM} = 'READalarm'; print V "$me: Set alarm after $HTTPtimeout sec.\n" if $V>2; } $inTEXT = $URLhdr ? 1 : 0; $inHTML = # Is it HTML? $inPRE = # Within a
...
section? $statmax = 0; # Max status code seen. $Ubuf = # Input buffer. $staterr = ''; # Last error message. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's where we read the data from one URL and write it to # # standard output. If you want to do something else with the # # data, you should rewrite this loop: # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if ($outfile && !$outopen) { # Do we need to open the output? print V "$me: Open \"$outfile\"\n" if $V>1; if (open(O,">$outfile")) { # Try to open it for writing. print V "$me: Writing \"$outfile\" [$!]\n" if $V>1; } else { print V "$me: Can't write \"$outfile\" [$!]\n" if $V>0; $outfile = ''; } $outopen = 1; } print V "$me: Headers (URLhdr=$URLhdr) ...\n" if $V>1; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # In all cases, we must first read through the HTTP headers. We look # # for a few of them, and set global variables to match what we see. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # hdr: while ($URLhdr && ($b = &Uline())) { $inTEXT = 1; $b =~ s/\s+$//; # Discard trailing white stuff if ($W3hdrs) {if ($outfile) {print O "$b\n"} else {print "$b\n"}} if (length($b) > 0) { # Anything left in the buffer? ++$hdrlines; if (($httpvrs,$statcode,$statmsg) = ($b =~ /^HTTP\/([\d.]+)\s+(\d+)\s+(.*)/)) { print V "\n" if $V>1; if ($statcode > $statmax) {$statmax = $statcode; $staterr = $statmsg} if ($W302 && $statcode == 302) { print V "\n" if $V>1; $moved = 1; } if ($httpvrs eq '1.1' && $statcode >= 400) { if ($HTTPversion ne '1.1') { print V "\n" if $W3trace; $kludge1_1_404 = 1; $HTTPversion = '1.1'; redo arg; } else { print V "\n" if $W3trace; } } } elsif ($Etext && ($b =~ m"^Content-type:\s*(.*)/(.*)$"i)) { $doctype = lc($1); $subtype = lc($2); print V "\n" if $V>1; if ($doctype eq 'text') { $inTEXT = 1;; $inHTML = ($subtype eq 'html') ? 1 : 0; } else { $inTEXT = $inHTML = 0; } print V "\n" if $V>1; } elsif ($moved && ($b =~ /^Location:\s*(.*)$/)) { print V "\n" if $V>1; $u = $1; redo arg; } if ($statcode >= 400) { print V "\n" if $W3trace; } } else { $URLhdr = 0; # Blank line ends headers } } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # We should now be positioned just after the double CRLF that ends # # HTTP header lines. The rest should be the contents of the file. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if ($statmax >= 400) { # Did we get a fatal error code? exit 1; } $/ = undef if $binary; if ($HTTPtimeout > 0) {alarm $HTTPtimeout} print V "$me: Headers done.\n" if $V>1; if (!$W3data) { # Data not wanted? print V "$me: Data not wanted, quitting.\n" if $V>1; close(U); # Close this connection. next arg; # Go on to next URL. } # if ($W3hdrs) { # print V "$me: Writing NL\n" if $V>1; # if ($outfile) {print O "\n"} else {print "\n"} # } # Now we copy the data, doing any needed processing to each line. data: while ($b = &Uline()) { print V "$me: Got: \"$b\"\n" if $V>5; if (!$W3hdrs && $URLhdr) { # Suppressing header lines. print V "$me: HDR check in \"$b\"\n" if $V>1; if ($b =~ s/^.*\r\n\r\n//s) { $URLhdr = 0; # Found \n\n separator. } else { next; # No separator, discard it all. } } if ($MACfl) {$b =~ s"\r\n?"\n"g} if ($outfile) {print O $b} else {print $b} if ($HTTPtimeout) {alarm $HTTPtimeout} } unless (defined $b) { print V "\n$me: Error reading \"$u\" (Reason: $!)\n" if $V>0; $exitstat = $?; } } exit $exitstat; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub READalarm { my $t = time - $HTTPopentime; print V "\n" if $W3trace; exit -1; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Convert an HTML tag to its plain-text equivalent. sub Htag { local($tag) = @_; local($a,$t,$u); print V "Htag: tag: $tag\n" if $V>2; if ($tag =~ s"^(/^\w*)\s*(.*)$"$2") { $t = $1; $tag = $2; } else { $t = $tag; } while ($tag) { print V "Htag: Parse \"$tag\"\n" if $V>2; if ($tag =~ s/^(\w+)\s*=\s*"([^>"]*)"\s*//) { print V "Htag: atr1 $1=\"$2\"\n" if $V>2; return "\n" if ($1 eq 'href' || $1 eq 'src'); } elsif ($tag =~ s/^(\w+)\s*=\s*(\w+)"*\s*//) { print V "Htag: atr2 $1=\"$2\"\n" if $V>2; return "\n" if ($1 eq 'href' || $1 eq 'src'); } elsif ($tag =~ s"^(\S+)\s*"") { print V "Htag: Drop \"$1\"\n" if $V>2; } else { $tag =~ s"^(.)""; print V "Htag: Chop \"$1\"\n" if $V>2; } print V "Htag: Left: \"$tag\"\n" if $V>2; } $t = lc($t); # $t =~ s/"\s+/"/g; if ($t eq 'br') {return "\n"} if ($t =~ m"^/*p\b") {return "\n\n"} if ($t =~ m"^/*t[dr]\b") {return "\n"} if ($t =~ m"^/*H\d+\b") {return "\n\n"} if ($t =~ m"^/*[bod]l\b") {return "\n"} if ($t =~ m"^li\b") {return "\n"} if ($t =~ m"^hr\b") {return "\n"} if ($t =~ m"^d[td]\b") {return "\n"} if ($t =~ m"^/*center\b") {return "\n"} if ($t =~ m"^/*blockquote\b") {return "\n"} if ($t eq 'pre') {$inPRE++; return "\n"} if ($t eq '/pre') {$inPRE--; return "\n"} if ($t =~ '^textarea\b') {$inPRE++; return "\n"} if ($t =~ '^/textarea\b') {$inPRE--; return "\n\n"} if ($t =~ m"^/*head\b") {return "\n\n"} if ($t =~ m"^/*body\b") {return "\n\n"} return ''; # Discard all others. } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Return one "line" from the URL file U. Depending on global flags, we may or # # may not trim away trailing white space, rewrite HTML tags, etc. Note that # # while in the HTTP headers ($URLhdr true), we treat the data as text and # # return it line at a time, regardless of settings. When we reach the data, # # we then start looking at how we are to process the data. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub Uline { local($b,$c,$h,$C,$s,$n); local($F) = 'Uline'; loop: while (1) { if ($binary && !$URLhdr) { $h .= $Ubuf; # Anything left over from last time? $Ubuf = ''; # Use it all up print V "$F HTML \"$h\"\n" if $V>6; return $h if length($h)>0; } elsif ($Etext && $inHTML && !$URLhdr) { print V "$F HTML \"$Ubuf\"\n" if $V>5; if ($h =~ /Notation /) { print V "$F ########\n" if $V>1; } if ($Ubuf =~ s/^([\s\r]+)//) { # White space => space $h .= ($inPRE > 0) ? $1 : ' '; print V "$F HTML \"$h\"\n" if $V>5; } elsif ($Ubuf =~ s/^\&//) { # HTML entity if ($Ubuf =~ s/^(\w+);//) { # HTML symbolic entity $h .= &Hsym($1); } elsif ($Ubuf =~ s/^#(\d+);//) { # One HTML numeric entity $h .= chr($1); } else { $h .= '&'; # False alarm } } elsif ($Ubuf =~ s/^]+)>\s*//) { print V "$F HTML tag \"$1\"\n" if $V>1; $c = &Htag($1); if ($c eq "\n" || $c eq "\n\n") { $C = ($c eq "\n") ? '\n' : ($c eq "\n\n") ? '\n\n' : '###'; print V "$F HTML \"$h\" + $C\n" if $V>2; return $h . $c; } else { $h .= $c; } } else { print V "$F Incomplete tag.\n" if $V>5; } } elsif ($Ubuf =~ s/^([^\s\r<&]+)//) { # Up to white space or < or & $h .= $1; print V "$F HTML \"$h\"\n" if $V>5; } else { $h .= $Ubuf; $Ubuf = ''; print V "$F HTML \"$h\"\n" if $V>5; } print V "$F h=\"$h\"\n" if $V>3; print V "$F HTML Need more ...\n" if $V>5; next loop if $Ubuf; } elsif ($Ubuf =~ s/^([^\r\n]*)([\r\n])//) { $s = $1; $c = $2; $C = ($c eq "\n") ? '\n' : ($c eq "\r") ? '\r' : $c; print V "$F Line $s$C\n" if $V>3; if (($c eq "\r") && $inTEXT) { # DOS/Macintosh kludge print V "$F TEXT map $C => \\n\n" if $V>4; unless ($Ubuf) {if (($n = read(U,$b,$bufsiz)) > 0) {$Ubuf .= $b}} $Ubuf =~ s/^\n//; # remove \r\n $c = "\n"; } return "$s$c"; } $n = read(U,$b,$bufsiz); # Read in the next chunk of data if ($n <= 0) { print V "$F Read failed; n=$n ($!)\n" if $V>1; $staterr = "$!"; # Remember the error message $b = $h . $Ubuf; # Whatever is left $Ubuf = ($n < 0) ? undef : ''; return $b; # Return remaining data } $Ubuf .= $b; print V "$F Ubuf=\"$Ubuf\"\n" if $V>6; print V "$F ================================\n" if $V>6; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's the routine to convert &foo; to the corresponding 8859-1 character. # # If it's not found, we just return the "&foo;" encoding. Eventually we will # # have to deal with the Unicodization of the unix world. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub Hsym { local($s) = @_; local($c) = $htmlsym{$s}; return(chr($c)) if defined $c; return("&$s;"); }