#! /usr/local/bin/perl # # @(#)url_get.pl 1.9 14 Apr 1995 # @(#)url_get.pl 1.9 /home/zippy/lib/perl/url_get/SCCS/s.url_get.pl # # url_get.pl --- get a document given a WWW URL # # Modified by Jack Lund 7/19/94 to add functionality and deal with HTTP # 1.0 headers # # Hacked by Stephane Bortzmeyer to add "ftp" URLs. # 22 Jan 1994 # # Jack Lund 9/3/93 zippy@ccwf.cc.utexas.edu # # from hget by: # Oscar Nierstrasz 26/8/93 oscar@cui.unige.ch # # Syntax: # # &url_get($url, [$file]) # # $url - URL of document you want # # $file - optional file you want it put into. Specify "&STDOUT" if you # want it to go to stdout; Leave this off if you want url_get to # return the document as one (possibly VERY LARGE) string ######################################################################## $home = $ENV{"HOME"}; require "/data1/WWW/Tropical/Bin/URL.pl"; require "/data1/WWW/Tropical/Bin/ftplib.pl"; sub url_get { local($url, $file) = @_; local($loseheader) = ($opt_h ? 0 : 1); local($debug) = ($opt_d ? 1 : 0); local($binary) = ($opt_b ? 1 : 0); ($protocol, $host, $port, $rest1, $rest2, $rest3) = &url'parse_url($url); # Convert any characters in the string specified in hex by "%xx" to # the correct character. Note we do this *after* parsing the URL! $rest1 =~ s/%(\w\w)/sprintf("%c", hex($1))/ge; if ($protocol eq "http") { return &url_get'http_get($host,$port,$rest1,$loseheader,$debug,$file); } if ($protocol eq "gopher") { # Convert from hex. See above. $rest2 =~ s/%(\w\w)/sprintf("%c", hex($1))/ge if ($rest2); $rest3 =~ s/%(\w\w)/sprintf("%c", hex($1))/ge if ($rest3); return &url_get'gopher_get($host, $port, $rest1, $rest2, $rest3, $file); } if ($protocol eq "file" || $protocol eq "ftp") { return &url_get'file_get($host, $port, $rest1, $file, $binary, $debug); } if ($protocol eq "news") { return &url_get'news_get($host, $port, $rest1, $file); } die "Protocol $protocol not supported!\n"; } package url_get; # Everything after this is "private" if ($] >= 5.0) { eval 'use Socket'; } else { eval 'require "sys/socket.ph"'; } 1; sub http_get { local($host,$port,$request,$loseheader,$debug,$file) = @_; local($output) = ""; # Status code translation table. Key is HTTP status code (from # http://info.cern.ch/hypertext/WWW/Protocols/HTTP/HTRESP.html), # and value is status returned by url_get. %exit_status = (400,1,401,2,402,3,403,4,404,5,500,6,501,7,502,8,503,9); if ($file) { open(OUT, ">$file") || die "Error opening output file $file: $!\n"; } $ret = &url_get'open($host, $port); if (!defined($ret)) { if ($! && $! != "") { die "Error opening port $port on $host: $!\n"; } else { die "Host not found: $host\n"; } } print CMD "GET $request HTTP/1.0\r\n\r\n"; $_ = ; if (m#^HTTP/1.0 (\d\d\d) (.+)$#) { $status = $1; $reason = $2; if (! $debug && $status > 299) { warn "Error returned from server: $status $reason\n"; return $exit_status{$status}; } if ($debug) { warn "$_"; while () { last if (/^\s*$/); warn "$_"; } } elsif ($loseheader) { while () { last if (/^\s*$/); if (! /^[a-zA-Z\-]+: /) { warn "Bad MIME header line: $_"; } } } else { if ($file) { print OUT $_; } else { $output .= $_; } } } else { if ($file) { print OUT $_; } else { $output .= $_; } } while () { if ($file) { print OUT $_; } else { $output .= $_; } } close(CMD); close(OUT) if ($file); return($output) unless ($file); } sub gopher_get { local($host,$port,$gtype,$selector,$search,$file) = @_; local($bintypes) = "59sgI"; # Binary gopher types local($goodtypes) = "01579sghI"; # types we can handle local($output) = ""; if ($file) { open(OUT, ">$file") || die "Error opening output file $file: $!\n"; } $request = ($search ? "$selector\t$search\t\$" : $selector); &url_get'open($host, $port) || die "Error opening port $port on $host: $!\n"; print CMD "$request\n"; if (index($goodtypes, $gtype) == -1) { die "Can't retrieve gopher type $gtype\n"; } # If this is a binary document, retreive it using sysreads rather # than if (index($bintypes, $gtype) > -1) { $done = 0; $rmask = ""; vec($rmask,fileno(CMD),1) = 1; do { ($nfound, $rmask) = select($rmask, undef, undef, $timeout); if ($nfound) { $nread = sysread(CMD, $thisbuf, 1024); if ($nread > 0) { $output .= $thisbuf; if ($file) { syswrite(OUT, $thisbuf, $nread) || die "Syswrite: $!\n"; } else { $output .= $thisbuf; } } else { $done++; } } else { warn "Timeout\n"; $done++; } } until $done; } # This is an ASCII document, and we can get it line-by-line using else { while () { last if (/^\.\r\n$/); chop; chop; if ($file) { print OUT "$_\n"; } else { $output .= "$_\n"; } } } close(CMD); close(OUT) if ($file); return($output) unless ($file); } sub file_get { local($host, $port, $path, $file, $bin_xfer, $debug) = @_; local($error); local($output) = ""; if ($host eq "localhost" && !defined($port)) { open(IN, $path) || die "$path: $!\n"; $binary = ((-B $path) ? 1 : 0); warn "binary = $binary\n"; if ($file) { open(OUT, ">$file") || die "Error opening output file $file: $!\n"; } if ($binary) { $done = 0; $rmask = ""; vec($rmask,fileno(CMD),1) = 1; do { ($nfound, $rmask) = select($rmask, undef, undef, $timeout); if ($nfound) { $nread = sysread(CMD, $thisbuf, 1024); if ($nread > 0) { if ($file) { syswrite(OUT, $thisbuf, $nread) || die "Syswrite: $!\n"; } else { $output .= $thisbuf; } } else { $done++; } } else { warn "Timeout\n"; $done++; } } until $done; } else { while () { if ($file) { print OUT "$_"; } else { $output .= "$_"; } } } close(IN); close(OUT) if ($file); } else { &ftp'open($host) || die "Unable to open ftp connection to $host: $ftp'ftp_error\n"; warn "$ftp'ftp_matched" if ($debug); if ($bin_xfer && ! &ftp'type("I")) { $error=&ftp'error; die "$error\n"; } warn "$ftp'ftp_matched" if ($debug && $bin_xfer); if ($file) { &ftp'get($path, $file) || die "Unable to get file $path from $host: $ftp'ftp_error\n"; } else { $output = &ftp'gets($path) || die "Unable to get file $path from $host: $ftp'ftp_error\n"; } warn "$ftp'ftp_matched" if ($debug); &ftp'close; } return($output) unless ($file); } sub news_get { local($host, $port, $article) = @_; local($output) = ""; if ($file) { open(OUT, ">$file") || die "Error opening output file $file: $!\n"; } &url_get'open($host, $port) || die "Error opening port $port on $host: $!\n"; if ($article =~ /^[^<].+@.+[^>]$/) { $request = "article <$article>"; } elsif ($article =~ /^<.+@.+>$/) { $request = "article $article"; } elsif ($article =~ /^\*$/) { die "Only support URLs of the form: news:article\n"; } elsif ($article) { die "Only support URLs of the form: news:article\n"; } else { die "Bad url\n"; } # Read NNTP Connect message $string = ; $string =~ /^(\d*) (.*)$/; die "NNTP Error: $2\n" unless ($1 eq '200'); # Send request print CMD "$request\r\n"; # Read reply message $string = ; $string =~ /^(\d*) (.*)$/; die "NNTP Error: $2\n" unless ($1 eq '220'); # Get article while () { last if (/^\.\r\n$/); chop; chop; if ($file) { print OUT "$_\n"; } else { $output .= "$_\n"; } } print CMD "quit\n"; close(CMD); close(OUT) if ($file); return($output) unless ($file); } sub open { local($Host, $Port) = @_; local($destaddr, $destproc); # Set the socket parameters. Note that we set the defaults to be the # BSD values if we can't get them from the required files. Also note # that, in the 4.0 version, the routines are in package ftp, since # it does the "require sys/socket.ph" first. if ($] < 5.0) { ($Inet = &ftp'AF_INET) || ($Inet=2); ($Stream = &ftp'SOCK_STREAM) || ($Stream=1); } else { ($Inet = &AF_INET) || ($Inet=2); ($Stream = &SOCK_STREAM) || ($Stream=1); } if ($Host =~ /^(\d+)+\.(\d+)\.(\d+)\.(\d+)$/) { $destaddr = pack('C4', $1, $2, $3, $4); } else { local(@temp) = gethostbyname($Host); unless (@temp) { $Error = "Can't get IP address of $Host"; return undef; } $destaddr = $temp[4]; } $Proto = (getprotobyname("tcp"))[2]; $Sockaddr = 'S n a4 x8'; $destproc = pack($Sockaddr, $Inet, $Port, $destaddr); if (socket(CMD, $Inet, $Stream, $Proto)) { if (connect(CMD, $destproc)) { ### This info will be used by future data connections ### $Cmdaddr = (unpack ($Sockaddr, getsockname(CMD)))[2]; $Cmdname = pack($Sockaddr, $Inet, 0, $Cmdaddr); select((select(CMD), $| = 1)[$[]); return 1; } } close(CMD); return undef; }