#!/usr/local/bin/perl -w # # nobanner # # A trivial web proxy written in perl. # # This proxy is passed a filename which contains a bunch # of perl regular expressions which allow for substitutions # and deletions of text in html files passing through the # proxy. This is an expensive way to do anything; far more # expensive then just rewriting URLs. However, this allows # more of the offending garbage to be removed. # # Written by: # Dave Regan # regan@peak.org # 24 August 1997 # http://mordred.ao.com/nobanner/ # # No copyright claimed. Do what you want to with this program. # # # Things to do: # ------------- # # The log file potentially has many different writers. This # should really be done through syslog. However, there may be # installations where that really isn't appropriate either. # # There are a number of CGI and web modules available for making # this work easier and more reliable. I did this as a training # excercise, and the standard routines were straightforward, so # I just did the work that way. # # This doesn't support a bidirectional dialog. I don't think # that is important though I really need to read the appropriate RFC. # # This doesn't proxy any of the secure server stuff. That bypasses # this code entirely, which is probably just as well. # # This probably won't handle multipart form-data correctly. # # It would be interesting to use the shared memory routines # for keeping a pool of counters for each of the rules fired. # Use this to replace the file oriented counting. # ### ### Configuration ### use Getopt::Long; use Socket; $Version = 'nobanner v0.09 regan@ao.com'; $Home = "http://mordred.ao.com/nobanner/"; # The source # Option table %Args = ( "port" => 5244, # Port to answer on "log" => "/dev/null", # Where to log data to "logdir" => "", # Where to log rule info "rules" => "./nobanner.rules", # Name of file with rules ); # Network constants $AF_INET = 2; $SOCK_STREAM = 1; $Sockaddr = "S n a4 x8"; ### ### Main program ### # Various initialization # print "$Version\n"; $StartTime = time(); GetOptions(\%Args, "port=i", "rules=s", "log=s", "logdir=s", "proxy=s"); InitializePublishedPort(); mkdir($Args{'logdir'}, 0755) if ($Args{'logdir'} ne ""); ReadRules(); $SIG{"CHLD"} = sub { wait }; $SIG{"HUP"} = "IGNORE"; # Servers started by hand. print "Cannot open $Args{'log'}. We will run, but expect errors.\n" if (!open(LOG, ">>$Args{'log'}")); print LOG "Operation started at ", scalar(localtime($StartTime)), "\n"; select(LOG); $| = 1; select(STDOUT); $Nrequests = 0; $SIG{'INT'} = ReadRules(); # Main loop for ($count = 0; ; $count++) { $addr = accept(NS, S); if (!$addr) { print LOG "Accept failed. This shouldn't happen: $!\n"; sleep(4); next; } select(NS); $| = 1; select(STDOUT); $Nrequests++; if (($child = fork()) == 0) { ProcessRequest($addr); close LOG; exit 0; } elsif ($child == -1) { print LOG "Cannot fork. Looks grim.\n"; Error(undef, "Proxy server cannot fork. Try back later"); } my($af, $port, $inetaddr); ($af, $port, $inetaddr) = unpack($Sockaddr, $addr); $References{$inetaddr} = 0 if (!defined($References{$inetaddr})); $References{$inetaddr}++; # my($af, $port, $inetaddr, $name, $aliases, $addrtype, $length, $addrs); # ($af, $port, $inetaddr) = unpack($Sockaddr, $addr); # print "Inetaddr is $inetaddr\n"; # ($name, $aliases, $addrtype, $length, @addrs) = # gethostbyaddr($inetaddr, $AF_INET); #print "Name is $name\n", $name; close NS; # Every now and then spin around and pick up zombies. # This isn't the right way to do things. if ($count > 100) { $count = 0; while (wait() != -1) { # Reap all of the zombies } } # If the Rules file has changed contents, read in the new table. ReadRules() if ($RulesReadAt != (stat($Args{'rules'}))[9]); } ### ### Error ### ### Print an error message to all concerned, and potentially exit. ### sub Error { my($error, $msg) = @_; $msg .= $error if (defined($error)); print LOG "$msg\n"; print NS << "EOF"; HTTP/1.0 404 File Not Found Content-type: text/html 404 Proxy error

Proxy error

The proxy server you are using is having trouble and reports:
	$msg
Sometime this means that there is a configuration error, often it means that the next host processing this request had problems. EOF die $msg if (defined($error)); } ### ### GiveStatusMsg ### ### Print a status message indicating whatever there is to tell. ### sub GiveStatusMsg { my($addr, @addr, $count, $key, $name, $start); $start = scalar(localtime($StartTime)); print NS << "EOF"; HTTP/1.0 200 OK Content-type: text/html nobanner status page

nobanner status page

The nobanner proxy server you are using reports the following information.


The arguments that the program was started with is:


The sites which have referenced this proxy server are:
Site namecount EOF for $addr (sort keys(%References)) { $name = gethostbyaddr($addr, $AF_INET); if (!defined($name)) { @addr = unpack("C4", $addr); $name = "$addr[0].$addr[1].$addr[2].$addr[3]"; } print NS "
$name$References{$addr}\n"; } $timestamp = scalar(localtime($RulesReadAt)); print NS << "EOF";


The rules (last changed at $timestamp) are:
matchespatternreplacement EOF for ($count = 0; $count <= $#Pattern; $count++) { print NS "
"; print NS (stat("$Args{'logdir'}/$count"))[7] if ($Args{'logdir'} ne ""); print NS "", HTMLsafe($Pattern[$count]), "", HTMLsafe($Replace[$count]), "\n"; } print NS << "EOF";

For more information on this proxy server, see: $Home. EOF } ### ### HTMLsafe ### ### Make a string safe to print in HTML land. ### sub HTMLsafe { my($text) = @_; $text =~ s/\&/&/g; $text =~ s//>/g; return $text; } ### ### InitializePublishedPort ### ### Make our presence felt. ### ### When we specify our address we risk picking the wrong interface. ### Far better to choose 0 and have one picked for us. ### sub InitializePublishedPort { my($aliases, $len, $name, $server, $this, $type); # If we are going to talk to a proxy server, find out about it now. if (defined($Args{'proxy'})) { ($ProxyHost = $Args{'proxy'}) =~ s/:.*//; ($ProxyPort = $Args{'proxy'}) =~ s/.*://; } # We will need our address later # chomp($server = `hostname`); # ($name, $aliases, $type, $len, $ServerAddress) = gethostbyname($server); # $ServerInfo = pack($Sockaddr, $AF_INET, 0, $ServerAddress); # $ServerInfo = pack($Sockaddr, $AF_INET, 0, "\0\0\0\0"); #@data = unpack('C16', $ServerInfo); #print "Connection control block is @data\n"; # Make our connection out ($name, $aliases, $TcpProto) = getprotobyname('tcp'); $this = pack($Sockaddr, $AF_INET, $Args{'port'}, "\0\0\0\0"); # Unbuffered I/O select(NS); $| = 1; select(STDOUT); # The rest of the magic socket(S, $AF_INET, $SOCK_STREAM, $TcpProto) || die "socket: $!"; setsockopt(S, SOL_SOCKET, SO_REUSEADDR, pack("l", 1)) || die "setsockopt: $!"; bind(S, $this) || die "bind: $!"; listen(S, 5) || die "connect: $!"; select(S); $| = 1; select(STDOUT); } ### ### ProcessRequest ### ### Process a request from a client. ### ### Read the header, which is everything up to two blank lines. ### Then take that header and echo it through to the real server. ### ### Remember that each request is handled by a separate process, ### which makes it hard for state information to be passed between ### childred. ### sub ProcessRequest { my($addr) = @_; my($af, $port, $inetaddr, @inetaddr); my($count, $data, $get, $header, $hits, $host, $ishtml); my($name, $aliases, $type, $len, $length, $realaddr, $realinfo); my($c2); # print "child got request\n"; ($af, $port, $inetaddr) = unpack($Sockaddr, $addr); @inetaddr = unpack('C4', $inetaddr); print LOG scalar(localtime), " connect: $af, $port, @inetaddr\n"; $header = ""; $host = ""; $get = ""; $length = -1; while () { $header .= "$_"; $host = $_ if (/^Host: /); $get = $_ if (/^GET/); $length = $1 if (/Content-Length:\s*(\d+)/); print LOG "client: $_"; last if (/^[\r\n]+$/m); } # # Read the data portion. # # This is real ugly. I'd have liked to read until EOF, but it appears # that the channel isn't closed down until far later. Thus we need # to know how much data there is to read. GET has no data. POST has # at least one line. This is the normal case. However, it is possible # for a POST to have *lots* of data for file transfers. This means # having to decode the multipart headers if the first line indicates # this mode. # $data = ""; if ($get eq "") { if ($length != -1) { # Handle the case where the length is specified. my($readdata); print LOG "Read $length bytes\n"; read(NS, $readdata, $length); $data .= $readdata; print LOG "client data: $readdata\n"; } else { while () { print LOG "client data: $_\n"; $data .= $_; last; # This needs to be more sophisticated. } } } print LOG $get if ($get ne ""); if ($get =~ m#/nobanner_status.html#) { GiveStatusMsg(); return; } # # Parse the hostname into a hostname and port number. # Make a connection to the real server. # if (defined($ProxyHost)) { $host = $ProxyHost; $port = $ProxyPort; } else { if ($host eq "") { Error(undef, "There was no Host specified in the header."); return; } chomp $host; $host =~ s/[\r\n]+//m; $host =~ s/Host: //; $port = 80; $port = $1 if ($host =~ /:(\d+)/); $host =~ s/:.*//; } if ($host =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/) { $ip[0] = $1; $ip[1] = $2; $ip[2] = $3; $ip[3] = $4; $realaddr = pack('C4', @ip); } else { ($name, $aliases, $type, $len, $realaddr) = gethostbyname($host); } if (!defined($realaddr)) { Error(undef, "No DNS entry"); exit 0; } $realinfo = pack($Sockaddr, $AF_INET, $port, $realaddr); #@inetaddr = unpack('C4', $realaddr); #print "Connect to host \"$host\" port \"$port\" at addr @inetaddr\n"; #@inetaddr = unpack('C4', $ServerAddress); #print "Connect from addr @inetaddr\n"; socket(S2, $AF_INET, $SOCK_STREAM, $TcpProto) || Error($!, "Socket: "); setsockopt(S2, SOL_SOCKET, SO_REUSEADDR, pack("l", 1)) || Error($!, "Setsockopt: "); # bind(S2, $ServerInfo) || Error($!, "bind: "); # bind(S2, INADDR_ANY) || Error($!, "bind: "); connect(S2, $realinfo) || Error($!, "connect: "); select(S2); $| = 1; select(STDOUT); # # Write out what we saw as the header information # If we are writing to another proxy server, send everything # unchanged. However, if we are talking to the ultimate # site, remove the http://site.top.level as well as the # Proxy-Connection line. # if (!defined($ProxyHost)) { # print "Convert $header\n"; $header =~ s/Proxy-Connection: .*[\r\n]*//gmi; $header =~ s#((GET|POST)\s+)http://[^/]*#$1#; # print "to $header\n"; } # print LOG "Msg to client is $header$data\n"; print S2 "$header$data"; # # Get the header information from the server. # $header = ""; for ($ishtml = 0; ; ) { $ishtml = 1 if (/Content-Type: text\/html/i); $header .= $_; # print NS $_; print LOG "server: $_"; last if (/^[\r\n]+$/m); } # # Transmit the rest of the information back. # If the header indicates that the type is text/html, then # we will parse the HTML, otherwise it goes back unchanged. # Read the *entire* HTML document into memory. This is sort # of wasteful, but is easy to do. # $hits = 0; if ($ishtml) { $data = ""; while () { $data .= $_; } # print "Process the body $data\n"; # print "Process $#Pattern rules\n"; for ($count = 0; $count <= $#Pattern; $count++) { ### print "/$Pattern[$count]/$Replace[$count]/ ", $data =~ /$Pattern[$count]/, "\n"; $c2 = ($data =~ s/$Pattern[$count]/$Replace[$count]/isgm); if ($c2 > 0) { $hits += $c2; if ($Args{'logdir'} ne "") { if (open(COUNTER, ">>$Args{'logdir'}/$count")) { flock(COUNTER, 2); seek(COUNTER, 0, 2); print COUNTER " "; close COUNTER; } } } } $header =~ s/Content-Length: \d+[\s\r]*\n?//gm; print LOG "server data:\n\n$header$data\n\n"; print LOG "There were $hits alterations\n"; print NS "$header\n"; print NS $data; } else { print NS $header; while () { print NS $_; } } close NS; close S2; } ### ### ReadRules ### ### Read a set of rules. ### ### This is a series of lines which are either comments ### (lines starting with # or blank), or rules. The rules ### are of the form: ### /pattern/replacement/ ### The first character is the delimiter. Unfortunately, you ### cannot escape whatever is chosen for a delimiter. ### No replacement portion is needed. ### Each rule is assumed to run in a "sgm" environment. ### sub ReadRules { my($delim, @pieces); if (!open(RULE, "<$Args{'rules'}")) { print "Cannot open $Args{'rules'}\n"; return; } undef @Pattern; undef @Replace; $RulesReadAt = (stat($Args{'rules'}))[9]; while () { chomp; next if (/^\s*$/ || /^\s*#/); $delim = substr($_, 0, 1); @pieces = split($delim, $_); $pieces[1] = "" if (!defined($pieces[1])); $pieces[2] = "" if (!defined($pieces[2])); push(@Pattern, $pieces[1]); push(@Replace, $pieces[2]); } close RULE; }