#!perl =head1 NAME Htmlcln - Sanitizes HTML and related files before publication =head1 SYNOPSIS perl htmlcln.pl [-t [html|js|css]] [-DVAR[=value] ...] [-v] -o I I =head1 DESCRIPTION The Htmlcln program preprocesses text files such as HTML, JS, or CSS files and cleans them up. =over 8 Comments are removed. Blank lines are removed. Sections marked "debug" are removed in the retail build. =back =head1 OPTIONS =over 8 =item B<-t> [html|js|css] Htmlcln normally tries to guess what kind of file it is processing from the filename extension. You can explicitly override the guess with the B<-t> command line switch. =item B<-D>VAR[=value] ... Command line definitions are supported in the same manner as the C compiler. The only command-line variable we pay attention to is the -DDBG flag, which indicates that this is a debug build. =item B<-o> I Specifies the name of the output file. =item I Specifies the name of the source file. =back =cut use strict qw(vars refs subs); ############################################################################## # # Element - A class that parses HTML elements # # Instance variables: # # raw = the raw text # attr = hash of attributes # tag = name of tag, including slash # # If the value of an attribute hash is undef, it means that the # attribute is present but with no value. # package Element; # # Constructor: $elem = new Element(""); # sub new { my ($class, $raw) = @_; my $attr = { }; my $self = { raw => $raw, attr => $attr }; my $tag; if ($raw =~ s/^<([^\s>]*)//) { $self->{tag} = uc $1; if ($self->{tag} =~ /^[A-Z]/) { $raw =~ s/>$//; for (;;) { if ($raw =~ s/^\s*([-A-Za-z]+)="([^"]*)"// || $raw =~ s/^\s*([-A-Za-z]+)='([^']*)'// || $raw =~ s/^\s*([-A-Za-z]+)=(\S*)//) { $attr->{uc $1} = $2; } elsif ($raw =~ s/^\s*([A-Za-z]+)//) { $attr->{uc $1} = undef; } else { last; } } } } else { warn "Can't parse \"$raw\""; } bless $self, $class; } # # Element::Tag # # Returns the tag. # sub Tag { my $self = shift; $self->{tag}; } # # Element::Attr # # Returns the value of the attribute. # sub Attr { my ($self, $attr) = @_; $self->{attr}{uc $attr}; } # # Element::Exists # # Returns the presence of the attribute. # sub Exists { my ($self, $attr) = @_; exists $self->{attr}{uc $attr}; } ############################################################################## # # Filter base class # # Basic stuff to save people some hassle. # # Per perl tradition, an object is a ref to an anonymous hash where the # state is kept. # # Instance variables: # # sink = reference to filter sink # package Filter; sub new { my($class) = @_; bless { }, $class; } sub SetSink { my ($self, $sink) = @_; $self->{sink} = $sink; } sub Add { my $self = shift; $self->{sink}->Add(@_); } sub Flush { } sub Close { my $self = shift; $self->{sink}->Close(@_); } sub SinkAdd { my $self = shift; $self->{sink}->Add(@_); } ############################################################################## # # TokenFilter filter package # # Does not modify the stream, but merely chops them into tokens, as # recognized by NextToken and processed by EachToken. # # Instance data: # # buf = unprocessed text # package TokenFilter; @TokenFilter::ISA = qw(Filter); # # Append the incoming text to the buffer, then suck out entire tokens. # sub Add { my($self, $text) = @_; my $tok; $self->{buf} .= $text; while ($self->{buf} ne '' && defined($tok = $self->NextToken)) { $self->EachToken($tok); } } sub Flush { my $self = shift; $self->EachToken($self->{buf}); } # # By default, we just sink tokens to the next layer. # sub EachToken { my($self, $tok) = @_; $self->SinkAdd($tok); } ############################################################################## # # LineFilter filter package # # Tokenizer that recognizes lines. # # Instance data: # # buf = unprocessed text # package LineFilter; @LineFilter::ISA = qw(TokenFilter); # # Recognize lines. # sub NextToken { my($self) = shift; if ($self->{buf} =~ s/([^\n]*\n)//) { $1; } else { undef; } } ############################################################################## # # WhitespaceFilter filter package # # Removes blank lines and removes leading and trailing whitespace. # # Someday: Collapse multiple whitespace outside of quotation marks. # package WhitespaceFilter; @WhitespaceFilter::ISA = qw(LineFilter); sub EachToken { my($self, $line) = @_; $line =~ s/^[ \t]+//; $line =~ s/[ \t]+$//; $self->SinkAdd($line) unless $line =~ /^$/; } ############################################################################## # # OutFile filter package # # Writes its output to a file. # # Instance data: # # fh = name of file handle # # package OutFile; @OutFile::ISA = qw(Filter); no strict 'refs'; # Our filename globs aren't very strict # # Custom method: SetOutput. Opens an output file. # my $seq = 0; sub SetOutput { my($self, $file) = @_; $self->{fh} = "OutFile" . $seq++; open($self->{fh}, ">$file") || die "Unable to open $file for writing ($!)\n"; } sub Add { my $self = shift; print { $self->{fh} } @_; } sub Close { my $self = shift; close($self->{fh}); } ############################################################################## # # DebugFilter filter package # # Filters out ;debug and ;begin_debug blocks if building retail. # # Instance data: # # skip = nonzero if we are inside an ignored ;begin_debug block # buf = unprocessed text # package DebugFilter; @DebugFilter::ISA = qw(LineFilter); no strict 'refs'; # Our filename globs aren't very strict # # See if the line contains a debug marker. # If applicable, send the line down the chain. # sub EachToken { my($self, $line) = @_; # ;begin_debug means start skipping if retail if ($line =~ s/;begin_debug//) { $self->{skip} = $::RetailVersion; } # If we were skipping, then ;end_debug ends skipping and we should eat it if ($line =~ s/;end_debug// && $self->{skip}) { $self->{skip} = 0; } elsif ($line =~ s/;debug// && $::RetailVersion) { # A one-shot debug line in retail - skip it } elsif (!$self->{skip}) { $self->SinkAdd($line); # send it down the chain } } ############################################################################## # # CPP filter package # # The CPP filter performs the following operations: # # Removes C and C++-style comments. # # Filters whitespace. # # Instance data: # # buf = unprocessed text # wsf = child WhitespaceFilter # script = current script sink # ultSink = the ultimate sink package CPP; @CPP::ISA = qw(TokenFilter); sub new { my($class) = shift; my $self = new Filter; $self->{wsf} = new WhitespaceFilter; # sink into a whitespace filter $self->{sink} = $self->{wsf}; # initially use this script bless $self, $class; } # # Recognize tokens, which are lines or /* ... */ comments. # sub NextToken { my($self) = shift; if ($self->{buf} =~ s/^([^\/]+)//) { # eat up to a slash $1; } elsif ($self->{buf} =~ s/^\/\/.*?\n//) { # eat // to end of line "\n"; } elsif ($self->{buf} =~ s/^\/\*[^\0]*?\*\///) { # eat /* .. */ ''; } elsif ($self->{buf} =~ s/^(\/)(?=[^\/\*])//) { # eat / not followed by / or * $1; } else { # incomplete fragment - stop undef; } } # # SetSink # # The sink we get is really the whitespace filter's sink, and we sink # into the whitespace filter. # sub SetSink { my ($self, $sink) = @_; $self->{wsf}->SetSink($sink); } ############################################################################## # # # JS - comments are // or /* ... */, invoked via . # sub NextScriptToken { my($self) = shift; if ($self->{buf} =~ s,^(),,i) { $1; } elsif ($self->{buf} =~ s,^(.*?),,i) { $1; } else { my $tok = $self->{buf}; $self->{buf} = ''; $tok; } } # # NextToken # # Returns either an HTML token or a script token. # sub NextToken { my($self) = shift; if (defined $self->{script}) { $self->NextScriptToken(); } else { $self->NextHTMLToken(); } } # # _Redirect - Private method that redirects parsing to a script language. # # $self->_Redirect($scr, $end); # # $scr = script object to hook in # $end = sub that recognizes the end of the script # # sub _Redirect { my ($self, $scr, $end) = @_; $self->{script} = $self->{sink} = $scr; $scr->SetSink($self->{ultSink}); $self->{endScript} = $end; } sub EachToken { my($self, $tok) = @_; if ($tok =~ /^,i }); } elsif ($tok =~ /,i }); } elsif (defined($self->{endScript}) && &{$self->{endScript}}($tok)) { delete $self->{endScript}; $self->{script}->Flush(); delete $self->{script}; $self->{sink} = $self->{wsf}; } $self->SinkAdd($tok); } ############################################################################## # # Main package # package main; # # Set up some defaults. # my $force_type = undef; # do not force file type $::RetailVersion = 1; # not the debugging version my $outfile = undef; # output file not known yet my %VAR = (); # No variables defined yet my $verbose = undef; # not verbose mode ############################################################################## # # CreateTypeFilter - Create a filter for the specified type. # my $types = { html => sub { new HTML }, # HTML htm => sub { new HTML }, htx => sub { new HTML }, js => sub { new JS }, # Javascript jsx => sub { new JS }, css => sub { new CSS }, # Cascading style sheet csx => sub { new CSS }, }; sub CreateTypeFilter { my $sub = $types->{lc shift}; &$sub; } ############################################################################## # # Command line parsing # sub Usage { die "Usage: htmlcln [-t [html|js|css]] [-DVAR[=value]...] [-v] -o outfile infile\n"; } # # AddDefine - Handle a -D command line option. # sub AddDefine { my $line = shift; if ($line =~ /=/) { $VAR{$`} = $'; } else { $VAR{$line} = 1; } } sub ParseCommandLine { # # Scream through the command line arguments. # while ($#ARGV >= 0 && $ARGV[0] =~ /^-(.)(.*)/) { # $1 - command # $2 - optional argument my($cmd, $val) = ($1, $2); shift(@ARGV); if ($cmd eq 't') { $val = shift(@ARGV) if $val eq ''; $force_type = $val; } elsif ($cmd eq 'D') { AddDefine($val); } elsif ($cmd eq 'o') { $val = shift(@ARGV) if $val eq ''; $outfile = $val; } elsif ($cmd eq 'v') { $verbose = 1; } else { Usage(); } } # # What's left should be a filename, and there should be an output file. # my $infile = shift(@ARGV); Usage() unless defined $infile && defined $outfile && $#ARGV == -1; # # If the filetype is not being overridden, then take it from the filename. # if (!defined $force_type) { ($force_type) = $infile =~ /\.(.*)/; } # # Include debug goo only if building DBG=1 and FULL_DEBUG is set in the # environment. # $::RetailVersion = 0 if defined($VAR{"DBG"}) && defined($ENV{"FULL_DEBUG"}); $infile; } ############################################################################## # # File processing # sub ProcessFile { my $infile = shift; # # Create the final sink. # my $sink = new OutFile; $sink->SetOutput($outfile); # # Set up the default filter based on the file type. # my $Type = CreateTypeFilter($force_type); $Type->SetSink($sink); # # Create the DebugFilter which sits at the top of the chain. # my $Filter = new DebugFilter; $Filter->SetSink($Type); # # All the plumbing is ready - start pumping data. # open(I, $infile) || die "Cannot open $infile for reading ($!)\n"; while () { $Filter->Add($_); } $Filter->Flush(); $Filter->Close(); } ############################################################################## # # Main program # { my $infile = ParseCommandLine(); ProcessFile($infile); }