WindowsXP-SP1/shell/lib/htmlcln.pl

728 lines
16 KiB
Perl

#!perl
=head1 NAME
Htmlcln - Sanitizes HTML and related files before publication
=head1 SYNOPSIS
perl htmlcln.pl [-t [html|js|css]] [-DVAR[=value] ...] [-v]
-o I<outfile> I<infile>
=head1 DESCRIPTION
The Htmlcln program preprocesses text files such as HTML, JS, or CSS
files and cleans them up.
=over 8
Comments are removed.
Blank lines are removed.
Sections marked "debug" are removed in the retail build.
=back
=head1 OPTIONS
=over 8
=item B<-t> [html|js|css]
Htmlcln normally tries to guess what kind of file it is processing from
the filename extension. You can explicitly override the guess with the
B<-t> command line switch.
=item B<-D>VAR[=value] ...
Command line definitions are supported in the same manner as the C compiler.
The only command-line variable we pay attention to is the -DDBG flag, which
indicates that this is a debug build.
=item B<-o> I<outfile>
Specifies the name of the output file.
=item I<srcfile>
Specifies the name of the source file.
=back
=cut
use strict qw(vars refs subs);
##############################################################################
#
# Element - A class that parses HTML elements
#
# Instance variables:
#
# raw = the raw text
# attr = hash of attributes
# tag = name of tag, including slash
#
# If the value of an attribute hash is undef, it means that the
# attribute is present but with no value.
#
package Element;
#
# Constructor: $elem = new Element("<TABLE BORDER>");
#
sub new {
my ($class, $raw) = @_;
my $attr = { };
my $self = { raw => $raw, attr => $attr };
my $tag;
if ($raw =~ s/^<([^\s>]*)//) {
$self->{tag} = uc $1;
if ($self->{tag} =~ /^[A-Z]/) {
$raw =~ s/>$//;
for (;;) {
if ($raw =~ s/^\s*([-A-Za-z]+)="([^"]*)"// ||
$raw =~ s/^\s*([-A-Za-z]+)='([^']*)'// ||
$raw =~ s/^\s*([-A-Za-z]+)=(\S*)//) {
$attr->{uc $1} = $2;
} elsif ($raw =~ s/^\s*([A-Za-z]+)//) {
$attr->{uc $1} = undef;
} else {
last;
}
}
}
} else {
warn "Can't parse \"$raw\"";
}
bless $self, $class;
}
#
# Element::Tag
#
# Returns the tag.
#
sub Tag {
my $self = shift;
$self->{tag};
}
#
# Element::Attr
#
# Returns the value of the attribute.
#
sub Attr {
my ($self, $attr) = @_;
$self->{attr}{uc $attr};
}
#
# Element::Exists
#
# Returns the presence of the attribute.
#
sub Exists {
my ($self, $attr) = @_;
exists $self->{attr}{uc $attr};
}
##############################################################################
#
# Filter base class
#
# Basic stuff to save people some hassle.
#
# Per perl tradition, an object is a ref to an anonymous hash where the
# state is kept.
#
# Instance variables:
#
# sink = reference to filter sink
#
package Filter;
sub new {
my($class) = @_;
bless { }, $class;
}
sub SetSink {
my ($self, $sink) = @_;
$self->{sink} = $sink;
}
sub Add {
my $self = shift;
$self->{sink}->Add(@_);
}
sub Flush { }
sub Close {
my $self = shift;
$self->{sink}->Close(@_);
}
sub SinkAdd {
my $self = shift;
$self->{sink}->Add(@_);
}
##############################################################################
#
# TokenFilter filter package
#
# Does not modify the stream, but merely chops them into tokens, as
# recognized by NextToken and processed by EachToken.
#
# Instance data:
#
# buf = unprocessed text
#
package TokenFilter;
@TokenFilter::ISA = qw(Filter);
#
# Append the incoming text to the buffer, then suck out entire tokens.
#
sub Add {
my($self, $text) = @_;
my $tok;
$self->{buf} .= $text;
while ($self->{buf} ne '' && defined($tok = $self->NextToken))
{
$self->EachToken($tok);
}
}
sub Flush {
my $self = shift;
$self->EachToken($self->{buf});
}
#
# By default, we just sink tokens to the next layer.
#
sub EachToken {
my($self, $tok) = @_;
$self->SinkAdd($tok);
}
##############################################################################
#
# LineFilter filter package
#
# Tokenizer that recognizes lines.
#
# Instance data:
#
# buf = unprocessed text
#
package LineFilter;
@LineFilter::ISA = qw(TokenFilter);
#
# Recognize lines.
#
sub NextToken {
my($self) = shift;
if ($self->{buf} =~ s/([^\n]*\n)//) {
$1;
} else {
undef;
}
}
##############################################################################
#
# WhitespaceFilter filter package
#
# Removes blank lines and removes leading and trailing whitespace.
#
# Someday: Collapse multiple whitespace outside of quotation marks.
#
package WhitespaceFilter;
@WhitespaceFilter::ISA = qw(LineFilter);
sub EachToken {
my($self, $line) = @_;
$line =~ s/^[ \t]+//;
$line =~ s/[ \t]+$//;
$self->SinkAdd($line) unless $line =~ /^$/;
}
##############################################################################
#
# OutFile filter package
#
# Writes its output to a file.
#
# Instance data:
#
# fh = name of file handle
#
#
package OutFile;
@OutFile::ISA = qw(Filter);
no strict 'refs'; # Our filename globs aren't very strict
#
# Custom method: SetOutput. Opens an output file.
#
my $seq = 0;
sub SetOutput {
my($self, $file) = @_;
$self->{fh} = "OutFile" . $seq++;
open($self->{fh}, ">$file") || die "Unable to open $file for writing ($!)\n";
}
sub Add {
my $self = shift;
print { $self->{fh} } @_;
}
sub Close {
my $self = shift;
close($self->{fh});
}
##############################################################################
#
# DebugFilter filter package
#
# Filters out ;debug and ;begin_debug blocks if building retail.
#
# Instance data:
#
# skip = nonzero if we are inside an ignored ;begin_debug block
# buf = unprocessed text
#
package DebugFilter;
@DebugFilter::ISA = qw(LineFilter);
no strict 'refs'; # Our filename globs aren't very strict
#
# See if the line contains a debug marker.
# If applicable, send the line down the chain.
#
sub EachToken {
my($self, $line) = @_;
# ;begin_debug means start skipping if retail
if ($line =~ s/;begin_debug//) {
$self->{skip} = $::RetailVersion;
}
# If we were skipping, then ;end_debug ends skipping and we should eat it
if ($line =~ s/;end_debug// && $self->{skip}) {
$self->{skip} = 0;
} elsif ($line =~ s/;debug// && $::RetailVersion) {
# A one-shot debug line in retail - skip it
} elsif (!$self->{skip}) {
$self->SinkAdd($line); # send it down the chain
}
}
##############################################################################
#
# CPP filter package
#
# The CPP filter performs the following operations:
#
# Removes C and C++-style comments.
#
# Filters whitespace.
#
# Instance data:
#
# buf = unprocessed text
# wsf = child WhitespaceFilter
# script = current script sink
# ultSink = the ultimate sink
package CPP;
@CPP::ISA = qw(TokenFilter);
sub new {
my($class) = shift;
my $self = new Filter;
$self->{wsf} = new WhitespaceFilter; # sink into a whitespace filter
$self->{sink} = $self->{wsf}; # initially use this script
bless $self, $class;
}
#
# Recognize tokens, which are lines or /* ... */ comments.
#
sub NextToken {
my($self) = shift;
if ($self->{buf} =~ s/^([^\/]+)//) { # eat up to a slash
$1;
} elsif ($self->{buf} =~ s/^\/\/.*?\n//) { # eat // to end of line
"\n";
} elsif ($self->{buf} =~ s/^\/\*[^\0]*?\*\///) { # eat /* .. */
'';
} elsif ($self->{buf} =~ s/^(\/)(?=[^\/\*])//) { # eat / not followed by / or *
$1;
} else { # incomplete fragment - stop
undef;
}
}
#
# SetSink
#
# The sink we get is really the whitespace filter's sink, and we sink
# into the whitespace filter.
#
sub SetSink {
my ($self, $sink) = @_;
$self->{wsf}->SetSink($sink);
}
##############################################################################
#
#
# JS - comments are // or /* ... */, invoked via <SCRIPT>...
# CSS - comments are /* ... */, invoked via <STYLE TYPE="text/css">
#
# They are both just CPP thingies. Both should someday remove whitespace
package JS;
@JS::ISA = qw(CPP);
package CSS;
@CSS::ISA = qw(CPP);
##############################################################################
#
# HTML filter package
#
# The HTML filter performs the following operations:
#
# Send the final output through a whitespace filter.
#
# Remove comments.
#
# Someday it will also...
#
# Recognize embedded stylesheets and scripts and generate a subfilter
# to handle them.
#
# Compress spaces outside quotation marks.
#
# Instance data:
#
# buf = unprocessed text
# wsf = child WhitespaceFilter
# script = current script sink
# endScript = sub that recognizes end of script
# ultSink = the ultimate sink
package HTML;
@HTML::ISA = qw(TokenFilter);
sub new {
my($class) = shift;
my $self = new Filter;
$self->{wsf} = new WhitespaceFilter;
$self->{sink} = $self->{wsf}; # initially use this script
bless $self, $class;
}
#
# SetSink
#
# The sink we get is really the whitespace filter's sink, and we sink
# into the whitespace filter.
#
sub SetSink {
my ($self, $sink) = @_;
$self->{ultSink} = $sink;
$self->{wsf}->SetSink($sink);
}
#
# NextHTMLToken
#
# An HTML token is one of the following:
#
# - A hunk of boring text.
# - A comment (thrown away).
# - A matched <...> thingie.
sub NextHTMLToken {
my($self) = shift;
#
# Any string of non "<" counts as a boring text token.
#
# Be careful not to mistake <!DOCTYPE...> as a comment.
#
if ($self->{buf} =~ s/^([^<]+)//) {
$1;
} elsif ($self->{buf} =~ s/^(<!--[^\0]*?-->)//) { # Eat full comments
'';
} elsif ($self->{buf} =~ s/^(<![^-][^>]*>)//) { # <!DOCTYPE ...>
$1;
} elsif ($self->{buf} =~ s/^(<[^!][^>]*>)//) { # <something else>
$1;
} else { # incomplete fragment - stop
undef;
}
}
#
# NextScriptToken
#
# A script token is anything that isn't the word </SCRIPT>.
#
sub NextScriptToken
{
my($self) = shift;
if ($self->{buf} =~ s,^(</SCRIPT>),,i) {
$1;
} elsif ($self->{buf} =~ s,^(.*?)</SCRIPT>,,i) {
$1;
} else {
my $tok = $self->{buf};
$self->{buf} = '';
$tok;
}
}
#
# NextToken
#
# Returns either an HTML token or a script token.
#
sub NextToken {
my($self) = shift;
if (defined $self->{script}) {
$self->NextScriptToken();
} else {
$self->NextHTMLToken();
}
}
#
# _Redirect - Private method that redirects parsing to a script language.
#
# $self->_Redirect($scr, $end);
#
# $scr = script object to hook in
# $end = sub that recognizes the end of the script
#
#
sub _Redirect {
my ($self, $scr, $end) = @_;
$self->{script} = $self->{sink} = $scr;
$scr->SetSink($self->{ultSink});
$self->{endScript} = $end;
}
sub EachToken {
my($self, $tok) = @_;
if ($tok =~ /^<SCRIPT/i) {
$self->{inScript} = 1; # BUGBUG create a script sink
my $elem = new Element($tok);
my $lang = lc $elem->Attr("LANGUAGE");
my $scr;
# No language implies JScript
if (!defined($lang) || $lang eq 'jscript' || $lang eq 'javascript') {
$scr = new CPP;
} else {
warn "Unknown script language [$lang]";
# Just use the whitespace filter as the unknown script filter
$scr = new WhitespaceFilter;
}
$self->_Redirect($scr, sub { m,^</SCRIPT>,i });
} elsif ($tok =~ /<STYLE/i) {
$self->_Redirect(new CSS, sub { m,^</STYLE>,i });
} elsif (defined($self->{endScript}) && &{$self->{endScript}}($tok)) {
delete $self->{endScript};
$self->{script}->Flush();
delete $self->{script};
$self->{sink} = $self->{wsf};
}
$self->SinkAdd($tok);
}
##############################################################################
#
# Main package
#
package main;
#
# Set up some defaults.
#
my $force_type = undef; # do not force file type
$::RetailVersion = 1; # not the debugging version
my $outfile = undef; # output file not known yet
my %VAR = (); # No variables defined yet
my $verbose = undef; # not verbose mode
##############################################################################
#
# CreateTypeFilter - Create a filter for the specified type.
#
my $types = {
html => sub { new HTML }, # HTML
htm => sub { new HTML },
htx => sub { new HTML },
js => sub { new JS }, # Javascript
jsx => sub { new JS },
css => sub { new CSS }, # Cascading style sheet
csx => sub { new CSS },
};
sub CreateTypeFilter {
my $sub = $types->{lc shift};
&$sub;
}
##############################################################################
#
# Command line parsing
#
sub Usage {
die "Usage: htmlcln [-t [html|js|css]] [-DVAR[=value]...] [-v] -o outfile infile\n";
}
#
# AddDefine - Handle a -D command line option.
#
sub AddDefine {
my $line = shift;
if ($line =~ /=/) {
$VAR{$`} = $';
} else {
$VAR{$line} = 1;
}
}
sub ParseCommandLine {
#
# Scream through the command line arguments.
#
while ($#ARGV >= 0 && $ARGV[0] =~ /^-(.)(.*)/) {
# $1 - command
# $2 - optional argument
my($cmd, $val) = ($1, $2);
shift(@ARGV);
if ($cmd eq 't') {
$val = shift(@ARGV) if $val eq '';
$force_type = $val;
} elsif ($cmd eq 'D') {
AddDefine($val);
} elsif ($cmd eq 'o') {
$val = shift(@ARGV) if $val eq '';
$outfile = $val;
} elsif ($cmd eq 'v') {
$verbose = 1;
} else {
Usage();
}
}
#
# What's left should be a filename, and there should be an output file.
#
my $infile = shift(@ARGV);
Usage() unless defined $infile && defined $outfile && $#ARGV == -1;
#
# If the filetype is not being overridden, then take it from the filename.
#
if (!defined $force_type) {
($force_type) = $infile =~ /\.(.*)/;
}
#
# Include debug goo only if building DBG=1 and FULL_DEBUG is set in the
# environment.
#
$::RetailVersion = 0 if defined($VAR{"DBG"}) && defined($ENV{"FULL_DEBUG"});
$infile;
}
##############################################################################
#
# File processing
#
sub ProcessFile {
my $infile = shift;
#
# Create the final sink.
#
my $sink = new OutFile;
$sink->SetOutput($outfile);
#
# Set up the default filter based on the file type.
#
my $Type = CreateTypeFilter($force_type);
$Type->SetSink($sink);
#
# Create the DebugFilter which sits at the top of the chain.
#
my $Filter = new DebugFilter;
$Filter->SetSink($Type);
#
# All the plumbing is ready - start pumping data.
#
open(I, $infile) || die "Cannot open $infile for reading ($!)\n";
while (<I>) {
$Filter->Add($_);
}
$Filter->Flush();
$Filter->Close();
}
##############################################################################
#
# Main program
#
{
my $infile = ParseCommandLine();
ProcessFile($infile);
}