HTML::Scrubber, HTML::StripScripts::Parser and HTML::Filter::Callbacks
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use autodie;
use 5.010000;
use Benchmark ':all';
use HTML::StripScripts;
use HTML::StripScripts::Parser;
use HTML::Scrubber;
use HTML::Filter::Callbacks;
say "Perl $]";
for (qw(
HTML::StripScripts
HTML::StripScripts::Parser
HTML::Filter::Callbacks
HTML::Scrubber
HTML::Parser
)) {
eval "use $_;1;" or die $@;
printf "%s %s\n", $_, $_->VERSION;
}
say '';
my $src = do {
open my $fh, '<:utf8', 'src.html';
local $/;
<$fh>;
};
my $stripper = HTML::StripScripts::Parser->new(
{
Context => 'Document',
AllowHref => 1,
Rules => {
br => 1,
a => {
href => qr{(?:https?://)},
'*' => 0,
},
},
}
);
my $scrubber = HTML::Scrubber->new(allow => [qw(br a html body)]);
$scrubber->rules(
'a' => {
href => qr{^https?://},
}
);
my $filter = HTML::Filter::Callbacks->new();
$filter->add_callbacks(
script => {
start => sub { shift->remove_text_and_tag },
end => sub { shift->remove_text_and_tag },
},
'a' => {
start => sub { $_[0]->remove_attr(qr/^on/); $_[0]->remove_attr('href') if ($_[0]->attr('href')||'') !~ m{^https?://} }
},
'*' => {
start => sub { shift->remove_attr(qr/^on/) }
},
);
cmpthese(
-1 => {
stripper => sub {
$stripper->filter_html($src);
},
scrubber => sub {
$scrubber->scrub($src);
},
filter => sub {
$filter->process($src);
},
},
);
Perl 5.014001
HTML::StripScripts 1.05
HTML::StripScripts::Parser 1.03
HTML::Filter::Callbacks 0.07
HTML::Scrubber 0.09
HTML::Parser 3.68
Rate stripper filter scrubber
stripper 1675/s -- -21% -67%
filter 2124/s 27% -- -58%
scrubber 5068/s 203% 139% --
Discussion
- These modules are using HTML::Parser internally.
- HTML::Scrubber is fast.
- The source code of HTML::Scrubber is very short and readable.
- HTML::StripScripts have a pluggability. But I seem it is a superfluity.
I recommend to use HTML::Scrubber. If you want a full control of HTML rewriting, use HTML::Filter::Callbacks.
Published: 2011-07-06(Wed) 06:31