HTML::Scrubber, HTML::StripScripts::Parser and HTML::Filter::Callbacks
#!/usr/bin/perl use strict; use warnings; use utf8; use autodie; use 5.010000; use Benchmark ':all'; use HTML::StripScripts; use HTML::StripScripts::Parser; use HTML::Scrubber; use HTML::Filter::Callbacks; say "Perl $]"; for (qw( HTML::StripScripts HTML::StripScripts::Parser HTML::Filter::Callbacks HTML::Scrubber HTML::Parser )) { eval "use $_;1;" or die $@; printf "%s %s\n", $_, $_->VERSION; } say ''; my $src = do { open my $fh, '<:utf8', 'src.html'; local $/; <$fh>; }; my $stripper = HTML::StripScripts::Parser->new( { Context => 'Document', AllowHref => 1, Rules => { br => 1, a => { href => qr{(?:https?://)}, '*' => 0, }, }, } ); my $scrubber = HTML::Scrubber->new(allow => [qw(br a html body)]); $scrubber->rules( 'a' => { href => qr{^https?://}, } ); my $filter = HTML::Filter::Callbacks->new(); $filter->add_callbacks( script => { start => sub { shift->remove_text_and_tag }, end => sub { shift->remove_text_and_tag }, }, 'a' => { start => sub { $_[0]->remove_attr(qr/^on/); $_[0]->remove_attr('href') if ($_[0]->attr('href')||'') !~ m{^https?://} } }, '*' => { start => sub { shift->remove_attr(qr/^on/) } }, ); cmpthese( -1 => { stripper => sub { $stripper->filter_html($src); }, scrubber => sub { $scrubber->scrub($src); }, filter => sub { $filter->process($src); }, }, );
Perl 5.014001 HTML::StripScripts 1.05 HTML::StripScripts::Parser 1.03 HTML::Filter::Callbacks 0.07 HTML::Scrubber 0.09 HTML::Parser 3.68 Rate stripper filter scrubber stripper 1675/s -- -21% -67% filter 2124/s 27% -- -58% scrubber 5068/s 203% 139% --
Discussion
- These modules are using HTML::Parser internally.
- HTML::Scrubber is fast.
- The source code of HTML::Scrubber is very short and readable.
- HTML::StripScripts have a pluggability. But I seem it is a superfluity.
I recommend to use HTML::Scrubber. If you want a full control of HTML rewriting, use HTML::Filter::Callbacks.