tokuhirom's Blog

HTML::Scrubber, HTML::StripScripts::Parser and HTML::Filter::Callbacks

#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use autodie;
use 5.010000;

use Benchmark ':all';

use HTML::StripScripts;
use HTML::StripScripts::Parser;
use HTML::Scrubber;
use HTML::Filter::Callbacks;

say "Perl $]";
for (qw(
    HTML::StripScripts
    HTML::StripScripts::Parser
    HTML::Filter::Callbacks
    HTML::Scrubber
    HTML::Parser
)) {
    eval "use $_;1;" or die $@;
    printf "%s %s\n", $_, $_->VERSION;
}
say '';

my $src = do {
    open my $fh, '<:utf8', 'src.html';
    local $/;
    <$fh>;
};

my $stripper = HTML::StripScripts::Parser->new(
    {
        Context => 'Document',
        AllowHref => 1,
        Rules   => {
            br => 1,
            a => {
                href  => qr{(?:https?://)},
                '*'   => 0,
            },
        },
    }
);

my $scrubber = HTML::Scrubber->new(allow => [qw(br a html body)]);
$scrubber->rules(
    'a' => {
        href => qr{^https?://},
    }
);

my $filter = HTML::Filter::Callbacks->new();
$filter->add_callbacks(
    script => {
        start => sub { shift->remove_text_and_tag },
        end   => sub { shift->remove_text_and_tag },
    },
    'a' => {
        start => sub { $_[0]->remove_attr(qr/^on/); $_[0]->remove_attr('href') if ($_[0]->attr('href')||'') !~ m{^https?://} }
    },
    '*' => {
        start => sub { shift->remove_attr(qr/^on/) }
    },
);

cmpthese(
    -1 => {
        stripper => sub {
            $stripper->filter_html($src);
        },
        scrubber => sub {
            $scrubber->scrub($src);
        },
        filter => sub {
            $filter->process($src);
        },
    },
);
Perl 5.014001
HTML::StripScripts 1.05
HTML::StripScripts::Parser 1.03
HTML::Filter::Callbacks 0.07
HTML::Scrubber 0.09
HTML::Parser 3.68

           Rate stripper   filter scrubber
stripper 1675/s       --     -21%     -67%
filter   2124/s      27%       --     -58%
scrubber 5068/s     203%     139%       --

Discussion

  • These modules are using HTML::Parser internally.
  • HTML::Scrubber is fast.
  • The source code of HTML::Scrubber is very short and readable.
  • HTML::StripScripts have a pluggability. But I seem it is a superfluity.

I recommend to use HTML::Scrubber. If you want a full control of HTML rewriting, use HTML::Filter::Callbacks.