tokuhirom's Blog

かわいい分類

http://d.hatena.ne.jp/echizen_tm/20110721/1311253494

自分で手をうごかしてみないとよくわからないタイプな僕は、簡単分類ツール futaba の動作がなんかむずかしかったので、ソースを1ファイルにまとめてみて、実際の挙動を理解することにつとめたのであった。

この手のツールは、なんかこうデータの入出力の部分がやたらおおくて煩雑なので、オンメモリで適当にやる版をかくことで理解の助けになる気がした。

#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use 5.010000;
use autodie;
use Data::Dumper;

my %data = (
  # feature_key       { class_key => score, ... }
    'warm:red'     => { R => 255, G => 0,   B => 0 },
    'cold:green'   => { R => 0,   G => 255, B => 0 },
    'cold:blue'    => { R => 0,   G => 0,   B => 255 },
    'cold:cyan'    => { R => 0,   G => 255, B => 255 },
    'warm:magenta' => { R => 255, G => 0,   B => 255 },
    'warm:yellow'  => { R => 255, G => 255, B => 0 },
);

&main; exit;

sub main {
    local $Data::Dumper::Terse = 1;

    my $tfidf = make_tfidf(%data);
    warn Dumper($tfidf);

    say("\n\n== Result ==");
    {
        local $Data::Dumper::Indent = 0;
        for my $key (keys %data) {
            print Dumper([ $key, predict_query($key, $data{$key}, $tfidf) ]), "\n";
        }
    }
}

sub predict {
    my ($queries, $tfidf) = @_;
}

sub predict_query {
    my ($query, $words, $tfidf) = @_;
    my %result;
    # 各単語について
    while (my ($query_class_key, $count) = each %$words) {
        # その単語のスコアをみる
        for my $feature_key (keys %{$tfidf->{$query_class_key}}) { # ここのとりだしに futaba は csa つかってる。
            my $score = $tfidf->{$query_class_key}->{$feature_key};
            $result{$feature_key} += $score * $count;
        }
    }
    return [
        reverse
        sort { $a->[0] <=> $b->[0] }
        map  { [ $result{$_}, $_ ] }
        keys %result
    ];
}

# このへんは futaba_make_tfidf.pl のコピペ
sub make_tfidf {
    my %data = @_;

    # tf, df, N をもとめる
    my %tf; # the number of occurences of i in j
    my %df; # the number of document contains n
    my $N = 1; # the number of documents
    for my $key (keys %data) {
        (my $class_key = $key) =~ s/:.+//;

        for my $word (keys %{$data{$key}}) {
            my $score = $data{$key}->{$word};
            $tf{$word}->{$class_key} += $score;
            $df{$word}++;
        }
        $N++;
    }

    # calculate tfidf & class norm
    my $log_N = log($N);
    my %class_norm;
    for my $feature_key (keys %tf) {
        my $fr = $tf{$feature_key};
        my $log_df = $log_N - log($df{$feature_key});
        for my $class_key (keys %$fr) {
            my $tfidf = $fr->{$class_key} * $log_df;
            $class_norm{$class_key} += ($tfidf * $tfidf);
            $fr->{$class_key} = $tfidf;
        }
    }

    # print normalized tf-idf
    my %result;
    foreach my $feature_key ( keys %tf ) {
        my $fr = $tf{$feature_key};
        my $sep = '';
        my %map;
        foreach my $class_key ( keys %$fr ) {
            my $normalized_tfidf = $fr->{$class_key} / sqrt( $class_norm{$class_key} );
            $normalized_tfidf = int( $normalized_tfidf * 10000 ) / 10000;
            $map{$class_key} = $normalized_tfidf;
        }
        $result{$feature_key} = \%map;
    }
    return \%result;
}