MacBookで開発しているときに,文字コードで手こずったので,ちょっとメモ.あとで振り返る用.
ソースコード.
#!/usr/bin/perl use strict; use warnings; use utf8; use Encode; use Unicode::Normalize; binmode STDOUT, ':utf8'; my $BA0 = "ば"; # 「ば」の字 my $BA1 = "\x{306f}\x{3099}"; # 「は」と「゛」との結合文字.utf-8-mac(NFD形式?) my $BA2 = "\343\201\257\343\202\231"; # apacheのログでこんな感じのが出てた my $BA2de = decode('utf8',$BA2); # printf "%s(\$BA0) \t cmp %s(\$BA1) \t => %d\n", $BA0, $BA1, $BA0 cmp $BA1; printf "%s(\$BA0) \t cmp %s(\$BA2) \t => %d\n", $BA0, $BA2, $BA0 cmp $BA2; printf "%s(\$BA1) \t cmp %s(\$BA2) \t => %d\n", $BA1, $BA2, $BA1 cmp $BA2; printf "%s(\$BA1) \t cmp %s(\$BA2de) \t => %d\n", $BA1, $BA2de, $BA1 cmp $BA2de; print "\n"; my $BA0nfc = NFC($BA0); my $BA1nfc = NFC($BA1); my $BA2nfc = NFC($BA2de); printf "%s(\$BA0nfc) \t cmp %s(\$BA1) \t => %d\n", $BA0nfc, $BA1, $BA0nfc cmp $BA1; printf "%s(\$BA0nfc) \t cmp %s(\$BA2) \t => %d\n", $BA0nfc, $BA2, $BA0nfc cmp $BA2; printf "%s(\$BA0nfc) \t cmp %s(\$BA1nfc) \t => %d\n", $BA0nfc, $BA1nfc, $BA0nfc cmp $BA1nfc; printf "%s(\$BA0nfc) \t cmp %s(\$BA2nfc) \t => %d\n", $BA0nfc, $BA2nfc, $BA0nfc cmp $BA2nfc; __END__
出力結果.
/Users/iwata/tmp % ./utf8mac.pl ば($BA0) cmp ば(ÿ$BA1) => 1 ば($BA0) cmp ã¯ã($BA2) => 1 ば(ÿ$BA1) cmp ã¯ã($BA2) => 1 ば(ÿ$BA1) cmp ば(ÿ$BA2de) => 0 ば($BA0nfc) cmp ば(ÿ$BA1) => 1 ば($BA0nfc) cmp ã¯ã($BA2) => 1 ば($BA0nfc) cmp ば($BA1nfc) => 0 ば($BA0nfc) cmp ば($BA2nfc) => 0 /Users/iwata/tmp %
