080601

MacBookで開発しているときに,文字コードで手こずったので,ちょっとメモ.あとで振り返る用.

ソースコード.

#!/usr/bin/perl
use strict;
use warnings;
use utf8;
 
use Encode;
use Unicode::Normalize;
 
binmode STDOUT, ':utf8';
 
my $BA0    = "ば";                        # 「ば」の字
my $BA1    = "\x{306f}\x{3099}";          # 「は」と「゛」との結合文字.utf-8-mac(NFD形式?)
my $BA2    = "\343\201\257\343\202\231";  # apacheのログでこんな感じのが出てた
my $BA2de  = decode('utf8',$BA2);         # 
 
printf "%s(\$BA0)    \t cmp %s(\$BA1)     \t => %d\n", $BA0,    $BA1,    $BA0    cmp $BA1;
printf "%s(\$BA0)    \t cmp %s(\$BA2)     \t => %d\n", $BA0,    $BA2,    $BA0    cmp $BA2;
printf "%s(\$BA1)    \t cmp %s(\$BA2)     \t => %d\n", $BA1,    $BA2,    $BA1    cmp $BA2;
printf "%s(\$BA1)    \t cmp %s(\$BA2de)   \t => %d\n", $BA1,    $BA2de,  $BA1    cmp $BA2de;
print "\n";
 
my $BA0nfc = NFC($BA0);
my $BA1nfc = NFC($BA1);
my $BA2nfc = NFC($BA2de);
 
printf "%s(\$BA0nfc) \t cmp %s(\$BA1)     \t => %d\n", $BA0nfc, $BA1,    $BA0nfc cmp $BA1;
printf "%s(\$BA0nfc) \t cmp %s(\$BA2)     \t => %d\n", $BA0nfc, $BA2,    $BA0nfc cmp $BA2;
printf "%s(\$BA0nfc) \t cmp %s(\$BA1nfc)  \t => %d\n", $BA0nfc, $BA1nfc, $BA0nfc cmp $BA1nfc;
printf "%s(\$BA0nfc) \t cmp %s(\$BA2nfc)  \t => %d\n", $BA0nfc, $BA2nfc, $BA0nfc cmp $BA2nfc;
 
__END__

出力結果.

/Users/iwata/tmp % ./utf8mac.pl
ば($BA0)         cmp ば(ÿ$BA1)           => 1
ば($BA0)         cmp ã¯ã($BA2)           => 1
ば(ÿ$BA1)        cmp ã¯ã($BA2)           => 1
ば(ÿ$BA1)        cmp ば(ÿ$BA2de)         => 0
 
ば($BA0nfc)      cmp ば(ÿ$BA1)           => 1
ば($BA0nfc)      cmp ã¯ã($BA2)           => 1
ば($BA0nfc)      cmp ば($BA1nfc)         => 0
ば($BA0nfc)      cmp ば($BA2nfc)         => 0
/Users/iwata/tmp %

こちらもあわせてどうぞ

コメントをどうぞ