What if sv_utf8_upgrade() used heuristic encoding?
Heuristic: decode as native unless is well-formed UTF-X:
sub heuristic_utf8_upgrade {
utf8::upgrade($_[0])
unless utf8::decode($_[0]);
return !!0;
}
Here is some code to play with:
#!/usr/bin/perl
use strict;
use warnings;
{
package encoding::heuristic;
our $Encoding;
BEGIN {
require Encode;
$Encoding = Encode::find_encoding('utf8');
}
sub import {
${^ENCODING} = bless \my $x, __PACKAGE__;
}
sub decode : lvalue {
local ${^ENCODING};
utf8::upgrade($_[1])
unless utf8::decode($_[1]);
$_[1];
}
sub cat_decode {
shift;
return $Encoding->cat_decode(@_);
}
}
BEGIN {
encoding::heuristic::import();
}
use Test::More qw[no_plan];
{
my $str = "\x{263A}" # unicode string
. "\xE2\x98\xBA" # UTF-8 encoded U+263A
. "\xC4" # Latin-1 encoded U+00C4
;
cmp_ok($str, "eq", "\x{263A}\x{263A}\x{c4}", "No mojibake when concatenating");
}
{
my $str = "\xE2\x98\xBA";
utf8::upgrade($str);
cmp_ok($str, "eq", "\x{263A}", "No mojibake when upgrading UTF-8 octets");
}
{
my $str = "\xC4";
utf8::upgrade($str);
cmp_ok($str, "eq", "\x{c4}", "Upgrading native still works");
}
{
my $str = "\xE2\x98\xBA";
utf8::encode($str);
cmp_ok($str, "eq", "\xE2\x98\xBA", "Encoding UTF-8 octets just work");
}
{
my $str = "\xC4";
utf8::encode($str);
cmp_ok($str, "eq", "\xC3\x84", "So does native");
}
cmp_ok("\x{263A}", "eq", "\xE2\x98\xBA", "Equality of unicode and UTF-8 octets");
Above code is aslo available as a gist, https://gist.github.com/c9884c0817463fa34284
--
chansen
Leave a comment