What if sv_utf8_upgrade() used heuristic encoding?

Heuristic: decode as native unless is well-formed UTF-X:

sub heuristic_utf8_upgrade {
    utf8::upgrade($_[0])
      unless utf8::decode($_[0]);
    return !!0;
}

Here is some code to play with:

#!/usr/bin/perl
use strict;
use warnings;

{
package encoding::heuristic;

our $Encoding;

BEGIN {
require Encode;
$Encoding = Encode::find_encoding('utf8');
}

sub import {
${^ENCODING} = bless \my $x, __PACKAGE__;
}

sub decode : lvalue {
local ${^ENCODING};
utf8::upgrade($_[1])
unless utf8::decode($_[1]);
$_[1];
}

sub cat_decode {
shift;
return $Encoding->cat_decode(@_);
}
}

BEGIN {
encoding::heuristic::import();
}

use Test::More qw[no_plan];

{
my $str = "\x{263A}" # unicode string
. "\xE2\x98\xBA" # UTF-8 encoded U+263A
. "\xC4" # Latin-1 encoded U+00C4
;
cmp_ok($str, "eq", "\x{263A}\x{263A}\x{c4}", "No mojibake when concatenating");
}

{
my $str = "\xE2\x98\xBA";
utf8::upgrade($str);
cmp_ok($str, "eq", "\x{263A}", "No mojibake when upgrading UTF-8 octets");
}

{
my $str = "\xC4";
utf8::upgrade($str);
cmp_ok($str, "eq", "\x{c4}", "Upgrading native still works");
}

{
my $str = "\xE2\x98\xBA";
utf8::encode($str);
cmp_ok($str, "eq", "\xE2\x98\xBA", "Encoding UTF-8 octets just work");
}

{
my $str = "\xC4";
utf8::encode($str);
cmp_ok($str, "eq", "\xC3\x84", "So does native");
}

cmp_ok("\x{263A}", "eq", "\xE2\x98\xBA", "Equality of unicode and UTF-8 octets");

Above code is aslo available as a gist, https://gist.github.com/c9884c0817463fa34284

--
chansen

Leave a comment

About Christian Hansen

user-pic I blog about Perl.