What if sv_utf8_upgrade() used heuristic encoding?

By Christian Hansen on October 3, 2010 12:59 PM

Heuristic: decode as native unless is well-formed UTF-X:

sub heuristic_utf8_upgrade {
    utf8::upgrade($_[0])
      unless utf8::decode($_[0]);
    return !!0;
}

Here is some code to play with:

#!/usr/bin/perl
use strict;
use warnings;

{

    package encoding::heuristic;

    our $Encoding;

    BEGIN {

        require Encode;

        $Encoding = Encode::find_encoding('utf8');

    }

    sub import {

        ${^ENCODING} = bless \my $x, __PACKAGE__;

    }

    sub decode : lvalue {

        local ${^ENCODING};

        utf8::upgrade($_[1])

          unless utf8::decode($_[1]);

        $_[1];

    }

    sub cat_decode {

        shift;

        return $Encoding->cat_decode(@_);

    }

}

BEGIN {

    encoding::heuristic::import();

}

use Test::More qw[no_plan];

{

    my $str = "\x{263A}"        # unicode string

            . "\xE2\x98\xBA"    # UTF-8 encoded U+263A

            . "\xC4"            # Latin-1 encoded U+00C4

            ;

    cmp_ok($str, "eq", "\x{263A}\x{263A}\x{c4}", "No mojibake when concatenating");

}

{

    my $str = "\xE2\x98\xBA";

    utf8::upgrade($str);

    cmp_ok($str, "eq", "\x{263A}", "No mojibake when upgrading UTF-8 octets");

}

{

    my $str = "\xC4";

    utf8::upgrade($str);

    cmp_ok($str, "eq", "\x{c4}", "Upgrading native still works");

}

{

    my $str = "\xE2\x98\xBA";

    utf8::encode($str);

    cmp_ok($str, "eq", "\xE2\x98\xBA", "Encoding UTF-8 octets just work");

}

{

    my $str = "\xC4";

    utf8::encode($str);

    cmp_ok($str, "eq", "\xC3\x84", "So does native");

}

cmp_ok("\x{263A}", "eq", "\xE2\x98\xBA", "Equality of unicode and UTF-8 octets");

Above code is aslo available as a gist, https://gist.github.com/c9884c0817463fa34284

--
chansen

0 comments

Tagged as:

unicode, UTF-8

Name

Email Address

URL

Remember personal info?

Comments (You may use HTML tags for style)

About Christian Hansen

I blog about Perl.

More info »

Christian Hansen

What if sv_utf8_upgrade() used heuristic encoding?

Tagged as:

Leave a comment

About Christian Hansen

Search this blog