diff options
Diffstat (limited to 'gnu/usr.bin/perl/cpan/Encode/Encode.pm')
-rw-r--r-- | gnu/usr.bin/perl/cpan/Encode/Encode.pm | 372 |
1 files changed, 198 insertions, 174 deletions
diff --git a/gnu/usr.bin/perl/cpan/Encode/Encode.pm b/gnu/usr.bin/perl/cpan/Encode/Encode.pm index dce6c5415e9..f90f929f073 100644 --- a/gnu/usr.bin/perl/cpan/Encode/Encode.pm +++ b/gnu/usr.bin/perl/cpan/Encode/Encode.pm @@ -1,21 +1,26 @@ # -# $Id: Encode.pm,v 2.80 2016/01/25 14:54:01 dankogai Exp $ +# $Id: Encode.pm,v 2.97 2018/02/21 12:14:24 dankogai Exp $ # package Encode; use strict; use warnings; -our $VERSION = sprintf "%d.%02d_01", q$Revision: 2.80 $ =~ /(\d+)/g; use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; -use XSLoader (); -XSLoader::load( __PACKAGE__, $VERSION ); +our $VERSION; +BEGIN { + $VERSION = sprintf "%d.%02d", q$Revision: 2.97 $ =~ /(\d+)/g; + require XSLoader; + XSLoader::load( __PACKAGE__, $VERSION ); +} use Exporter 5.57 'import'; +our @CARP_NOT = qw(Encode::Encoder); + # Public, encouraged API is exported by default our @EXPORT = qw( decode decode_utf8 encode encode_utf8 str2bytes bytes2str - encodings find_encoding clone_encoding + encodings find_encoding find_mime_encoding clone_encoding ); our @FB_FLAGS = qw( DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC @@ -44,7 +49,10 @@ our %EXPORT_TAGS = ( our $ON_EBCDIC = ( ord("A") == 193 ); -use Encode::Alias; +use Encode::Alias (); +use Encode::MIME::Name; + +use Storable; # Make a %Encoding package variable to allow a certain amount of cheating our %Encoding; @@ -96,12 +104,17 @@ sub define_encoding { my $alias = shift; define_alias( $alias, $obj ); } + my $class = ref($obj); + push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT; + push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT; return $obj; } sub getEncoding { my ( $class, $name, $skip_external ) = @_; + defined($name) or return; + $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 ref($name) && $name->can('renew') and return $name; @@ -125,11 +138,26 @@ sub getEncoding { return; } +# HACK: These two functions must be defined in Encode and because of +# cyclic dependency between Encode and Encode::Alias, Exporter does not work +sub find_alias { + goto &Encode::Alias::find_alias; +} +sub define_alias { + goto &Encode::Alias::define_alias; +} + sub find_encoding($;$) { my ( $name, $skip_external ) = @_; return __PACKAGE__->getEncoding( $name, $skip_external ); } +sub find_mime_encoding($;$) { + my ( $mime_name, $skip_external ) = @_; + my $name = Encode::MIME::Name::get_encode_name( $mime_name ); + return find_encoding( $name, $skip_external ); +} + sub resolve_alias($) { my $obj = find_encoding(shift); defined $obj and return $obj->name; @@ -139,8 +167,6 @@ sub resolve_alias($) { sub clone_encoding($) { my $obj = find_encoding(shift); ref $obj or return; - eval { require Storable }; - $@ and return; return Storable::dclone($obj); } @@ -172,7 +198,7 @@ sub encode($$;$) { else { $octets = $enc->encode( $string, $check ); } - $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC() ); + $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC ); return $octets; } *str2bytes = \&encode; @@ -201,7 +227,7 @@ sub decode($$;$) { else { $string = $enc->decode( $octets, $check ); } - $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); + $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC ); return $string; } *bytes2str = \&decode; @@ -220,14 +246,41 @@ sub from_to($$$;$) { require Carp; Carp::croak("Unknown encoding '$to'"); } - my $uni = $f->decode($string); - $_[0] = $string = $t->encode( $uni, $check ); + + # For Unicode, warnings need to be caught and re-issued at this level + # so that callers can disable utf8 warnings lexically. + my $uni; + if ( ref($f) eq 'Encode::Unicode' ) { + my $warn = ''; + { + local $SIG{__WARN__} = sub { $warn = shift }; + $uni = $f->decode($string); + } + warnings::warnif('utf8', $warn) if length $warn; + } + else { + $uni = $f->decode($string); + } + + if ( ref($t) eq 'Encode::Unicode' ) { + my $warn = ''; + { + local $SIG{__WARN__} = sub { $warn = shift }; + $_[0] = $string = $t->encode( $uni, $check ); + } + warnings::warnif('utf8', $warn) if length $warn; + } + else { + $_[0] = $string = $t->encode( $uni, $check ); + } + return undef if ( $check && length($uni) ); return defined( $_[0] ) ? length($string) : undef; } sub encode_utf8($) { my ($str) = @_; + return undef unless defined $str; utf8::encode($str); return $str; } @@ -241,133 +294,87 @@ sub decode_utf8($;$) { $check ||= 0; $utf8enc ||= find_encoding('utf8'); my $string = $utf8enc->decode( $octets, $check ); - $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC() ); + $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC ); return $string; } -# sub decode_utf8($;$) { -# my ( $str, $check ) = @_; -# return $str if is_utf8($str); -# if ($check) { -# return decode( "utf8", $str, $check ); -# } -# else { -# return decode( "utf8", $str ); -# return $str; -# } -# } - -predefine_encodings(1); - -# -# This is to restore %Encoding if really needed; -# - -sub predefine_encodings { - require Encode::Encoding; - no warnings 'redefine'; - my $use_xs = shift; - if ($ON_EBCDIC) { - - # was in Encode::UTF_EBCDIC - package Encode::UTF_EBCDIC; - push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; - *decode = sub { - my ( undef, $str, $chk ) = @_; - my $res = ''; - for ( my $i = 0 ; $i < length($str) ; $i++ ) { - $res .= - chr( - utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) - ); - } - $_[1] = '' if $chk; - return $res; - }; - *encode = sub { - my ( undef, $str, $chk ) = @_; - my $res = ''; - for ( my $i = 0 ; $i < length($str) ; $i++ ) { - $res .= - chr( - utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) - ); - } - $_[1] = '' if $chk; - return $res; - }; - $Encode::Encoding{Unicode} = - bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; +onBOOT; + +if ($ON_EBCDIC) { + package Encode::UTF_EBCDIC; + use parent 'Encode::Encoding'; + my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC"; + Encode::define_encoding($obj, 'Unicode'); + sub decode { + my ( undef, $str, $chk ) = @_; + my $res = ''; + for ( my $i = 0 ; $i < length($str) ; $i++ ) { + $res .= + chr( + utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) ) + ); + } + $_[1] = '' if $chk; + return $res; } - else { - - package Encode::Internal; - push @Encode::Internal::ISA, 'Encode::Encoding'; - *decode = sub { - my ( undef, $str, $chk ) = @_; - utf8::upgrade($str); - $_[1] = '' if $chk; - return $str; - }; - *encode = \&decode; - $Encode::Encoding{Unicode} = - bless { Name => "Internal" } => "Encode::Internal"; + sub encode { + my ( undef, $str, $chk ) = @_; + my $res = ''; + for ( my $i = 0 ; $i < length($str) ; $i++ ) { + $res .= + chr( + utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) ) + ); + } + $_[1] = '' if $chk; + return $res; } - { - # https://rt.cpan.org/Public/Bug/Display.html?id=103253 - package Encode::XS; - push @Encode::XS::ISA, 'Encode::Encoding'; +} else { + package Encode::Internal; + use parent 'Encode::Encoding'; + my $obj = bless { Name => "Internal" } => "Encode::Internal"; + Encode::define_encoding($obj, 'Unicode'); + sub decode { + my ( undef, $str, $chk ) = @_; + utf8::upgrade($str); + $_[1] = '' if $chk; + return $str; } - { + *encode = \&decode; +} - # was in Encode::utf8 - package Encode::utf8; - push @Encode::utf8::ISA, 'Encode::Encoding'; +{ + # https://rt.cpan.org/Public/Bug/Display.html?id=103253 + package Encode::XS; + use parent 'Encode::Encoding'; +} - # - if ($use_xs) { - Encode::DEBUG and warn __PACKAGE__, " XS on"; - *decode = \&decode_xs; - *encode = \&encode_xs; - } - else { - Encode::DEBUG and warn __PACKAGE__, " XS off"; - *decode = sub { - my ( undef, $octets, $chk ) = @_; - my $str = Encode::decode_utf8($octets); - if ( defined $str ) { - $_[1] = '' if $chk; - return $str; - } - return undef; - }; - *encode = sub { - my ( undef, $string, $chk ) = @_; - my $octets = Encode::encode_utf8($string); - $_[1] = '' if $chk; - return $octets; - }; +{ + package Encode::utf8; + use parent 'Encode::Encoding'; + my %obj = ( + 'utf8' => { Name => 'utf8' }, + 'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 } + ); + for ( keys %obj ) { + bless $obj{$_} => __PACKAGE__; + Encode::define_encoding( $obj{$_} => $_ ); + } + sub cat_decode { + # ($obj, $dst, $src, $pos, $trm, $chk) + # currently ignores $chk + my ( undef, undef, undef, $pos, $trm ) = @_; + my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; + use bytes; + if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { + $$rdst .= + substr( $$rsrc, $pos, $npos - $pos + length($trm) ); + $$rpos = $npos + length($trm); + return 1; } - *cat_decode = sub { # ($obj, $dst, $src, $pos, $trm, $chk) - # currently ignores $chk - my ( undef, undef, undef, $pos, $trm ) = @_; - my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ]; - use bytes; - if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) { - $$rdst .= - substr( $$rsrc, $pos, $npos - $pos + length($trm) ); - $$rpos = $npos + length($trm); - return 1; - } - $$rdst .= substr( $$rsrc, $pos ); - $$rpos = length($$rsrc); - return ''; - }; - $Encode::Encoding{utf8} = - bless { Name => "utf8" } => "Encode::utf8"; - $Encode::Encoding{"utf-8-strict"} = - bless { Name => "utf-8-strict", strict_utf8 => 1 } - => "Encode::utf8"; + $$rdst .= substr( $$rsrc, $pos ); + $$rpos = length($$rsrc); + return ''; } } @@ -470,19 +477,25 @@ I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a canonical name or an alias. For encoding names and aliases, see L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">. +B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending +on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be +left unchanged. + For example, to convert a string from Perl's internal format into ISO-8859-1, also known as Latin1: $octets = encode("iso-8859-1", $string); -B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then +B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then $octets I<might not be equal to> $string. Though both contain the same data, the UTF8 flag for $octets is I<always> off. When you encode anything, the UTF8 flag on the result is always off, even when it -contains a completely valid utf8 string. See L</"The UTF8 flag"> below. +contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below. If the $string is C<undef>, then C<undef> is returned. +C<str2bytes> may be used as an alias for C<encode>. + =head3 decode $string = decode(ENCODING, OCTETS[, CHECK]) @@ -494,18 +507,24 @@ I<ENCODING> can be either a canonical name or an alias. For encoding names and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling Malformed Data">. +B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending +on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be +left unchanged. + For example, to convert ISO-8859-1 data into a string in Perl's internal format: $string = decode("iso-8859-1", $octets); -B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string +B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string I<might not be equal to> $octets. Though both contain the same data, the UTF8 flag for $string is on. See L</"The UTF8 flag"> below. If the $string is C<undef>, then C<undef> is returned. +C<bytes2str> may be used as an alias for C<decode>. + =head3 find_encoding [$obj =] find_encoding(ENCODING) @@ -514,11 +533,11 @@ Returns the I<encoding object> corresponding to I<ENCODING>. Returns C<undef> if no matching I<ENCODING> is find. The returned object is what does the actual encoding or decoding. - $utf8 = decode($name, $bytes); + $string = decode($name, $bytes); is in fact - $utf8 = do { + $string = do { $obj = find_encoding($name); croak qq(encoding "$name" not found) unless ref $obj; $obj->decode($bytes); @@ -530,8 +549,8 @@ You can therefore save time by reusing this object as follows; my $enc = find_encoding("iso-8859-1"); while(<>) { - my $utf8 = $enc->decode($_); - ... # now do something with $utf8; + my $string = $enc->decode($_); + ... # now do something with $string; } Besides L</decode> and L</encode>, other methods are @@ -542,6 +561,20 @@ name of the encoding object. See L<Encode::Encoding> for details. +=head3 find_mime_encoding + + [$obj =] find_mime_encoding(MIME_ENCODING) + +Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts +same as C<find_encoding()> but C<mime_name()> of returned object must +match to I<MIME_ENCODING>. So as opposite of C<find_encoding()> +canonical names and aliases are not used when searching for object. + + find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING> + find_mime_encoding("utf-8"); # returns encode object "utf-8-strict" + find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive + find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING> + =head3 from_to [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) @@ -565,13 +598,13 @@ and C<undef> on error. B<CAVEAT>: The following operations may look the same, but are not: - from_to($data, "iso-8859-1", "utf8"); #1 + from_to($data, "iso-8859-1", "UTF-8"); #1 $data = decode("iso-8859-1", $data); #2 Both #1 and #2 make $data consist of a completely valid UTF-8 string, but only #2 turns the UTF8 flag on. #1 is equivalent to: - $data = encode("utf8", decode("iso-8859-1", $data)); + $data = encode("UTF-8", decode("iso-8859-1", $data)); See L</"The UTF8 flag"> below. @@ -596,7 +629,11 @@ followed by C<encode> as follows: Equivalent to C<$octets = encode("utf8", $string)>. The characters in $string are encoded in Perl's internal format, and the result is returned as a sequence of octets. Because all possible characters in Perl have a -(loose, not strict) UTF-8 representation, this function cannot fail. +(loose, not strict) utf8 representation, this function cannot fail. + +B<WARNING>: do not use this function for data exchange as it can produce +not strict utf8 $octets! For strictly valid UTF-8 output use +C<$octets = encode("UTF-8", $string)>. =head3 decode_utf8 @@ -604,11 +641,19 @@ as a sequence of octets. Because all possible characters in Perl have a Equivalent to C<$string = decode("utf8", $octets [, CHECK])>. The sequence of octets represented by $octets is decoded -from UTF-8 into a sequence of logical characters. -Because not all sequences of octets are valid UTF-8, +from (loose, not strict) utf8 into a sequence of logical characters. +Because not all sequences of octets are valid not strict utf8, it is quite possible for this function to fail. For CHECK, see L</"Handling Malformed Data">. +B<WARNING>: do not use this function for data exchange as it can produce +$string with not strict utf8 representation! For strictly valid UTF-8 +$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>. + +B<CAVEAT>: the input I<$octets> might be modified in-place depending on +what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be +left unchanged. + =head2 Listing available encodings use Encode; @@ -840,15 +885,14 @@ octets that represent the fallback character. For instance: Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>. -Even the fallback for C<decode> must return octets, which are -then decoded with the character encoding that C<decode> accepts. So for +Fallback for C<decode> must return decoded string (sequence of characters) +and takes a list of ordinal values as its arguments. So for example if you wish to decode octets as UTF-8, and use ISO-8859-15 as a fallback for bytes that are not valid UTF-8, you could write $str = decode 'UTF-8', $octets, sub { - my $tmp = chr shift; - from_to $tmp, 'ISO-8859-15', 'UTF-8'; - return $tmp; + my $tmp = join '', map chr, @_; + return decode 'ISO-8859-15', $tmp; }; =head1 Defining Encodings @@ -905,38 +949,11 @@ different kinds of strings and string-operations in Perl: one a byte-oriented mode for when the internal UTF8 flag is off, and the other a character-oriented mode for when the internal UTF8 flag is on. -Here is how C<Encode> handles the UTF8 flag. - -=over 2 - -=item * - -When you I<encode>, the resulting UTF8 flag is always B<off>. - -=item * - -When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can -unambiguously represent data. Here is what we mean by "unambiguously". -After C<$utf8 = decode("foo", $octet)>, - - When $octet is... The UTF8 flag in $utf8 is - --------------------------------------------- - In ASCII only (or EBCDIC only) OFF - In ISO-8859-1 ON - In any other Encoding ON - --------------------------------------------- - -As you see, there is one exception: in ASCII. That way you can assume -Goal #1. And with C<Encode>, Goal #2 is assumed but you still have to be -careful in the cases mentioned in the B<CAVEAT> paragraphs above. - This UTF8 flag is not visible in Perl scripts, exactly for the same reason you cannot (or rather, you I<don't have to>) see whether a scalar contains a string, an integer, or a floating-point number. But you can still peek and poke these if you will. See the next section. -=back - =head2 Messing with Perl's Internals The following API uses parts of Perl's internals in the current @@ -951,6 +968,13 @@ release. If I<CHECK> is true, also checks whether I<STRING> contains well-formed UTF-8. Returns true if successful, false otherwise. +Typically only necessary for debugging and testing. Don't use this flag as +a marker to distinguish character and binary data, that should be decided +for each variable when you write your code. + +B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that +I<STRING> is UTF-8 encoded and vice-versa. + As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function. =head3 _utf8_on |