diff options
Diffstat (limited to 'gnu/usr.bin/perl/lib/Unicode')
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/UCD.pm | 405 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/UCD.t | 69 |
2 files changed, 356 insertions, 118 deletions
diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.pm b/gnu/usr.bin/perl/lib/Unicode/UCD.pm index 724fb62785b..9c3dd7c7105 100644 --- a/gnu/usr.bin/perl/lib/Unicode/UCD.pm +++ b/gnu/usr.bin/perl/lib/Unicode/UCD.pm @@ -4,11 +4,8 @@ use strict; use warnings; no warnings 'surrogate'; # surrogates can be inputs to this use charnames (); -use Unicode::Normalize qw(getCombinClass NFD); -our $VERSION = '0.43'; - -use Storable qw(dclone); +our $VERSION = '0.51'; require Exporter; @@ -20,7 +17,7 @@ our @EXPORT_OK = qw(charinfo charinrange general_categories bidi_types compexcl - casefold casespec + casefold all_casefolds casespec namedseq num prop_aliases @@ -44,6 +41,9 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'casefold'; my $casefold = casefold(0xFB00); + use Unicode::UCD 'all_casefolds'; + my $all_casefolds_ref = all_casefolds(); + use Unicode::UCD 'casespec'; my $casespec = casespec(0xFB00); @@ -104,8 +104,16 @@ a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+> followed by hexadecimals designating a Unicode code point. In other words, if you want a code point to be interpreted as a hexadecimal number, you must prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be -interpreted as a decimal code point. Note that the largest code point in -Unicode is U+10FFFF. +interpreted as a decimal code point. + +Examples: + + 223 # Decimal 223 + 0223 # Hexadecimal 223 (= 547 decimal) + 0xDF # Hexadecimal DF (= 223 decimal + U+DF # Hexadecimal DF + +Note that the largest code point in Unicode is U+10FFFF. =cut @@ -114,6 +122,7 @@ my $VERSIONFH; my $CASEFOLDFH; my $CASESPECFH; my $NAMEDSEQFH; +my $v_unicode_version; # v-string. sub openunicode { my ($rfh, @path) = @_; @@ -132,6 +141,35 @@ sub openunicode { return $f; } +sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it. + + use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone); + + return dclone(shift) if defined &dclone; + + my $arg = shift; + my $type = ref $arg; + return $arg unless $type; # No deep cloning needed for scalars + + if ($type eq 'ARRAY') { + my @return; + foreach my $element (@$arg) { + push @return, &_dclone($element); + } + return \@return; + } + elsif ($type eq 'HASH') { + my %return; + foreach my $key (keys %$arg) { + $return{$key} = &_dclone($arg->{$key}); + } + return \%return; + } + else { + croak "_dclone can't handle " . $type; + } +} + =head2 B<charinfo()> use Unicode::UCD 'charinfo'; @@ -303,6 +341,7 @@ my %SIMPLE_LOWER; my %SIMPLE_TITLE; my %SIMPLE_UPPER; my %UNICODE_1_NAMES; +my %ISO_COMMENT; sub charinfo { @@ -315,6 +354,9 @@ sub charinfo { use feature 'unicode_strings'; + # Will fail if called under minitest + use if defined &DynaLoader::boot_DynaLoader, "Unicode::Normalize" => qw(getCombinClass NFD); + my $arg = shift; my $code = _getcode($arg); croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code; @@ -353,7 +395,8 @@ sub charinfo { # Having no decomposition implies an empty field; otherwise, all but # "Canonical" imply a compatible decomposition, and the type is prefixed # to that, as it is in UnicodeData.txt - if ($char =~ /\p{Block=Hangul_Syllables}/) { + UnicodeVersion() unless defined $v_unicode_version; + if ($v_unicode_version ge v2.0.0 && $char =~ /\p{Block=Hangul_Syllables}/) { # The code points of the decomposition are output in standard Unicode # hex format, separated by blanks. $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)} @@ -400,9 +443,16 @@ sub charinfo { %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES; $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // ""; - # This is true starting in 6.0, but, num() also requires 6.0, so - # don't need to test for version again here. - $prop{'comment'} = ""; + UnicodeVersion() unless defined $v_unicode_version; + if ($v_unicode_version ge v6.0.0) { + $prop{'comment'} = ""; + } + else { + %ISO_COMMENT = _read_table("To/Isc.pl", "use_hash") unless %ISO_COMMENT; + $prop{'comment'} = (defined $ISO_COMMENT{$code}) + ? $ISO_COMMENT{$code} + : ""; + } %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER; $prop{'upper'} = (defined $SIMPLE_UPPER{$code}) @@ -536,7 +586,8 @@ With a L</code point argument> charblock() returns the I<block> the code point belongs to, e.g. C<Basic Latin>. The old-style block name is returned (see L</Old-style versus new-style block names>). If the code point is unassigned, this returns the block it would belong to if -it were assigned. +it were assigned. (If the Unicode version being used is so early as to not +have blocks, all code points are considered to be in C<No_Block>.) See also L</Blocks versus Scripts>. @@ -562,7 +613,13 @@ sub _charblocks { # Can't read from the mktables table because it loses the hyphens in the # original. unless (@BLOCKS) { - if (openunicode(\$BLOCKSFH, "Blocks.txt")) { + UnicodeVersion() unless defined $v_unicode_version; + if ($v_unicode_version lt v2.0.0) { + my $subrange = [ 0, 0x10FFFF, 'No_Block' ]; + push @BLOCKS, $subrange; + push @{$BLOCKS{'No_Block'}}, $subrange; + } + elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) { local $_; local $/ = "\n"; while (<$BLOCKSFH>) { @@ -591,7 +648,7 @@ sub charblock { return 'No_Block'; } elsif (exists $BLOCKS{$arg}) { - return dclone $BLOCKS{$arg}; + return _dclone $BLOCKS{$arg}; } } @@ -607,7 +664,8 @@ sub charblock { With a L</code point argument> charscript() returns the I<script> the code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>. -If the code point is unassigned, it returns C<"Unknown">. +If the code point is unassigned or the Unicode version being used is so early +that it doesn't have scripts, this function returns C<"Unknown">. If supplied with an argument that can't be a code point, charscript() tries to do the opposite and interpret the argument as a script name. The @@ -624,7 +682,15 @@ my @SCRIPTS; my %SCRIPTS; sub _charscripts { - @SCRIPTS =_read_table("To/Sc.pl") unless @SCRIPTS; + unless (@SCRIPTS) { + UnicodeVersion() unless defined $v_unicode_version; + if ($v_unicode_version lt v3.1.0) { + push @SCRIPTS, [ 0, 0x10FFFF, 'Unknown' ]; + } + else { + @SCRIPTS =_read_table("To/Sc.pl"); + } + } foreach my $entry (@SCRIPTS) { $entry->[2] =~ s/(_\w)/\L$1/g; # Preserve old-style casing push @{$SCRIPTS{$entry->[2]}}, $entry; @@ -643,7 +709,7 @@ sub charscript { return $result if defined $result; return $utf8::SwashInfo{'ToSc'}{'missing'}; } elsif (exists $SCRIPTS{$arg}) { - return dclone $SCRIPTS{$arg}; + return _dclone $SCRIPTS{$arg}; } return; @@ -670,7 +736,7 @@ See also L</Blocks versus Scripts>. sub charblocks { _charblocks() unless %BLOCKS; - return dclone \%BLOCKS; + return _dclone \%BLOCKS; } =head2 B<charscripts()> @@ -692,7 +758,7 @@ See also L</Blocks versus Scripts>. sub charscripts { _charscripts() unless %SCRIPTS; - return dclone \%SCRIPTS; + return _dclone \%SCRIPTS; } =head2 B<charinrange()> @@ -752,7 +818,7 @@ my %GENERAL_CATEGORIES = ); sub general_categories { - return dclone \%GENERAL_CATEGORIES; + return _dclone \%GENERAL_CATEGORIES; } =head2 B<general_categories()> @@ -820,7 +886,7 @@ the bidi type name. =cut sub bidi_types { - return dclone \%BIDI_TYPES; + return _dclone \%BIDI_TYPES; } =head2 B<compexcl()> @@ -829,7 +895,9 @@ sub bidi_types { my $compexcl = compexcl(0x09dc); -This routine is included for backwards compatibility, but as of Perl 5.12, for +This routine returns C<undef> if the Unicode version being used is so early +that it doesn't have this property. It is included for backwards +compatibility, but as of Perl 5.12 and more modern Unicode versions, for most purposes it is probably more convenient to use one of the following instead: @@ -864,6 +932,9 @@ sub compexcl { croak __PACKAGE__, "::compexcl: unknown code '$arg'" unless defined $code; + UnicodeVersion() unless defined $v_unicode_version; + return if $v_unicode_version lt v3.0.0; + no warnings "non_unicode"; # So works on non-Unicode code points return chr($code) =~ /\p{Composition_Exclusion}/; } @@ -943,12 +1014,12 @@ dotless lowercase i: =over -=item B<*> If you use this C<I> mapping +=item Z<>B<*> If you use this C<I> mapping the result is case-insensitive, but dotless and dotted I's are not distinguished -=item B<*> If you exclude this C<I> mapping +=item Z<>B<*> If you exclude this C<I> mapping the result is not fully case-insensitive, but dotless and dotted I's are distinguished @@ -997,54 +1068,88 @@ L<http://www.unicode.org/unicode/reports/tr21> my %CASEFOLD; sub _casefold { - unless (%CASEFOLD) { - if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { - local $_; - local $/ = "\n"; - while (<$CASEFOLDFH>) { - if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { - my $code = hex($1); - $CASEFOLD{$code}{'code'} = $1; - $CASEFOLD{$code}{'turkic'} = "" unless - defined $CASEFOLD{$code}{'turkic'}; - if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and - # earlier Unicodes - # Both entries there (I - # only checked 3.1) are - # the same as C, and - # there are no other - # entries for those - # codepoints, so treat - # as if C, but override - # the turkic one for - # 'I'. - $CASEFOLD{$code}{'status'} = $2; - $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} = - $CASEFOLD{$code}{'mapping'} = $3; - $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I'; - } elsif ($2 eq 'F') { - $CASEFOLD{$code}{'full'} = $3; - unless (defined $CASEFOLD{$code}{'simple'}) { - $CASEFOLD{$code}{'simple'} = ""; - $CASEFOLD{$code}{'mapping'} = $3; - $CASEFOLD{$code}{'status'} = $2; - } - } elsif ($2 eq 'S') { + unless (%CASEFOLD) { # Populate the hash + my ($full_invlist_ref, $full_invmap_ref, undef, $default) + = prop_invmap('Case_Folding'); + + # Use the recipe given in the prop_invmap() pod to convert the + # inversion map into the hash. + for my $i (0 .. @$full_invlist_ref - 1 - 1) { + next if $full_invmap_ref->[$i] == $default; + my $adjust = -1; + for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) { + $adjust++; + if (! ref $full_invmap_ref->[$i]) { + + # This is a single character mapping + $CASEFOLD{$j}{'status'} = 'C'; + $CASEFOLD{$j}{'simple'} + = $CASEFOLD{$j}{'full'} + = $CASEFOLD{$j}{'mapping'} + = sprintf("%04X", $full_invmap_ref->[$i] + $adjust); + $CASEFOLD{$j}{'code'} = sprintf("%04X", $j); + $CASEFOLD{$j}{'turkic'} = ""; + } + else { # prop_invmap ensures that $adjust is 0 for a ref + $CASEFOLD{$j}{'status'} = 'F'; + $CASEFOLD{$j}{'full'} + = $CASEFOLD{$j}{'mapping'} + = join " ", map { sprintf "%04X", $_ } + @{$full_invmap_ref->[$i]}; + $CASEFOLD{$j}{'simple'} = ""; + $CASEFOLD{$j}{'code'} = sprintf("%04X", $j); + $CASEFOLD{$j}{'turkic'} = ""; + } + } + } + # We have filled in the full mappings above, assuming there were no + # simple ones for the ones with multi-character maps. Now, we find + # and fix the cases where that assumption was false. + (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default) + = prop_invmap('Simple_Case_Folding'); + for my $i (0 .. @$simple_invlist_ref - 1 - 1) { + next if $simple_invmap_ref->[$i] == $default; + my $adjust = -1; + for my $j ($simple_invlist_ref->[$i] + .. $simple_invlist_ref->[$i+1] -1) + { + $adjust++; + next if $CASEFOLD{$j}{'status'} eq 'C'; + $CASEFOLD{$j}{'status'} = 'S'; + $CASEFOLD{$j}{'simple'} + = $CASEFOLD{$j}{'mapping'} + = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust); + $CASEFOLD{$j}{'code'} = sprintf("%04X", $j); + $CASEFOLD{$j}{'turkic'} = ""; + } + } - # There can't be a simple without a full, and simple - # overrides all but full + # We hard-code in the turkish rules + UnicodeVersion() unless defined $v_unicode_version; + if ($v_unicode_version ge v3.2.0) { - $CASEFOLD{$code}{'simple'} = $3; - $CASEFOLD{$code}{'mapping'} = $3; - $CASEFOLD{$code}{'status'} = $2; - } elsif ($2 eq 'T') { - $CASEFOLD{$code}{'turkic'} = $3; - } # else can't happen because only [CIFST] are possible - } - } - close($CASEFOLDFH); - } + # These two code points should already have regular entries, so + # just fill in the turkish fields + $CASEFOLD{ord('I')}{'turkic'} = '0131'; + $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i'); + } + elsif ($v_unicode_version ge v3.1.0) { + + # These two code points don't have entries otherwise. + $CASEFOLD{0x130}{'code'} = '0130'; + $CASEFOLD{0x131}{'code'} = '0131'; + $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I'; + $CASEFOLD{0x130}{'turkic'} + = $CASEFOLD{0x130}{'mapping'} + = $CASEFOLD{0x130}{'full'} + = $CASEFOLD{0x130}{'simple'} + = $CASEFOLD{0x131}{'turkic'} + = $CASEFOLD{0x131}{'mapping'} + = $CASEFOLD{0x131}{'full'} + = $CASEFOLD{0x131}{'simple'} + = sprintf "%04X", ord('i'); + } } } @@ -1059,6 +1164,55 @@ sub casefold { return $CASEFOLD{$code}; } +=head2 B<all_casefolds()> + + + use Unicode::UCD 'all_casefolds'; + + my $all_folds_ref = all_casefolds(); + foreach my $char_with_casefold (sort { $a <=> $b } + keys %$all_folds_ref) + { + printf "%04X:", $char_with_casefold; + my $casefold = $all_folds_ref->{$char_with_casefold}; + + # Get folds for $char_with_casefold + + my @full_fold_hex = split / /, $casefold->{'full'}; + my $full_fold_string = + join "", map {chr(hex($_))} @full_fold_hex; + print " full=", join " ", @full_fold_hex; + my @turkic_fold_hex = + split / /, ($casefold->{'turkic'} ne "") + ? $casefold->{'turkic'} + : $casefold->{'full'}; + my $turkic_fold_string = + join "", map {chr(hex($_))} @turkic_fold_hex; + print "; turkic=", join " ", @turkic_fold_hex; + if (defined $casefold && $casefold->{'simple'} ne "") { + my $simple_fold_hex = $casefold->{'simple'}; + my $simple_fold_string = chr(hex($simple_fold_hex)); + print "; simple=$simple_fold_hex"; + } + print "\n"; + } + +This returns all the case foldings in the current version of Unicode in the +form of a reference to a hash. Each key to the hash is the decimal +representation of a Unicode character that has a casefold to other than +itself. The casefold of a semi-colon is itself, so it isn't in the hash; +likewise for a lowercase "a", but there is an entry for a capital "A". The +hash value for each key is another hash, identical to what is returned by +L</casefold()> if called with that code point as its argument. So the value +C<< all_casefolds()->{ord("A")}' >> is equivalent to C<casefold(ord("A"))>; + +=cut + +sub all_casefolds () { + _casefold() unless %CASEFOLD; + return _dclone \%CASEFOLD; +} + =head2 B<casespec()> use Unicode::UCD 'casespec'; @@ -1161,15 +1315,25 @@ my %CASESPEC; sub _casespec { unless (%CASESPEC) { - if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { + UnicodeVersion() unless defined $v_unicode_version; + if ($v_unicode_version lt v2.1.8) { + %CASESPEC = {}; + } + elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { local $_; local $/ = "\n"; while (<$CASESPECFH>) { if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { + my ($hexcode, $lower, $title, $upper, $condition) = ($1, $2, $3, $4, $5); my $code = hex($hexcode); - if (exists $CASESPEC{$code}) { + + # In 2.1.8, there were duplicate entries; ignore all but + # the first one -- there were no conditions in the file + # anyway. + if (exists $CASESPEC{$code} && $v_unicode_version ne v2.1.8) + { if (exists $CASESPEC{$code}->{code}) { my ($oldlower, $oldtitle, @@ -1222,7 +1386,7 @@ sub casespec { _casespec() unless %CASESPEC; - return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; + return ref $CASESPEC{$code} ? _dclone $CASESPEC{$code} : $CASESPEC{$code}; } =head2 B<namedseq()> @@ -1307,16 +1471,6 @@ sub namedseq { my %NUMERIC; sub _numeric { - - # Unicode 6.0 instituted the rule that only digits in a consecutive - # block of 10 would be considered decimal digits. Before that, the only - # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE - # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT - # ONE. The code could be modified to handle that, but not bothering, as - # in TUS 6.0, U+19DA was changed to Nt=Di. - if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) { - croak __PACKAGE__, "::num requires Unicode 6.0 or greater" - } my @numbers = _read_table("To/Nv.pl"); foreach my $entry (@numbers) { my ($start, $end, $value) = @$entry; @@ -1427,14 +1581,43 @@ sub num { return if $string =~ /\D/; my $first_ord = ord(substr($string, 0, 1)); my $value = $NUMERIC{$first_ord}; + + # To be a valid decimal number, it should be in a block of 10 consecutive + # characters, whose values are 0, 1, 2, ... 9. Therefore this digit's + # value is its offset in that block from the character that means zero. my $zero_ord = $first_ord - $value; + # Unicode 6.0 instituted the rule that only digits in a consecutive + # block of 10 would be considered decimal digits. If this is an earlier + # release, we verify that this first character is a member of such a + # block. That is, that the block of characters surrounding this one + # consists of all \d characters whose numeric values are the expected + # ones. + UnicodeVersion() unless defined $v_unicode_version; + if ($v_unicode_version lt v6.0.0) { + for my $i (0 .. 9) { + my $ord = $zero_ord + $i; + return unless chr($ord) =~ /\d/; + my $numeric = $NUMERIC{$ord}; + return unless defined $numeric; + return unless $numeric == $i; + } + } + for my $i (1 .. $length -1) { + + # Here we know either by verifying, or by fact of the first character + # being a \d in Unicode 6.0 or later, that any character between the + # character that means 0, and 9 positions above it must be \d, and + # must have its value correspond to its offset from the zero. Any + # characters outside these 10 do not form a legal number for this + # function. my $ord = ord(substr($string, $i, 1)); my $digit = $ord - $zero_ord; return unless $digit >= 0 && $digit <= 9; $value = $value * 10 + $digit; } + return $value; } @@ -1676,7 +1859,7 @@ sub prop_aliases ($) { # The full name is in element 1. return $list_ref->[1] unless wantarray; - return @{dclone $list_ref}; + return @{_dclone $list_ref}; } =pod @@ -1815,7 +1998,7 @@ sub prop_value_aliases ($$) { # The full name is in element 1. return $list_ref->[1] unless wantarray; - return @{dclone $list_ref}; + return @{_dclone $list_ref}; } return $list_ref->[0] unless wantarray; @@ -1842,7 +2025,8 @@ by the input parameter string: prints: 0, 1114112 -An empty list is returned if the input is unknown; the number of elements in +If the input is unknown C<undef> is returned in scalar context; an empty-list +in list context. If the input is known, the number of elements in the list is returned if called in scalar context. L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives @@ -1952,8 +2136,12 @@ properties, and will return C<undef> if called with one of those. our %loose_defaults; our $MAX_UNICODE_CODEPOINT; -sub prop_invlist ($) { +sub prop_invlist ($;$) { my $prop = $_[0]; + + # Undocumented way to get at Perl internal properties + my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok'; + return if ! defined $prop; require "utf8_heavy.pl"; @@ -1970,7 +2158,7 @@ sub prop_invlist ($) { || ref $swash eq "" || $swash->{'BITS'} != 1 || $swash->{'USER_DEFINED'} - || $prop =~ /^\s*_/; + || (! $internal_ok && $prop =~ /^\s*_/); if ($swash->{'EXTRAS'}) { carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic"; @@ -2075,7 +2263,8 @@ sub prop_invlist ($) { sub _search_invlist { # Find the range in the inversion list which contains a code point; that - # is, find i such that l[i] <= code_point < l[i+1] + # is, find i such that l[i] <= code_point < l[i+1]. Returns undef if no + # such i. # If this is ever made public, could use to speed up .t specials. Would # need to use code point argument, as in other functions in this pm @@ -2085,7 +2274,10 @@ sub _search_invlist { # Verify non-neg numeric XXX my $max_element = @$list_ref - 1; - return if ! $max_element < 0; # Undef if list is empty. + + # Return undef if list is empty or requested item is before the first element. + return if $max_element < 0; + return if $code_point < $list_ref->[0]; # Short cut something at the far-end of the table. This also allows us to # refer to element [$i+1] without fear of being out-of-bounds in the loop @@ -2431,7 +2623,7 @@ means that all the elements of the map array are either rational numbers or the string C<"NaN">, meaning "Not a Number". A rational number is either an integer, or two integers separated by a solidus (C<"/">). The second integer represents the denominator of the division implied by the solidus, and is -actually always positive, so it is guaranteed not to be 0 and to not to be +actually always positive, so it is guaranteed not to be 0 and to not be signed. When the element is a plain integer (without the solidus), it may need to be adjusted to get the correct value by adding the offset, just as other C<"a"> properties. No adjustment is needed for @@ -2443,7 +2635,7 @@ can use something like this: my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property); if ($format && $format eq "ar") { - map { $_ = eval $_ } @$invmap_ref; + map { $_ = eval $_ if $_ ne 'NaN' } @$map_ref; } Here's some entries from the output of the property "Nv", which has format @@ -2665,7 +2857,7 @@ RETRY: $prop = "age"; goto RETRY; } - elsif ($second_try =~ / ^ s ( cf | [ltu] c ) $ /x) { + elsif ($second_try =~ / ^ s ( cf | fc | [ltu] c ) $ /x) { # These properties use just the LIST part of the full mapping, # which includes the simple maps that are otherwise overridden by @@ -2674,7 +2866,11 @@ RETRY: $overrides = -1; # The full name is the simple name stripped of its initial 's' - $prop = $second_try =~ s/^s//r; + $prop = $1; + + # .. except for this case + $prop = 'cf' if $prop eq 'fc'; + goto RETRY; } elsif ($second_try eq "blk") { @@ -2733,7 +2929,7 @@ RETRY: my ($hex_code_point, $name) = split "\t", $line; # Weeds out all comments, blank lines, and named sequences - next if $hex_code_point =~ /\P{ASCII_HEX_DIGIT}/; + next if $hex_code_point =~ /[^[:xdigit:]]/a; my $code_point = hex $hex_code_point; @@ -2791,8 +2987,11 @@ RETRY: $decomps{'LIST'} = ""; # This property has one special range not in the file: for the - # hangul syllables - my $done_hangul = 0; # Have we done the hangul range. + # hangul syllables. But not in Unicode version 1. + UnicodeVersion() unless defined $v_unicode_version; + my $done_hangul = ($v_unicode_version lt v2.0.0) + ? 1 + : 0; # Have we done the hangul range ? foreach my $line (split "\n", $original) { my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line; my $code_point = hex $hex_lower; @@ -2822,6 +3021,12 @@ RETRY: : "<hangul syllable>"; } + if ($value =~ / / && $hex_upper ne "" && $hex_upper ne $hex_lower) { + $line = sprintf("%04X\t%s\t%s", hex($hex_lower) + 1, $hex_upper, $value); + $hex_upper = ""; + $redo = 1; + } + # And append this to our constructed LIST. $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n"; @@ -2863,8 +3068,8 @@ RETRY: } else { - # These should all single-element ranges. - croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne ""; + # These should all be single-element ranges. + croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "" && $hex_end ne $hex_begin; # Convert them to decimal, as that's what's expected. $list .= "$hex_begin\t\t" @@ -3334,6 +3539,7 @@ sub UnicodeVersion { croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; } + $v_unicode_version = pack "C*", split /\./, $UNICODEVERSION; return $UNICODEVERSION; } @@ -3342,7 +3548,8 @@ sub UnicodeVersion { The difference between a block and a script is that scripts are closer to the linguistic notion of a set of code points required to present languages, while block is more of an artifact of the Unicode code point -numbering and separation into blocks of (mostly) 256 code points. +numbering and separation into blocks of consecutive code points (so far the +size of a block is some multiple of 16, like 128 or 256). For example the Latin B<script> is spread over several B<blocks>, such as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.t b/gnu/usr.bin/perl/lib/Unicode/UCD.t index 2e5a741f0f9..e070defbeae 100644 --- a/gnu/usr.bin/perl/lib/Unicode/UCD.t +++ b/gnu/usr.bin/perl/lib/Unicode/UCD.t @@ -19,7 +19,8 @@ use Test::More; use Unicode::UCD 'charinfo'; -$/ = 7; +my $input_record_separator = 7; # Make sure Unicode::UCD isn't affected by +$/ = $input_record_separator; # setting this. my $charinfo; @@ -342,7 +343,7 @@ is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic'); # If this fails, then maybe one should look at the Unicode changes to see # what else might need to be updated. -is(Unicode::UCD::UnicodeVersion, '6.1.0', 'UnicodeVersion'); +is(Unicode::UCD::UnicodeVersion, '6.2.0', 'UnicodeVersion'); use Unicode::UCD qw(compexcl); @@ -373,9 +374,9 @@ is($casefold->{full}, '0073 0073', 'casefold 0xDF full'); is($casefold->{simple}, "", 'casefold 0xDF simple'); is($casefold->{turkic}, "", 'casefold 0xDF turkic'); -# Do different tests depending on if version <= 3.1, or not. -(my $version = Unicode::UCD::UnicodeVersion) =~ /^(\d+)\.(\d+)/; -if (defined $1 && ($1 <= 2 || $1 == 3 && defined $2 && $2 <= 1)) { +# Do different tests depending on if version < 3.2, or not. +my $v_unicode_version = pack "C*", split /\./, Unicode::UCD::UnicodeVersion(); +if ($v_unicode_version lt v3.2.0) { $casefold = casefold(0x130); is($casefold->{code}, '0130', 'casefold 0x130 code'); @@ -469,11 +470,13 @@ is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)"); { my $r1 = charscript('Latin'); - my $n1 = @$r1; - is($n1, 30, "number of ranges in Latin script (Unicode 6.1.0)"); - shift @$r1 while @$r1; - my $r2 = charscript('Latin'); - is(@$r2, $n1, "modifying results should not mess up internal caches"); + if (ok(defined $r1, "Found Latin script")) { + my $n1 = @$r1; + is($n1, 30, "number of ranges in Latin script (Unicode 6.1.0)"); + shift @$r1 while @$r1; + my $r2 = charscript('Latin'); + is(@$r2, $n1, "modifying results should not mess up internal caches"); + } } { @@ -550,14 +553,18 @@ is_deeply(\@list, ], "prop_aliases('perldecimaldigit') returns Perl_Decimal_Digit as both short and full names"); # Get the official Unicode property name synonyms and test them. + +SKIP: { +skip "PropertyAliases.txt is not in this Unicode version", 1 if $v_unicode_version lt v3.2.0; open my $props, "<", "../lib/unicore/PropertyAliases.txt" or die "Can't open Unicode PropertyAliases.txt"; -$/ = "\n"; +local $/ = "\n"; while (<$props>) { s/\s*#.*//; # Remove comments next if /^\s* $/x; # Ignore empty and comment lines chomp; + local $/ = $input_record_separator; my $count = 0; # 0th field in line is short name; 1th is long name my $short_name; my $full_name; @@ -615,6 +622,7 @@ while (<$props>) { $count++; } } +} # End of SKIP block # Now test anything we can find that wasn't covered by the tests of the # official properties. We have no way of knowing if mktables omitted a Perl @@ -701,15 +709,20 @@ is(prop_value_aliases("gc", "isC"), undef, "prop_value_aliases('gc', 'isC') retu # correct. my %pva_tested; # List of things already tested. + +SKIP: { +skip "PropValueAliases.txt is not in this Unicode version", 1 if $v_unicode_version lt v3.2.0; open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt" or die "Can't open Unicode PropValueAliases.txt"; +local $/ = "\n"; while (<$propvalues>) { s/\s*#.*//; # Remove comments next if /^\s* $/x; # Ignore empty and comment lines chomp; + local $/ = $input_record_separator; # Fix typo in official input file - s/CCC133/CCC132/g if $version eq "6.1.0"; + s/CCC133/CCC132/g if $v_unicode_version eq v6.1.0; my @fields = split /\s*;\s*/; # Fields are separated by semi-colons my $prop = shift @fields; # 0th field is the property, @@ -801,6 +814,7 @@ while (<$propvalues>) { $count++; } } +} # End of SKIP block # And test as best we can, the non-official pva's that mktables generates. foreach my $hash (\%utf8::loose_to_file_of, \%utf8::stricter_to_file_of) { @@ -953,6 +967,7 @@ sub fail_with_diff ($$$$) { require File::Temp; my $off = File::Temp->new(); + local $/ = "\n"; chomp $official; print $off $official, "\n"; close $off || die "Can't close official"; @@ -1037,7 +1052,9 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of # Get rid of any trailing space and comments in the file. $official =~ s/\s*(#.*)?$//mg; + local $/ = "\n"; chomp $official; + $/ = $input_record_separator; # If we are to test against an inverted file, it is easier to invert # our array than the file. @@ -1091,7 +1108,9 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) { $tested .= sprintf("%04X\t10FFFF\n", $tested[$i]); } + local $/ = "\n"; chomp $tested; + $/ = $input_record_separator; if ($tested ne $official) { fail_with_diff($mod_table, $official, $tested, "prop_invlist"); next; @@ -1407,11 +1426,15 @@ foreach my $prop (keys %props) { } } } + local $/ = "\n"; chomp $official; + $/ = $input_record_separator; - # If there are any special elements, get a reference to them. + # Get the format for the file, and if there are any special elements, + # get a reference to them. my $swash_name = $utf8::file_to_swash_name{$base_file}; my $specials_ref; + my $file_format; if ($swash_name) { $specials_ref = $utf8::SwashInfo{$swash_name}{'specials_name'}; if ($specials_ref) { @@ -1420,6 +1443,8 @@ foreach my $prop (keys %props) { no strict 'refs'; $specials_ref = \%{$specials_ref}; } + + $file_format = $utf8::SwashInfo{$swash_name}{'format'}; } # Certain of the proxy properties have to be adjusted to match the @@ -1512,15 +1537,14 @@ foreach my $prop (keys %props) { # specials are superfluous. undef $specials_ref; } - elsif ($name eq 'bmg') { + elsif ($format !~ /^a/ && defined $file_format && $file_format eq 'x') { - # For this property, the file is output using hex notation for the - # map, with all ranges equal to length 1. Convert from hex to - # decimal. + # For these properties the file is output using hex notation for the + # map. Convert from hex to decimal. my @lines = split "\n", $official; foreach my $line (@lines) { - my ($code_point, $map) = split "\t\t", $line; - $line = $code_point . "\t\t" . hex $map; + my ($lower, $upper, $map) = split "\t", $line; + $line = "$lower\t$upper\t" . hex $map; } $official = join "\n", @lines; } @@ -1731,7 +1755,9 @@ foreach my $prop (keys %props) { # Here are done with generating what the file should look like + local $/ = "\n"; chomp $tested_map; + $/ = $input_record_separator; # And compare. if ($tested_map ne $official) { @@ -1801,7 +1827,9 @@ foreach my $prop (keys %props) { $official =~ s/$hex_code_point \t $alias \n //x; } } + local $/ = "\n"; chomp $official; + $/ = $input_record_separator; # Here have adjusted the file. We also have to adjust the returned # inversion map by checking and deleting all the lines in it that @@ -1889,7 +1917,9 @@ foreach my $prop (keys %props) { # Finished creating the string from the inversion map. Can compare # with what the file is. + local $/ = "\n"; chomp $tested_map; + $/ = $input_record_separator; if ($tested_map ne $official) { fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap"); next PROPERTY; @@ -1996,4 +2026,5 @@ foreach my $prop (keys %props) { pass("prop_invmap('$mod_prop')"); } +ok($/ eq $input_record_separator, "The record separator didn't get overridden"); done_testing(); |