diff options
author | 2019-02-13 21:15:00 +0000 | |
---|---|---|
committer | 2019-02-13 21:15:00 +0000 | |
commit | 9f11ffb7133c203312a01e4b986886bc88c7d74b (patch) | |
tree | 6618511204c614b20256e4ef9dea39a7b311d638 /gnu/usr.bin/perl/lib/Unicode | |
parent | Import perl-5.28.1 (diff) | |
download | wireguard-openbsd-9f11ffb7133c203312a01e4b986886bc88c7d74b.tar.xz wireguard-openbsd-9f11ffb7133c203312a01e4b986886bc88c7d74b.zip |
Fix merge issues, remove excess files - match perl-5.28.1 dist
looking good sthen@, Great! bluhm@
Diffstat (limited to 'gnu/usr.bin/perl/lib/Unicode')
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/UCD.pm | 201 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/UCD.t | 47 |
2 files changed, 169 insertions, 79 deletions
diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.pm b/gnu/usr.bin/perl/lib/Unicode/UCD.pm index 13c2c785981..6733e119ea4 100644 --- a/gnu/usr.bin/perl/lib/Unicode/UCD.pm +++ b/gnu/usr.bin/perl/lib/Unicode/UCD.pm @@ -5,7 +5,7 @@ use warnings; no warnings 'surrogate'; # surrogates can be inputs to this use charnames (); -our $VERSION = '0.64'; +our $VERSION = '0.70'; require Exporter; @@ -98,6 +98,9 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'search_invlist'; my $index = search_invlist(\@invlist, $code_point); + # The following function should be used only internally in + # implementations of the Unicode Normalization Algorithm, and there + # are better choices than it. use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); @@ -128,7 +131,8 @@ Examples: 223 # Decimal 223 in native character set 0223 # Hexadecimal 223, native (= 547 decimal) - 0xDF # Hexadecimal DF, native (= 223 decimal + 0xDF # Hexadecimal DF, native (= 223 decimal) + '0xDF' # String form of hexadecimal (= 223 decimal) 'U+DF' # Hexadecimal DF, in Unicode's character set (= LATIN SMALL LETTER SHARP S) @@ -136,28 +140,18 @@ Note that the largest code point in Unicode is U+10FFFF. =cut -my $BLOCKSFH; -my $VERSIONFH; -my $CASEFOLDFH; -my $CASESPECFH; -my $NAMEDSEQFH; my $v_unicode_version; # v-string. sub openunicode { - my ($rfh, @path) = @_; - my $f; - unless (defined $$rfh) { - for my $d (@INC) { - use File::Spec; - $f = File::Spec->catfile($d, "unicore", @path); - last if open($$rfh, $f); - undef $f; - } - croak __PACKAGE__, ": failed to find ", - File::Spec->catfile(@path), " in @INC" - unless defined $f; + my (@path) = @_; + my $rfh; + for my $d (@INC) { + use File::Spec; + my $f = File::Spec->catfile($d, "unicore", @path); + return $rfh if open($rfh, '<', $f); } - return $f; + croak __PACKAGE__, ": failed to find ", + File::Spec->catfile("unicore", @path), " in @INC"; } sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it. @@ -334,7 +328,8 @@ See L</Blocks versus Scripts>. the script I<code> belongs to. The L</prop_value_aliases()> function can be used to get all the synonyms -of the script name. +of the script name. Note that this is the older "Script" property value, and +not the improved "Script_Extensions" value. See L</Blocks versus Scripts>. @@ -694,14 +689,14 @@ that are internal-only. =cut -sub charprop ($$) { - my ($input_cp, $prop) = @_; +sub charprop ($$;$) { + my ($input_cp, $prop, $internal_ok) = @_; my $cp = _getcode($input_cp); croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp; my ($list_ref, $map_ref, $format, $default) - = prop_invmap($prop); + = prop_invmap($prop, $internal_ok); return undef unless defined $list_ref; my $i = search_invlist($list_ref, $cp); @@ -875,10 +870,11 @@ sub _charblocks { push @BLOCKS, $subrange; push @{$BLOCKS{'No_Block'}}, $subrange; } - elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) { + else { + my $blocksfh = openunicode("Blocks.txt"); local $_; local $/ = "\n"; - while (<$BLOCKSFH>) { + while (<$blocksfh>) { # Old versions used a different syntax to mark the range. $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0; @@ -890,7 +886,6 @@ sub _charblocks { push @{$BLOCKS{$3}}, $subrange; } } - close($BLOCKSFH); if (! IS_ASCII_PLATFORM) { # The first two blocks, through 0xFF, are wrong on EBCDIC # platforms. @@ -962,6 +957,10 @@ that it doesn't have scripts, this function returns C<"Unknown">. The L</prop_value_aliases()> function can be used to get all the synonyms of the script name. +Note that the Script_Extensions property is an improved version of the Script +property, and you should probably be using that instead, with the +L</charprop()> function. + If supplied with an argument that can't be a code point, charscript() tries to do the opposite and interpret the argument as a script name. The return value is a I<range set>: an anonymous array of arrays that contain @@ -1052,7 +1051,9 @@ names as the keys, and the code point ranges (see L</charscript()>) as the values. L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a -different type of data structure. +different type of data structure. Since the Script_Extensions property is an +improved version of the Script property, you should instead use +L<prop_invmap("scx")|/prop_invmap()>. L<C<prop_values("Script")>|/prop_values()> can be used to get all the known script names as a list, without the code point ranges. @@ -1199,6 +1200,12 @@ sub bidi_types { =head2 B<compexcl()> +WARNING: Unicode discourages the use of this function or any of the +alternative mechanisms listed in this section (the documentation of +C<compexcl()>), except internally in implementations of the Unicode +Normalization Algorithm. You should be using L<Unicode::Normalize> directly +instead of these. Using these will likely lead to half-baked results. + use Unicode::UCD 'compexcl'; my $compexcl = compexcl(0x09dc); @@ -1631,13 +1638,11 @@ my %CASESPEC; sub _casespec { unless (%CASESPEC) { UnicodeVersion() unless defined $v_unicode_version; - if ($v_unicode_version lt v2.1.8) { - %CASESPEC = {}; - } - elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { + if ($v_unicode_version ge v2.1.8) { + my $casespecfh = openunicode("SpecialCasing.txt"); local $_; local $/ = "\n"; - while (<$CASESPECFH>) { + while (<$casespecfh>) { if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { my ($hexcode, $lower, $title, $upper, $condition) = @@ -1702,7 +1707,6 @@ sub _casespec { } } } - close($CASESPECFH); } } } @@ -1752,19 +1756,17 @@ my %NAMEDSEQ; sub _namedseq { unless (%NAMEDSEQ) { - if (openunicode(\$NAMEDSEQFH, "Name.pl")) { - local $_; - local $/ = "\n"; - while (<$NAMEDSEQFH>) { - if (/^ [0-9A-F]+ \ /x) { - chomp; - my ($sequence, $name) = split /\t/; - my @s = map { chr(hex($_)) } split(' ', $sequence); - $NAMEDSEQ{$name} = join("", @s); - } - } - close($NAMEDSEQFH); - } + my $namedseqfh = openunicode("Name.pl"); + local $_; + local $/ = "\n"; + while (<$namedseqfh>) { + if (/^ [0-9A-F]+ \ /x) { + chomp; + my ($sequence, $name) = split /\t/; + my @s = map { chr(hex($_)) } split(' ', $sequence); + $NAMEDSEQ{$name} = join("", @s); + } + } } } @@ -1848,14 +1850,18 @@ sub _numeric { my $val = num("123"); my $one_quarter = num("\N{VULGAR FRACTION 1/4}"); + my $val = num("12a", \$valid_length); # $valid_length contains 2 C<num()> returns the numeric value of the input Unicode string; or C<undef> if it doesn't think the entire string has a completely valid, safe numeric value. +If called with an optional second parameter, a reference to a scalar, C<num()> +will set the scalar to the length of any valid initial substring; or to 0 if none. If the string is just one character in length, the Unicode numeric value -is returned if it has one, or C<undef> otherwise. Note that this need -not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for -example returns -0.5. +is returned if it has one, or C<undef> otherwise. If the optional scalar ref +is passed, it would be set to 1 if the return is valid; or 0 if the return is +C<undef>. Note that the numeric value returned need not be a whole number. +C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5. =cut @@ -1877,7 +1883,9 @@ is returned. A further restriction is that the digits all have to be of the same form. A half-width digit mixed with a full-width one will return C<undef>. The Arabic script has two sets of digits; C<num> will return C<undef> unless all the digits in the string come from the same -set. +set. In all cases, the optional scalar ref parameter is set to how +long any valid initial substring of digits is; hence it will be set to the +entire string length if the main return value is not C<undef>. C<num> errs on the side of safety, and there may be valid strings of decimal digits that it doesn't recognize. Note that Unicode defines @@ -1901,16 +1909,30 @@ change these into digits, and then call C<num> on the result. # consider those, and return the <decomposition> type in the second # array element. -sub num { - my $string = $_[0]; +sub num ($;$) { + my ($string, $retlen_ref) = @_; + + use feature 'unicode_strings'; _numeric unless %NUMERIC; + $$retlen_ref = 0 if $retlen_ref; # Assume will fail + + my $length = length $string; + return if $length == 0; - my $length = length($string); - return $NUMERIC{ord($string)} if $length == 1; - return if $string =~ /\D/; my $first_ord = ord(substr($string, 0, 1)); + return if ! exists $NUMERIC{$first_ord} + || ! defined $NUMERIC{$first_ord}; + + # Here, we know the first character is numeric my $value = $NUMERIC{$first_ord}; + $$retlen_ref = 1 if $retlen_ref; # Assume only this one is numeric + + return $value if $length == 1; + + # Here, the input is longer than a single character. To be valid, it must + # be entirely decimal digits, which means it must start with one. + return if $string =~ / ^ \D /x; # To be a valid decimal number, it should be in a block of 10 consecutive # characters, whose values are 0, 1, 2, ... 9. Therefore this digit's @@ -1922,7 +1944,8 @@ sub num { # release, we verify that this first character is a member of such a # block. That is, that the block of characters surrounding this one # consists of all \d characters whose numeric values are the expected - # ones. + # ones. If not, then this single character is numeric, but the string as + # a whole is not considered to be. UnicodeVersion() unless defined $v_unicode_version; if ($v_unicode_version lt v6.0.0) { for my $i (0 .. 9) { @@ -1944,10 +1967,14 @@ sub num { # function. my $ord = ord(substr($string, $i, 1)); my $digit = $ord - $zero_ord; - return unless $digit >= 0 && $digit <= 9; + if ($digit < 0 || $digit > 9) { + $$retlen_ref = $i if $retlen_ref; + return; + } $value = $value * 10 + $digit; } + $$retlen_ref = $length if $retlen_ref; return $value; } @@ -2427,8 +2454,8 @@ sub prop_value_aliases ($$) { return ( $list_ref->[0], $list_ref->[0] ); } -# All 1 bits is the largest possible UV. -$Unicode::UCD::MAX_CP = ~0; +# All 1 bits but the top one is the largest possible IV. +$Unicode::UCD::MAX_CP = (~0) >> 1; =pod @@ -2458,7 +2485,7 @@ resolving the input property's name as is done for regular expressions. These are also specified in L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}>. Examples of using the "property=value" form are: - say join ", ", prop_invlist("Script=Shavian"); + say join ", ", prop_invlist("Script_Extensions=Shavian"); prints: 66640, 66688 @@ -2520,11 +2547,7 @@ code points that have the property-value: for (my $i = 0; $i < @invlist; $i += 2) { my $upper = ($i + 1) < @invlist ? $invlist[$i+1] - 1 # In range - : $Unicode::UCD::MAX_CP; # To infinity. You may want - # to stop much much earlier; - # going this high may expose - # perl deficiencies with very - # large numbers. + : $Unicode::UCD::MAX_CP; # To infinity. for my $j ($invlist[$i] .. $upper) { push @full_list, $j; } @@ -3043,6 +3066,8 @@ L<Unicode::Normalize::NFD()|Unicode::Normalize>. Note that the mapping is the one that is specified in the Unicode data files, and to get the final decomposition, it may need to be applied recursively. +Unicode in fact discourages use of this property except internally in +implementations of the Unicode Normalization Algorithm. The fourth (index [3]) element (C<$default>) in the list returned for this format is 0. @@ -3136,11 +3161,48 @@ return C<undef> if called with one of those. The returned values for the Perl extension properties, such as C<Any> and C<Greek> are somewhat misleading. The values are either C<"Y"> or C<"N>". All Unicode properties are bipartite, so you can actually use the C<"Y"> or -C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or +C<"N>" in a Perl regular expression for these, like C<qr/\p{ID_Start=Y/}> or C<qr/\p{Upper=N/}>. But the Perl extensions aren't specified this way, only like C</qr/\p{Any}>, I<etc>. You can't actually use the C<"Y"> and C<"N>" in them. +=head3 Getting every available name + +Instead of reading the Unicode Database directly from files, as you were able +to do for a long time, you are encouraged to use the supplied functions. So, +instead of reading C<Name.pl> - which may disappear without notice in the +future - directly, as with + + my (%name, %cp); + for (split m/\s*\n/ => do "unicore/Name.pl") { + my ($cp, $name) = split m/\t/ => $_; + $cp{$name} = $cp; + $name{$cp} = $name unless $cp =~ m/ /; + } + +You ought to use L</prop_invmap()> like this: + + my (%name, %cp, %cps, $n); + # All codepoints + foreach my $cat (qw( Name Name_Alias )) { + my ($codepoints, $names, $format, $default) = prop_invmap($cat); + # $format => "n", $default => "" + foreach my $i (0 .. @$codepoints - 2) { + my ($cp, $n) = ($codepoints->[$i], $names->[$i]); + # If $n is a ref, the same codepoint has multiple names + foreach my $name (ref $n ? @$n : $n) { + $name{$cp} //= $name; + $cp{$name} //= $cp; + } + } + } + # Named sequences + { my %ns = namedseq(); + foreach my $name (sort { $ns{$a} cmp $ns{$b} } keys %ns) { + $cp{$name} //= [ map { ord } split "" => $ns{$name} ]; + } + } + =cut # User-defined properties could be handled with some changes to utf8_heavy.pl; @@ -4044,10 +4106,9 @@ my $UNICODEVERSION; sub UnicodeVersion { unless (defined $UNICODEVERSION) { - openunicode(\$VERSIONFH, "version"); + my $versionfh = openunicode("version"); local $/ = "\n"; - chomp($UNICODEVERSION = <$VERSIONFH>); - close($VERSIONFH); + chomp($UNICODEVERSION = <$versionfh>); croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; } diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.t b/gnu/usr.bin/perl/lib/Unicode/UCD.t index 83320d34a01..0538bda3055 100644 --- a/gnu/usr.bin/perl/lib/Unicode/UCD.t +++ b/gnu/usr.bin/perl/lib/Unicode/UCD.t @@ -19,7 +19,7 @@ use Test::More; use Unicode::UCD qw(charinfo charprop charprops_all); -my $expected_version = '8.0.0'; +my $expected_version = '10.0.0'; my $current_version = Unicode::UCD::UnicodeVersion; my $v_unicode_version = pack "C*", split /\./, $current_version; my $unknown_script = ($v_unicode_version lt v5.0.0) @@ -819,10 +819,19 @@ use charnames (); # Don't use \N{} on things not in original Unicode # version; else will get a compilation error when this .t # is run on an older version. +my $ret_len; is(num("0"), 0, 'Verify num("0") == 0'); -is(num("98765"), 98765, 'Verify num("98765") == 98765'); -ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), +is(num("0", \$ret_len), 0, 'Verify num("0", \$ret_len) == 0'); +is($ret_len, 1, "... and the returned length is 1"); +ok(! defined num("", \$ret_len), 'Verify num("", \$ret_len) isnt defined'); +is($ret_len, 0, "... and the returned length is 0"); +ok(! defined num("A", \$ret_len), 'Verify num("A") isnt defined'); +is($ret_len, 0, "... and the returned length is 0"); +is(num("98765", \$ret_len), 98765, 'Verify num("98765") == 98765'); +is($ret_len, 5, "... and the returned length is 5"); +ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}", \$ret_len), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined'); +is($ret_len, 5, "... but the returned length is 5"); my $tai_lue_2; if ($v_unicode_version ge v4.1.0) { my $tai_lue_1 = charnames::string_vianame("NEW TAI LUE DIGIT ONE"); @@ -834,8 +843,13 @@ if ($v_unicode_version ge v4.1.0) { } if ($v_unicode_version ge v5.2.0) { ok(! defined num($tai_lue_2 - . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")), + . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE"), \$ret_len), 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined'); + is($ret_len, 1, "... but the returned length is 1"); + ok(! defined num(charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE") + . $tai_lue_2, \$ret_len), + 'Verify num("\N{NEW TAI LUE THAM DIGIT ONE}\N{NEW TAI LUE DIGIT TWO}") isnt defined'); + is($ret_len, 1, "... but the returned length is 1"); } if ($v_unicode_version ge v5.1.0) { my $cham_0 = charnames::string_vianame("CHAM DIGIT ZERO"); @@ -843,8 +857,10 @@ if ($v_unicode_version ge v5.1.0) { 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3'); if ($v_unicode_version ge v5.2.0) { ok(! defined num( $cham_0 - . charnames::string_vianame("JAVANESE DIGIT NINE")), + . charnames::string_vianame("JAVANESE DIGIT NINE"), + \$ret_len), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined'); + is($ret_len, 1, "... but the returned length is 1"); } } is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2'); @@ -1415,9 +1431,14 @@ sub fail_with_diff ($$$$) { # For use below to output better messages my ($prop, $official, $constructed, $tested_function_name) = @_; - is($constructed, $official, "$tested_function_name('$prop')"); - diag("Comment out lines " . (__LINE__ - 1) . " through " . (__LINE__ + 1) . " in '$0' on Un*x-like systems to see just the differences. Uses the 'diff' first in your \$PATH"); - return; + if (! $ENV{PERL_DIFF_TOOL}) { + + is($constructed, $official, "$tested_function_name('$prop')"); + + diag("Set environment variable PERL_DIFF_TOOL=diff_tool to see just " + . "the differences."); + return; + } fail("$tested_function_name('$prop')"); @@ -1434,7 +1455,7 @@ sub fail_with_diff ($$$$) { close $gend || die "Can't close gend"; my $diff = File::Temp->new(); - system("diff $off $gend > $diff"); + system("$ENV{PERL_DIFF_TOOL} $off $gend > $diff"); open my $fh, "<", $diff || die "Can't open $diff"; my @diffs = <$fh>; @@ -2695,4 +2716,12 @@ if (! ok(@warnings == 0, "No warnings were generated")) { diag(join "\n", "The warnings are:", @warnings); } +# And make sure that the max code point returned actually fits in an IV, which +# currently range iterators are. +my $count = 0; +for my $i ($Unicode::UCD::MAX_CP - 1 .. $Unicode::UCD::MAX_CP) { + $count++; +} +is($count, 2, "MAX_CP isn't too large"); + done_testing(); |