Fix merge issues, remove excess files - match perl-5.28.1 dist

looking good sthen@, Great! bluhm@
author: afresh1 <afresh1@openbsd.org> 2019-02-13 21:15:00 +0000
committer: afresh1 <afresh1@openbsd.org> 2019-02-13 21:15:00 +0000
commit: 9f11ffb7133c203312a01e4b986886bc88c7d74b (patch)
tree: 6618511204c614b20256e4ef9dea39a7b311d638 /gnu/usr.bin/perl/lib/Unicode
parent: Import perl-5.28.1 (diff)
download: wireguard-openbsd-9f11ffb7133c203312a01e4b986886bc88c7d74b.tar.xz
wireguard-openbsd-9f11ffb7133c203312a01e4b986886bc88c7d74b.zip
2 files changed, 169 insertions, 79 deletions
diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.pm b/gnu/usr.bin/perl/lib/Unicode/UCD.pm
index 13c2c785981..6733e119ea4 100644
--- a/gnu/usr.bin/perl/lib/Unicode/UCD.pm
+++ b/gnu/usr.bin/perl/lib/Unicode/UCD.pm
@@ -5,7 +5,7 @@ use warnings;
 no warnings 'surrogate';    # surrogates can be inputs to this
 use charnames ();
 
-our $VERSION = '0.64';
+our $VERSION = '0.70';
 
 require Exporter;
 
@@ -98,6 +98,9 @@ Unicode::UCD - Unicode character database
     use Unicode::UCD 'search_invlist';
     my $index = search_invlist(\@invlist, $code_point);
 
+    # The following function should be used only internally in
+    # implementations of the Unicode Normalization Algorithm, and there
+    # are better choices than it.
     use Unicode::UCD 'compexcl';
     my $compexcl = compexcl($codepoint);
 
@@ -128,7 +131,8 @@ Examples:
 
     223     # Decimal 223 in native character set
     0223    # Hexadecimal 223, native (= 547 decimal)
-    0xDF    # Hexadecimal DF, native (= 223 decimal
+    0xDF    # Hexadecimal DF, native (= 223 decimal)
+    '0xDF'  # String form of hexadecimal (= 223 decimal)
     'U+DF'  # Hexadecimal DF, in Unicode's character set
                               (= LATIN SMALL LETTER SHARP S)
 
@@ -136,28 +140,18 @@ Note that the largest code point in Unicode is U+10FFFF.
 
 =cut
 
-my $BLOCKSFH;
-my $VERSIONFH;
-my $CASEFOLDFH;
-my $CASESPECFH;
-my $NAMEDSEQFH;
 my $v_unicode_version;  # v-string.
 
 sub openunicode {
-    my ($rfh, @path) = @_;
-    my $f;
-    unless (defined $$rfh) {
-	for my $d (@INC) {
-	    use File::Spec;
-	    $f = File::Spec->catfile($d, "unicore", @path);
-	    last if open($$rfh, $f);
-	    undef $f;
-	}
-	croak __PACKAGE__, ": failed to find ",
-              File::Spec->catfile(@path), " in @INC"
-	    unless defined $f;
+    my (@path) = @_;
+    my $rfh;
+    for my $d (@INC) {
+        use File::Spec;
+        my $f = File::Spec->catfile($d, "unicore", @path);
+        return $rfh if open($rfh, '<', $f);
     }
-    return $f;
+    croak __PACKAGE__, ": failed to find ",
+        File::Spec->catfile("unicore", @path), " in @INC";
 }
 
 sub _dclone ($) {   # Use Storable::dclone if available; otherwise emulate it.
@@ -334,7 +328,8 @@ See L</Blocks versus Scripts>.
 
 the script I<code> belongs to.
 The L</prop_value_aliases()> function can be used to get all the synonyms
-of the script name.
+of the script name.  Note that this is the older "Script" property value, and
+not the improved "Script_Extensions" value.
 
 See L</Blocks versus Scripts>.
 
@@ -694,14 +689,14 @@ that are internal-only.
 
 =cut
 
-sub charprop ($$) {
-    my ($input_cp, $prop) = @_;
+sub charprop ($$;$) {
+    my ($input_cp, $prop, $internal_ok) = @_;
 
     my $cp = _getcode($input_cp);
     croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp;
 
     my ($list_ref, $map_ref, $format, $default)
-                                      = prop_invmap($prop);
+                                      = prop_invmap($prop, $internal_ok);
     return undef unless defined $list_ref;
 
     my $i = search_invlist($list_ref, $cp);
@@ -875,10 +870,11 @@ sub _charblocks {
             push @BLOCKS, $subrange;
             push @{$BLOCKS{'No_Block'}}, $subrange;
         }
-        elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+        else {
+            my $blocksfh = openunicode("Blocks.txt");
 	    local $_;
 	    local $/ = "\n";
-	    while (<$BLOCKSFH>) {
+	    while (<$blocksfh>) {
 
                 # Old versions used a different syntax to mark the range.
                 $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0;
@@ -890,7 +886,6 @@ sub _charblocks {
 		    push @{$BLOCKS{$3}}, $subrange;
 		}
 	    }
-	    close($BLOCKSFH);
             if (! IS_ASCII_PLATFORM) {
                 # The first two blocks, through 0xFF, are wrong on EBCDIC
                 # platforms.
@@ -962,6 +957,10 @@ that it doesn't have scripts, this function returns C<"Unknown">.
 The L</prop_value_aliases()> function can be used to get all the synonyms
 of the script name.
 
+Note that the Script_Extensions property is an improved version of the Script
+property, and you should probably be using that instead, with the
+L</charprop()> function.
+
 If supplied with an argument that can't be a code point, charscript() tries
 to do the opposite and interpret the argument as a script name. The
 return value is a I<range set>: an anonymous array of arrays that contain
@@ -1052,7 +1051,9 @@ names as the keys, and the code point ranges (see L</charscript()>) as
 the values.
 
 L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
-different type of data structure.
+different type of data structure.  Since the Script_Extensions property is an
+improved version of the Script property, you should instead use
+L<prop_invmap("scx")|/prop_invmap()>.
 
 L<C<prop_values("Script")>|/prop_values()> can be used to get all
 the known script names as a list, without the code point ranges.
@@ -1199,6 +1200,12 @@ sub bidi_types {
 
 =head2 B<compexcl()>
 
+WARNING: Unicode discourages the use of this function or any of the
+alternative mechanisms listed in this section (the documentation of
+C<compexcl()>), except internally in implementations of the Unicode
+Normalization Algorithm.  You should be using L<Unicode::Normalize> directly
+instead of these.  Using these will likely lead to half-baked results.
+
     use Unicode::UCD 'compexcl';
 
     my $compexcl = compexcl(0x09dc);
@@ -1631,13 +1638,11 @@ my %CASESPEC;
 sub _casespec {
     unless (%CASESPEC) {
         UnicodeVersion() unless defined $v_unicode_version;
-        if ($v_unicode_version lt v2.1.8) {
-            %CASESPEC = {};
-        }
-	elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
+        if ($v_unicode_version ge v2.1.8) {
+            my $casespecfh = openunicode("SpecialCasing.txt");
 	    local $_;
 	    local $/ = "\n";
-	    while (<$CASESPECFH>) {
+	    while (<$casespecfh>) {
 		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 
 		    my ($hexcode, $lower, $title, $upper, $condition) =
@@ -1702,7 +1707,6 @@ sub _casespec {
 		    }
 		}
 	    }
-	    close($CASESPECFH);
 	}
     }
 }
@@ -1752,19 +1756,17 @@ my %NAMEDSEQ;
 
 sub _namedseq {
     unless (%NAMEDSEQ) {
-	if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
-	    local $_;
-	    local $/ = "\n";
-	    while (<$NAMEDSEQFH>) {
-		if (/^ [0-9A-F]+ \  /x) {
-                    chomp;
-                    my ($sequence, $name) = split /\t/;
-		    my @s = map { chr(hex($_)) } split(' ', $sequence);
-		    $NAMEDSEQ{$name} = join("", @s);
-		}
-	    }
-	    close($NAMEDSEQFH);
-	}
+        my $namedseqfh = openunicode("Name.pl");
+        local $_;
+        local $/ = "\n";
+        while (<$namedseqfh>) {
+            if (/^ [0-9A-F]+ \  /x) {
+                chomp;
+                my ($sequence, $name) = split /\t/;
+                my @s = map { chr(hex($_)) } split(' ', $sequence);
+                $NAMEDSEQ{$name} = join("", @s);
+            }
+        }
     }
 }
 
@@ -1848,14 +1850,18 @@ sub _numeric {
 
     my $val = num("123");
     my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
+    my $val = num("12a", \$valid_length);  # $valid_length contains 2
 
 C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
 doesn't think the entire string has a completely valid, safe numeric value.
+If called with an optional second parameter, a reference to a scalar, C<num()>
+will set the scalar to the length of any valid initial substring; or to 0 if none.
 
 If the string is just one character in length, the Unicode numeric value
-is returned if it has one, or C<undef> otherwise.  Note that this need
-not be a whole number.  C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
-example returns -0.5.
+is returned if it has one, or C<undef> otherwise.  If the optional scalar ref
+is passed, it would be set to 1 if the return is valid; or 0 if the return is
+C<undef>.  Note that the numeric value returned need not be a whole number.
+C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5.
 
 =cut
 
@@ -1877,7 +1883,9 @@ is returned.  A further restriction is that the digits all have to be of
 the same form.  A half-width digit mixed with a full-width one will
 return C<undef>.  The Arabic script has two sets of digits;  C<num> will
 return C<undef> unless all the digits in the string come from the same
-set.
+set.  In all cases, the optional scalar ref parameter is set to how
+long any valid initial substring of digits is; hence it will be set to the
+entire string length if the main return value is not C<undef>.
 
 C<num> errs on the side of safety, and there may be valid strings of
 decimal digits that it doesn't recognize.  Note that Unicode defines
@@ -1901,16 +1909,30 @@ change these into digits, and then call C<num> on the result.
 # consider those, and return the <decomposition> type in the second
 # array element.
 
-sub num {
-    my $string = $_[0];
+sub num ($;$) {
+    my ($string, $retlen_ref) = @_;
+
+    use feature 'unicode_strings';
 
     _numeric unless %NUMERIC;
+    $$retlen_ref = 0 if $retlen_ref;    # Assume will fail
+
+    my $length = length $string;
+    return if $length == 0;
 
-    my $length = length($string);
-    return $NUMERIC{ord($string)} if $length == 1;
-    return if $string =~ /\D/;
     my $first_ord = ord(substr($string, 0, 1));
+    return if ! exists  $NUMERIC{$first_ord}
+           || ! defined $NUMERIC{$first_ord};
+
+    # Here, we know the first character is numeric
     my $value = $NUMERIC{$first_ord};
+    $$retlen_ref = 1 if $retlen_ref;    # Assume only this one is numeric
+
+    return $value if $length == 1;
+
+    # Here, the input is longer than a single character.  To be valid, it must
+    # be entirely decimal digits, which means it must start with one.
+    return if $string =~ / ^ \D /x;
 
     # To be a valid decimal number, it should be in a block of 10 consecutive
     # characters, whose values are 0, 1, 2, ... 9.  Therefore this digit's
@@ -1922,7 +1944,8 @@ sub num {
     # release, we verify that this first character is a member of such a
     # block.  That is, that the block of characters surrounding this one
     # consists of all \d characters whose numeric values are the expected
-    # ones.
+    # ones.  If not, then this single character is numeric, but the string as
+    # a whole is not considered to be.
     UnicodeVersion() unless defined $v_unicode_version;
     if ($v_unicode_version lt v6.0.0) {
         for my $i (0 .. 9) {
@@ -1944,10 +1967,14 @@ sub num {
         # function.
         my $ord = ord(substr($string, $i, 1));
         my $digit = $ord - $zero_ord;
-        return unless $digit >= 0 && $digit <= 9;
+        if ($digit < 0 || $digit > 9) {
+            $$retlen_ref = $i if $retlen_ref;
+            return;
+        }
         $value = $value * 10 + $digit;
     }
 
+    $$retlen_ref = $length if $retlen_ref;
     return $value;
 }
 
@@ -2427,8 +2454,8 @@ sub prop_value_aliases ($$) {
     return ( $list_ref->[0], $list_ref->[0] );
 }
 
-# All 1 bits is the largest possible UV.
-$Unicode::UCD::MAX_CP = ~0;
+# All 1 bits but the top one is the largest possible IV.
+$Unicode::UCD::MAX_CP = (~0) >> 1;
 
 =pod
 
@@ -2458,7 +2485,7 @@ resolving the input property's name as is done for regular expressions.  These
 are also specified in L<perluniprops|perluniprops/Properties accessible
 through \p{} and \P{}>.  Examples of using the "property=value" form are:
 
- say join ", ", prop_invlist("Script=Shavian");
+ say join ", ", prop_invlist("Script_Extensions=Shavian");
 
  prints:
  66640, 66688
@@ -2520,11 +2547,7 @@ code points that have the property-value:
  for (my $i = 0; $i < @invlist; $i += 2) {
     my $upper = ($i + 1) < @invlist
                 ? $invlist[$i+1] - 1      # In range
-                : $Unicode::UCD::MAX_CP;  # To infinity.  You may want
-                                          # to stop much much earlier;
-                                          # going this high may expose
-                                          # perl deficiencies with very
-                                          # large numbers.
+                : $Unicode::UCD::MAX_CP;  # To infinity.
     for my $j ($invlist[$i] .. $upper) {
         push @full_list, $j;
     }
@@ -3043,6 +3066,8 @@ L<Unicode::Normalize::NFD()|Unicode::Normalize>.
 
 Note that the mapping is the one that is specified in the Unicode data files,
 and to get the final decomposition, it may need to be applied recursively.
+Unicode in fact discourages use of this property except internally in
+implementations of the Unicode Normalization Algorithm.
 
 The fourth (index [3]) element (C<$default>) in the list returned for this
 format is 0.
@@ -3136,11 +3161,48 @@ return C<undef> if called with one of those.
 The returned values for the Perl extension properties, such as C<Any> and
 C<Greek> are somewhat misleading.  The values are either C<"Y"> or C<"N>".
 All Unicode properties are bipartite, so you can actually use the C<"Y"> or
-C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or
+C<"N>" in a Perl regular expression for these, like C<qr/\p{ID_Start=Y/}> or
 C<qr/\p{Upper=N/}>.  But the Perl extensions aren't specified this way, only
 like C</qr/\p{Any}>, I<etc>.  You can't actually use the C<"Y"> and C<"N>" in
 them.
 
+=head3 Getting every available name
+
+Instead of reading the Unicode Database directly from files, as you were able
+to do for a long time, you are encouraged to use the supplied functions. So,
+instead of reading C<Name.pl> - which may disappear without notice in the
+future - directly, as with
+
+  my (%name, %cp);
+  for (split m/\s*\n/ => do "unicore/Name.pl") {
+      my ($cp, $name) = split m/\t/ => $_;
+      $cp{$name} = $cp;
+      $name{$cp} = $name unless $cp =~ m/ /;
+  }
+
+You ought to use L</prop_invmap()> like this:
+
+  my (%name, %cp, %cps, $n);
+  # All codepoints
+  foreach my $cat (qw( Name Name_Alias )) {
+      my ($codepoints, $names, $format, $default) = prop_invmap($cat);
+      # $format => "n", $default => ""
+      foreach my $i (0 .. @$codepoints - 2) {
+          my ($cp, $n) = ($codepoints->[$i], $names->[$i]);
+          # If $n is a ref, the same codepoint has multiple names
+          foreach my $name (ref $n ? @$n : $n) {
+              $name{$cp} //= $name;
+              $cp{$name} //= $cp;
+          }
+      }
+  }
+  # Named sequences
+  {   my %ns = namedseq();
+      foreach my $name (sort { $ns{$a} cmp $ns{$b} } keys %ns) {
+          $cp{$name} //= [ map { ord } split "" => $ns{$name} ];
+      }
+  }
+
 =cut
 
 # User-defined properties could be handled with some changes to utf8_heavy.pl;
@@ -4044,10 +4106,9 @@ my $UNICODEVERSION;
 
 sub UnicodeVersion {
     unless (defined $UNICODEVERSION) {
-	openunicode(\$VERSIONFH, "version");
+	my $versionfh = openunicode("version");
 	local $/ = "\n";
-	chomp($UNICODEVERSION = <$VERSIONFH>);
-	close($VERSIONFH);
+	chomp($UNICODEVERSION = <$versionfh>);
 	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 	    unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
     }
diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.t b/gnu/usr.bin/perl/lib/Unicode/UCD.t
index 83320d34a01..0538bda3055 100644
--- a/gnu/usr.bin/perl/lib/Unicode/UCD.t
+++ b/gnu/usr.bin/perl/lib/Unicode/UCD.t
@@ -19,7 +19,7 @@ use Test::More;
 
 use Unicode::UCD qw(charinfo charprop charprops_all);
 
-my $expected_version = '8.0.0';
+my $expected_version = '10.0.0';
 my $current_version = Unicode::UCD::UnicodeVersion;
 my $v_unicode_version = pack "C*", split /\./, $current_version;
 my $unknown_script = ($v_unicode_version lt v5.0.0)
@@ -819,10 +819,19 @@ use charnames ();   # Don't use \N{} on things not in original Unicode
                     # version; else will get a compilation error when this .t
                     # is run on an older version.
 
+my $ret_len;
 is(num("0"), 0, 'Verify num("0") == 0');
-is(num("98765"), 98765, 'Verify num("98765") == 98765');
-ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"),
+is(num("0", \$ret_len), 0, 'Verify num("0", \$ret_len) == 0');
+is($ret_len, 1, "... and the returned length is 1");
+ok(! defined num("", \$ret_len), 'Verify num("", \$ret_len) isnt defined');
+is($ret_len, 0, "... and the returned length is 0");
+ok(! defined num("A", \$ret_len), 'Verify num("A") isnt defined');
+is($ret_len, 0, "... and the returned length is 0");
+is(num("98765", \$ret_len), 98765, 'Verify num("98765") == 98765');
+is($ret_len, 5, "... and the returned length is 5");
+ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}", \$ret_len),
    'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
+is($ret_len, 5, "... but the returned length is 5");
 my $tai_lue_2;
 if ($v_unicode_version ge v4.1.0) {
     my $tai_lue_1 = charnames::string_vianame("NEW TAI LUE DIGIT ONE");
@@ -834,8 +843,13 @@ if ($v_unicode_version ge v4.1.0) {
 }
 if ($v_unicode_version ge v5.2.0) {
     ok(! defined num($tai_lue_2
-         . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")),
+         . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE"), \$ret_len),
          'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined');
+    is($ret_len, 1, "... but the returned length is 1");
+    ok(! defined num(charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")
+                     .  $tai_lue_2, \$ret_len),
+         'Verify num("\N{NEW TAI LUE THAM DIGIT ONE}\N{NEW TAI LUE DIGIT TWO}") isnt defined');
+    is($ret_len, 1, "... but the returned length is 1");
 }
 if ($v_unicode_version ge v5.1.0) {
     my $cham_0 = charnames::string_vianame("CHAM DIGIT ZERO");
@@ -843,8 +857,10 @@ if ($v_unicode_version ge v5.1.0) {
        'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
     if ($v_unicode_version ge v5.2.0) {
         ok(! defined num(  $cham_0
-                         . charnames::string_vianame("JAVANESE DIGIT NINE")),
+                         . charnames::string_vianame("JAVANESE DIGIT NINE"),
+                         \$ret_len),
         'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
+    is($ret_len, 1, "... but the returned length is 1");
     }
 }
 is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2');
@@ -1415,9 +1431,14 @@ sub fail_with_diff ($$$$) {
     # For use below to output better messages
     my ($prop, $official, $constructed, $tested_function_name) = @_;
 
-    is($constructed, $official, "$tested_function_name('$prop')");
-    diag("Comment out lines " . (__LINE__ - 1) . " through " . (__LINE__ + 1) . " in '$0' on Un*x-like systems to see just the differences.  Uses the 'diff' first in your \$PATH");
-    return;
+    if (! $ENV{PERL_DIFF_TOOL}) {
+
+        is($constructed, $official, "$tested_function_name('$prop')");
+
+        diag("Set environment variable PERL_DIFF_TOOL=diff_tool to see just "
+           . "the differences.");
+        return;
+    }
 
     fail("$tested_function_name('$prop')");
 
@@ -1434,7 +1455,7 @@ sub fail_with_diff ($$$$) {
     close $gend || die "Can't close gend";
 
     my $diff = File::Temp->new();
-    system("diff $off $gend > $diff");
+    system("$ENV{PERL_DIFF_TOOL} $off $gend > $diff");
 
     open my $fh, "<", $diff || die "Can't open $diff";
     my @diffs = <$fh>;
@@ -2695,4 +2716,12 @@ if (! ok(@warnings == 0, "No warnings were generated")) {
     diag(join "\n", "The warnings are:", @warnings);
 }
 
+# And make sure that the max code point returned actually fits in an IV, which
+# currently range iterators are.
+my $count = 0;
+for my $i ($Unicode::UCD::MAX_CP - 1 .. $Unicode::UCD::MAX_CP) {
+    $count++;
+}
+is($count, 2, "MAX_CP isn't too large");
+
 done_testing();
author	afresh1 <afresh1@openbsd.org>	2019-02-13 21:15:00 +0000
committer	afresh1 <afresh1@openbsd.org>	2019-02-13 21:15:00 +0000
commit	9f11ffb7133c203312a01e4b986886bc88c7d74b (patch)
tree	6618511204c614b20256e4ef9dea39a7b311d638 /gnu/usr.bin/perl/lib/Unicode
parent	Import perl-5.28.1 (diff)
download	wireguard-openbsd-9f11ffb7133c203312a01e4b986886bc88c7d74b.tar.xz wireguard-openbsd-9f11ffb7133c203312a01e4b986886bc88c7d74b.zip