summaryrefslogtreecommitdiffstats
path: root/gnu/usr.bin/perl/lib/Unicode
diff options
context:
space:
mode:
authorafresh1 <afresh1@openbsd.org>2014-11-17 20:52:31 +0000
committerafresh1 <afresh1@openbsd.org>2014-11-17 20:52:31 +0000
commit6fb12b7054efc6b436584db6cef9c2f85c0d7e27 (patch)
treeaa09a524574ec7ae2f521a24573deeecb78ff66a /gnu/usr.bin/perl/lib/Unicode
parentAdd the Cammelia cipher to libcrypto. (diff)
downloadwireguard-openbsd-6fb12b7054efc6b436584db6cef9c2f85c0d7e27.tar.xz
wireguard-openbsd-6fb12b7054efc6b436584db6cef9c2f85c0d7e27.zip
Import perl-5.20.1
Diffstat (limited to 'gnu/usr.bin/perl/lib/Unicode')
-rw-r--r--gnu/usr.bin/perl/lib/Unicode/UCD.pm874
-rw-r--r--gnu/usr.bin/perl/lib/Unicode/UCD.t539
2 files changed, 854 insertions, 559 deletions
diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.pm b/gnu/usr.bin/perl/lib/Unicode/UCD.pm
index 9c3dd7c7105..11a8ec26807 100644
--- a/gnu/usr.bin/perl/lib/Unicode/UCD.pm
+++ b/gnu/usr.bin/perl/lib/Unicode/UCD.pm
@@ -5,7 +5,7 @@ use warnings;
no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
-our $VERSION = '0.51';
+our $VERSION = '0.58';
require Exporter;
@@ -24,11 +24,14 @@ our @EXPORT_OK = qw(charinfo
prop_value_aliases
prop_invlist
prop_invmap
+ search_invlist
MAX_CP
);
use Carp;
+sub IS_ASCII_PLATFORM { ord("A") == 65 }
+
=head1 NAME
Unicode::UCD - Unicode character database
@@ -80,6 +83,9 @@ Unicode::UCD - Unicode character database
my ($list_ref, $map_ref, $format, $missing)
= prop_invmap("General Category");
+ use Unicode::UCD 'search_invlist';
+ my $index = search_invlist(\@invlist, $code_point);
+
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
@@ -100,18 +106,18 @@ Character Database.
=head2 code point argument
Some of the functions are called with a I<code point argument>, which is either
-a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
-followed by hexadecimals designating a Unicode code point. In other words, if
-you want a code point to be interpreted as a hexadecimal number, you must
-prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
-interpreted as a decimal code point.
+a decimal or a hexadecimal scalar designating a code point in the platform's
+native character set (extended to Unicode), or C<U+> followed by hexadecimals
+designating a Unicode code point. A leading 0 will force a hexadecimal
+interpretation, as will a hexadecimal digit that isn't a decimal digit.
Examples:
- 223 # Decimal 223
- 0223 # Hexadecimal 223 (= 547 decimal)
- 0xDF # Hexadecimal DF (= 223 decimal
- U+DF # Hexadecimal DF
+ 223 # Decimal 223 in native character set
+ 0223 # Hexadecimal 223, native (= 547 decimal)
+ 0xDF # Hexadecimal DF, native (= 223 decimal
+ U+DF # Hexadecimal DF, in Unicode's character set
+ (= LATIN SMALL LETTER SHARP S)
Note that the largest code point in Unicode is U+10FFFF.
@@ -193,7 +199,8 @@ The keys in the hash with the meanings of their values are:
=item B<code>
-the input L</code point argument> expressed in hexadecimal, with leading zeros
+the input native L</code point argument> expressed in hexadecimal, with
+leading zeros
added if necessary to make it contain at least four hexdigits
=item B<name>
@@ -237,7 +244,7 @@ of the bidi type name.
is empty if I<code> has no decomposition; or is one or more codes
(separated by spaces) that, taken in order, represent a decomposition for
I<code>. Each has at least four hexdigits.
-The codes may be preceded by a word enclosed in angle brackets then a space,
+The codes may be preceded by a word enclosed in angle brackets, then a space,
like C<E<lt>compatE<gt> >, giving the type of decomposition
This decomposition may be an intermediate one whose components are also
@@ -245,7 +252,7 @@ decomposable. Use L<Unicode::Normalize> to get the final decomposition.
=item B<decimal>
-if I<code> is a decimal digit this is its integer numeric value
+if I<code> represents a decimal digit this is its integer numeric value
=item B<digit>
@@ -322,8 +329,16 @@ sub _getcode {
if ($arg =~ /^[1-9]\d*$/) {
return $arg;
- } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
- return hex($1);
+ }
+ elsif ($arg =~ /^(?:0[xX])?([[:xdigit:]]+)$/) {
+ return CORE::hex($1);
+ }
+ elsif ($arg =~ /^[Uu]\+([[:xdigit:]]+)$/) { # Is of form U+0000, means
+ # wants the Unicode code
+ # point, not the native one
+ my $decimal = CORE::hex($1);
+ return $decimal if IS_ASCII_PLATFORM;
+ return utf8::unicode_to_native($decimal);
}
return;
@@ -533,7 +548,7 @@ sub _read_table ($;$) {
my $property = $table =~ s/\.pl//r;
$property = $utf8::file_to_swash_name{$property};
my $to_adjust = defined $property
- && $utf8::SwashInfo{$property}{'format'} eq 'a';
+ && $utf8::SwashInfo{$property}{'format'} =~ / ^ a /x;
for (split /^/m, $list) {
my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
@@ -541,6 +556,8 @@ sub _read_table ($;$) {
$ /x;
my $decimal_start = hex $start;
my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
+ $value = hex $value if $to_adjust
+ && $utf8::SwashInfo{$property}{'format'} eq 'ax';
if ($return_hash) {
foreach my $i ($decimal_start .. $decimal_end) {
$return{$i} = ($to_adjust)
@@ -582,7 +599,7 @@ sub charinrange {
my $range = charblock('Armenian');
-With a L</code point argument> charblock() returns the I<block> the code point
+With a L</code point argument> C<charblock()> returns the I<block> the code point
belongs to, e.g. C<Basic Latin>. The old-style block name is returned (see
L</Old-style versus new-style block names>).
If the code point is unassigned, this returns the block it would belong to if
@@ -591,17 +608,20 @@ have blocks, all code points are considered to be in C<No_Block>.)
See also L</Blocks versus Scripts>.
-If supplied with an argument that can't be a code point, charblock() tries to
-do the opposite and interpret the argument as an old-style block name. The
-return value
-is a I<range set> with one range: an anonymous list with a single element that
-consists of another anonymous list whose first element is the first code point
-in the block, and whose second (and final) element is the final code point in
-the block. (The extra list consisting of just one element is so that the same
-program logic can be used to handle both this return, and the return from
-L</charscript()> which can have multiple ranges.) You can test whether a code
-point is in a range using the L</charinrange()> function. If the argument is
-not a known block, C<undef> is returned.
+If supplied with an argument that can't be a code point, C<charblock()> tries to
+do the opposite and interpret the argument as an old-style block name. On an
+ASCII platform, the return value is a I<range set> with one range: an
+anonymous list with a single element that consists of another anonymous list
+whose first element is the first code point in the block, and whose second
+element is the final code point in the block. On an EBCDIC
+platform, the first two Unicode blocks are not contiguous. Their range sets
+are lists containing I<start-of-range>, I<end-of-range> code point pairs. You
+can test whether a code point is in a range set using the L</charinrange()>
+function. (To be precise, each I<range set> contains a third array element,
+after the range boundary ones: the old_style block name.)
+
+If the argument to C<charblock()> is not a known block, C<undef> is
+returned.
=cut
@@ -631,6 +651,36 @@ sub _charblocks {
}
}
close($BLOCKSFH);
+ if (! IS_ASCII_PLATFORM) {
+ # The first two blocks, through 0xFF, are wrong on EBCDIC
+ # platforms.
+
+ my @new_blocks = _read_table("To/Blk.pl");
+
+ # Get rid of the first two ranges in the Unicode version, and
+ # replace them with the ones computed by mktables.
+ shift @BLOCKS;
+ shift @BLOCKS;
+ delete $BLOCKS{'Basic Latin'};
+ delete $BLOCKS{'Latin-1 Supplement'};
+
+ # But there are multiple entries in the computed versions, and
+ # we change their names to (which we know) to be the old-style
+ # ones.
+ for my $i (0.. @new_blocks - 1) {
+ if ($new_blocks[$i][2] =~ s/Basic_Latin/Basic Latin/
+ or $new_blocks[$i][2] =~
+ s/Latin_1_Supplement/Latin-1 Supplement/)
+ {
+ push @{$BLOCKS{$new_blocks[$i][2]}}, $new_blocks[$i];
+ }
+ else {
+ splice @new_blocks, $i;
+ last;
+ }
+ }
+ unshift @BLOCKS, @new_blocks;
+ }
}
}
}
@@ -662,8 +712,8 @@ sub charblock {
my $range = charscript('Thai');
-With a L</code point argument> charscript() returns the I<script> the
-code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
+With a L</code point argument>, C<charscript()> returns the I<script> the
+code point belongs to, e.g., C<Latin>, C<Greek>, C<Han>.
If the code point is unassigned or the Unicode version being used is so early
that it doesn't have scripts, this function returns C<"Unknown">.
@@ -671,8 +721,11 @@ If supplied with an argument that can't be a code point, charscript() tries
to do the opposite and interpret the argument as a script name. The
return value is a I<range set>: an anonymous list of lists that contain
I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
-code point is in a range set using the L</charinrange()> function. If the
-argument is not a known script, C<undef> is returned.
+code point is in a range set using the L</charinrange()> function.
+(To be precise, each I<range set> contains a third array element,
+after the range boundary ones: the script name.)
+
+If the C<charscript()> argument is not a known script, C<undef> is returned.
See also L</Blocks versus Scripts>.
@@ -721,7 +774,7 @@ sub charscript {
my $charblocks = charblocks();
-charblocks() returns a reference to a hash with the known block names
+C<charblocks()> returns a reference to a hash with the known block names
as the keys, and the code point ranges (see L</charblock()>) as the values.
The names are in the old-style (see L</Old-style versus new-style block
@@ -745,7 +798,7 @@ sub charblocks {
my $charscripts = charscripts();
-charscripts() returns a reference to a hash with the known script
+C<charscripts()> returns a reference to a hash with the known script
names as the keys, and the code point ranges (see L</charscript()>) as
the values.
@@ -766,7 +819,7 @@ sub charscripts {
In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
can also test whether a code point is in the I<range> as returned by
L</charblock()> and L</charscript()> or as the values of the hash returned
-by L</charblocks()> and L</charscripts()> by using charinrange():
+by L</charblocks()> and L</charscripts()> by using C<charinrange()>:
use Unicode::UCD qw(charscript charinrange);
@@ -896,7 +949,9 @@ sub bidi_types {
my $compexcl = compexcl(0x09dc);
This routine returns C<undef> if the Unicode version being used is so early
-that it doesn't have this property. It is included for backwards
+that it doesn't have this property.
+
+C<compexcl()> is included for backwards
compatibility, but as of Perl 5.12 and more modern Unicode versions, for
most purposes it is probably more convenient to use one of the following
instead:
@@ -974,7 +1029,8 @@ with the following fields is returned:
=item B<code>
-the input L</code point argument> expressed in hexadecimal, with leading zeros
+the input native L</code point argument> expressed in hexadecimal, with
+leading zeros
added if necessary to make it contain at least four hexdigits
=item B<full>
@@ -1239,7 +1295,8 @@ The keys in the bottom layer hash with the meanings of their values are:
=item B<code>
-the input L</code point argument> expressed in hexadecimal, with leading zeros
+the input native L</code point argument> expressed in hexadecimal, with
+leading zeros
added if necessary to make it contain at least four hexdigits
=item B<lower>
@@ -1327,6 +1384,20 @@ sub _casespec {
my ($hexcode, $lower, $title, $upper, $condition) =
($1, $2, $3, $4, $5);
+ if (! IS_ASCII_PLATFORM) { # Remap entry to native
+ foreach my $var_ref (\$hexcode,
+ \$lower,
+ \$title,
+ \$upper)
+ {
+ next unless defined $$var_ref;
+ $$var_ref = join " ",
+ map { sprintf("%04X",
+ utf8::unicode_to_native(hex $_)) }
+ split " ", $$var_ref;
+ }
+ }
+
my $code = hex($hexcode);
# In 2.1.8, there were duplicate entries; ignore all but
@@ -1400,10 +1471,11 @@ sub casespec {
If used with a single argument in a scalar context, returns the string
consisting of the code points of the named sequence, or C<undef> if no
named sequence by that name exists. If used with a single argument in
-a list context, it returns the list of the ordinals of the code points. If used
-with no
-arguments in a list context, returns a hash with the names of the
-named sequences as the keys and the named sequences as strings as
+a list context, it returns the list of the ordinals of the code points.
+
+If used with no
+arguments in a list context, it returns a hash with the names of all the
+named sequences as the keys and their sequences as strings as
the values. Otherwise, it returns C<undef> or an empty list depending
on the context.
@@ -1519,7 +1591,7 @@ sub _numeric {
my $val = num("123");
my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
-C<num> returns the numeric value of the input Unicode string; or C<undef> if it
+C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
doesn't think the entire string has a completely valid, safe numeric value.
If the string is just one character in length, the Unicode numeric value
@@ -1687,7 +1759,7 @@ Those discouraged forms are accepted as input to C<prop_aliases>, but are not
returned in the lists. C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
which are old synonyms for C<"Is_LC"> and should not be used in new code, are
examples of this. These both return C<(Is_LC, Cased_Letter)>. Thus this
-function allows you to take a discourarged form, and find its acceptable
+function allows you to take a discouraged form, and find its acceptable
alternatives. The same goes with single-form Block property equivalences.
Only the forms that begin with C<"In_"> are not discouraged; if you pass
C<prop_aliases> a discouraged form, you will get back the equivalent ones that
@@ -1782,12 +1854,21 @@ sub prop_aliases ($) {
# there, the input is unknown.
return;
}
- else {
+ elsif ($loose =~ / [:=] /x) {
# Here we found the name but not its aliases, so it has to
- # exist. This means it must be one of the Perl single-form
- # extensions. First see if it is for a property-value
- # combination in one of the following properties.
+ # exist. Exclude property-value combinations. (This shows up
+ # for something like ccc=vr which matches loosely, but is a
+ # synonym for ccc=9 which matches only strictly.
+ return;
+ }
+ else {
+
+ # Here it has to exist, and isn't a property-value
+ # combination. This means it must be one of the Perl
+ # single-form extensions. First see if it is for a
+ # property-value combination in one of the following
+ # properties.
my @list;
foreach my $property ("gc", "script") {
@list = prop_value_aliases($property, $loose);
@@ -2076,21 +2157,10 @@ too high for some operations to work; you may wish to use a smaller number for
your purposes.)
Note that the inversion lists returned by this function can possibly include
-non-Unicode code points, that is anything above 0x10FFFF. This is in
-contrast to Perl regular expression matches on those code points, in which a
-non-Unicode code point always fails to match. For example, both of these have
-the same result:
-
- chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails.
- chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Fails!
-
-And both raise a warning that a Unicode property is being used on a
-non-Unicode code point. It is arguable as to which is the correct thing to do
-here. This function has chosen the way opposite to the Perl regular
-expression behavior. This allows you to easily flip to to the Perl regular
-expression way (for you to go in the other direction would be far harder).
-Simply add 0x110000 at the end of the non-empty returned list if it isn't
-already that value; and pop that value if it is; like:
+non-Unicode code points, that is anything above 0x10FFFF. Unicode properties
+are not defined on such code points. You might wish to change the output to
+not include these. Simply add 0x110000 at the end of the non-empty returned
+list if it isn't already that value; and pop that value if it is; like:
my @list = prop_invlist("foo");
if (@list) {
@@ -2124,6 +2194,9 @@ code points that have the property-value:
C<prop_invlist> does not know about any user-defined nor Perl internal-only
properties, and will return C<undef> if called with one of those.
+The L</search_invlist()> function is provided for finding a code point within
+an inversion list.
+
=cut
# User-defined properties could be handled with some changes to utf8_heavy.pl;
@@ -2171,44 +2244,53 @@ sub prop_invlist ($;$) {
my @invlist;
- # The input lines look like:
- # 0041\t005A # [26]
- # 005F
+ if ($swash->{'LIST'} =~ /^V/) {
- # Split into lines, stripped of trailing comments
- foreach my $range (split "\n",
- $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
- {
- # And find the beginning and end of the range on the line
- my ($hex_begin, $hex_end) = split "\t", $range;
- my $begin = hex $hex_begin;
-
- # If the new range merely extends the old, we remove the marker
- # created the last time through the loop for the old's end, which
- # causes the new one's end to be used instead.
- if (@invlist && $begin == $invlist[-1]) {
- pop @invlist;
- }
- else {
- # Add the beginning of the range
- push @invlist, $begin;
- }
+ # A 'V' as the first character marks the input as already an inversion
+ # list, in which case, all we need to do is put the remaining lines
+ # into our array.
+ @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
+ shift @invlist;
+ }
+ else {
+ # The input lines look like:
+ # 0041\t005A # [26]
+ # 005F
- if (defined $hex_end) { # The next item starts with the code point 1
- # beyond the end of the range.
- push @invlist, hex($hex_end) + 1;
- }
- else { # No end of range, is a single code point.
- push @invlist, $begin + 1;
+ # Split into lines, stripped of trailing comments
+ foreach my $range (split "\n",
+ $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
+ {
+ # And find the beginning and end of the range on the line
+ my ($hex_begin, $hex_end) = split "\t", $range;
+ my $begin = hex $hex_begin;
+
+ # If the new range merely extends the old, we remove the marker
+ # created the last time through the loop for the old's end, which
+ # causes the new one's end to be used instead.
+ if (@invlist && $begin == $invlist[-1]) {
+ pop @invlist;
+ }
+ else {
+ # Add the beginning of the range
+ push @invlist, $begin;
+ }
+
+ if (defined $hex_end) { # The next item starts with the code point 1
+ # beyond the end of the range.
+ no warnings 'portable';
+ my $end = hex $hex_end;
+ last if $end == $Unicode::UCD::MAX_CP;
+ push @invlist, $end + 1;
+ }
+ else { # No end of range, is a single code point.
+ push @invlist, $begin + 1;
+ }
}
}
- require "unicore/UCD.pl";
- my $FIRST_NON_UNICODE = $MAX_UNICODE_CODEPOINT + 1;
-
# Could need to be inverted: add or subtract a 0 at the beginning of the
- # list. And to keep it from matching non-Unicode, add or subtract the
- # first non-unicode code point.
+ # list.
if ($swash->{'INVERT_IT'}) {
if (@invlist && $invlist[0] == 0) {
shift @invlist;
@@ -2216,116 +2298,17 @@ sub prop_invlist ($;$) {
else {
unshift @invlist, 0;
}
- if (@invlist && $invlist[-1] == $FIRST_NON_UNICODE) {
- pop @invlist;
- }
- else {
- push @invlist, $FIRST_NON_UNICODE;
- }
- }
-
- # Here, the list is set up to include only Unicode code points. But, if
- # the table is the default one for the property, it should contain all
- # non-Unicode code points. First calculate the loose name for the
- # property. This is done even for strict-name properties, as the data
- # structure that mktables generates for us is set up so that we don't have
- # to worry about that. The property-value needs to be split if compound,
- # as the loose rules need to be independently calculated on each part. We
- # know that it is syntactically valid, or SWASHNEW would have failed.
-
- $prop = lc $prop;
- my ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
- if ($table) {
-
- # May have optional prefixed 'is'
- $prop = utf8::_loose_name($prop_only) =~ s/^is//r;
- $prop = $utf8::loose_property_name_of{$prop};
- $prop .= "=" . utf8::_loose_name($table);
- }
- else {
- $prop = utf8::_loose_name($prop);
- }
- if (exists $loose_defaults{$prop}) {
-
- # Here, is the default table. If a range ended with 10ffff, instead
- # continue that range to infinity, by popping the 110000; otherwise,
- # add the range from 11000 to infinity
- if (! @invlist || $invlist[-1] != $FIRST_NON_UNICODE) {
- push @invlist, $FIRST_NON_UNICODE;
- }
- else {
- pop @invlist;
- }
}
return @invlist;
}
-sub _search_invlist {
- # Find the range in the inversion list which contains a code point; that
- # is, find i such that l[i] <= code_point < l[i+1]. Returns undef if no
- # such i.
-
- # If this is ever made public, could use to speed up .t specials. Would
- # need to use code point argument, as in other functions in this pm
-
- my $list_ref = shift;
- my $code_point = shift;
- # Verify non-neg numeric XXX
-
- my $max_element = @$list_ref - 1;
-
- # Return undef if list is empty or requested item is before the first element.
- return if $max_element < 0;
- return if $code_point < $list_ref->[0];
-
- # Short cut something at the far-end of the table. This also allows us to
- # refer to element [$i+1] without fear of being out-of-bounds in the loop
- # below.
- return $max_element if $code_point >= $list_ref->[$max_element];
-
- use integer; # want integer division
-
- my $i = $max_element / 2;
-
- my $lower = 0;
- my $upper = $max_element;
- while (1) {
-
- if ($code_point >= $list_ref->[$i]) {
-
- # Here we have met the lower constraint. We can quit if we
- # also meet the upper one.
- last if $code_point < $list_ref->[$i+1];
-
- $lower = $i; # Still too low.
-
- }
- else {
-
- # Here, $code_point < $list_ref[$i], so look lower down.
- $upper = $i;
- }
-
- # Split search domain in half to try again.
- my $temp = ($upper + $lower) / 2;
-
- # No point in continuing unless $i changes for next time
- # in the loop.
- return $i if $temp == $i;
- $i = $temp;
- } # End of while loop
-
- # Here we have found the offset
- return $i;
-}
-
=pod
=head2 B<prop_invmap()>
use Unicode::UCD 'prop_invmap';
- my ($list_ref, $map_ref, $format, $missing)
+ my ($list_ref, $map_ref, $format, $default)
= prop_invmap("General Category");
C<prop_invmap> is used to get the complete mapping definition for a property,
@@ -2343,17 +2326,20 @@ or even better, C<"Gc=LC">).
Many Unicode properties have more than one name (or alias). C<prop_invmap>
understands all of these, including Perl extensions to them. Ambiguities are
resolved as described above for L</prop_aliases()>. The Perl internal
-property "Perl_Decimal_Digit, described below, is also accepted. C<undef> is
-returned if the property name is unknown.
+property "Perl_Decimal_Digit, described below, is also accepted. An empty
+list is returned if the property name is unknown.
See L<perluniprops/Properties accessible through Unicode::UCD> for the
properties acceptable as inputs to this function.
It is a fatal error to call this function except in list context.
-In addition to the the two arrays that form the inversion map, C<prop_invmap>
+In addition to the two arrays that form the inversion map, C<prop_invmap>
returns two other values; one is a scalar that gives some details as to the
-format of the entries of the map array; the other is used for specialized
-purposes, described at the end of this section.
+format of the entries of the map array; the other is a default value, useful
+in maps whose format name begins with the letter C<"a">, as described
+L<below in its subsection|/a>; and for specialized purposes, such as
+converting to another data structure, described at the end of this main
+section.
This means that C<prop_invmap> returns a 4 element list. For example,
@@ -2413,7 +2399,8 @@ that, instead of treating these as unassigned Unicode code points, the value
for this range should be C<undef>. If you wish, you can change the returned
arrays accordingly.
-The maps are almost always simple scalars that should be interpreted as-is.
+The maps for almost all properties are simple scalars that should be
+interpreted as-is.
These values are those given in the Unicode-supplied data files, which may be
inconsistent as to capitalization and as to which synonym for a property-value
is given. The results may be normalized by using the L</prop_value_aliases()>
@@ -2508,7 +2495,7 @@ is like C<"s"> in that all the map array elements are scalars, but here they are
restricted to all being integers, and some have to be adjusted (hence the name
C<"a">) to get the correct result. For example, in:
- my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
= prop_invmap("Simple_Uppercase_Mapping");
the returned arrays look like this:
@@ -2521,30 +2508,32 @@ the returned arrays look like this:
182 0
...
+and C<$default> is 0.
+
Let's start with the second line. It says that the uppercase of code point 97
is 65; or C<uc("a")> == "A". But the line is for the entire range of code
-points 97 through 122. To get the mapping for any code point in a range, you
-take the offset it has from the beginning code point of the range, and add
+points 97 through 122. To get the mapping for any code point in this range,
+you take the offset it has from the beginning code point of the range, and add
that to the mapping for that first code point. So, the mapping for 122 ("z")
is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
yielding 90 ("z"). Likewise for everything in between.
-The first line works the same way. The first map in a range is always the
-correct value for its code point (because the adjustment is 0). Thus the
-C<uc(chr(0))> is just itself. Also, C<uc(chr(1))> is also itself, as the
-adjustment is 0+1-0 .. C<uc(chr(96))> is 96.
-
Requiring this simple adjustment allows the returned arrays to be
significantly smaller than otherwise, up to a factor of 10, speeding up
searching through them.
+Ranges that map to C<$default>, C<"0">, behave somewhat differently. For
+these, each code point maps to itself. So, in the first line in the example,
+S<C<ord(uc(chr(0)))>> is 0, S<C<ord(uc(chr(1)))>> is 1, ..
+S<C<ord(uc(chr(96)))>> is 96.
+
=item B<C<al>>
means that some of the map array elements have the form given by C<"a">, and
the rest are ordered lists of code points.
For example, in:
- my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
= prop_invmap("Uppercase_Mapping");
the returned arrays look like this:
@@ -2571,6 +2560,9 @@ CAPITAL LETTER N).
No adjustments are needed to entries that are references to arrays; each such
entry will have exactly one element in its range, so the offset is always 0.
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is 0.
+
=item B<C<ae>>
This is like C<"a">, but some elements are the empty string, and should not be
@@ -2600,6 +2592,9 @@ represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
represents 0+1-0 = 1 ...
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is the empty string.
+
=item B<C<ale>>
is a combination of the C<"al"> type and the C<"ae"> type. Some of
@@ -2617,6 +2612,9 @@ An example slice is:
0x00B0 0
...
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is 0.
+
=item B<C<ar>>
means that all the elements of the map array are either rational numbers or
@@ -2656,6 +2654,9 @@ C<"ar">.
0x660 0 ARABIC-INDIC DIGIT ZERO .. NINE
0x66A "NaN"
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is C<"NaN">.
+
=item B<C<n>>
means the Name property. All the elements of the map array are simple
@@ -2693,7 +2694,7 @@ properties, except that one of the scalar elements is of the form:
This signifies that this entry should be replaced by the decompositions for
all the code points whose decomposition is algorithmically calculated. (All
-of them are currently in one range and no others outisde the range are likely
+of them are currently in one range and no others outside the range are likely
to ever be added to Unicode; the C<"n"> format
has this same entry.) These can be generated via the function
L<Unicode::Normalize::NFD()|Unicode::Normalize>.
@@ -2701,6 +2702,9 @@ L<Unicode::Normalize::NFD()|Unicode::Normalize>.
Note that the mapping is the one that is specified in the Unicode data files,
and to get the final decomposition, it may need to be applied recursively.
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is 0.
+
=back
Note that a format begins with the letter "a" if and only the property it is
@@ -2713,29 +2717,31 @@ which is an integer. That is, it must match the regular expression:
Further, the first element in a range never needs adjustment, as the
adjustment would be just adding 0.
-A binary search can be used to quickly find a code point in the inversion
-list, and hence its corresponding mapping.
+A binary search such as that provided by L</search_invlist()>, can be used to
+quickly find a code point in the inversion list, and hence its corresponding
+mapping.
-The final element (index [3], assigned to C<$default> in the "block" example) in
-the four element list returned by this function may be useful for applications
+The final, fourth element (index [3], assigned to C<$default> in the "block"
+example) in the four element list returned by this function is used with the
+C<"a"> format types; it may also be useful for applications
that wish to convert the returned inversion map data structure into some
other, such as a hash. It gives the mapping that most code points map to
under the property. If you establish the convention that any code point not
explicitly listed in your data structure maps to this value, you can
potentially make your data structure much smaller. As you construct your data
structure from the one returned by this function, simply ignore those ranges
-that map to this value, generally called the "default" value. For example, to
+that map to this value. For example, to
convert to the data structure searchable by L</charinrange()>, you can follow
this recipe for properties that don't require adjustments:
- my ($list_ref, $map_ref, $format, $missing) = prop_invmap($property);
+ my ($list_ref, $map_ref, $format, $default) = prop_invmap($property);
my @range_list;
# Look at each element in the list, but the -2 is needed because we
# look at $i+1 in the loop, and the final element is guaranteed to map
- # to $missing by prop_invmap(), so we would skip it anyway.
+ # to $default by prop_invmap(), so we would skip it anyway.
for my $i (0 .. @$list_ref - 2) {
- next if $map_ref->[$i] eq $missing;
+ next if $map_ref->[$i] eq $default;
push @range_list, [ $list_ref->[$i],
$list_ref->[$i+1],
$map_ref->[$i]
@@ -2745,13 +2751,13 @@ this recipe for properties that don't require adjustments:
print charinrange(\@range_list, $code_point), "\n";
With this, C<charinrange()> will return C<undef> if its input code point maps
-to C<$missing>. You can avoid this by omitting the C<next> statement, and adding
+to C<$default>. You can avoid this by omitting the C<next> statement, and adding
a line after the loop to handle the final element of the inversion map.
Similarly, this recipe can be used for properties that do require adjustments:
for my $i (0 .. @$list_ref - 2) {
- next if $map_ref->[$i] eq $missing;
+ next if $map_ref->[$i] eq $default;
# prop_invmap() guarantees that if the mapping is to an array, the
# range has just one element, so no need to worry about adjustments.
@@ -2817,7 +2823,7 @@ sub prop_invmap ($) {
# The swash has two components we look at, the base list, and a hash,
# named 'SPECIALS', containing any additional members whose mappings don't
- # fit into the the base list scheme of things. These generally 'override'
+ # fit into the base list scheme of things. These generally 'override'
# any value in the base list for the same code point.
my $overrides;
@@ -2934,10 +2940,8 @@ RETRY:
my $code_point = hex $hex_code_point;
# The name of all controls is the default: the empty string.
- # The set of controls is immutable, so these hard-coded
- # constants work.
- next if $code_point <= 0x9F
- && ($code_point <= 0x1F || $code_point >= 0x7F);
+ # The set of controls is immutable
+ next if chr($code_point) =~ /[[:cntrl:]]/u;
# If this is a name_alias, it isn't a name
next if grep { $_ eq $name } @{$aliases{$code_point}};
@@ -3161,7 +3165,7 @@ RETRY:
$list .= "$hex_begin\t$hex_end\t$decimal_map\n";
} else {
- # Here, no combining done. Just appen the initial
+ # Here, no combining done. Just append the initial
# (and current) values.
$list .= "$hex_begin\t\t$decimal_map\n";
}
@@ -3198,159 +3202,182 @@ RETRY:
my $requires_adjustment = $format =~ /^a/;
- # The LIST input lines look like:
- # ...
- # 0374\t\tCommon
- # 0375\t0377\tGreek # [3]
- # 037A\t037D\tGreek # [4]
- # 037E\t\tCommon
- # 0384\t\tGreek
- # ...
- #
- # Convert them to like
- # 0374 => Common
- # 0375 => Greek
- # 0378 => $missing
- # 037A => Greek
- # 037E => Common
- # 037F => $missing
- # 0384 => Greek
- #
- # For binary properties, the final non-comment column is absent, and
- # assumed to be 'Y'.
-
- foreach my $range (split "\n", $swash->{'LIST'}) {
- $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
-
- # Find the beginning and end of the range on the line
- my ($hex_begin, $hex_end, $map) = split "\t", $range;
- my $begin = hex $hex_begin;
- my $end = (defined $hex_end && $hex_end ne "")
- ? hex $hex_end
- : $begin;
-
- # Each time through the loop (after the first):
- # $invlist[-2] contains the beginning of the previous range processed
- # $invlist[-1] contains the end+1 of the previous range processed
- # $invmap[-2] contains the value of the previous range processed
- # $invmap[-1] contains the default value for missing ranges ($missing)
- #
- # Thus, things are set up for the typical case of a new non-adjacent
- # range of non-missings to be added. But, if the new range is
- # adjacent, it needs to replace the [-1] element; and if the new
- # range is a multiple value of the previous one, it needs to be added
- # to the [-2] map element.
-
- # The first time through, everything will be empty. If the property
- # doesn't have a range that begins at 0, add one that maps to $missing
- if (! @invlist) {
- if ($begin != 0) {
- push @invlist, 0;
- push @invmap, $missing;
- }
+ if ($swash->{'LIST'} =~ /^V/) {
+ @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
+ shift @invlist;
+ foreach my $i (0 .. @invlist - 1) {
+ $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N'
}
- elsif (@invlist > 1 && $invlist[-2] == $begin) {
-
- # Here we handle the case where the input has multiple entries for
- # each code point. mktables should have made sure that each such
- # range contains only one code point. At this point, $invlist[-1]
- # is the $missing that was added at the end of the last loop
- # iteration, and [-2] is the last real input code point, and that
- # code point is the same as the one we are adding now, making the
- # new one a multiple entry. Add it to the existing entry, either
- # by pushing it to the existing list of multiple entries, or
- # converting the single current entry into a list with both on it.
- # This is all we need do for this iteration.
-
- if ($end != $begin) {
- croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
- }
- if (! ref $invmap[-2]) {
- $invmap[-2] = [ $invmap[-2], $map ];
- }
- else {
- push @{$invmap[-2]}, $map;
- }
- $has_multiples = 1;
- next;
+
+ # The map includes lines for all code points; add one for the range
+ # from 0 to the first Y.
+ if ($invlist[0] != 0) {
+ unshift @invlist, 0;
+ unshift @invmap, 'N';
}
- elsif ($invlist[-1] == $begin) {
-
- # If the input isn't in the most compact form, so that there are
- # two adjacent ranges that map to the same thing, they should be
- # combined (EXCEPT where the arrays require adjustments, in which
- # case everything is already set up correctly). This happens in
- # our constructed dt mapping, as Element [-2] is the map for the
- # latest range so far processed. Just set the beginning point of
- # the map to $missing (in invlist[-1]) to 1 beyond where this
- # range ends. For example, in
- # 12\t13\tXYZ
- # 14\t17\tXYZ
- # we have set it up so that it looks like
- # 12 => XYZ
- # 14 => $missing
+ }
+ else {
+ # The LIST input lines look like:
+ # ...
+ # 0374\t\tCommon
+ # 0375\t0377\tGreek # [3]
+ # 037A\t037D\tGreek # [4]
+ # 037E\t\tCommon
+ # 0384\t\tGreek
+ # ...
+ #
+ # Convert them to like
+ # 0374 => Common
+ # 0375 => Greek
+ # 0378 => $missing
+ # 037A => Greek
+ # 037E => Common
+ # 037F => $missing
+ # 0384 => Greek
+ #
+ # For binary properties, the final non-comment column is absent, and
+ # assumed to be 'Y'.
+
+ foreach my $range (split "\n", $swash->{'LIST'}) {
+ $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
+
+ # Find the beginning and end of the range on the line
+ my ($hex_begin, $hex_end, $map) = split "\t", $range;
+ my $begin = hex $hex_begin;
+ no warnings 'portable';
+ my $end = (defined $hex_end && $hex_end ne "")
+ ? hex $hex_end
+ : $begin;
+
+ # Each time through the loop (after the first):
+ # $invlist[-2] contains the beginning of the previous range processed
+ # $invlist[-1] contains the end+1 of the previous range processed
+ # $invmap[-2] contains the value of the previous range processed
+ # $invmap[-1] contains the default value for missing ranges
+ # ($missing)
#
- # We now see that it should be
- # 12 => XYZ
- # 18 => $missing
- if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
- ? $invmap[-2] eq $map
- : $invmap[-2] eq 'Y'))
- {
- $invlist[-1] = $end + 1;
+ # Thus, things are set up for the typical case of a new
+ # non-adjacent range of non-missings to be added. But, if the new
+ # range is adjacent, it needs to replace the [-1] element; and if
+ # the new range is a multiple value of the previous one, it needs
+ # to be added to the [-2] map element.
+
+ # The first time through, everything will be empty. If the
+ # property doesn't have a range that begins at 0, add one that
+ # maps to $missing
+ if (! @invlist) {
+ if ($begin != 0) {
+ push @invlist, 0;
+ push @invmap, $missing;
+ }
+ }
+ elsif (@invlist > 1 && $invlist[-2] == $begin) {
+
+ # Here we handle the case where the input has multiple entries
+ # for each code point. mktables should have made sure that
+ # each such range contains only one code point. At this
+ # point, $invlist[-1] is the $missing that was added at the
+ # end of the last loop iteration, and [-2] is the last real
+ # input code point, and that code point is the same as the one
+ # we are adding now, making the new one a multiple entry. Add
+ # it to the existing entry, either by pushing it to the
+ # existing list of multiple entries, or converting the single
+ # current entry into a list with both on it. This is all we
+ # need do for this iteration.
+
+ if ($end != $begin) {
+ croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
+ }
+ if (! ref $invmap[-2]) {
+ $invmap[-2] = [ $invmap[-2], $map ];
+ }
+ else {
+ push @{$invmap[-2]}, $map;
+ }
+ $has_multiples = 1;
next;
}
+ elsif ($invlist[-1] == $begin) {
+
+ # If the input isn't in the most compact form, so that there
+ # are two adjacent ranges that map to the same thing, they
+ # should be combined (EXCEPT where the arrays require
+ # adjustments, in which case everything is already set up
+ # correctly). This happens in our constructed dt mapping, as
+ # Element [-2] is the map for the latest range so far
+ # processed. Just set the beginning point of the map to
+ # $missing (in invlist[-1]) to 1 beyond where this range ends.
+ # For example, in
+ # 12\t13\tXYZ
+ # 14\t17\tXYZ
+ # we have set it up so that it looks like
+ # 12 => XYZ
+ # 14 => $missing
+ #
+ # We now see that it should be
+ # 12 => XYZ
+ # 18 => $missing
+ if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
+ ? $invmap[-2] eq $map
+ : $invmap[-2] eq 'Y'))
+ {
+ $invlist[-1] = $end + 1;
+ next;
+ }
- # Here, the range started in the previous iteration that maps to
- # $missing starts at the same code point as this range. That
- # means there is no gap to fill that that range was intended for,
- # so we just pop it off the parallel arrays.
- pop @invlist;
- pop @invmap;
- }
-
- # Add the range beginning, and the range's map.
- push @invlist, $begin;
- if ($returned_prop eq 'ToDm') {
-
- # The decomposition maps are either a line like <hangul syllable>
- # which are to be taken as is; or a sequence of code points in hex
- # and separated by blanks. Convert them to decimal, and if there
- # is more than one, use an anonymous array as the map.
- if ($map =~ /^ < /x) {
- push @invmap, $map;
+ # Here, the range started in the previous iteration that maps
+ # to $missing starts at the same code point as this range.
+ # That means there is no gap to fill that that range was
+ # intended for, so we just pop it off the parallel arrays.
+ pop @invlist;
+ pop @invmap;
}
- else {
- my @map = split " ", $map;
- if (@map == 1) {
- push @invmap, $map[0];
+
+ # Add the range beginning, and the range's map.
+ push @invlist, $begin;
+ if ($returned_prop eq 'ToDm') {
+
+ # The decomposition maps are either a line like <hangul
+ # syllable> which are to be taken as is; or a sequence of code
+ # points in hex and separated by blanks. Convert them to
+ # decimal, and if there is more than one, use an anonymous
+ # array as the map.
+ if ($map =~ /^ < /x) {
+ push @invmap, $map;
}
else {
- push @invmap, \@map;
+ my @map = split " ", $map;
+ if (@map == 1) {
+ push @invmap, $map[0];
+ }
+ else {
+ push @invmap, \@map;
+ }
}
}
- }
- else {
+ else {
- # Otherwise, convert hex formatted list entries to decimal; add a
- # 'Y' map for the missing value in binary properties, or
- # otherwise, use the input map unchanged.
- $map = ($format eq 'x')
- ? hex $map
- : $format eq 'b'
- ? 'Y'
- : $map;
- push @invmap, $map;
- }
+ # Otherwise, convert hex formatted list entries to decimal;
+ # add a 'Y' map for the missing value in binary properties, or
+ # otherwise, use the input map unchanged.
+ $map = ($format eq 'x' || $format eq 'ax')
+ ? hex $map
+ : $format eq 'b'
+ ? 'Y'
+ : $map;
+ push @invmap, $map;
+ }
- # We just started a range. It ends with $end. The gap between it and
- # the next element in the list must be filled with a range that maps
- # to the default value. If there is no gap, the next iteration will
- # pop this, unless there is no next iteration, and we have filled all
- # of the Unicode code space, so check for that and skip.
- if ($end < $MAX_UNICODE_CODEPOINT) {
- push @invlist, $end + 1;
- push @invmap, $missing;
+ # We just started a range. It ends with $end. The gap between it
+ # and the next element in the list must be filled with a range
+ # that maps to the default value. If there is no gap, the next
+ # iteration will pop this, unless there is no next iteration, and
+ # we have filled all of the Unicode code space, so check for that
+ # and skip.
+ if ($end < $Unicode::UCD::MAX_CP) {
+ push @invlist, $end + 1;
+ push @invmap, $missing;
+ }
}
}
@@ -3361,10 +3388,15 @@ RETRY:
push @invmap, $missing;
}
- # And add in standard element that all non-Unicode code points map to:
- # $missing
- push @invlist, $MAX_UNICODE_CODEPOINT + 1;
- push @invmap, $missing;
+ # The final element is always for just the above-Unicode code points. If
+ # not already there, add it. It merely splits the current final range
+ # that extends to infinity into two elements, each with the same map.
+ # (This is to conform with the API that says the final element is for
+ # $MAX_UNICODE_CODEPOINT + 1 .. INFINITY.)
+ if ($invlist[-1] != $MAX_UNICODE_CODEPOINT + 1) {
+ push @invmap, $invmap[-1];
+ push @invlist, $MAX_UNICODE_CODEPOINT + 1;
+ }
# The second component of the map are those values that require
# non-standard specification, stored in SPECIALS. These override any
@@ -3411,7 +3443,7 @@ RETRY:
}
# Find the range that the override applies to.
- my $i = _search_invlist(\@invlist, $cp);
+ my $i = search_invlist(\@invlist, $cp);
if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
}
@@ -3520,6 +3552,100 @@ RETRY:
return (\@invlist, \@invmap, $format, $missing);
}
+sub search_invlist {
+
+=pod
+
+=head2 B<search_invlist()>
+
+ use Unicode::UCD qw(prop_invmap prop_invlist);
+ use Unicode::UCD 'search_invlist';
+
+ my @invlist = prop_invlist($property_name);
+ print $code_point, ((search_invlist(\@invlist, $code_point) // -1) % 2)
+ ? " isn't"
+ : " is",
+ " in $property_name\n";
+
+ my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block");
+ my $index = search_invlist($blocks_ranges_ref, $code_point);
+ print "$code_point is in block ", $blocks_map_ref->[$index], "\n";
+
+C<search_invlist> is used to search an inversion list returned by
+C<prop_invlist> or C<prop_invmap> for a particular L</code point argument>.
+C<undef> is returned if the code point is not found in the inversion list
+(this happens only when it is not a legal L<code point argument>, or is less
+than the list's first element). A warning is raised in the first instance.
+
+Otherwise, it returns the index into the list of the range that contains the
+code point.; that is, find C<i> such that
+
+ list[i]<= code_point < list[i+1].
+
+As explained in L</prop_invlist()>, whether a code point is in the list or not
+depends on if the index is even (in) or odd (not in). And as explained in
+L</prop_invmap()>, the index is used with the returned parallel array to find
+the mapping.
+
+=cut
+
+
+ my $list_ref = shift;
+ my $input_code_point = shift;
+ my $code_point = _getcode($input_code_point);
+
+ if (! defined $code_point) {
+ carp __PACKAGE__, "::search_invlist: unknown code '$input_code_point'";
+ return;
+ }
+
+ my $max_element = @$list_ref - 1;
+
+ # Return undef if list is empty or requested item is before the first element.
+ return if $max_element < 0;
+ return if $code_point < $list_ref->[0];
+
+ # Short cut something at the far-end of the table. This also allows us to
+ # refer to element [$i+1] without fear of being out-of-bounds in the loop
+ # below.
+ return $max_element if $code_point >= $list_ref->[$max_element];
+
+ use integer; # want integer division
+
+ my $i = $max_element / 2;
+
+ my $lower = 0;
+ my $upper = $max_element;
+ while (1) {
+
+ if ($code_point >= $list_ref->[$i]) {
+
+ # Here we have met the lower constraint. We can quit if we
+ # also meet the upper one.
+ last if $code_point < $list_ref->[$i+1];
+
+ $lower = $i; # Still too low.
+
+ }
+ else {
+
+ # Here, $code_point < $list_ref[$i], so look lower down.
+ $upper = $i;
+ }
+
+ # Split search domain in half to try again.
+ my $temp = ($upper + $lower) / 2;
+
+ # No point in continuing unless $i changes for next time
+ # in the loop.
+ return $i if $temp == $i;
+ $i = $temp;
+ } # End of while loop
+
+ # Here we have found the offset
+ return $i;
+}
+
=head2 Unicode::UCD::UnicodeVersion
This returns the version of the Unicode Character Database, in other words, the
@@ -3604,10 +3730,6 @@ for its block using C<charblock>).
Note that starting in Unicode 6.1, many of the block names have shorter
synonyms. These are always given in the new style.
-=head1 BUGS
-
-Does not yet support EBCDIC platforms.
-
=head1 AUTHOR
Jarkko Hietaniemi. Now maintained by perl5 porters.
diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.t b/gnu/usr.bin/perl/lib/Unicode/UCD.t
index e070defbeae..2abb05acb23 100644
--- a/gnu/usr.bin/perl/lib/Unicode/UCD.t
+++ b/gnu/usr.bin/perl/lib/Unicode/UCD.t
@@ -13,6 +13,9 @@ BEGIN {
}
}
+my @warnings;
+local $SIG{__WARN__} = sub { push @warnings, @_ };
+
use strict;
use Unicode::UCD;
use Test::More;
@@ -343,7 +346,7 @@ is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic');
# If this fails, then maybe one should look at the Unicode changes to see
# what else might need to be updated.
-is(Unicode::UCD::UnicodeVersion, '6.2.0', 'UnicodeVersion');
+is(Unicode::UCD::UnicodeVersion, '6.3.0', 'UnicodeVersion');
use Unicode::UCD qw(compexcl);
@@ -534,6 +537,8 @@ is(prop_aliases("isgc"), undef,
"prop_aliases('isgc') returns <undef> since is not covered Perl extension");
is(prop_aliases("Is_Is_Any"), undef,
"prop_aliases('Is_Is_Any') returns <undef> since two is's");
+is(prop_aliases("ccc=vr"), undef,
+ "prop_aliases('ccc=vr') doesn't generate a warning");
require 'utf8_heavy.pl';
require "unicore/Heavy.pl";
@@ -628,7 +633,7 @@ while (<$props>) {
# official properties. We have no way of knowing if mktables omitted a Perl
# extension or not, but we do the best we can from its generated lists
-foreach my $alias (keys %utf8::loose_to_file_of) {
+foreach my $alias (sort keys %utf8::loose_to_file_of) {
next if $alias =~ /=/;
my $lc_name = lc $alias;
my $loose = &utf8::_loose_name($lc_name);
@@ -818,7 +823,7 @@ while (<$propvalues>) {
# And test as best we can, the non-official pva's that mktables generates.
foreach my $hash (\%utf8::loose_to_file_of, \%utf8::stricter_to_file_of) {
- foreach my $test (keys %$hash) {
+ foreach my $test (sort keys %$hash) {
next if exists $pva_tested{$test}; # Skip if already tested
my ($prop, $value) = split "=", $test;
@@ -992,7 +997,7 @@ my %tested_invlist;
# strict
foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of)
{
- foreach my $table (keys %$set_of_tables) {
+ foreach my $table (sort keys %$set_of_tables) {
my $mod_table;
my ($prop_only, $value) = split "=", $table;
@@ -1019,7 +1024,7 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
}
else { # Single-form.
- # Like above, use looose if required, and insert underscores
+ # Like above, use loose if required, and insert underscores
# between digits if strict.
if ($set_of_tables == \%utf8::loose_to_file_of) {
$mod_table = "$extra_chars$table";
@@ -1046,9 +1051,20 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
}
$tested_invlist{$file} = dclone \@tested;
- # A leading '!' in the file name means that it is to be inverted.
- my $invert = $file =~ s/^!//;
- my $official = do "unicore/lib/$file.pl";
+ # A '!' in the file name means that it is to be inverted.
+ my $invert = $file =~ s/!//;
+ my $official;
+
+ # If the file's directory is '#', it is a special case where the
+ # contents are in-lined with semi-colons meaning new-lines, instead of
+ # it being an actual file to read. The file is an index in to the
+ # array of the definitions
+ if ($file =~ s!^#/!!) {
+ $official = $utf8::inline_definitions[$file];
+ }
+ else {
+ $official = do "unicore/lib/$file.pl";
+ }
# Get rid of any trailing space and comments in the file.
$official =~ s/\s*(#.*)?$//mg;
@@ -1058,56 +1074,28 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
# If we are to test against an inverted file, it is easier to invert
# our array than the file.
- # The file only is valid for Unicode code points, while the inversion
- # list is valid for all possible code points. Therefore, we must test
- # just the Unicode part against the file. Later we will test for
- # the non-Unicode part.
-
- my $before_invert; # Saves the pre-inverted table.
if ($invert) {
- $before_invert = dclone \@tested;
if (@tested && $tested[0] == 0) {
shift @tested;
} else {
unshift @tested, 0;
}
- if (@tested && $tested[-1] == 0x110000) {
- pop @tested;
- }
- else {
- push @tested, 0x110000;
- }
}
# Now construct a string from the list that should match the file.
- # The file gives ranges of code points with starting and ending values
- # in hex, like this:
- # 0041\t005A
- # 0061\t007A
- # 00AA
- # Our list has even numbered elements start ranges that are in the
- # list, and odd ones that aren't in the list. Therefore the odd
- # numbered ones are one beyond the end of the previous range, but
- # otherwise don't get reflected in the file.
- my $tested = "";
- my $i = 0;
- for (; $i < @tested - 1; $i += 2) {
- my $start = $tested[$i];
- my $end = $tested[$i+1] - 1;
- if ($start == $end) {
- $tested .= sprintf("%04X\n", $start);
- }
- else {
- $tested .= sprintf "%04X\t%04X\n", $start, $end;
- }
- }
-
- # As mentioned earlier, the disk files only go up through Unicode,
- # whereas the prop_invlist() ones go as high as necessary. The
- # comparison is only valid through max Unicode.
- if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) {
- $tested .= sprintf("%04X\t10FFFF\n", $tested[$i]);
- }
+ # The file is inversion list format code points, like this:
+ # V1216
+ # 65 # [26]
+ # 91
+ # 192 # [23]
+ # ...
+ # The V indicates it's an inversion list, and is followed immediately
+ # by the number of elements (lines) that follow giving its contents.
+ # The list has even numbered elements (0th, 2nd, ...) start ranges
+ # that are in the list, and odd ones that aren't in the list.
+ # Therefore the odd numbered ones are one beyond the end of the
+ # previous range, but otherwise don't get reflected in the file.
+ my $tested = join "\n", ("V" . scalar @tested), @tested;
local $/ = "\n";
chomp $tested;
$/ = $input_record_separator;
@@ -1116,50 +1104,6 @@ foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of
next;
}
- # Here, it matched the table. Now need to check for if it is correct
- # for beyond Unicode. First, calculate if is the default table or
- # not. This is the same algorithm as used internally in
- # prop_invlist(), so if it is wrong there, this test won't catch it.
- my $prop = lc $table;
- ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
- if (defined $table) {
-
- # May have optional prefixed 'is'
- $prop = &utf8::_loose_name($prop_only) =~ s/^is//r;
- $prop = $utf8::loose_property_name_of{$prop};
- $prop .= "=" . &utf8::_loose_name($table);
- }
- else {
- $prop = &utf8::_loose_name($prop);
- }
- my $is_default = exists $Unicode::UCD::loose_defaults{$prop};
-
- @tested = @$before_invert if $invert; # Use the original
- if (@tested % 2 == 0) {
-
- # If there are an even number of elements, the final one starts a
- # range (going to infinity) of code points that are not in the
- # list.
- if ($is_default) {
- fail("prop_invlist('$mod_table')");
- diag("default table doesn't goto infinity");
- use Data::Dumper;
- diag Dumper \@tested;
- next;
- }
- }
- else {
- # An odd number of elements means the final one starts a range
- # (going to infinity of code points that are in the list.
- if (! $is_default) {
- fail("prop_invlist('$mod_table')");
- diag("non-default table needs to stop in the Unicode range");
- use Data::Dumper;
- diag Dumper \@tested;
- next;
- }
- }
-
pass("prop_invlist('$mod_table')");
}
}
@@ -1183,6 +1127,49 @@ is(@list, 0, "prop_invmap('Perl_Charnames') returns <undef> since internal-Perl-
@list = prop_invmap("Is_Is_Any");
is(@list, 0, "prop_invmap('Is_Is_Any') returns <undef> since two is's");
+# The files for these properties are not used by Perl, but are retained for
+# backwards compatibility with applications that read them directly, with
+# comments in them that their use is deprecated. Until such time as we remove
+# them completely, we test that they exist, are correct, and that their
+# formats haven't changed. This hash contains the info needed to test them as
+# if they were regular properties. 'replaced_by' gives the equivalent
+# property now used by Perl.
+my %legacy_props = (
+ Legacy_Case_Folding => { replaced_by => 'cf',
+ file => 'To/Fold',
+ swash_name => 'ToFold'
+ },
+ Legacy_Lowercase_Mapping => { replaced_by => 'lc',
+ file => 'To/Lower',
+ swash_name => 'ToLower'
+ },
+ Legacy_Titlecase_Mapping => { replaced_by => 'tc',
+ file => 'To/Title',
+ swash_name => 'ToTitle'
+ },
+ Legacy_Uppercase_Mapping => { replaced_by => 'uc',
+ file => 'To/Upper',
+ swash_name => 'ToUpper'
+ },
+ Legacy_Perl_Decimal_Digit => { replaced_by => 'Perl_Decimal_Digit',
+ file => 'To/Digit',
+ swash_name => 'ToDigit'
+ },
+ );
+
+foreach my $legacy_prop (keys %legacy_props) {
+ @list = prop_invmap($legacy_prop);
+ is(@list, 0, "'$legacy_prop' is unknown to prop_invmap");
+}
+
+# The files for these properties shouldn't have their formats changed in case
+# applications use them (though such use is deprecated).
+my @legacy_file_format = (keys %legacy_props,
+ qw( Bidi_Mirroring_Glyph
+ NFKC_Casefold
+ )
+ );
+
# The set of properties to test on has already been compiled into %props by
# the prop_aliases() tests.
@@ -1202,19 +1189,54 @@ my %tested_invmaps;
# lists returned by prop_invlist(), which has already been tested.
PROPERTY:
-foreach my $prop (keys %props) {
+foreach my $prop (sort(keys %props), sort keys %legacy_props) {
+ my $is_legacy = 0;
my $loose_prop = &utf8::_loose_name(lc $prop);
my $suppressed = grep { $_ eq $loose_prop }
@Unicode::UCD::suppressed_properties;
+ my $actual_lookup_prop;
+ my $display_prop; # The property name that is displayed, as opposed
+ # to the one that is actually used.
+
# Find the short and full names that this property goes by
my ($name, $full_name) = prop_aliases($prop);
if (! $name) {
- if (! $suppressed) {
- fail("prop_invmap('$prop')");
- diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap");
+
+ # Here, Perl doesn't know about this property. It could be a
+ # suppressed one, or a legacy one.
+ if (grep { $prop eq $_ } keys %legacy_props) {
+
+ # For legacy properties, we look up the modern equivalent
+ # property instead; later massaging the results to look like the
+ # known format of the legacy property. We add info about the
+ # legacy property to the data structures for the rest of the
+ # properties; this is to avoid more special cases for the legacies
+ # in the code below
+ $full_name = $name = $prop;
+ $actual_lookup_prop = $legacy_props{$prop}->{'replaced_by'};
+ my $base_file = $legacy_props{$prop}->{'file'};
+
+ # This legacy property is otherwise unknown to Perl; so shouldn't
+ # have any information about it already.
+ ok(! exists $utf8::loose_property_to_file_of{$loose_prop},
+ "There isn't a hash entry for file lookup of $prop");
+ $utf8::loose_property_to_file_of{$loose_prop} = $base_file;
+
+ ok(! exists $utf8::file_to_swash_name{$loose_prop},
+ "There isn't a hash entry for swash lookup of $prop");
+ $utf8::file_to_swash_name{$base_file}
+ = $legacy_props{$prop}->{'swash_name'};
+ $display_prop = $prop;
+ $is_legacy = 1;
}
+ else {
+ if (! $suppressed) {
+ fail("prop_invmap('$prop')");
+ diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap");
+ }
next PROPERTY;
+ }
}
# Normalize the short name, as it is stored in the hashes under the
@@ -1223,15 +1245,59 @@ foreach my $prop (keys %props) {
# Add in the characters that are supposed to be ignored to test loose
# matching, which the tested function applies to all properties
- my $mod_prop = "$extra_chars$prop";
+ $display_prop = "$extra_chars$prop" unless $display_prop;
+ $actual_lookup_prop = $display_prop unless $actual_lookup_prop;
- my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($mod_prop);
+ my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($actual_lookup_prop);
my $return_ref = [ $invlist_ref, $invmap_ref, $format, $missing ];
+
+ # The legacy property files all are expanded out so that each range is 1
+ # element long. That isn't true of the modern equivalent we use to check
+ # those files for correctness against. So take the output of the proxy
+ # and expand it to match the legacy file.
+ if ($is_legacy) {
+ my @expanded_list;
+ my @expanded_map;
+ for my $i (0 .. @$invlist_ref - 1 - 1) {
+ if (ref $invmap_ref->[$i] || $invmap_ref->[$i] eq $missing) {
+
+ # No adjustments should be done for the default mapping and
+ # the multi-char ones.
+ push @expanded_list, $invlist_ref->[$i];
+ push @expanded_map, $invmap_ref->[$i];
+ }
+ else {
+
+ # Expand the range into separate elements for each item.
+ my $offset = 0;
+ for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
+ push @expanded_list, $j;
+ push @expanded_map, $invmap_ref->[$i] + $offset;
+
+ # The 'ae' format is for Legacy_Perl_Decimal_Digit; the
+ # other 4 are kept with leading zeros in the file, so
+ # convert to that.
+ $expanded_map[-1] = sprintf("%04X", $expanded_map[-1])
+ if $format ne 'ae';
+ $offset++;
+ }
+ }
+ }
+
+ # Final element is taken as is. The map should always be to the
+ # default value, so don't do a sprintf like we did above.
+ push @expanded_list, $invlist_ref->[-1];
+ push @expanded_map, $invmap_ref->[-1];
+
+ $invlist_ref = \@expanded_list;
+ $invmap_ref = \@expanded_map;
+ }
+
# If have already tested this property under a different name, merely
# compare the return from now with the saved one from before.
if (exists $tested_invmaps{$name}) {
- is_deeply($return_ref, $tested_invmaps{$name}, "prop_invmap('$mod_prop') gave same results as its synonym, '$name'");
+ is_deeply($return_ref, $tested_invmaps{$name}, "prop_invmap('$display_prop') gave same results as its synonym, '$name'");
next PROPERTY;
}
$tested_invmaps{$name} = dclone $return_ref;
@@ -1240,20 +1306,20 @@ foreach my $prop (keys %props) {
# not generated.
if ($suppressed) {
if (defined $format) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("did not return undef for suppressed property $prop");
}
next PROPERTY;
}
elsif (!defined $format) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("'$prop' is unknown to prop_invmap()");
next PROPERTY;
}
# The two parallel arrays must have the same number of elements.
if (@$invlist_ref != @$invmap_ref) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("invlist has "
. scalar @$invlist_ref
. " while invmap has "
@@ -1265,19 +1331,47 @@ foreach my $prop (keys %props) {
# The last element must be for the above-Unicode code points, and must be
# for the default value.
if ($invlist_ref->[-1] != 0x110000) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The last inversion list element is not 0x110000");
next PROPERTY;
}
- if ($invmap_ref->[-1] ne $missing) {
- fail("prop_invmap('$mod_prop')");
+
+ my $upper_limit_subtract;
+
+ # prop_invmap() adds an extra element not present in the disk files for
+ # the above-Unicode code points. For almost all properties, that will be
+ # to $missing. In that case we don't look further at it when comparing
+ # with the disk files.
+ if ($invmap_ref->[-1] eq $missing) {
+ $upper_limit_subtract = 1;
+ }
+ elsif ($invmap_ref->[-1] eq 'Y' && ! grep { $_ !~ /[YN]/ } @$invmap_ref) {
+
+ # But that's not true for a few binary properties like 'Unassigned'
+ # that are Perl extensions (in this case for Gc=Unassigned) which
+ # match above-Unicode code points (hence the 'Y' in the test above).
+ # For properties where it isn't $missing, we're going to want to look
+ # at the whole thing when comparing with the disk file.
+ $upper_limit_subtract = 0;
+
+ # In those properties like 'Unassigned, the final element should be
+ # just a repetition of the next-to-last element, and won't be in the
+ # disk file, so remove it for the comparison. Otherwise, we will
+ # compare the whole of the array with the whole of the disk file.
+ if ($invlist_ref->[-2] <= 0x10FFFF && $invmap_ref->[-2] eq 'Y') {
+ pop @$invlist_ref;
+ pop @$invmap_ref;
+ }
+ }
+ else {
+ fail("prop_invmap('$display_prop')");
diag("The last inversion list element is '$invmap_ref->[-1]', and should be '$missing'");
next PROPERTY;
}
if ($name eq 'bmg') { # This one has an atypical $missing
if ($missing ne "") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The missings should be \"\"; got '$missing'");
next PROPERTY;
}
@@ -1285,19 +1379,19 @@ foreach my $prop (keys %props) {
elsif ($format =~ /^ a (?!r) /x) {
if ($full_name eq 'Perl_Decimal_Digit') {
if ($missing ne "") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The missings should be \"\"; got '$missing'");
next PROPERTY;
}
}
- elsif ($missing ne "0") {
- fail("prop_invmap('$mod_prop')");
+ elsif ($missing ne "0" && ! grep { $prop eq $_ } keys %legacy_props) {
+ fail("prop_invmap('$display_prop')");
diag("The missings should be '0'; got '$missing'");
next PROPERTY;
}
}
elsif ($missing =~ /[<>]/) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The missings should NOT be something with <...>'");
next PROPERTY;
@@ -1322,14 +1416,14 @@ foreach my $prop (keys %props) {
$proxy_prop = lc $1 . "c";
}
if ($format ne "a") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The format should be 'a'; got '$format'");
next PROPERTY;
}
}
if ($format !~ / ^ (?: a [der]? | ale? | n | sl? ) $ /x) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Unknown format '$format'");
next PROPERTY;
}
@@ -1371,7 +1465,7 @@ foreach my $prop (keys %props) {
{
# Translate the charblocks() data structure to what the file
# would like.
- $official .= sprintf"%04X\t%04X\t%s\n",
+ $official .= sprintf"%X\t%X\t%s\n",
$range->[0][0],
$range->[0][1],
$range->[0][2];
@@ -1392,13 +1486,19 @@ foreach my $prop (keys %props) {
# property comes along without these characteristics
if (!defined $base_file) {
$base_file = $utf8::loose_to_file_of{$proxy_prop};
- $is_binary = ($base_file =~ s/^!//) ? -1 : 1;
- $base_file = "lib/$base_file";
+ $is_binary = ($base_file =~ s/!//) ? -1 : 1;
+ $base_file = "lib/$base_file" unless $base_file =~ m!^#/!;
}
- # Read in the file
- $file = "unicore/$base_file.pl";
- $official = do $file;
+ # Read in the file. If the file's directory is '#', it is a
+ # special case where the contents are in-lined with semi-colons
+ # meaning new-lines, instead of it being an actual file to read.
+ if ($base_file =~ s!^#/!!) {
+ $official = $utf8::inline_definitions[$base_file];
+ }
+ else {
+ $official = do "unicore/$base_file.pl";
+ }
# Get rid of any trailing space and comments in the file.
$official =~ s/\s*(#.*)?$//mg;
@@ -1420,7 +1520,7 @@ foreach my $prop (keys %props) {
# easier below.
if ($end ne "") {
for my $i (hex($start) + 1 .. hex $end) {
- $official .= sprintf "%04X\t\t%s\n", $i, $value;
+ $official .= sprintf "%X\t\t%s\n", $i, $value;
}
}
}
@@ -1434,7 +1534,7 @@ foreach my $prop (keys %props) {
# get a reference to them.
my $swash_name = $utf8::file_to_swash_name{$base_file};
my $specials_ref;
- my $file_format;
+ my $file_format; # The 'format' given inside the file
if ($swash_name) {
$specials_ref = $utf8::SwashInfo{$swash_name}{'specials_name'};
if ($specials_ref) {
@@ -1447,9 +1547,24 @@ foreach my $prop (keys %props) {
$file_format = $utf8::SwashInfo{$swash_name}{'format'};
}
+ # Leading zeros used to be used with the values in the files that give,
+ # ranges, but these have been mostly stripped off, except for some
+ # files whose formats should not change in any way.
+ my $file_range_format = (grep { $full_name eq $_ } @legacy_file_format)
+ ? "%04X"
+ : "%X";
+ # Currently this property still has leading zeroes in the mapped-to
+ # values, but otherwise, those values follow the same rules as the
+ # ranges.
+ my $file_map_format = ($full_name eq 'Decomposition_Mapping')
+ ? "%04X"
+ : $file_range_format;
+
# Certain of the proxy properties have to be adjusted to match the
# real ones.
- if ($full_name =~ /^(Case_Folding|(Lower|Title|Upper)case_Mapping)/) {
+ if ($full_name
+ =~ /^(Legacy_)?(Case_Folding|(Lower|Title|Upper)case_Mapping)/)
+ {
# Here we have either
# 1) Case_Folding; or
@@ -1465,7 +1580,7 @@ foreach my $prop (keys %props) {
my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
\s* ( \# .* )? $ /x;
$end = $start if $end eq "";
- push @list, [ hex $start, hex $end, $value ];
+ push @list, [ hex $start, hex $end, hex $value ];
}
# For these mappings, the file contains all the simple mappings,
@@ -1496,8 +1611,8 @@ foreach my $prop (keys %props) {
# element of the range...
if ($cp == $list[$i][0]) {
- # ... and there are other elements in the range, just shorten
- # the range to exclude this code point.
+ # ... and there are other elements in the range, just
+ # shorten the range to exclude this code point.
if ($list[$i][1] > $list[$i][0]) {
$list[$i][0]++;
}
@@ -1523,14 +1638,20 @@ foreach my $prop (keys %props) {
for my $element (@list) {
$official .= "\n" if $official;
if ($element->[1] == $element->[0]) {
- $official .= sprintf "%04X\t\t%s", $element->[0], $element->[2];
+ $official
+ .= sprintf "$file_range_format\t\t$file_map_format",
+ $element->[0], $element->[2];
}
else {
- $official .= sprintf "%04X\t%04X\t%s", $element->[0], $element->[1], $element->[2];
+ $official .= sprintf "$file_range_format\t$file_range_format\t$file_map_format",
+ $element->[0],
+ $element->[1],
+ $element->[2];
}
}
}
- elsif ($full_name =~ /Simple_(Case_Folding|(Lower|Title|Upper)case_Mapping)/)
+ elsif ($full_name
+ =~ / ^ Simple_(Case_Folding|(Lower|Title|Upper)case_Mapping) $ /x)
{
# These properties have everything in the regular array, and the
@@ -1555,6 +1676,11 @@ foreach my $prop (keys %props) {
# appends the next line to the running string.
my $tested_map = "";
+ # For use with files for binary properties only, which are stored in
+ # inversion list format. This counts the number of data lines in the
+ # file.
+ my $binary_count = 0;
+
# Create a copy of the file's specials hash. (It has been undef'd if
# we know it isn't relevant to this property, so if it exists, it's an
# error or is relevant). As we go along, we delete from that copy.
@@ -1562,9 +1688,10 @@ foreach my $prop (keys %props) {
# it's an error
my %specials = %$specials_ref if $specials_ref;
- # The extra -1 is because the final element has been tested above to
- # be for anything above Unicode. The file doesn't go that high.
- for (my $i = 0; $i < @$invlist_ref - 1; $i++) {
+ # The extra -$upper_limit_subtract is because the final element may
+ # have been tested above to be for anything above Unicode, in which
+ # case the file may not go that high.
+ for (my $i = 0; $i < @$invlist_ref - $upper_limit_subtract; $i++) {
# If the map element is a reference, have to stringify it (but
# don't do so if the format doesn't allow references, so that an
@@ -1576,9 +1703,10 @@ foreach my $prop (keys %props) {
if ($format eq 'sl') {
# At the time of this writing, there are two types of 'sl'
- # format One, in Name_Alias, has multiple separate entries
- # for each code point; the other, in Script_Extension, is space
- # separated. Assume the latter for non-Name_Alias.
+ # format One, in Name_Alias, has multiple separate
+ # entries for each code point; the other, in
+ # Script_Extension, is space separated. Assume the latter
+ # for non-Name_Alias.
if ($full_name ne 'Name_Alias') {
$invmap_ref->[$i] = join " ", @{$invmap_ref->[$i]};
}
@@ -1595,10 +1723,11 @@ foreach my $prop (keys %props) {
# other property; thus the special handling of the
# first line.
if (ref $invmap_ref->[$i]) {
- my $hex_cp = sprintf("%04X", $invlist_ref->[$i]);
+ my $hex_cp = sprintf("%X", $invlist_ref->[$i]);
my $concatenated = $invmap_ref->[$i][0];
for (my $j = 1; $j < @{$invmap_ref->[$i]}; $j++) {
- $concatenated .= "\n$hex_cp\t\t" . $invmap_ref->[$i][$j];
+ $concatenated .= "\n$hex_cp\t\t"
+ . $invmap_ref->[$i][$j];
}
$invmap_ref->[$i] = $concatenated;
}
@@ -1606,18 +1735,20 @@ foreach my $prop (keys %props) {
}
elsif ($format =~ / ^ al e? $/x) {
- # For a al property, the stringified result should be in
+ # For an al property, the stringified result should be in
# the specials hash. The key is the packed code point,
# and the value is the packed map.
my $value;
- if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) {
- fail("prop_invmap('$mod_prop')");
+ if (! defined ($value = delete $specials{pack("C0U",
+ $invlist_ref->[$i]) }))
+ {
+ fail("prop_invmap('$display_prop')");
diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]);
next PROPERTY;
}
my $packed = pack "U*", @{$invmap_ref->[$i]};
if ($value ne $packed) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "For %04X, expected the mapping to be '$packed', but got '$value'");
next PROPERTY;
}
@@ -1628,7 +1759,7 @@ foreach my $prop (keys %props) {
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
@@ -1638,13 +1769,18 @@ foreach my $prop (keys %props) {
# The decomposition mapping file has the code points as
# a string of space-separated hex constants.
- $invmap_ref->[$i] = join " ", map { sprintf "%04X", $_ } @{$invmap_ref->[$i]};
+ $invmap_ref->[$i] = join " ", map { sprintf "%04X", $_ }
+ @{$invmap_ref->[$i]};
}
else {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Can't handle format '$format'");
next PROPERTY;
}
+ } # Otherwise, the map is to a simple scalar
+ elsif (defined $file_format && $file_format eq 'ax') {
+ # These maps are in hex
+ $invmap_ref->[$i] = sprintf("%X", $invmap_ref->[$i]);
}
elsif ($format eq 'ad' || $format eq 'ale') {
@@ -1659,7 +1795,8 @@ foreach my $prop (keys %props) {
&& $invmap_ref->[$i] != 0)
{
my $next = $invmap_ref->[$i] + 1;
- $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i]);
+ $invmap_ref->[$i] = sprintf($file_map_format,
+ $invmap_ref->[$i]);
# If there are other elements in this range they need to
# be adjusted; they must individually be re-mapped. Do
@@ -1679,13 +1816,15 @@ foreach my $prop (keys %props) {
# should be in the specials hash, with the key the packed
# code point, and the map just empty.
my $value;
- if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) {
- fail("prop_invmap('$mod_prop')");
+ if (! defined ($value = delete $specials{pack("C0U",
+ $invlist_ref->[$i]) }))
+ {
+ fail("prop_invmap('$display_prop')");
diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]);
next PROPERTY;
}
if ($value ne "") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "For %04X, expected the mapping to be \"\", but got '$value'", $invlist_ref->[$i]);
next PROPERTY;
}
@@ -1696,7 +1835,7 @@ foreach my $prop (keys %props) {
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
@@ -1714,7 +1853,7 @@ foreach my $prop (keys %props) {
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
@@ -1730,7 +1869,7 @@ foreach my $prop (keys %props) {
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
@@ -1740,19 +1879,41 @@ foreach my $prop (keys %props) {
# Finally have figured out what the map column in the file should
# be. Append the line to the running string.
my $start = $invlist_ref->[$i];
- my $end = $invlist_ref->[$i+1] - 1;
- $end = ($start == $end) ? "" : sprintf("%04X", $end);
- if ($invmap_ref->[$i] ne "") {
- $tested_map .= sprintf "%04X\t%s\t%s\n", $start, $end, $invmap_ref->[$i];
- }
- elsif ($end ne "") {
- $tested_map .= sprintf "%04X\t%s\n", $start, $end;
+ my $end = (defined $invlist_ref->[$i+1])
+ ? $invlist_ref->[$i+1] - 1
+ : $Unicode::UCD::MAX_CP;
+ if ($is_binary) {
+
+ # Files for binary properties are in inversion list format,
+ # without ranges.
+ $tested_map .= "$start\n";
+ $binary_count++;
+
+ # If the final value is infinity, no line for it exists.
+ if ($end < $Unicode::UCD::MAX_CP) {
+ $tested_map .= ($end + 1) . "\n";
+ $binary_count++;
+ }
}
else {
- $tested_map .= sprintf "%04X\n", $start;
+ $end = ($start == $end) ? "" : sprintf($file_range_format, $end);
+ if ($invmap_ref->[$i] ne "") {
+ $tested_map .= sprintf "$file_range_format\t%s\t%s\n",
+ $start, $end, $invmap_ref->[$i];
+ }
+ elsif ($end ne "") {
+ $tested_map .= sprintf "$file_range_format\t%s\n",
+ $start, $end;
+ }
+ else {
+ $tested_map .= sprintf "$file_range_format\n", $start;
+ }
}
} # End of looping over all elements.
+ # Binary property files begin with a line count line.
+ $tested_map = "V$binary_count\n$tested_map" if $binary_count;
+
# Here are done with generating what the file should look like
local $/ = "\n";
@@ -1761,13 +1922,13 @@ foreach my $prop (keys %props) {
# And compare.
if ($tested_map ne $official) {
- fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap");
+ fail_with_diff($display_prop, $official, $tested_map, "prop_invmap");
next PROPERTY;
}
# There shouldn't be any specials unaccounted for.
if (keys %specials) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Unexpected specials: " . join ", ", keys %specials);
next PROPERTY;
}
@@ -1781,7 +1942,7 @@ foreach my $prop (keys %props) {
# but the Name in order to do the comparison.
if ($missing ne "") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("The missings should be \"\"; got \"missing\"");
next PROPERTY;
}
@@ -1839,14 +2000,14 @@ foreach my $prop (keys %props) {
my @code_point_in_names =
@Unicode::UCD::code_points_ending_in_code_point;
- for my $i (0 .. @$invlist_ref - 1 - 1) {
+ for my $i (0 .. @$invlist_ref - 1 - $upper_limit_subtract) {
my $start = $invlist_ref->[$i];
my $end = $invlist_ref->[$i+1] - 1;
if ($invmap_ref->[$i] eq $missing) {
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
@@ -1858,29 +2019,29 @@ foreach my $prop (keys %props) {
if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
|| $invlist_ref->[$i] >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
if ($type eq "<hangul syllable>") {
if ($name ne "") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Unexpected text in $invmap_ref->[$i]");
next PROPERTY;
}
if ($start != 0xAC00) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf("<hangul syllables> should begin at 0xAC00, got %04X", $start));
next PROPERTY;
}
if ($end != $start + 11172 - 1) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf("<hangul syllables> should end at %04X, got %04X", $start + 11172 -1, $end));
next PROPERTY;
}
}
elsif ($type ne "<code point>") {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Unexpected text '$type' in $invmap_ref->[$i]");
next PROPERTY;
}
@@ -1900,7 +2061,7 @@ foreach my $prop (keys %props) {
last;
}
else {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Unexpected code-point-in-name line '$invmap_ref->[$i]'");
next PROPERTY;
}
@@ -1921,11 +2082,11 @@ foreach my $prop (keys %props) {
chomp $tested_map;
$/ = $input_record_separator;
if ($tested_map ne $official) {
- fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap");
+ fail_with_diff($display_prop, $official, $tested_map, "prop_invmap");
next PROPERTY;
}
if (@code_point_in_names) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
use Data::Dumper;
diag("Missing code-point-in-name line(s)" . Dumper \@code_point_in_names);
next PROPERTY;
@@ -1945,10 +2106,7 @@ foreach my $prop (keys %props) {
my %maps;
my $previous_map;
- # (The extra -1 is to not look at the final element in the loop, which
- # we know is the one that starts just beyond Unicode and goes to
- # infinity.)
- for my $i (0 .. @$invlist_ref - 1 - 1) {
+ for my $i (0 .. @$invlist_ref - 1 - $upper_limit_subtract) {
my $range_start = $invlist_ref->[$i];
# Because we are sorting into buckets, things could be
@@ -1958,7 +2116,7 @@ foreach my $prop (keys %props) {
if (($i > 0 && $range_start <= $invlist_ref->[$i-1])
|| $range_start >= $invlist_ref->[$i+1])
{
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
next PROPERTY;
}
@@ -1993,24 +2151,24 @@ foreach my $prop (keys %props) {
# through each and verify that matches what prop_invlist() returns.
# We could use is_deeply() for the comparison, but would get multiple
# messages for each $prop.
- foreach my $map (keys %maps) {
+ foreach my $map (sort keys %maps) {
my @off_invlist = prop_invlist("$prop = $map");
my $min = (@off_invlist >= @{$maps{$map}})
? @off_invlist
: @{$maps{$map}};
for my $i (0 .. $min- 1) {
if ($i > @off_invlist - 1) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("There is no element [$i] for $prop=$map from prop_invlist(), while [$i] in the implicit one constructed from prop_invmap() is '$maps{$map}[$i]'");
next PROPERTY;
}
elsif ($i > @{$maps{$map}} - 1) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("There is no element [$i] from the implicit $prop=$map constructed from prop_invmap(), while [$i] in the one from prop_invlist() is '$off_invlist[$i]'");
next PROPERTY;
}
elsif ($maps{$map}[$i] ne $off_invlist[$i]) {
- fail("prop_invmap('$mod_prop')");
+ fail("prop_invmap('$display_prop')");
diag("Element [$i] of the implicit $prop=$map constructed from prop_invmap() is '$maps{$map}[$i]', and the one from prop_invlist() is '$off_invlist[$i]'");
next PROPERTY;
}
@@ -2019,12 +2177,27 @@ foreach my $prop (keys %props) {
}
else { # Don't know this property nor format.
- fail("prop_invmap('$mod_prop')");
- diag("Unknown format '$format'");
+ fail("prop_invmap('$display_prop')");
+ diag("Unknown property '$display_prop' or format '$format'");
+ next PROPERTY;
}
- pass("prop_invmap('$mod_prop')");
+ pass("prop_invmap('$display_prop')");
}
+# A few tests of search_invlist
+use Unicode::UCD qw(search_invlist);
+
+my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script");
+my $index = search_invlist($scripts_ranges_ref, 0x390);
+is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek");
+my @alpha_invlist = prop_invlist("Alpha");
+is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list");
+
ok($/ eq $input_record_separator, "The record separator didn't get overridden");
+
+if (! ok(@warnings == 0, "No warnings were generated")) {
+ diag(join "\n", "The warnings are:", @warnings);
+}
+
done_testing();