diff options
author | 2017-02-05 00:31:51 +0000 | |
---|---|---|
committer | 2017-02-05 00:31:51 +0000 | |
commit | b8851fcc53cbe24fd20b090f26dd149e353f6174 (patch) | |
tree | 4b7c1695865f00ab7a0da30b5632d514848ea3a2 /gnu/usr.bin/perl/lib/Unicode | |
parent | Add option PCIVERBOSE. (diff) | |
download | wireguard-openbsd-b8851fcc53cbe24fd20b090f26dd149e353f6174.tar.xz wireguard-openbsd-b8851fcc53cbe24fd20b090f26dd149e353f6174.zip |
Fix merge issues, remove excess files - match perl-5.24.1 dist
Diffstat (limited to 'gnu/usr.bin/perl/lib/Unicode')
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/UCD.pm | 540 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/UCD.t | 1385 |
2 files changed, 1408 insertions, 517 deletions
diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.pm b/gnu/usr.bin/perl/lib/Unicode/UCD.pm index 11a8ec26807..13c2c785981 100644 --- a/gnu/usr.bin/perl/lib/Unicode/UCD.pm +++ b/gnu/usr.bin/perl/lib/Unicode/UCD.pm @@ -5,7 +5,7 @@ use warnings; no warnings 'surrogate'; # surrogates can be inputs to this use charnames (); -our $VERSION = '0.58'; +our $VERSION = '0.64'; require Exporter; @@ -15,6 +15,8 @@ our @EXPORT_OK = qw(charinfo charblock charscript charblocks charscripts charinrange + charprop + charprops_all general_categories bidi_types compexcl casefold all_casefolds casespec @@ -22,6 +24,7 @@ our @EXPORT_OK = qw(charinfo num prop_aliases prop_value_aliases + prop_values prop_invlist prop_invmap search_invlist @@ -41,14 +44,20 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'charinfo'; my $charinfo = charinfo($codepoint); + use Unicode::UCD 'charprop'; + my $value = charprop($codepoint, $property); + + use Unicode::UCD 'charprops_all'; + my $all_values_hash_ref = charprops_all($codepoint); + use Unicode::UCD 'casefold'; - my $casefold = casefold(0xFB00); + my $casefold = casefold($codepoint); use Unicode::UCD 'all_casefolds'; my $all_casefolds_ref = all_casefolds(); use Unicode::UCD 'casespec'; - my $casespec = casespec(0xFB00); + my $casespec = casespec($codepoint); use Unicode::UCD 'charblock'; my $charblock = charblock($codepoint); @@ -76,6 +85,9 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'prop_value_aliases'; my @gc_punct_names = prop_value_aliases("Gc", "Punct"); + use Unicode::UCD 'prop_values'; + my @all_EA_short_names = prop_values("East_Asian_Width"); + use Unicode::UCD 'prop_invlist'; my @puncts = prop_invlist("gc=punctuation"); @@ -107,7 +119,8 @@ Character Database. Some of the functions are called with a I<code point argument>, which is either a decimal or a hexadecimal scalar designating a code point in the platform's -native character set (extended to Unicode), or C<U+> followed by hexadecimals +native character set (extended to Unicode), or a string containing C<U+> +followed by hexadecimals designating a Unicode code point. A leading 0 will force a hexadecimal interpretation, as will a hexadecimal digit that isn't a decimal digit. @@ -116,7 +129,7 @@ Examples: 223 # Decimal 223 in native character set 0223 # Hexadecimal 223, native (= 547 decimal) 0xDF # Hexadecimal DF, native (= 223 decimal - U+DF # Hexadecimal DF, in Unicode's character set + 'U+DF' # Hexadecimal DF, in Unicode's character set (= LATIN SMALL LETTER SHARP S) Note that the largest code point in Unicode is U+10FFFF. @@ -193,6 +206,10 @@ C<undef> is returned. Fields that aren't applicable to the particular code point argument exist in the returned hash, and are empty. +For results that are less "raw" than this function returns, or to get the values for +any property, not just the few covered by this function, use the +L</charprop()> function. + The keys in the hash with the meanings of their values are: =over @@ -248,7 +265,8 @@ The codes may be preceded by a word enclosed in angle brackets, then a space, like C<E<lt>compatE<gt> >, giving the type of decomposition This decomposition may be an intermediate one whose components are also -decomposable. Use L<Unicode::Normalize> to get the final decomposition. +decomposable. Use L<Unicode::Normalize> to get the final decomposition in one +step. =item B<decimal> @@ -279,47 +297,55 @@ As of Unicode 6.0, this is always empty. =item B<upper> -is empty if there is no single code point uppercase mapping for I<code> -(its uppercase mapping is itself); -otherwise it is that mapping expressed as at least four hexdigits. -(L</casespec()> should be used in addition to B<charinfo()> -for case mappings when the calling program can cope with multiple code point -mappings.) +is, if non-empty, the uppercase mapping for I<code> expressed as at least four +hexdigits. This indicates that the full uppercase mapping is a single +character, and is identical to the simple (single-character only) mapping. +When this field is empty, it means that the simple uppercase mapping is +I<code> itself; you'll need some other means, (like L</charprop()> or +L</casespec()> to get the full mapping. =item B<lower> -is empty if there is no single code point lowercase mapping for I<code> -(its lowercase mapping is itself); -otherwise it is that mapping expressed as at least four hexdigits. -(L</casespec()> should be used in addition to B<charinfo()> -for case mappings when the calling program can cope with multiple code point -mappings.) +is, if non-empty, the lowercase mapping for I<code> expressed as at least four +hexdigits. This indicates that the full lowercase mapping is a single +character, and is identical to the simple (single-character only) mapping. +When this field is empty, it means that the simple lowercase mapping is +I<code> itself; you'll need some other means, (like L</charprop()> or +L</casespec()> to get the full mapping. =item B<title> -is empty if there is no single code point titlecase mapping for I<code> -(its titlecase mapping is itself); -otherwise it is that mapping expressed as at least four hexdigits. -(L</casespec()> should be used in addition to B<charinfo()> -for case mappings when the calling program can cope with multiple code point -mappings.) +is, if non-empty, the titlecase mapping for I<code> expressed as at least four +hexdigits. This indicates that the full titlecase mapping is a single +character, and is identical to the simple (single-character only) mapping. +When this field is empty, it means that the simple titlecase mapping is +I<code> itself; you'll need some other means, (like L</charprop()> or +L</casespec()> to get the full mapping. =item B<block> the block I<code> belongs to (used in C<\p{Blk=...}>). -See L</Blocks versus Scripts>. +The L</prop_value_aliases()> function can be used to get all the synonyms +of the block name. +See L</Blocks versus Scripts>. =item B<script> the script I<code> belongs to. +The L</prop_value_aliases()> function can be used to get all the synonyms +of the script name. + See L</Blocks versus Scripts>. =back Note that you cannot do (de)composition and casing based solely on the -I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields; -you will need also the L</compexcl()>, and L</casespec()> functions. +I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields; you +will need also the L</casespec()> function and the C<Composition_Exclusion> +property. (Or you could just use the L<lc()|perlfunc/lc>, +L<uc()|perlfunc/uc>, and L<ucfirst()|perlfunc/ucfirst> functions, and the +L<Unicode::Normalize> module.) =cut @@ -358,6 +384,9 @@ my %SIMPLE_UPPER; my %UNICODE_1_NAMES; my %ISO_COMMENT; +# Eval'd so can run on versions earlier than the property is available in +my $Hangul_Syllables_re = eval 'qr/\p{Block=Hangul_Syllables}/'; + sub charinfo { # This function has traditionally mimicked what is in UnicodeData.txt, @@ -385,8 +414,9 @@ sub charinfo { @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES; $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code) // $utf8::SwashInfo{'ToGc'}{'missing'}; - - return if $prop{'category'} eq 'Cn'; # Unassigned code points are undef + # Return undef if category value is 'Unassigned' or one of its synonyms + return if grep { lc $_ eq 'unassigned' } + prop_value_aliases('Gc', $prop{'category'}); $prop{'code'} = sprintf "%04X", $code; $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>' @@ -411,7 +441,7 @@ sub charinfo { # "Canonical" imply a compatible decomposition, and the type is prefixed # to that, as it is in UnicodeData.txt UnicodeVersion() unless defined $v_unicode_version; - if ($v_unicode_version ge v2.0.0 && $char =~ /\p{Block=Hangul_Syllables}/) { + if ($v_unicode_version ge v2.0.0 && $char =~ $Hangul_Syllables_re) { # The code points of the decomposition are output in standard Unicode # hex format, separated by blanks. $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)} @@ -588,6 +618,209 @@ sub charinrange { _search($range, 0, $#$range, $code); } +=head2 B<charprop()> + + use Unicode::UCD 'charprop'; + + print charprop(0x41, "Gc"), "\n"; + print charprop(0x61, "General_Category"), "\n"; + + prints + Lu + Ll + +This returns the value of the Unicode property given by the second parameter +for the L</code point argument> given by the first. + +The passed-in property may be specified as any of the synonyms returned by +L</prop_aliases()>. + +The return value is always a scalar, either a string or a number. For +properties where there are synonyms for the values, the synonym returned by +this function is the longest, most descriptive form, the one returned by +L</prop_value_aliases()> when called in a scalar context. Of course, you can +call L</prop_value_aliases()> on the result to get other synonyms. + +The return values are more "cooked" than the L</charinfo()> ones. For +example, the C<"uc"> property value is the actual string containing the full +uppercase mapping of the input code point. You have to go to extra trouble +with C<charinfo> to get this value from its C<upper> hash element when the +full mapping differs from the simple one. + +Special note should be made of the return values for a few properties: + +=over + +=item Block + +The value returned is the new-style (see L</Old-style versus new-style block +names>). + +=item Decomposition_Mapping + +Like L</charinfo()>, the result may be an intermediate decomposition whose +components are also decomposable. Use L<Unicode::Normalize> to get the final +decomposition in one step. + +Unlike L</charinfo()>, this does not include the decomposition type. Use the +C<Decomposition_Type> property to get that. + +=item Name_Alias + +If the input code point's name has more than one synonym, they are returned +joined into a single comma-separated string. + +=item Numeric_Value + +If the result is a fraction, it is converted into a floating point number to +the accuracy of your platform. + +=item Script_Extensions + +If the result is multiple script names, they are returned joined into a single +comma-separated string. + +=back + +When called with a property that is a Perl extension that isn't expressible in +a compound form, this function currently returns C<undef>, as the only two +possible values are I<true> or I<false> (1 or 0 I suppose). This behavior may +change in the future, so don't write code that relies on it. C<Present_In> is +a Perl extension that is expressible in a bipartite or compound form (for +example, C<\p{Present_In=4.0}>), so C<charprop> accepts it. But C<Any> is a +Perl extension that isn't expressible that way, so C<charprop> returns +C<undef> for it. Also C<charprop> returns C<undef> for all Perl extensions +that are internal-only. + +=cut + +sub charprop ($$) { + my ($input_cp, $prop) = @_; + + my $cp = _getcode($input_cp); + croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp; + + my ($list_ref, $map_ref, $format, $default) + = prop_invmap($prop); + return undef unless defined $list_ref; + + my $i = search_invlist($list_ref, $cp); + croak __PACKAGE__, "::charprop: prop_invmap return is invalid for charprop('$input_cp', '$prop)" unless defined $i; + + # $i is the index into both the inversion list and map of $cp. + my $map = $map_ref->[$i]; + + # Convert enumeration values to their most complete form. + if (! ref $map) { + my $long_form = prop_value_aliases($prop, $map); + $map = $long_form if defined $long_form; + } + + if ($format =~ / ^ s /x) { # Scalars + return join ",", @$map if ref $map; # Convert to scalar with comma + # separated array elements + + # Resolve ambiguity as to whether an all digit value is a code point + # that should be converted to a character, or whether it is really + # just a number. To do this, look at the default. If it is a + # non-empty number, we can safely assume the result is also a number. + if ($map =~ / ^ \d+ $ /ax && $default !~ / ^ \d+ $ /ax) { + $map = chr $map; + } + elsif ($map =~ / ^ (?: Y | N ) $ /x) { + + # prop_invmap() returns these values for properties that are Perl + # extensions. But this is misleading. For now, return undef for + # these, as currently documented. + undef $map unless + exists $Unicode::UCD::prop_aliases{utf8::_loose_name(lc $prop)}; + } + return $map; + } + elsif ($format eq 'ar') { # numbers, including rationals + my $offset = $cp - $list_ref->[$i]; + return $map if $map =~ /nan/i; + return $map + $offset if $offset != 0; # If needs adjustment + return eval $map; # Convert e.g., 1/2 to 0.5 + } + elsif ($format =~ /^a/) { # Some entries need adjusting + + # Linearize sequences into a string. + return join "", map { chr $_ } @$map if ref $map; # XXX && $format =~ /^ a [dl] /x; + + return "" if $map eq "" && $format =~ /^a.*e/; + + # These are all character mappings. Return the chr if no adjustment + # is needed + return chr $cp if $map eq "0"; + + # Convert special entry. + if ($map eq '<hangul syllable>' && $format eq 'ad') { + use Unicode::Normalize qw(NFD); + return NFD(chr $cp); + } + + # The rest need adjustment from the first entry in the inversion list + # corresponding to this map. + my $offset = $cp - $list_ref->[$i]; + return chr($map + $cp - $list_ref->[$i]); + } + elsif ($format eq 'n') { # The name property + + # There are two special cases, handled here. + if ($map =~ / ( .+ ) <code\ point> $ /x) { + $map = sprintf("$1%04X", $cp); + } + elsif ($map eq '<hangul syllable>') { + $map = charnames::viacode($cp); + } + return $map; + } + else { + croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'. Please perlbug this"; + } +} + +=head2 B<charprops_all()> + + use Unicode::UCD 'charprops_all'; + + my $%properties_of_A_hash_ref = charprops_all("U+41"); + +This returns a reference to a hash whose keys are all the distinct Unicode (no +Perl extension) properties, and whose values are the respective values for +those properties for the input L</code point argument>. + +Each key is the property name in its longest, most descriptive form. The +values are what L</charprop()> would return. + +This function is expensive in time and memory. + +=cut + +sub charprops_all($) { + my $input_cp = shift; + + my $cp = _getcode($input_cp); + croak __PACKAGE__, "::charprops_all: unknown code point '$input_cp'" unless defined $cp; + + my %return; + + require "unicore/UCD.pl"; + + foreach my $prop (keys %Unicode::UCD::prop_aliases) { + + # Don't return a Perl extension. (This is the only one that + # %prop_aliases has in it.) + next if $prop eq 'perldecimaldigit'; + + # Use long name for $prop in the hash + $return{scalar prop_aliases($prop)} = charprop($cp, $prop); + } + + return \%return; +} + =head2 B<charblock()> use Unicode::UCD 'charblock'; @@ -602,6 +835,9 @@ sub charinrange { With a L</code point argument> C<charblock()> returns the I<block> the code point belongs to, e.g. C<Basic Latin>. The old-style block name is returned (see L</Old-style versus new-style block names>). +The L</prop_value_aliases()> function can be used to get all the synonyms +of the block name. + If the code point is unassigned, this returns the block it would belong to if it were assigned. (If the Unicode version being used is so early as to not have blocks, all code points are considered to be in C<No_Block>.) @@ -611,7 +847,7 @@ See also L</Blocks versus Scripts>. If supplied with an argument that can't be a code point, C<charblock()> tries to do the opposite and interpret the argument as an old-style block name. On an ASCII platform, the return value is a I<range set> with one range: an -anonymous list with a single element that consists of another anonymous list +anonymous array with a single element that consists of another anonymous array whose first element is the first code point in the block, and whose second element is the final code point in the block. On an EBCDIC platform, the first two Unicode blocks are not contiguous. Their range sets @@ -643,6 +879,10 @@ sub _charblocks { local $_; local $/ = "\n"; while (<$BLOCKSFH>) { + + # Old versions used a different syntax to mark the range. + $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0; + if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { my ($lo, $hi) = (hex($1), hex($2)); my $subrange = [ $lo, $hi, $3 ]; @@ -700,6 +940,9 @@ sub charblock { elsif (exists $BLOCKS{$arg}) { return _dclone $BLOCKS{$arg}; } + + carp __PACKAGE__, "::charblock: unknown code '$arg'"; + return; } =head2 B<charscript()> @@ -716,10 +959,12 @@ With a L</code point argument>, C<charscript()> returns the I<script> the code point belongs to, e.g., C<Latin>, C<Greek>, C<Han>. If the code point is unassigned or the Unicode version being used is so early that it doesn't have scripts, this function returns C<"Unknown">. +The L</prop_value_aliases()> function can be used to get all the synonyms +of the script name. If supplied with an argument that can't be a code point, charscript() tries to do the opposite and interpret the argument as a script name. The -return value is a I<range set>: an anonymous list of lists that contain +return value is a I<range set>: an anonymous array of arrays that contain I<start-of-range>, I<end-of-range> code point pairs. You can test whether a code point is in a range set using the L</charinrange()> function. (To be precise, each I<range set> contains a third array element, @@ -765,6 +1010,7 @@ sub charscript { return _dclone $SCRIPTS{$arg}; } + carp __PACKAGE__, "::charscript: unknown code '$arg'"; return; } @@ -783,6 +1029,9 @@ names>). L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a different type of data structure. +L<prop_values("Block")|/prop_values()> can be used to get all +the known new-style block names as a list, without the code point ranges. + See also L</Blocks versus Scripts>. =cut @@ -805,6 +1054,9 @@ the values. L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a different type of data structure. +L<C<prop_values("Script")>|/prop_values()> can be used to get all +the known script names as a list, without the code point ranges. + See also L</Blocks versus Scripts>. =cut @@ -888,8 +1140,9 @@ from the long names to the short names. The general category is the one returned from L</charinfo()> under the C<category> key. -The L</prop_value_aliases()> function can be used to get all the synonyms of -the category name. +The L</prop_values()> and L</prop_value_aliases()> functions can be used as an +alternative to this function; the first returning a simple list of the short +category names; and the second gets all the synonyms of a given category name. =cut @@ -933,8 +1186,10 @@ the Unicode TR9 is recommended reading: L<http://www.unicode.org/reports/tr9/> (as of Unicode 5.0.0) -The L</prop_value_aliases()> function can be used to get all the synonyms of -the bidi type name. +The L</prop_values()> and L</prop_value_aliases()> functions can be used as an +alternative to this function; the first returning a simple list of the short +bidi type names; and the second gets all the synonyms of a given bidi type +name. =cut @@ -981,6 +1236,9 @@ The routine returns B<false> otherwise. =cut +# Eval'd so can run on versions earlier than the property is available in +my $Composition_Exclusion_re = eval 'qr/\p{Composition_Exclusion}/'; + sub compexcl { my $arg = shift; my $code = _getcode($arg); @@ -991,7 +1249,7 @@ sub compexcl { return if $v_unicode_version lt v3.0.0; no warnings "non_unicode"; # So works on non-Unicode code points - return chr($code) =~ /\p{Composition_Exclusion}/; + return chr($code) =~ $Composition_Exclusion_re } =head2 B<casefold()> @@ -1945,6 +2203,79 @@ sub prop_aliases ($) { =pod +=head2 B<prop_values()> + + use Unicode::UCD 'prop_values'; + + print "AHex values are: ", join(", ", prop_values("AHex")), + "\n"; + prints: + AHex values are: N, Y + +Some Unicode properties have a restricted set of legal values. For example, +all binary properties are restricted to just C<true> or C<false>; and there +are only a few dozen possible General Categories. Use C<prop_values> +to find out if a given property is one such, and if so, to get a list of the +values: + + print join ", ", prop_values("NFC_Quick_Check"); + prints: + M, N, Y + +If the property doesn't have such a restricted set, C<undef> is returned. + +There are usually several synonyms for each possible value. Use +L</prop_value_aliases()> to access those. + +Case, white space, hyphens, and underscores are ignored in the input property +name (except for the trailing underscore in the old-form grandfathered-in +general category property value C<"L_">, which is better written as C<"LC">). + +If the property name is unknown, C<undef> is returned. Note that Perl typically +recognizes property names in regular expressions with an optional C<"Is_>" +(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>. +This function does not recognize those in the property parameter, returning +C<undef>. + +For the block property, new-style block names are returned (see +L</Old-style versus new-style block names>). + +C<prop_values> does not know about any user-defined properties, and +will return C<undef> if called with one of those. + +=cut + +# These are created by mktables for this module and stored in unicore/UCD.pl +# where their structures are described. +our %loose_to_standard_value; +our %prop_value_aliases; + +sub prop_values ($) { + my $prop = shift; + return undef unless defined $prop; + + require "unicore/UCD.pl"; + require "utf8_heavy.pl"; + + # Find the property name synonym that's used as the key in other hashes, + # which is element 0 in the returned list. + ($prop) = prop_aliases($prop); + return undef if ! $prop; + $prop = utf8::_loose_name(lc $prop); + + # Here is a legal property. + return undef unless exists $prop_value_aliases{$prop}; + my @return; + foreach my $value_key (sort { lc $a cmp lc $b } + keys %{$prop_value_aliases{$prop}}) + { + push @return, $prop_value_aliases{$prop}{$value_key}[0]; + } + return @return; +} + +=pod + =head2 B<prop_value_aliases()> use Unicode::UCD 'prop_value_aliases'; @@ -1958,7 +2289,7 @@ sub prop_aliases ($) { print "The short name is $short_name\n"; print "The other aliases are: ", join(", ", @other_names), "\n"; - prints: + prints: The full name is Punctuation The short name is P The other aliases are: Punct @@ -1967,18 +2298,20 @@ Some Unicode properties have a restricted set of legal values. For example, all binary properties are restricted to just C<true> or C<false>; and there are only a few dozen possible General Categories. -For such properties, there are usually several synonyms for each possible -value. For example, in binary properties, I<truth> can be represented by any of -the strings "Y", "Yes", "T", or "True"; and the General Category -"Punctuation" by that string, or "Punct", or simply "P". +You can use L</prop_values()> to find out if a given property is one which has +a restricted set of values, and if so, what those values are. But usually +each value actually has several synonyms. For example, in Unicode binary +properties, I<truth> can be represented by any of the strings "Y", "Yes", "T", +or "True"; and the General Category "Punctuation" by that string, or "Punct", +or simply "P". Like property names, there is typically at least a short name for each such -property-value, and a long name. If you know any name of the property-value, -you can use C<prop_value_aliases>() to get the long name (when called in -scalar context), or a list of all the names, with the short name in the 0th -element, the long name in the next element, and any other synonyms in the -remaining elements, in no particular order, except that any all-numeric -synonyms will be last. +property-value, and a long name. If you know any name of the property-value +(which you can get by L</prop_values()>, you can use C<prop_value_aliases>() +to get the long name (when called in scalar context), or a list of all the +names, with the short name in the 0th element, the long name in the next +element, and any other synonyms in the remaining elements, in no particular +order, except that any all-numeric synonyms will be last. The long name is returned in a form nicely capitalized, suitable for printing. @@ -1994,7 +2327,7 @@ C<undef>. If called with a property that doesn't have synonyms for its values, it returns the input value, possibly normalized with capitalization and -underscores. +underscores, but not necessarily checking that the input value is valid. For the block property, new-style block names are returned (see L</Old-style versus new-style block names>). @@ -2007,11 +2340,6 @@ will return C<undef> if called with one of those. =cut -# These are created by mktables for this routine and stored in unicore/UCD.pl -# where their structures are described. -our %loose_to_standard_value; -our %prop_value_aliases; - sub prop_value_aliases ($$) { my ($prop, $value) = @_; return unless defined $prop && defined $value; @@ -2031,7 +2359,18 @@ sub prop_value_aliases ($$) { # anything, like most (if not all) string properties. These don't have # synonyms anyway. Simply return the input. For example, there is no # synonym for ('Uppercase_Mapping', A'). - return $value if ! exists $prop_value_aliases{$prop}; + if (! exists $prop_value_aliases{$prop}) { + + # Here, we have a legal property, but an unknown value. Since the + # property is legal, if it isn't in the prop_aliases hash, it must be + # a Perl-extension All perl extensions are binary, hence are + # enumerateds, which means that we know that the input unknown value + # is illegal. + return if ! exists $Unicode::UCD::prop_aliases{$prop}; + + # Otherwise, we assume it's valid, as documented. + return $value; + } # The value name may be loosely or strictly matched; we don't know yet. # But both types use lower-case. @@ -2212,7 +2551,8 @@ our $MAX_UNICODE_CODEPOINT; sub prop_invlist ($;$) { my $prop = $_[0]; - # Undocumented way to get at Perl internal properties + # Undocumented way to get at Perl internal properties; it may be changed + # or removed without notice at any time. my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok'; return if ! defined $prop; @@ -2325,9 +2665,11 @@ or even better, C<"Gc=LC">). Many Unicode properties have more than one name (or alias). C<prop_invmap> understands all of these, including Perl extensions to them. Ambiguities are -resolved as described above for L</prop_aliases()>. The Perl internal -property "Perl_Decimal_Digit, described below, is also accepted. An empty -list is returned if the property name is unknown. +resolved as described above for L</prop_aliases()> (except if a property has +both a complete mapping, and a binary C<Y>/C<N> mapping, then specifying the +property name prefixed by C<"is"> causes the binary one to be returned). The +Perl internal property "Perl_Decimal_Digit, described below, is also accepted. +An empty list is returned if the property name is unknown. See L<perluniprops/Properties accessible through Unicode::UCD> for the properties acceptable as inputs to this function. @@ -2791,6 +3133,14 @@ Use L</casefold()> for these. C<prop_invmap> does not know about any user-defined properties, and will return C<undef> if called with one of those. +The returned values for the Perl extension properties, such as C<Any> and +C<Greek> are somewhat misleading. The values are either C<"Y"> or C<"N>". +All Unicode properties are bipartite, so you can actually use the C<"Y"> or +C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or +C<qr/\p{Upper=N/}>. But the Perl extensions aren't specified this way, only +like C</qr/\p{Any}>, I<etc>. You can't actually use the C<"Y"> and C<"N>" in +them. + =cut # User-defined properties could be handled with some changes to utf8_heavy.pl; @@ -2808,15 +3158,21 @@ our @algorithmic_named_code_points; our $HANGUL_BEGIN; our $HANGUL_COUNT; -sub prop_invmap ($) { +sub prop_invmap ($;$) { croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray; my $prop = $_[0]; return unless defined $prop; + # Undocumented way to get at Perl internal properties; it may be changed + # or removed without notice at any time. It currently also changes the + # output to use the format specified in the file rather than the one we + # normally compute and return + my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok'; + # Fail internal properties - return if $prop =~ /^_/; + return if $prop =~ /^_/ && ! $internal_ok; # The values returned by this function. my (@invlist, @invmap, $format, $missing); @@ -2912,8 +3268,8 @@ RETRY: # we need to also read in that table. Create a hash with the keys # being the code points, and the values being a list of the # aliases for the code point key. - my ($aliases_code_points, $aliases_maps, undef, undef) = - &prop_invmap('Name_Alias'); + my ($aliases_code_points, $aliases_maps, undef, undef) + = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); my %aliases; for (my $i = 0; $i < @$aliases_code_points; $i++) { my $code_point = $aliases_code_points->[$i]; @@ -3204,7 +3560,19 @@ RETRY: if ($swash->{'LIST'} =~ /^V/) { @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr; - shift @invlist; + + shift @invlist; # Get rid of 'V'; + + # Could need to be inverted: add or subtract a 0 at the beginning of + # the list. + if ($swash->{'INVERT_IT'}) { + if (@invlist && $invlist[0] == 0) { + shift @invlist; + } + else { + unshift @invlist, 0; + } + } foreach my $i (0 .. @invlist - 1) { $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N' } @@ -3217,6 +3585,10 @@ RETRY: } } else { + if ($swash->{'INVERT_IT'}) { + croak __PACKAGE__, ":prop_invmap: Don't know how to deal with inverted"; + } + # The LIST input lines look like: # ... # 0374\t\tCommon @@ -3425,8 +3797,15 @@ RETRY: # If the overrides came from SPECIALS, the code point keys are # packed UTF-8. if ($overrides == $swash->{'SPECIALS'}) { - $cp = unpack("C0U", $cp_maybe_utf8); - @map = unpack "U0U*", $swash->{'SPECIALS'}{$cp_maybe_utf8}; + $cp = $cp_maybe_utf8; + if (! utf8::decode($cp)) { + croak __PACKAGE__, "::prop_invmap: Malformed UTF-8: ", + map { sprintf("\\x{%02X}", unpack("C", $_)) } + split "", $cp; + } + + $cp = unpack("W", $cp); + @map = unpack "W*", $swash->{'SPECIALS'}{$cp_maybe_utf8}; # The empty string will show up unpacked as an empty # array. @@ -3525,7 +3904,7 @@ RETRY: map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap; $format = 'sl'; } - elsif ($returned_prop eq 'ToNameAlias') { + elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) { # This property currently doesn't have any lists, but theoretically # could @@ -3540,7 +3919,14 @@ RETRY: # to indicate that need to add code point to it. $format = 'ar'; } - elsif ($format ne 'n' && $format ne 'a') { + elsif ($format eq 'ax') { + + # Normally 'ax' properties have overrides, and will have been handled + # above, but if not, they still need adjustment, and the hex values + # have already been converted to decimal + $format = 'a'; + } + elsif ($format ne 'n' && $format !~ / ^ a /x) { # All others are simple scalars $format = 's'; @@ -3672,7 +4058,7 @@ sub UnicodeVersion { =head2 B<Blocks versus Scripts> The difference between a block and a script is that scripts are closer -to the linguistic notion of a set of code points required to present +to the linguistic notion of a set of code points required to represent languages, while block is more of an artifact of the Unicode code point numbering and separation into blocks of consecutive code points (so far the size of a block is some multiple of 16, like 128 or 256). @@ -3682,7 +4068,7 @@ as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and C<Latin Extended-B>. On the other hand, the Latin script does not contain all the characters of the C<Basic Latin> block (also known as ASCII): it includes only the letters, and not, for example, the digits -or the punctuation. +nor the punctuation. For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt> @@ -3711,8 +4097,9 @@ The newer style replaces these with underscores, like this: This newer style is consistent with the values of other Unicode properties. To preserve backward compatibility, all the functions in Unicode::UCD that -return block names (except one) return the old-style ones. That one function, -L</prop_value_aliases()> can be used to convert from old-style to new-style: +return block names (except as noted) return the old-style ones. +L</prop_value_aliases()> returns the new-style and can be used to convert from +old-style to new-style: my $new_style = prop_values_aliases("block", $old_style); @@ -3730,6 +4117,15 @@ for its block using C<charblock>). Note that starting in Unicode 6.1, many of the block names have shorter synonyms. These are always given in the new style. +=head2 Use with older Unicode versions + +The functions in this module work as well as can be expected when +used on earlier Unicode versions. But, obviously, they use the available data +from that Unicode version. For example, if the Unicode version predates the +definition of the script property (Unicode 3.1), then any function that deals +with scripts is going to return C<undef> for the script portion of the return +value. + =head1 AUTHOR Jarkko Hietaniemi. Now maintained by perl5 porters. diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.t b/gnu/usr.bin/perl/lib/Unicode/UCD.t index 2abb05acb23..83320d34a01 100644 --- a/gnu/usr.bin/perl/lib/Unicode/UCD.t +++ b/gnu/usr.bin/perl/lib/Unicode/UCD.t @@ -1,9 +1,7 @@ #!perl -w BEGIN { - if (ord("A") != 65) { - print "1..0 # Skip: EBCDIC\n"; - exit 0; - } + $::IS_ASCII = (ord("A") == 65) ? 1 : 0; + $::IS_EBCDIC = (ord("A") == 193) ? 1 : 0; chdir 't' if -d 't'; @INC = '../lib'; require Config; import Config; @@ -17,277 +15,537 @@ my @warnings; local $SIG{__WARN__} = sub { push @warnings, @_ }; use strict; -use Unicode::UCD; use Test::More; -use Unicode::UCD 'charinfo'; +use Unicode::UCD qw(charinfo charprop charprops_all); +my $expected_version = '8.0.0'; +my $current_version = Unicode::UCD::UnicodeVersion; +my $v_unicode_version = pack "C*", split /\./, $current_version; +my $unknown_script = ($v_unicode_version lt v5.0.0) + ? 'Common' + : 'Unknown'; my $input_record_separator = 7; # Make sure Unicode::UCD isn't affected by $/ = $input_record_separator; # setting this. my $charinfo; is(charinfo(0x110000), undef, "Verify charinfo() of non-unicode is undef"); +if ($v_unicode_version ge v3.2.0) { + is(lc charprop(0x110000, 'age'), lc "Unassigned", "Verify charprop(age) of non-unicode is Unassigned"); + is(charprop(0x110000, 'in'), "Unassigned", "Verify charprop(in), a bipartite Perl extension, works"); +} +is(charprop(0x110000, 'Any'), undef, "Verify charprop of non-bipartite Perl extension returns undef"); -$charinfo = charinfo(0); # Null is often problematic, so test it. - -is($charinfo->{code}, '0000', '<control>'); -is($charinfo->{name}, '<control>'); -is($charinfo->{category}, 'Cc'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'BN'); -is($charinfo->{decomposition}, ''); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, 'NULL'); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, ''); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'Basic Latin'); -is($charinfo->{script}, 'Common'); - -$charinfo = charinfo(0x41); - -is($charinfo->{code}, '0041', 'LATIN CAPITAL LETTER A'); -is($charinfo->{name}, 'LATIN CAPITAL LETTER A'); -is($charinfo->{category}, 'Lu'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'L'); -is($charinfo->{decomposition}, ''); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, ''); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, '0061'); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'Basic Latin'); -is($charinfo->{script}, 'Latin'); - -$charinfo = charinfo(0x100); - -is($charinfo->{code}, '0100', 'LATIN CAPITAL LETTER A WITH MACRON'); -is($charinfo->{name}, 'LATIN CAPITAL LETTER A WITH MACRON'); -is($charinfo->{category}, 'Lu'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'L'); -is($charinfo->{decomposition}, '0041 0304'); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, 'LATIN CAPITAL LETTER A MACRON'); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, '0101'); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'Latin Extended-A'); -is($charinfo->{script}, 'Latin'); +my $cp = 0; +$charinfo = charinfo($cp); # Null is often problematic, so test it. -# 0x0590 is in the Hebrew block but unused. +is($charinfo->{code}, "0000", + "Next tests are for charinfo and charprop; first NULL"); +is($charinfo->{name}, "<control>"); +is(charprop($cp, "name"), ""); -$charinfo = charinfo(0x590); - -is($charinfo->{code}, undef, '0x0590 - unused Hebrew'); -is($charinfo->{name}, undef); -is($charinfo->{category}, undef); -is($charinfo->{combining}, undef); -is($charinfo->{bidi}, undef); -is($charinfo->{decomposition}, undef); -is($charinfo->{decimal}, undef); -is($charinfo->{digit}, undef); -is($charinfo->{numeric}, undef); -is($charinfo->{mirrored}, undef); -is($charinfo->{unicode10}, undef); -is($charinfo->{comment}, undef); -is($charinfo->{upper}, undef); -is($charinfo->{lower}, undef); -is($charinfo->{title}, undef); -is($charinfo->{block}, undef); -is($charinfo->{script}, undef); +if ($v_unicode_version ge v6.1.0) { + # This gets a sl-type property returning a flattened list + is(charprop($cp, "name_alias"), "NULL: control,NUL: abbreviation"); +} +is($charinfo->{category}, "Cc"); +is(charprop($cp, "category"), "Control"); +is($charinfo->{combining}, "0"); +is(charprop($cp, "ccc"), "Not_Reordered"); +is($charinfo->{bidi}, "BN"); +is(charprop($cp, "bc"), "Boundary_Neutral"); +is($charinfo->{decomposition}, ""); +is(charprop($cp, "dm"), "\0"); +is($charinfo->{decimal}, ""); +is($charinfo->{digit}, ""); +is($charinfo->{numeric}, ""); +is(charprop($cp, "nv"), "NaN"); +is($charinfo->{mirrored}, "N"); +is(charprop($cp, "bidim"), "No"); +is($charinfo->{unicode10}, "NULL"); +is(charprop($cp, "na1"), "NULL"); +is($charinfo->{comment}, ""); +is(charprop($cp, "isc"), ""); +is($charinfo->{upper}, ""); +is(charprop($cp, "uc"), "\0"); +is($charinfo->{lower}, ""); +is(charprop($cp, "lc"), "\0"); +is($charinfo->{title}, ""); +is(charprop($cp, "tc"), "\0"); +is($charinfo->{block}, "Basic Latin"); +is(charprop($cp, "block"), "Basic_Latin"); +is($charinfo->{script}, "Common") if $v_unicode_version gt v3.0.1; +is(charprop($cp, "script"), "Common") if $v_unicode_version gt v3.0.1; + +$cp = utf8::unicode_to_native(0x41); +my $A_code = sprintf("%04X", ord("A")); +my $a_code = sprintf("%04X", ord("a")); +$charinfo = charinfo($cp); + +is($charinfo->{code}, $A_code, "LATIN CAPITAL LETTER A"); +is($charinfo->{name}, "LATIN CAPITAL LETTER A"); +is(charprop($cp, 'name'), "LATIN CAPITAL LETTER A"); +is($charinfo->{category}, "Lu"); +is(charprop($cp, 'gc'), "Uppercase_Letter"); +is($charinfo->{combining}, "0"); +is(charprop($cp, 'ccc'), "Not_Reordered"); +is($charinfo->{bidi}, "L"); +is(charprop($cp, 'bc'), "Left_To_Right"); +is($charinfo->{decomposition}, ""); +is(charprop($cp, 'dm'), "A"); +is($charinfo->{decimal}, ""); +is($charinfo->{digit}, ""); +is($charinfo->{numeric}, ""); +is(charprop($cp, 'nv'), "NaN"); +is($charinfo->{mirrored}, "N"); +is(charprop($cp, 'bidim'), "No"); +is($charinfo->{unicode10}, ""); +is(charprop($cp, 'na1'), ""); +is($charinfo->{comment}, ""); +is(charprop($cp, 'isc'), ""); +is($charinfo->{upper}, ""); +is(charprop($cp, 'uc'), "A"); +is($charinfo->{lower}, $a_code); +is(charprop($cp, 'lc'), "a"); +is($charinfo->{title}, ""); +is(charprop($cp, 'tc'), "A"); +is($charinfo->{block}, "Basic Latin"); +is(charprop($cp, 'block'), "Basic_Latin"); +is($charinfo->{script}, "Latin") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Latin") if $v_unicode_version gt v3.0.1; + +$cp = 0x100; +$charinfo = charinfo($cp); + +is($charinfo->{code}, "0100", "LATIN CAPITAL LETTER A WITH MACRON"); +is($charinfo->{name}, "LATIN CAPITAL LETTER A WITH MACRON"); +is(charprop($cp, 'name'), "LATIN CAPITAL LETTER A WITH MACRON"); +is($charinfo->{category}, "Lu"); +is(charprop($cp, 'gc'), "Uppercase_Letter"); +is($charinfo->{combining}, "0"); +is(charprop($cp, 'ccc'), "Not_Reordered"); +is($charinfo->{bidi}, "L"); +is(charprop($cp, 'bc'), "Left_To_Right"); +is($charinfo->{decomposition}, "$A_code 0304"); +is(charprop($cp, 'dm'), "A\x{0304}"); +is($charinfo->{decimal}, ""); +is($charinfo->{digit}, ""); +is($charinfo->{numeric}, ""); +is(charprop($cp, 'nv'), "NaN"); +is($charinfo->{mirrored}, "N"); +is(charprop($cp, 'bidim'), "No"); +is($charinfo->{unicode10}, "LATIN CAPITAL LETTER A MACRON"); +is(charprop($cp, 'na1'), "LATIN CAPITAL LETTER A MACRON"); +is($charinfo->{comment}, ""); +is(charprop($cp, 'isc'), ""); +is($charinfo->{upper}, ""); +is(charprop($cp, 'uc'), "\x{100}"); +is($charinfo->{lower}, "0101"); +is(charprop($cp, 'lc'), "\x{101}"); +is($charinfo->{title}, ""); +is(charprop($cp, 'tc'), "\x{100}"); +is($charinfo->{block}, "Latin Extended-A"); +is(charprop($cp, 'block'), "Latin_Extended_A"); +is($charinfo->{script}, "Latin") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Latin") if $v_unicode_version gt v3.0.1; + +$cp = 0x590; # 0x0590 is in the Hebrew block but unused. +$charinfo = charinfo($cp); + +is($charinfo->{code}, undef, "0x0590 - unused Hebrew"); +is($charinfo->{name}, undef); +is(charprop($cp, 'name'), ""); +is($charinfo->{category}, undef); +is(charprop($cp, 'gc'), "Unassigned"); +is($charinfo->{combining}, undef); +is(charprop($cp, 'ccc'), "Not_Reordered"); +is($charinfo->{bidi}, undef); +if ($v_unicode_version gt v3.2.0) { + is(charprop($cp, 'bc'), "Right_To_Left"); +} +is($charinfo->{decomposition}, undef); +is(charprop($cp, 'dm'), "\x{590}"); +is($charinfo->{decimal}, undef); +is($charinfo->{digit}, undef); +is($charinfo->{numeric}, undef); +is(charprop($cp, 'nv'), "NaN"); +is($charinfo->{mirrored}, undef); +is(charprop($cp, 'bidim'), "No"); +is($charinfo->{unicode10}, undef); +is(charprop($cp, 'na1'), ""); +is($charinfo->{comment}, undef); +is(charprop($cp, 'isc'), ""); +is($charinfo->{upper}, undef); +is(charprop($cp, 'uc'), "\x{590}"); +is($charinfo->{lower}, undef); +is(charprop($cp, 'lc'), "\x{590}"); +is($charinfo->{title}, undef); +is(charprop($cp, 'tc'), "\x{590}"); +is($charinfo->{block}, undef); +is(charprop($cp, 'block'), "Hebrew"); +is($charinfo->{script}, undef); +is(charprop($cp, 'script'), $unknown_script) if $v_unicode_version gt +v3.0.1; # 0x05d0 is in the Hebrew block and used. -$charinfo = charinfo(0x5d0); - -is($charinfo->{code}, '05D0', '05D0 - used Hebrew'); -is($charinfo->{name}, 'HEBREW LETTER ALEF'); -is($charinfo->{category}, 'Lo'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'R'); -is($charinfo->{decomposition}, ''); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, ''); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, ''); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'Hebrew'); -is($charinfo->{script}, 'Hebrew'); +$cp = 0x5d0; +$charinfo = charinfo($cp); + +is($charinfo->{code}, "05D0", "05D0 - used Hebrew"); +is($charinfo->{name}, "HEBREW LETTER ALEF"); +is(charprop($cp, 'name'), "HEBREW LETTER ALEF"); +is($charinfo->{category}, "Lo"); +is(charprop($cp, 'gc'), "Other_Letter"); +is($charinfo->{combining}, "0"); +is(charprop($cp, 'ccc'), "Not_Reordered"); +is($charinfo->{bidi}, "R"); +is(charprop($cp, 'bc'), "Right_To_Left"); +is($charinfo->{decomposition}, ""); +is(charprop($cp, 'dm'), "\x{5d0}"); +is($charinfo->{decimal}, ""); +is($charinfo->{digit}, ""); +is($charinfo->{numeric}, ""); +is(charprop($cp, 'nv'), "NaN"); +is($charinfo->{mirrored}, "N"); +is(charprop($cp, 'bidim'), "No"); +is($charinfo->{unicode10}, ""); +is(charprop($cp, 'na1'), ""); +is($charinfo->{comment}, ""); +is(charprop($cp, 'isc'), ""); +is($charinfo->{upper}, ""); +is(charprop($cp, 'uc'), "\x{5d0}"); +is($charinfo->{lower}, ""); +is(charprop($cp, 'lc'), "\x{5d0}"); +is($charinfo->{title}, ""); +is(charprop($cp, 'tc'), "\x{5d0}"); +is($charinfo->{block}, "Hebrew"); +is(charprop($cp, 'block'), "Hebrew"); +is($charinfo->{script}, "Hebrew") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Hebrew") if $v_unicode_version gt v3.0.1; # An open syllable in Hangul. -$charinfo = charinfo(0xAC00); - -is($charinfo->{code}, 'AC00', 'HANGUL SYLLABLE U+AC00'); -is($charinfo->{name}, 'HANGUL SYLLABLE GA'); -is($charinfo->{category}, 'Lo'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'L'); -is($charinfo->{decomposition}, '1100 1161'); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, ''); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, ''); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'Hangul Syllables'); -is($charinfo->{script}, 'Hangul'); +$cp = 0xAC00; +$charinfo = charinfo($cp); + +is($charinfo->{code}, "AC00", "HANGUL SYLLABLE U+AC00"); +is($charinfo->{name}, "HANGUL SYLLABLE GA"); +is(charprop($cp, 'name'), "HANGUL SYLLABLE GA"); +is($charinfo->{category}, "Lo"); +is(charprop($cp, 'gc'), "Other_Letter"); +is($charinfo->{combining}, "0"); +is(charprop($cp, 'ccc'), "Not_Reordered"); +is($charinfo->{bidi}, "L"); +is(charprop($cp, 'bc'), "Left_To_Right"); +is($charinfo->{decomposition}, "1100 1161"); +is(charprop($cp, 'dm'), "\x{1100}\x{1161}"); +is($charinfo->{decimal}, ""); +is($charinfo->{digit}, ""); +is($charinfo->{numeric}, ""); +is(charprop($cp, 'nv'), "NaN"); +is($charinfo->{mirrored}, "N"); +is(charprop($cp, 'bidim'), "No"); +is($charinfo->{unicode10}, ""); +is(charprop($cp, 'na1'), ""); +is($charinfo->{comment}, ""); +is(charprop($cp, 'isc'), ""); +is($charinfo->{upper}, ""); +is(charprop($cp, 'uc'), "\x{AC00}"); +is($charinfo->{lower}, ""); +is(charprop($cp, 'lc'), "\x{AC00}"); +is($charinfo->{title}, ""); +is(charprop($cp, 'tc'), "\x{AC00}"); +is($charinfo->{block}, "Hangul Syllables"); +is(charprop($cp, 'block'), "Hangul_Syllables"); +is($charinfo->{script}, "Hangul") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Hangul") if $v_unicode_version gt v3.0.1; # A closed syllable in Hangul. -$charinfo = charinfo(0xAE00); - -is($charinfo->{code}, 'AE00', 'HANGUL SYLLABLE U+AE00'); -is($charinfo->{name}, 'HANGUL SYLLABLE GEUL'); -is($charinfo->{category}, 'Lo'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'L'); +$cp = 0xAE00; +$charinfo = charinfo($cp); + +is($charinfo->{code}, "AE00", "HANGUL SYLLABLE U+AE00"); +is($charinfo->{name}, "HANGUL SYLLABLE GEUL"); +is(charprop($cp, 'name'), "HANGUL SYLLABLE GEUL"); +is($charinfo->{category}, "Lo"); +is(charprop($cp, 'gc'), "Other_Letter"); +is($charinfo->{combining}, "0"); +is(charprop($cp, 'ccc'), "Not_Reordered"); +is($charinfo->{bidi}, "L"); +is(charprop($cp, 'bc'), "Left_To_Right"); is($charinfo->{decomposition}, "1100 1173 11AF"); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, ''); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, ''); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'Hangul Syllables'); -is($charinfo->{script}, 'Hangul'); - -$charinfo = charinfo(0x1D400); - -is($charinfo->{code}, '1D400', 'MATHEMATICAL BOLD CAPITAL A'); -is($charinfo->{name}, 'MATHEMATICAL BOLD CAPITAL A'); -is($charinfo->{category}, 'Lu'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'L'); -is($charinfo->{decomposition}, '<font> 0041'); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, ''); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, ''); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'Mathematical Alphanumeric Symbols'); -is($charinfo->{script}, 'Common'); - -$charinfo = charinfo(0x9FBA); #Bug 58428 - -is($charinfo->{code}, '9FBA', 'U+9FBA'); -is($charinfo->{name}, 'CJK UNIFIED IDEOGRAPH-9FBA'); -is($charinfo->{category}, 'Lo'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'L'); -is($charinfo->{decomposition}, ''); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, ''); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, ''); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'CJK Unified Ideographs'); -is($charinfo->{script}, 'Han'); +is(charprop($cp, 'dm'), "\x{1100}\x{1173}\x{11AF}"); +is($charinfo->{decimal}, ""); +is($charinfo->{digit}, ""); +is($charinfo->{numeric}, ""); +is(charprop($cp, 'nv'), "NaN"); +is($charinfo->{mirrored}, "N"); +is(charprop($cp, 'bidim'), "No"); +is($charinfo->{unicode10}, ""); +is(charprop($cp, 'na1'), ""); +is($charinfo->{comment}, ""); +is(charprop($cp, 'isc'), ""); +is($charinfo->{upper}, ""); +is(charprop($cp, 'uc'), "\x{AE00}"); +is($charinfo->{lower}, ""); +is(charprop($cp, 'lc'), "\x{AE00}"); +is($charinfo->{title}, ""); +is(charprop($cp, 'tc'), "\x{AE00}"); +is($charinfo->{block}, "Hangul Syllables"); +is(charprop($cp, 'block'), "Hangul_Syllables"); +is($charinfo->{script}, "Hangul") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Hangul") if $v_unicode_version gt v3.0.1; + +if ($v_unicode_version gt v3.0.1) { + $cp = 0x1D400; + $charinfo = charinfo($cp); + + is($charinfo->{code}, "1D400", "MATHEMATICAL BOLD CAPITAL A"); + is($charinfo->{name}, "MATHEMATICAL BOLD CAPITAL A"); + is(charprop($cp, 'name'), "MATHEMATICAL BOLD CAPITAL A"); + is($charinfo->{category}, "Lu"); + is(charprop($cp, 'gc'), "Uppercase_Letter"); + is($charinfo->{combining}, "0"); + is(charprop($cp, 'ccc'), "Not_Reordered"); + is($charinfo->{bidi}, "L"); + is(charprop($cp, 'bc'), "Left_To_Right"); + is($charinfo->{decomposition}, "<font> $A_code"); + is(charprop($cp, 'dm'), "A"); + is($charinfo->{decimal}, ""); + is($charinfo->{digit}, ""); + is($charinfo->{numeric}, ""); + is(charprop($cp, 'nv'), "NaN"); + is($charinfo->{mirrored}, "N"); + is(charprop($cp, 'bidim'), "No"); + is($charinfo->{unicode10}, ""); + is(charprop($cp, 'na1'), ""); + is($charinfo->{comment}, ""); + is(charprop($cp, 'isc'), ""); + is($charinfo->{upper}, ""); + is(charprop($cp, 'uc'), "\x{1D400}"); + is($charinfo->{lower}, ""); + is(charprop($cp, 'lc'), "\x{1D400}"); + is($charinfo->{title}, ""); + is(charprop($cp, 'tc'), "\x{1D400}"); + is($charinfo->{block}, "Mathematical Alphanumeric Symbols"); + is(charprop($cp, 'block'), "Mathematical_Alphanumeric_Symbols"); + is($charinfo->{script}, "Common"); + is(charprop($cp, 'script'), "Common"); +} + +if ($v_unicode_version ge v4.1.0) { + $cp = 0x9FBA; #Bug 58428 + $charinfo = charinfo(0x9FBA); + + is($charinfo->{code}, "9FBA", "U+9FBA"); + is($charinfo->{name}, "CJK UNIFIED IDEOGRAPH-9FBA"); + is(charprop($cp, 'name'), "CJK UNIFIED IDEOGRAPH-9FBA"); + is($charinfo->{category}, "Lo"); + is(charprop($cp, 'gc'), "Other_Letter"); + is($charinfo->{combining}, "0"); + is(charprop($cp, 'ccc'), "Not_Reordered"); + is($charinfo->{bidi}, "L"); + is(charprop($cp, 'bc'), "Left_To_Right"); + is($charinfo->{decomposition}, ""); + is(charprop($cp, 'dm'), "\x{9FBA}"); + is($charinfo->{decimal}, ""); + is($charinfo->{digit}, ""); + is($charinfo->{numeric}, ""); + is(charprop($cp, 'nv'), "NaN"); + is($charinfo->{mirrored}, "N"); + is(charprop($cp, 'bidim'), "No"); + is($charinfo->{unicode10}, ""); + is(charprop($cp, 'na1'), ""); + is($charinfo->{comment}, ""); + is(charprop($cp, 'isc'), ""); + is($charinfo->{upper}, ""); + is(charprop($cp, 'uc'), "\x{9FBA}"); + is($charinfo->{lower}, ""); + is(charprop($cp, 'lc'), "\x{9FBA}"); + is($charinfo->{title}, ""); + is(charprop($cp, 'tc'), "\x{9FBA}"); + is($charinfo->{block}, "CJK Unified Ideographs"); + is(charprop($cp, 'block'), "CJK_Unified_Ideographs"); + is($charinfo->{script}, "Han"); + is(charprop($cp, 'script'), "Han"); +} use Unicode::UCD qw(charblock charscript); # 0x0590 is in the Hebrew block but unused. -is(charblock(0x590), 'Hebrew', '0x0590 - Hebrew unused charblock'); -is(charscript(0x590), 'Unknown', '0x0590 - Hebrew unused charscript'); -is(charblock(0x1FFFF), 'No_Block', '0x1FFFF - unused charblock'); - -$charinfo = charinfo(0xbe); - -is($charinfo->{code}, '00BE', 'VULGAR FRACTION THREE QUARTERS'); -is($charinfo->{name}, 'VULGAR FRACTION THREE QUARTERS'); -is($charinfo->{category}, 'No'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'ON'); -is($charinfo->{decomposition}, '<fraction> 0033 2044 0034'); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, '3/4'); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, 'FRACTION THREE QUARTERS'); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, ''); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'Latin-1 Supplement'); -is($charinfo->{script}, 'Common'); +is(charblock(0x590), "Hebrew", "0x0590 - Hebrew unused charblock"); +is(charscript(0x590), $unknown_script, "0x0590 - Hebrew unused charscript") if $v_unicode_version gt v3.0.1; +is(charblock(0x1FFFF), "No_Block", "0x1FFFF - unused charblock"); + +{ + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, @_ }; + is(charblock(chr(0x6237)), undef, + "Verify charblock of non-code point returns <undef>"); + cmp_ok(scalar @warnings, '==', 1, " ... and generates 1 warning"); + like($warnings[0], qr/unknown code/, " ... with the right text"); +} + +my $fraction_3_4_code = sprintf("%04X", utf8::unicode_to_native(0xbe)); +$cp = $fraction_3_4_code; +$charinfo = charinfo($fraction_3_4_code); + +is($charinfo->{code}, $fraction_3_4_code, "VULGAR FRACTION THREE QUARTERS"); +is($charinfo->{name}, "VULGAR FRACTION THREE QUARTERS"); +is(charprop($cp, 'name'), "VULGAR FRACTION THREE QUARTERS"); +is($charinfo->{category}, "No"); +is(charprop($cp, 'gc'), "Other_Number"); +is($charinfo->{combining}, "0"); +is(charprop($cp, 'ccc'), "Not_Reordered"); +is($charinfo->{bidi}, "ON"); +is(charprop($cp, 'bc'), "Other_Neutral"); +is($charinfo->{decomposition}, "<fraction> " + . sprintf("%04X", ord "3") + . " 2044 " + . sprintf("%04X", ord "4")); +is(charprop($cp, 'dm'), "3\x{2044}4"); +is($charinfo->{decimal}, ""); +is($charinfo->{digit}, ""); +is($charinfo->{numeric}, "3/4"); +is(charprop($cp, 'nv'), "0.75"); +is($charinfo->{mirrored}, "N"); +is(charprop($cp, 'bidim'), "No"); +is($charinfo->{unicode10}, "FRACTION THREE QUARTERS"); +is(charprop($cp, 'na1'), "FRACTION THREE QUARTERS"); +is($charinfo->{comment}, ""); +is(charprop($cp, 'isc'), ""); +is($charinfo->{upper}, ""); +is(charprop($cp, 'uc'), chr hex $cp); +is($charinfo->{lower}, ""); +is(charprop($cp, 'lc'), chr hex $cp); +is($charinfo->{title}, ""); +is(charprop($cp, 'tc'), chr hex $cp); +is($charinfo->{block}, "Latin-1 Supplement"); +is(charprop($cp, 'block'), "Latin_1_Supplement"); +is($charinfo->{script}, "Common") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Common") if $v_unicode_version gt v3.0.1; # This is to test a case where both simple and full lowercases exist and # differ -$charinfo = charinfo(0x130); - -is($charinfo->{code}, '0130', 'LATIN CAPITAL LETTER I WITH DOT ABOVE'); -is($charinfo->{name}, 'LATIN CAPITAL LETTER I WITH DOT ABOVE'); -is($charinfo->{category}, 'Lu'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'L'); -is($charinfo->{decomposition}, '0049 0307'); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, 'LATIN CAPITAL LETTER I DOT'); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, ''); -is($charinfo->{lower}, '0069'); -is($charinfo->{title}, ''); -is($charinfo->{block}, 'Latin Extended-A'); -is($charinfo->{script}, 'Latin'); +$cp = 0x130; +$charinfo = charinfo($cp); +my $I_code = sprintf("%04X", ord("I")); +my $i_code = sprintf("%04X", ord("i")); + +is($charinfo->{code}, "0130", "LATIN CAPITAL LETTER I WITH DOT ABOVE"); +is($charinfo->{name}, "LATIN CAPITAL LETTER I WITH DOT ABOVE"); +is(charprop($cp, 'name'), "LATIN CAPITAL LETTER I WITH DOT ABOVE"); +is($charinfo->{category}, "Lu"); +is(charprop($cp, 'gc'), "Uppercase_Letter"); +is($charinfo->{combining}, "0"); +is(charprop($cp, 'ccc'), "Not_Reordered"); +is($charinfo->{bidi}, "L"); +is(charprop($cp, 'bc'), "Left_To_Right"); +is($charinfo->{decomposition}, "$I_code 0307"); +is(charprop($cp, 'dm'), "I\x{0307}"); +is($charinfo->{decimal}, ""); +is($charinfo->{digit}, ""); +is($charinfo->{numeric}, ""); +is(charprop($cp, 'nv'), "NaN"); +is($charinfo->{mirrored}, "N"); +is(charprop($cp, 'bidim'), "No"); +is($charinfo->{unicode10}, "LATIN CAPITAL LETTER I DOT"); +is(charprop($cp, 'na1'), "LATIN CAPITAL LETTER I DOT"); +is($charinfo->{comment}, ""); +is(charprop($cp, 'isc'), ""); +is($charinfo->{upper}, ""); +is(charprop($cp, 'uc'), "\x{130}"); +is($charinfo->{lower}, $i_code); +is(charprop($cp, 'lc'), "i\x{307}") if $v_unicode_version ge v3.2.0; +is($charinfo->{title}, ""); +is(charprop($cp, 'tc'), "\x{130}"); +is($charinfo->{block}, "Latin Extended-A"); +is(charprop($cp, 'block'), "Latin_Extended_A"); +is($charinfo->{script}, "Latin") if $v_unicode_version gt v3.0.1; +is(charprop($cp, 'script'), "Latin") if $v_unicode_version gt v3.0.1; # This is to test a case where both simple and full uppercases exist and # differ -$charinfo = charinfo(0x1F80); - -is($charinfo->{code}, '1F80', 'GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI'); -is($charinfo->{name}, 'GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI'); -is($charinfo->{category}, 'Ll'); -is($charinfo->{combining}, '0'); -is($charinfo->{bidi}, 'L'); -is($charinfo->{decomposition}, '1F00 0345'); -is($charinfo->{decimal}, ''); -is($charinfo->{digit}, ''); -is($charinfo->{numeric}, ''); -is($charinfo->{mirrored}, 'N'); -is($charinfo->{unicode10}, ''); -is($charinfo->{comment}, ''); -is($charinfo->{upper}, '1F88'); -is($charinfo->{lower}, ''); -is($charinfo->{title}, '1F88'); -is($charinfo->{block}, 'Greek Extended'); -is($charinfo->{script}, 'Greek'); +$cp = 0x1F80; +$charinfo = charinfo($cp); + +is($charinfo->{code}, "1F80", "GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI"); +is($charinfo->{name}, "GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI"); +is(charprop($cp, "name"), "GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI"); +is($charinfo->{category}, "Ll"); +is(charprop($cp, "gc"), "Lowercase_Letter"); +is($charinfo->{combining}, "0"); +is(charprop($cp, "ccc"), "Not_Reordered"); +is($charinfo->{bidi}, "L"); +is(charprop($cp, "bc"), "Left_To_Right"); +is($charinfo->{decomposition}, "1F00 0345"); +is(charprop($cp, "dm"), "\x{1F00}\x{0345}"); +is($charinfo->{decimal}, ""); +is($charinfo->{digit}, ""); +is($charinfo->{numeric}, ""); +is(charprop($cp, "nv"), "NaN"); +is($charinfo->{mirrored}, "N"); +is(charprop($cp, "bidim"), "No"); +is($charinfo->{unicode10}, ""); +is(charprop($cp, "na1"), ""); +is($charinfo->{comment}, ""); +is(charprop($cp, "isc"), ""); +is($charinfo->{upper}, "1F88"); +is(charprop($cp, "uc"), "\x{1F08}\x{0399}"); +is(charprop($cp, "suc"), "\x{1F88}"); +is($charinfo->{lower}, ""); +is(charprop($cp, "lc"), "\x{1F80}"); +is($charinfo->{title}, "1F88"); +is(charprop($cp, "tc"), "\x{1F88}"); +is($charinfo->{block}, "Greek Extended"); +is(charprop($cp, "block"), "Greek_Extended"); +is($charinfo->{script}, "Greek") if $v_unicode_version gt v3.0.1; +is(charprop($cp, "script"), "Greek") if $v_unicode_version gt v3.0.1; + +is(charprop(ord("A"), "foo"), undef, + "Verify charprop of unknown property returns <undef>"); + +# These were created from inspection of the code to exercise the branches +if ($v_unicode_version ge v6.3.0) { + is(charprop(ord("("), "bpb"), ")", + "Verify charprop figures out that s-type properties can be char"); +} +is(charprop(ord("9"), "nv"), 9, + "Verify charprop can adjust an ar-type property"); +if ($v_unicode_version ge v5.2.0) { + is(charprop(utf8::unicode_to_native(0xAD), "NFKC_Casefold"), "", + "Verify charprop can handle an \"\" in ae-type property"); +} + +my $mark_props_ref = charprops_all(0x300); +is($mark_props_ref->{'Bidi_Class'}, "Nonspacing_Mark", + "Next tests are charprops_all of 0x300"); +is($mark_props_ref->{'Bidi_Mirrored'}, "No"); +is($mark_props_ref->{'Canonical_Combining_Class'}, "Above"); +is($mark_props_ref->{'Case_Folding'}, "\x{300}"); +is($mark_props_ref->{'Decomposition_Mapping'}, "\x{300}"); +is($mark_props_ref->{'Decomposition_Type'}, ($v_unicode_version le v4.0.0) + ? "none" + : "None"); +is($mark_props_ref->{'General_Category'}, "Nonspacing_Mark"); +if ($v_unicode_version gt v5.1.0) { + is($mark_props_ref->{'ISO_Comment'}, ""); +} +is($mark_props_ref->{'Lowercase_Mapping'}, "\x{300}"); +is($mark_props_ref->{'Name'}, "COMBINING GRAVE ACCENT"); +is($mark_props_ref->{'Numeric_Type'}, "None"); +is($mark_props_ref->{'Numeric_Value'}, "NaN"); +is($mark_props_ref->{'Simple_Case_Folding'}, "\x{300}"); +is($mark_props_ref->{'Simple_Lowercase_Mapping'}, "\x{300}"); +is($mark_props_ref->{'Simple_Titlecase_Mapping'}, "\x{300}"); +is($mark_props_ref->{'Simple_Uppercase_Mapping'}, "\x{300}"); +is($mark_props_ref->{'Titlecase_Mapping'}, "\x{300}"); +is($mark_props_ref->{'Unicode_1_Name'}, "NON-SPACING GRAVE"); +is($mark_props_ref->{'Uppercase_Mapping'}, "\x{300}"); use Unicode::UCD qw(charblocks charscripts); @@ -297,36 +555,40 @@ ok(exists $charblocks->{Thai}, 'Thai charblock exists'); is($charblocks->{Thai}->[0]->[0], hex('0e00')); ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist'); -my $charscripts = charscripts(); +if ($v_unicode_version gt v3.0.1) { + my $charscripts = charscripts(); -ok(exists $charscripts->{Armenian}, 'Armenian charscript exists'); -is($charscripts->{Armenian}->[0]->[0], hex('0531')); -ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist'); + ok(exists $charscripts->{Armenian}, 'Armenian charscript exists'); + is($charscripts->{Armenian}->[0]->[0], hex('0531')); + ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist'); -my $charscript; + my $charscript; -$charscript = charscript("12ab"); -is($charscript, 'Ethiopic', 'Ethiopic charscript'); + $charscript = charscript("12ab"); + is($charscript, 'Ethiopic', 'Ethiopic charscript'); -$charscript = charscript("0x12ab"); -is($charscript, 'Ethiopic'); + $charscript = charscript("0x12ab"); + is($charscript, 'Ethiopic'); -$charscript = charscript("U+12ab"); -is($charscript, 'Ethiopic'); + $charscript = charscript("U+12ab"); + is($charscript, 'Ethiopic'); -my $ranges; + my $ranges; -$ranges = charscript('Ogham'); -is($ranges->[0]->[0], hex('1680'), 'Ogham charscript'); -is($ranges->[0]->[1], hex('169C')); + if ($v_unicode_version gt v4.0.0) { + $ranges = charscript('Ogham'); + is($ranges->[0]->[0], hex('1680'), 'Ogham charscript'); + is($ranges->[0]->[1], hex('169C')); + } -use Unicode::UCD qw(charinrange); + use Unicode::UCD qw(charinrange); -$ranges = charscript('Cherokee'); -ok(!charinrange($ranges, "139f"), 'Cherokee charscript'); -ok( charinrange($ranges, "13a0")); -ok( charinrange($ranges, "13f4")); -ok(!charinrange($ranges, "13f5")); + $ranges = charscript('Cherokee'); + ok(!charinrange($ranges, "139f"), 'Cherokee charscript'); + ok( charinrange($ranges, "13a0")); + ok( charinrange($ranges, "13f4")); + ok(!charinrange($ranges, "13ff")); +} use Unicode::UCD qw(general_categories); @@ -346,7 +608,8 @@ is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic'); # If this fails, then maybe one should look at the Unicode changes to see # what else might need to be updated. -is(Unicode::UCD::UnicodeVersion, '6.3.0', 'UnicodeVersion'); +ok($current_version le $expected_version, + "Verify there isn't a new Unicode version to upgrade to"); use Unicode::UCD qw(compexcl); @@ -359,97 +622,131 @@ use Unicode::UCD qw(casefold); my $casefold; -$casefold = casefold(0x41); - -is($casefold->{code}, '0041', 'casefold 0x41 code'); -is($casefold->{status}, 'C', 'casefold 0x41 status'); -is($casefold->{mapping}, '0061', 'casefold 0x41 mapping'); -is($casefold->{full}, '0061', 'casefold 0x41 full'); -is($casefold->{simple}, '0061', 'casefold 0x41 simple'); -is($casefold->{turkic}, "", 'casefold 0x41 turkic'); - -$casefold = casefold(0xdf); - -is($casefold->{code}, '00DF', 'casefold 0xDF code'); -is($casefold->{status}, 'F', 'casefold 0xDF status'); -is($casefold->{mapping}, '0073 0073', 'casefold 0xDF mapping'); -is($casefold->{full}, '0073 0073', 'casefold 0xDF full'); -is($casefold->{simple}, "", 'casefold 0xDF simple'); -is($casefold->{turkic}, "", 'casefold 0xDF turkic'); - -# Do different tests depending on if version < 3.2, or not. -my $v_unicode_version = pack "C*", split /\./, Unicode::UCD::UnicodeVersion(); -if ($v_unicode_version lt v3.2.0) { - $casefold = casefold(0x130); - - is($casefold->{code}, '0130', 'casefold 0x130 code'); - is($casefold->{status}, 'I' , 'casefold 0x130 status'); - is($casefold->{mapping}, '0069', 'casefold 0x130 mapping'); - is($casefold->{full}, '0069', 'casefold 0x130 full'); - is($casefold->{simple}, "0069", 'casefold 0x130 simple'); - is($casefold->{turkic}, "0069", 'casefold 0x130 turkic'); - - $casefold = casefold(0x131); - - is($casefold->{code}, '0131', 'casefold 0x131 code'); - is($casefold->{status}, 'I' , 'casefold 0x131 status'); - is($casefold->{mapping}, '0069', 'casefold 0x131 mapping'); - is($casefold->{full}, '0069', 'casefold 0x131 full'); - is($casefold->{simple}, "0069", 'casefold 0x131 simple'); - is($casefold->{turkic}, "0069", 'casefold 0x131 turkic'); -} else { - $casefold = casefold(0x49); - - is($casefold->{code}, '0049', 'casefold 0x49 code'); - is($casefold->{status}, 'C' , 'casefold 0x49 status'); - is($casefold->{mapping}, '0069', 'casefold 0x49 mapping'); - is($casefold->{full}, '0069', 'casefold 0x49 full'); - is($casefold->{simple}, "0069", 'casefold 0x49 simple'); - is($casefold->{turkic}, "0131", 'casefold 0x49 turkic'); - - $casefold = casefold(0x130); - - is($casefold->{code}, '0130', 'casefold 0x130 code'); - is($casefold->{status}, 'F' , 'casefold 0x130 status'); - is($casefold->{mapping}, '0069 0307', 'casefold 0x130 mapping'); - is($casefold->{full}, '0069 0307', 'casefold 0x130 full'); - is($casefold->{simple}, "", 'casefold 0x130 simple'); - is($casefold->{turkic}, "0069", 'casefold 0x130 turkic'); -} +$casefold = casefold(utf8::unicode_to_native(0x41)); + +is($casefold->{code}, $A_code, 'casefold native(0x41) code'); +is($casefold->{status}, 'C', 'casefold native(0x41) status'); +is($casefold->{mapping}, $a_code, 'casefold native(0x41) mapping'); +is($casefold->{full}, $a_code, 'casefold native(0x41) full'); +is($casefold->{simple}, $a_code, 'casefold native(0x41) simple'); +is($casefold->{turkic}, "", 'casefold native(0x41) turkic'); + +my $sharp_s_code = sprintf("%04X", utf8::unicode_to_native(0xdf)); +my $S_code = sprintf("%04X", ord "S"); +my $s_code = sprintf("%04X", ord "s"); + +if ($v_unicode_version gt v3.0.0) { # These special ones don't work on early + # perls + $casefold = casefold(utf8::unicode_to_native(0xdf)); + + is($casefold->{code}, $sharp_s_code, 'casefold native(0xDF) code'); + is($casefold->{status}, 'F', 'casefold native(0xDF) status'); + is($casefold->{mapping}, "$s_code $s_code", 'casefold native(0xDF) mapping'); + is($casefold->{full}, "$s_code $s_code", 'casefold native(0xDF) full'); + is($casefold->{simple}, "", 'casefold native(0xDF) simple'); + is($casefold->{turkic}, "", 'casefold native(0xDF) turkic'); + + # Do different tests depending on if version < 3.2, or not. + if ($v_unicode_version eq v3.0.1) { + # In this release, there was no special Turkic values. + # Both 0x130 and 0x131 folded to 'i'. + + $casefold = casefold(0x130); + + is($casefold->{code}, '0130', 'casefold 0x130 code'); + is($casefold->{status}, 'C' , 'casefold 0x130 status'); + is($casefold->{mapping}, $i_code, 'casefold 0x130 mapping'); + is($casefold->{full}, $i_code, 'casefold 0x130 full'); + is($casefold->{simple}, $i_code, 'casefold 0x130 simple'); + is($casefold->{turkic}, "", 'casefold 0x130 turkic'); + + $casefold = casefold(0x131); + + is($casefold->{code}, '0131', 'casefold 0x131 code'); + is($casefold->{status}, 'C' , 'casefold 0x131 status'); + is($casefold->{mapping}, $i_code, 'casefold 0x131 mapping'); + is($casefold->{full}, $i_code, 'casefold 0x131 full'); + is($casefold->{simple}, $i_code, 'casefold 0x131 simple'); + is($casefold->{turkic}, "", 'casefold 0x131 turkic'); + } + elsif ($v_unicode_version lt v3.2.0) { + $casefold = casefold(0x130); + + is($casefold->{code}, '0130', 'casefold 0x130 code'); + is($casefold->{status}, 'I' , 'casefold 0x130 status'); + is($casefold->{mapping}, $i_code, 'casefold 0x130 mapping'); + is($casefold->{full}, $i_code, 'casefold 0x130 full'); + is($casefold->{simple}, $i_code, 'casefold 0x130 simple'); + is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic'); + + $casefold = casefold(0x131); + + is($casefold->{code}, '0131', 'casefold 0x131 code'); + is($casefold->{status}, 'I' , 'casefold 0x131 status'); + is($casefold->{mapping}, $i_code, 'casefold 0x131 mapping'); + is($casefold->{full}, $i_code, 'casefold 0x131 full'); + is($casefold->{simple}, $i_code, 'casefold 0x131 simple'); + is($casefold->{turkic}, $i_code, 'casefold 0x131 turkic'); + } else { + $casefold = casefold(utf8::unicode_to_native(0x49)); + + is($casefold->{code}, $I_code, 'casefold native(0x49) code'); + is($casefold->{status}, 'C' , 'casefold native(0x49) status'); + is($casefold->{mapping}, $i_code, 'casefold native(0x49) mapping'); + is($casefold->{full}, $i_code, 'casefold native(0x49) full'); + is($casefold->{simple}, $i_code, 'casefold native(0x49) simple'); + is($casefold->{turkic}, "0131", 'casefold native(0x49) turkic'); + + $casefold = casefold(0x130); + + is($casefold->{code}, '0130', 'casefold 0x130 code'); + is($casefold->{status}, 'F' , 'casefold 0x130 status'); + is($casefold->{mapping}, "$i_code 0307", 'casefold 0x130 mapping'); + is($casefold->{full}, "$i_code 0307", 'casefold 0x130 full'); + is($casefold->{simple}, "", 'casefold 0x130 simple'); + is($casefold->{turkic}, $i_code, 'casefold 0x130 turkic'); + } -$casefold = casefold(0x1F88); + if ($v_unicode_version gt v3.0.1) { + $casefold = casefold(0x1F88); -is($casefold->{code}, '1F88', 'casefold 0x1F88 code'); -is($casefold->{status}, 'S' , 'casefold 0x1F88 status'); -is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping'); -is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full'); -is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple'); -is($casefold->{turkic}, "", 'casefold 0x1F88 turkic'); + is($casefold->{code}, '1F88', 'casefold 0x1F88 code'); + is($casefold->{status}, 'S' , 'casefold 0x1F88 status'); + is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping'); + is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full'); + is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple'); + is($casefold->{turkic}, "", 'casefold 0x1F88 turkic'); + } +} -ok(!casefold(0x20)); +ok(!casefold(utf8::unicode_to_native(0x20))); use Unicode::UCD qw(casespec); my $casespec; -ok(!casespec(0x41)); +ok(!casespec(utf8::unicode_to_native(0x41))); -$casespec = casespec(0xdf); +$casespec = casespec(utf8::unicode_to_native(0xdf)); -ok($casespec->{code} eq '00DF' && - $casespec->{lower} eq '00DF' && - $casespec->{title} eq '0053 0073' && - $casespec->{upper} eq '0053 0053' && - !defined $casespec->{condition}, 'casespec 0xDF'); +ok($casespec->{code} eq $sharp_s_code && + $casespec->{lower} eq $sharp_s_code && + $casespec->{title} eq "$S_code $s_code" && + $casespec->{upper} eq "$S_code $S_code" && + !defined $casespec->{condition}, 'casespec native(0xDF)'); $casespec = casespec(0x307); -ok($casespec->{az}->{code} eq '0307' && - !defined $casespec->{az}->{lower} && - $casespec->{az}->{title} eq '0307' && - $casespec->{az}->{upper} eq '0307' && - $casespec->{az}->{condition} eq 'az After_I', - 'casespec 0x307'); +if ($v_unicode_version gt v3.1.0) { + ok($casespec->{az}->{code} eq '0307' + && !defined $casespec->{az}->{lower} + && $casespec->{az}->{title} eq '0307' + && $casespec->{az}->{upper} eq '0307' + && $casespec->{az}->{condition} eq ($v_unicode_version le v3.2) + ? 'az After_Soft_Dotted' + : 'az After_I', + 'casespec 0x307'); +} # perl #7305 UnicodeCD::compexcl is weird @@ -471,11 +768,25 @@ is(Unicode::UCD::_getcode('x123'), undef, "_getcode(x123)"); is(Unicode::UCD::_getcode('0x123x'), undef, "_getcode(x123)"); is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)"); +SKIP: { + skip("Script property not in this release", 3) if $v_unicode_version lt v3.1.0; + + { + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, @_ }; + is(charscript(chr(0x6237)), undef, + "Verify charscript of non-code point returns <undef>"); + cmp_ok(scalar @warnings, '==', 1, " ... and generates 1 warning"); + like($warnings[0], qr/unknown code/, " ... with the right text"); + } + my $r1 = charscript('Latin'); if (ok(defined $r1, "Found Latin script")) { + skip("Latin range count will be wrong when using older Unicode release", + 2) if $current_version lt $expected_version; my $n1 = @$r1; - is($n1, 30, "number of ranges in Latin script (Unicode 6.1.0)"); + is($n1, 31, "number of ranges in Latin script (Unicode $expected_version)") if $::IS_ASCII; shift @$r1 while @$r1; my $r2 = charscript('Latin'); is(@$r2, $n1, "modifying results should not mess up internal caches"); @@ -486,36 +797,72 @@ is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)"); is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD"); } -use Unicode::UCD qw(namedseq); - -is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq"); -is(namedseq("KATAKANA LETTER AINU Q"), undef); -is(namedseq(), undef); -is(namedseq(qw(foo bar)), undef); -my @ns = namedseq("KATAKANA LETTER AINU P"); -is(scalar @ns, 2); -is($ns[0], 0x31F7); -is($ns[1], 0x309A); -my %ns = namedseq(); -is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}"); -@ns = namedseq(42); -is(@ns, 0); +if ($v_unicode_version ge v4.1.0) { + use Unicode::UCD qw(namedseq); + + is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq"); + is(namedseq("KATAKANA LETTER AINU Q"), undef); + is(namedseq(), undef); + is(namedseq(qw(foo bar)), undef); + my @ns = namedseq("KATAKANA LETTER AINU P"); + is(scalar @ns, 2); + is($ns[0], 0x31F7); + is($ns[1], 0x309A); + my %ns = namedseq(); + is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}"); + @ns = namedseq(42); + is(@ns, 0); +} use Unicode::UCD qw(num); -use charnames ":full"; +use charnames (); # Don't use \N{} on things not in original Unicode + # version; else will get a compilation error when this .t + # is run on an older version. is(num("0"), 0, 'Verify num("0") == 0'); is(num("98765"), 98765, 'Verify num("98765") == 98765'); -ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined'); -is(num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}"), 21, 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}") == 21'); -ok(! defined num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}"), 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined'); -is(num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}"), 3, 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3'); -ok(! defined num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}"), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined'); +ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), + 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined'); +my $tai_lue_2; +if ($v_unicode_version ge v4.1.0) { + my $tai_lue_1 = charnames::string_vianame("NEW TAI LUE DIGIT ONE"); + $tai_lue_2 = charnames::string_vianame("NEW TAI LUE DIGIT TWO"); + is(num($tai_lue_2), 2, 'Verify num("\N{NEW TAI LUE DIGIT TWO}") == 2'); + is(num($tai_lue_1), 1, 'Verify num("\N{NEW TAI LUE DIGIT ONE}") == 1'); + is(num($tai_lue_2 . $tai_lue_1), 21, + 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}") == 21'); +} +if ($v_unicode_version ge v5.2.0) { + ok(! defined num($tai_lue_2 + . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")), + 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined'); +} +if ($v_unicode_version ge v5.1.0) { + my $cham_0 = charnames::string_vianame("CHAM DIGIT ZERO"); + is(num($cham_0 . charnames::string_vianame("CHAM DIGIT THREE")), 3, + 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3'); + if ($v_unicode_version ge v5.2.0) { + ok(! defined num( $cham_0 + . charnames::string_vianame("JAVANESE DIGIT NINE")), + 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined'); + } +} is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2'); -is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000'); -is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5'); -is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9'); -is(num("\N{U+5146}"), 1000000000000, 'Verify num("\N{U+5146}") == 1000000000000'); +if ($v_unicode_version ge v3.0.0) { + is(num(charnames::string_vianame("ETHIOPIC NUMBER TEN THOUSAND")), 10000, + 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000'); +} +if ($v_unicode_version ge v5.2.0) { + is(num(charnames::string_vianame("NORTH INDIC FRACTION ONE HALF")), + .5, + 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5'); + is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9'); +} +if ($v_unicode_version gt v3.2.0) { # Is missing from non-Unihan files before + # this + is(num("\N{U+5146}"), 1000000000000, + 'Verify num("\N{U+5146}") == 1000000000000'); +} # Create a user-defined property sub InKana {<<'END'} @@ -678,6 +1025,15 @@ foreach my $alias (sort keys %utf8::loose_to_file_of) { } } +# Some of the Perl extensions should always be built; make sure they have the +# correct full name, etc. +for my $prop (qw(Alnum Blank Cntrl Digit Graph Print Word XDigit)) { + my @expected = ( $prop, "XPosix$prop" ); + my @got = prop_aliases($prop); + splice @got, 2; + is_deeply(\@got, \@expected, "Got expected aliases for $prop"); +} + my $done_equals = 0; foreach my $alias (keys %utf8::stricter_to_file_of) { if ($alias =~ /=/) { # Only test one case where there is an equals @@ -698,7 +1054,7 @@ foreach my $alias (keys %utf8::stricter_to_file_of) { } } -use Unicode::UCD qw(prop_value_aliases); +use Unicode::UCD qw(prop_values prop_value_aliases); is(prop_value_aliases("unknown property", "unknown value"), undef, "prop_value_aliases(<unknown property>, <unknown value>) returns <undef>"); @@ -707,6 +1063,8 @@ is(prop_value_aliases(undef, undef), undef, is((prop_value_aliases("na", "A")), "A", "test that prop_value_aliases returns its input for properties that don't have synonyms"); is(prop_value_aliases("isgc", "C"), undef, "prop_value_aliases('isgc', 'C') returns <undef> since is not covered Perl extension"); is(prop_value_aliases("gc", "isC"), undef, "prop_value_aliases('gc', 'isC') returns <undef> since is not covered Perl extension"); +is(prop_value_aliases("Any", "None"), undef, "prop_value_aliases('Any', 'None') returns <undef> since is Perl extension and 'None' is not valid"); +is(prop_value_aliases("lc", "A"), "A", "prop_value_aliases('lc', 'A') returns its input, as docs say it does"); # We have no way of knowing if mktables omitted a Perl extension that it # shouldn't have, but we can check if it omitted an official Unicode property @@ -720,6 +1078,12 @@ skip "PropValueAliases.txt is not in this Unicode version", 1 if $v_unicode_vers open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt" or die "Can't open Unicode PropValueAliases.txt"; local $/ = "\n"; + +# Each examined line in the file is for a single value for a property. We +# accumulate all the values for each property using these two variables. +my $prev_prop = ""; +my @this_prop_values; + while (<$propvalues>) { s/\s*#.*//; # Remove comments next if /^\s* $/x; # Ignore empty and comment lines @@ -731,6 +1095,38 @@ while (<$propvalues>) { my @fields = split /\s*;\s*/; # Fields are separated by semi-colons my $prop = shift @fields; # 0th field is the property, + + # 'qc' is short in early versions of the file for any of the quick check + # properties. Choose one of them. + if ($prop eq 'qc' && $v_unicode_version le v4.0.0) { + $prop = "NFKC_QC"; + } + + # When changing properties, we examine the accumulated values for the old + # one to see if our function that returns them matches. + if ($prev_prop ne $prop) { + if ($prev_prop ne "") { # Skip for the first time through + my @ucd_function_values = prop_values($prev_prop); + @ucd_function_values = () unless @ucd_function_values; + + # The file didn't include strictly numeric values until after this + if ($prev_prop eq 'ccc' && $v_unicode_version le v6.0.0) { + @ucd_function_values = grep { /\D/ } @ucd_function_values; + } + + # This perl extension doesn't appear in the official file + push @this_prop_values, "Non_Canon" if $prev_prop eq 'dt'; + + my @file_values = undef; + @file_values = sort { lc($a =~ s/_//gr) cmp lc($b =~ s/_//gr) } + @this_prop_values if @this_prop_values; + is_deeply(\@ucd_function_values, \@file_values, + "prop_values('$prev_prop') returns correct list of values"); + } + $prev_prop = $prop; + undef @this_prop_values; + } + my $count = 0; # 0th field in line (after shifting off the property) is # short name; 1th is long name my $short_name; @@ -740,6 +1136,12 @@ while (<$propvalues>) { # characters that are ignored under loose matching to test that my $mod_prop = "$extra_chars$prop"; + if ($prop eq 'blk' && $v_unicode_version le v5.0.0) { + foreach my $element (@fields) { + $element =~ s/-/_/g; + } + } + if ($fields[0] eq 'n/a') { # See comments in input file, essentially # means full name and short name are identical $fields[0] = $fields[1]; @@ -765,6 +1167,7 @@ while (<$propvalues>) { my $loose_prop = &utf8::_loose_name(lc $prop); my $suppressed = grep { $_ eq $loose_prop } @Unicode::UCD::suppressed_properties; + push @this_prop_values, $fields[0] unless $suppressed; foreach my $value (@fields) { if ($suppressed) { is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop"); @@ -893,44 +1296,67 @@ use Unicode::UCD qw(prop_invlist prop_invmap MAX_CP); # elements are; just look at the first element to see if are getting the # distinction right. The general inversion map testing below will test the # whole thing. -my $prop = "uc"; -my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); -is($format, 'al', "prop_invmap() format of '$prop' is 'al'"); -is($missing, '0', "prop_invmap() missing of '$prop' is '0'"); -is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); -is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41"); - -$prop = "upper"; -($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); -is($format, 's', "prop_invmap() format of '$prop' is 's"); -is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); -is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); -is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); - -$prop = "lower"; -($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); -is($format, 's', "prop_invmap() format of '$prop' is 's'"); -is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); -is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); -is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); - -$prop = "lc"; -($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); -is($format, 'al', "prop_invmap() format of '$prop' is 'al'"); -is($missing, '0', "prop_invmap() missing of '$prop' is '0'"); -is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); -is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61"); + +my $prop; +my ($invlist_ref, $invmap_ref, $format, $missing); +if ($::IS_ASCII) { # On EBCDIC, other things will come first, and can vary + # according to code page + $prop = "uc"; + ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); + is($format, 'al', "prop_invmap() format of '$prop' is 'al'"); + is($missing, '0', "prop_invmap() missing of '$prop' is '0'"); + is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); + is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41"); + + $prop = "upper"; + ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); + is($format, 's', "prop_invmap() format of '$prop' is 's"); + is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); + is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); + is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); + + $prop = "lower"; + ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); + is($format, 's', "prop_invmap() format of '$prop' is 's'"); + is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); + is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); + is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); + + $prop = "lc"; + ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); + my $lc_format = ($v_unicode_version ge v3.2.0) ? 'al' : 'a'; + is($format, $lc_format, "prop_invmap() format of '$prop' is '$lc_format"); + is($missing, '0', "prop_invmap() missing of '$prop' is '0'"); + is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); + is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61"); +} # This property is stable and small, so can test all of it -$prop = "ASCII_Hex_Digit"; -($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); -is($format, 's', "prop_invmap() format of '$prop' is 's'"); -is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); -is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A, 0x0041, - 0x0047, 0x0061, 0x0067, 0x110000 ], - "prop_invmap('$prop') code point list is correct"); -is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] , - "prop_invmap('$prop') map list is correct"); +if ($v_unicode_version gt v3.1.0) { + $prop = "ASCII_Hex_Digit"; + ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); + is($format, 's', "prop_invmap() format of '$prop' is 's'"); + is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); + if ($::IS_ASCII) { + is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A, + 0x0041, 0x0047, + 0x0061, 0x0067, 0x110000 + ], + "prop_invmap('$prop') code point list is correct"); + } + elsif ($::IS_EBCDIC) { + is_deeply($invlist_ref, [ + utf8::unicode_to_native(0x0000), + utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1, + utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1, + utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1, + utf8::unicode_to_native(0x110000) + ], + "prop_invmap('$prop') code point list is correct"); + } + is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] , + "prop_invmap('$prop') map list is correct"); +} is(prop_invlist("Unknown property"), undef, "prop_invlist(<Unknown property>) returns undef"); is(prop_invlist(undef), undef, "prop_invlist(undef) returns undef"); @@ -951,14 +1377,39 @@ is(prop_invlist("InKana"), undef, "prop_invlist(<user-defined property returns u # are there in the files. As a small hedge against that, test some # prop_invlist() tables fully with the known correct result. We choose # ASCII_Hex_Digit again, as it is stable. -@invlist = prop_invlist("AHex"); -is_deeply(\@invlist, [ 0x0030, 0x003A, 0x0041, - 0x0047, 0x0061, 0x0067 ], - "prop_invlist('AHex') is exactly the expected set of points"); -@invlist = prop_invlist("AHex=f"); -is_deeply(\@invlist, [ 0x0000, 0x0030, 0x003A, 0x0041, - 0x0047, 0x0061, 0x0067 ], - "prop_invlist('AHex=f') is exactly the expected set of points"); +if ($v_unicode_version gt v3.1.0) { + if ($::IS_ASCII) { + @invlist = prop_invlist("AHex"); + is_deeply(\@invlist, [ 0x0030, 0x003A, 0x0041, + 0x0047, 0x0061, 0x0067 ], + "prop_invlist('AHex') is exactly the expected set of points"); + @invlist = prop_invlist("AHex=f"); + is_deeply(\@invlist, [ 0x0000, 0x0030, 0x003A, 0x0041, + 0x0047, 0x0061, 0x0067 ], + "prop_invlist('AHex=f') is exactly the expected set of points"); + } + elsif ($::IS_EBCDIC) { # Relies on the ranges 0-9, a-f, and A-F each being + # contiguous + @invlist = prop_invlist("AHex"); + is_deeply(\@invlist, [ + utf8::unicode_to_native(0x0061), utf8::unicode_to_native(0x0066) + 1, + utf8::unicode_to_native(0x0041), utf8::unicode_to_native(0x0046) + 1, + utf8::unicode_to_native(0x0030), utf8::unicode_to_native(0x0039) + 1, + ], + "prop_invlist('AHex') is exactly the expected set of points"); + @invlist = prop_invlist("AHex=f"); + is_deeply(\@invlist, [ + utf8::unicode_to_native(0x0000), + utf8::unicode_to_native(0x0061), + utf8::unicode_to_native(0x0066) + 1, + utf8::unicode_to_native(0x0041), + utf8::unicode_to_native(0x0046) + 1, + utf8::unicode_to_native(0x0030), + utf8::unicode_to_native(0x0039) + 1, + ], + "prop_invlist('AHex=f') is exactly the expected set of points"); + } +} sub fail_with_diff ($$$$) { # For use below to output better messages @@ -1235,7 +1686,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { fail("prop_invmap('$prop')"); diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap"); } - next PROPERTY; + next PROPERTY; } } @@ -1243,6 +1694,14 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # normalized version. $name = &utf8::_loose_name(lc $name); + # In the case of a combination property, both a map table and a match + # table are generated. For all the tests except prop_invmap(), this is + # irrelevant, but for prop_invmap, having an 'is' prefix forces it to + # return the match table; otherwise the map. We thus need to distinguish + # between the two forms. The property name is what has this information. + $name = &utf8::_loose_name(lc $prop) + if exists $Unicode::UCD::combination_property{$name}; + # Add in the characters that are supposed to be ignored to test loose # matching, which the tested function applies to all properties $display_prop = "$extra_chars$prop" unless $display_prop; @@ -1452,23 +1911,46 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { if ($name eq 'blk') { # The blk property is special. The original file with old block - # names is retained, and the default is to not write out a - # new-name file. What we do is get the old names into a data - # structure, and from that create what the new file would look - # like. $base_file is needed to be defined, just to avoid a - # message below. + # names is retained, and the default (on ASCII platforms) is to + # not write out a new-name file. What we do is get the old names + # into a data structure, and from that create what the new file + # would look like. $base_file is needed to be defined, just to + # avoid a message below. $base_file = "This is a dummy name"; my $blocks_ref = charblocks(); + + if ($::IS_EBCDIC) { + # On EBCDIC, the first two blocks can each contain multiple + # ranges. We create a new version with each of these + # flattened, so have one level. ($index is used as a dummy + # key.) + my %new_blocks; + my $index = 0; + foreach my $block (values %$blocks_ref) { + foreach my $range (@$block) { + $new_blocks{$index++}[0] = $range; + } + } + $blocks_ref = \%new_blocks; + } $official = ""; for my $range (sort { $a->[0][0] <=> $b->[0][0] } values %$blocks_ref) { # Translate the charblocks() data structure to what the file - # would like. - $official .= sprintf"%X\t%X\t%s\n", - $range->[0][0], - $range->[0][1], - $range->[0][2]; + # would look like. (The sub range is for EBCDIC platforms + # where Latin1 and ASCII are intermixed.) + if ($range->[0][0] == $range->[0][1]) { + $official .= sprintf("%X\t\t%s\n", + $range->[0][0], + $range->[0][2]); + } + else { + $official .= sprintf("%X\t%X\t%s\n", + $range->[0][0], + $range->[0][1], + $range->[0][2]); + } } } else { @@ -1587,10 +2069,13 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # including the ones that are overridden by the specials. These # need to be removed as the list is for just the full ones. - # Go through any special mappings one by one. They are packed. + # Go through any special mappings one by one. The keys are the + # UTF-8 representation of code points. my $i = 0; foreach my $utf8_cp (sort keys %$specials_ref) { - my $cp = unpack("C0U", $utf8_cp); + my $cp = $utf8_cp; + utf8::decode($cp); + $cp = ord $cp; # Find the spot in the @list of simple mappings that this # special applies to; uses a linear search. @@ -1736,20 +2221,22 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { elsif ($format =~ / ^ al e? $/x) { # For an al property, the stringified result should be in - # the specials hash. The key is the packed code point, - # and the value is the packed map. + # the specials hash. The key is the utf8 bytes of the + # code point, and the value is its map as a utf-8 string. my $value; - if (! defined ($value = delete $specials{pack("C0U", - $invlist_ref->[$i]) })) - { + my $key = chr $invlist_ref->[$i]; + utf8::encode($key); + if (! defined ($value = delete $specials{$key})) { fail("prop_invmap('$display_prop')"); diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]); next PROPERTY; } - my $packed = pack "U*", @{$invmap_ref->[$i]}; + my $packed = pack "W*", @{$invmap_ref->[$i]}; + utf8::upgrade($packed); if ($value ne $packed) { fail("prop_invmap('$display_prop')"); - diag(sprintf "For %04X, expected the mapping to be '$packed', but got '$value'"); + diag(sprintf "For %04X, expected the mapping to be " + . "'$packed', but got '$value'", $invlist_ref->[$i]); next PROPERTY; } @@ -1813,12 +2300,12 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { if ($format eq 'ale' && $invmap_ref->[$i] eq "") { # ale properties have maps to the empty string that also - # should be in the specials hash, with the key the packed - # code point, and the map just empty. + # should be in the specials hash, with the key the utf8 + # bytes representing the code point, and the map just empty. my $value; - if (! defined ($value = delete $specials{pack("C0U", - $invlist_ref->[$i]) })) - { + my $key = chr $invlist_ref->[$i]; + utf8::encode($key); + if (! defined ($value = delete $specials{$key})) { fail("prop_invmap('$display_prop')"); diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]); next PROPERTY; @@ -1957,13 +2444,19 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # shouldn't be in the property. This gets rid of the two ranges in # one fell swoop, and also all the Unicode1_Name values that may not # be in Name_Alias. - $official =~ s/ 00000 \t .* 0001F .*? \n//xs; - $official =~ s/ 0007F \t .* 0009F .*? \n//xs; + if ($::IS_ASCII) { + $official =~ s/ 00000 \t .* 0001F .*? \n//xs; + $official =~ s/ 0007F \t .* 0009F .*? \n//xs; + } + elsif ($::IS_EBCDIC) { # Won't work for POSIX-BC + $official =~ s/ 00000 \t .* 0003F .*? \n//xs; + $official =~ s/ 000FF \t .* 000FF .*? \n//xs; + } # And remove the aliases. We read in the Name_Alias property, and go # through them one by one. my ($aliases_code_points, $aliases_maps, undef, undef) - = &prop_invmap('Name_Alias'); + = &prop_invmap('_Perl_Name_Alias', '_perl_core_internal_ok'); for (my $i = 0; $i < @$aliases_code_points; $i++) { my $code_point = $aliases_code_points->[$i]; @@ -2188,11 +2681,13 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # A few tests of search_invlist use Unicode::UCD qw(search_invlist); -my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script"); -my $index = search_invlist($scripts_ranges_ref, 0x390); -is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek"); -my @alpha_invlist = prop_invlist("Alpha"); -is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list"); +if ($v_unicode_version ge v3.1.0) { # No Script property before this + my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script"); + my $index = search_invlist($scripts_ranges_ref, 0x390); + is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek"); + my @alpha_invlist = prop_invlist("Alpha"); + is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list"); +} ok($/ eq $input_record_separator, "The record separator didn't get overridden"); |