import perl 5.16.3 from CPAN - worked on by Andrew Fresh and myself

author: sthen <sthen@openbsd.org> 2013-03-25 20:06:16 +0000
committer: sthen <sthen@openbsd.org> 2013-03-25 20:06:16 +0000
commit: 898184e3e61f9129feb5978fad5a8c6865f00b92 (patch)
tree: 56f32aefc1eed60b534611007c7856f82697a205 /gnu/usr.bin/perl/lib/Unicode
parent: PGSHIFT -> PAGE_SHIFT (diff)
download: wireguard-openbsd-898184e3e61f9129feb5978fad5a8c6865f00b92.tar.xz
wireguard-openbsd-898184e3e61f9129feb5978fad5a8c6865f00b92.zip
2 files changed, 4013 insertions, 263 deletions
diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.pm b/gnu/usr.bin/perl/lib/Unicode/UCD.pm
index c6ee8e05fe2..724fb62785b 100644
--- a/gnu/usr.bin/perl/lib/Unicode/UCD.pm
+++ b/gnu/usr.bin/perl/lib/Unicode/UCD.pm
@@ -2,8 +2,11 @@ package Unicode::UCD;
 
 use strict;
 use warnings;
+no warnings 'surrogate';    # surrogates can be inputs to this
+use charnames ();
+use Unicode::Normalize qw(getCombinClass NFD);
 
-our $VERSION = '0.27';
+our $VERSION = '0.43';
 
 use Storable qw(dclone);
 
@@ -18,7 +21,14 @@ our @EXPORT_OK = qw(charinfo
 		    general_categories bidi_types
 		    compexcl
 		    casefold casespec
-		    namedseq);
+		    namedseq
+                    num
+                    prop_aliases
+                    prop_value_aliases
+                    prop_invlist
+                    prop_invmap
+                    MAX_CP
+                );
 
 use Carp;
 
@@ -57,6 +67,19 @@ Unicode::UCD - Unicode character database
     my $categories = general_categories();
     my $types = bidi_types();
 
+    use Unicode::UCD 'prop_aliases';
+    my @space_names = prop_aliases("space");
+
+    use Unicode::UCD 'prop_value_aliases';
+    my @gc_punct_names = prop_value_aliases("Gc", "Punct");
+
+    use Unicode::UCD 'prop_invlist';
+    my @puncts = prop_invlist("gc=punctuation");
+
+    use Unicode::UCD 'prop_invmap';
+    my ($list_ref, $map_ref, $format, $missing)
+                                      = prop_invmap("General Category");
+
     use Unicode::UCD 'compexcl';
     my $compexcl = compexcl($codepoint);
 
@@ -65,6 +88,9 @@ Unicode::UCD - Unicode character database
 
     my $unicode_version = Unicode::UCD::UnicodeVersion();
 
+    my $convert_to_numeric =
+              Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
+
 =head1 DESCRIPTION
 
 The Unicode::UCD module offers a series of functions that
@@ -78,16 +104,13 @@ a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
 followed by hexadecimals designating a Unicode code point.  In other words, if
 you want a code point to be interpreted as a hexadecimal number, you must
 prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
-interpreted as a decimal code point.  Also note that Unicode is B<not> limited
-to 16 bits (the number of Unicode code points is open-ended, in theory
-unlimited): you may have more than 4 hexdigits.
+interpreted as a decimal code point.  Note that the largest code point in
+Unicode is U+10FFFF.
+
 =cut
 
-my $UNICODEFH;
 my $BLOCKSFH;
-my $SCRIPTSFH;
 my $VERSIONFH;
-my $COMPEXCLFH;
 my $CASEFOLDFH;
 my $CASESPECFH;
 my $NAMEDSEQFH;
@@ -121,7 +144,7 @@ standard.  If the L</code point argument> is not assigned in the standard
 (i.e., has the general category C<Cn> meaning C<Unassigned>)
 or is a non-character (meaning it is guaranteed to never be assigned in
 the standard),
-B<undef> is returned.
+C<undef> is returned.
 
 Fields that aren't applicable to the particular code point argument exist in the
 returned hash, and are empty. 
@@ -150,6 +173,9 @@ C<E<lt>controlE<gt>>.
 The short name of the general category of I<code>.
 This will match one of the keys in the hash returned by L</general_categories()>.
 
+The L</prop_value_aliases()> function can be used to get all the synonyms
+of the category name.
+
 =item B<combining>
 
 the combining class number for I<code> used in the Canonical Ordering Algorithm.
@@ -157,26 +183,36 @@ For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior
 available at
 L<http://www.unicode.org/versions/Unicode5.1.0/>
 
+The L</prop_value_aliases()> function can be used to get all the synonyms
+of the combining class number.
+
 =item B<bidi>
 
 bidirectional type of I<code>.
 This will match one of the keys in the hash returned by L</bidi_types()>.
 
+The L</prop_value_aliases()> function can be used to get all the synonyms
+of the bidi type name.
+
 =item B<decomposition>
 
 is empty if I<code> has no decomposition; or is one or more codes
-(separated by spaces) that taken in order represent a decomposition for
+(separated by spaces) that, taken in order, represent a decomposition for
 I<code>.  Each has at least four hexdigits.
 The codes may be preceded by a word enclosed in angle brackets then a space,
 like C<E<lt>compatE<gt> >, giving the type of decomposition
 
+This decomposition may be an intermediate one whose components are also
+decomposable.  Use L<Unicode::Normalize> to get the final decomposition.
+
 =item B<decimal>
 
 if I<code> is a decimal digit this is its integer numeric value
 
 =item B<digit>
 
-if I<code> represents a whole number, this is its integer numeric value
+if I<code> represents some other digit-like number, this is its integer
+numeric value
 
 =item B<numeric>
 
@@ -194,14 +230,12 @@ existed for this code point and is different from the current name
 
 =item B<comment>
 
-ISO 10646 comment field.
-It appears in parentheses in the ISO 10646 names list,
-or contains an asterisk to indicate there is
-a note for this code point in Annex P of that standard.
+As of Unicode 6.0, this is always empty.
 
 =item B<upper>
 
-is empty if there is no single code point uppercase mapping for I<code>;
+is empty if there is no single code point uppercase mapping for I<code>
+(its uppercase mapping is itself);
 otherwise it is that mapping expressed as at least four hexdigits.
 (L</casespec()> should be used in addition to B<charinfo()>
 for case mappings when the calling program can cope with multiple code point
@@ -209,7 +243,8 @@ mappings.)
 
 =item B<lower>
 
-is empty if there is no single code point lowercase mapping for I<code>;
+is empty if there is no single code point lowercase mapping for I<code>
+(its lowercase mapping is itself);
 otherwise it is that mapping expressed as at least four hexdigits.
 (L</casespec()> should be used in addition to B<charinfo()>
 for case mappings when the calling program can cope with multiple code point
@@ -217,7 +252,8 @@ mappings.)
 
 =item B<title>
 
-is empty if there is no single code point titlecase mapping for I<code>;
+is empty if there is no single code point titlecase mapping for I<code>
+(its titlecase mapping is itself);
 otherwise it is that mapping expressed as at least four hexdigits.
 (L</casespec()> should be used in addition to B<charinfo()>
 for case mappings when the calling program can cope with multiple code point
@@ -225,13 +261,13 @@ mappings.)
 
 =item B<block>
 
-block I<code> belongs to (used in \p{In...}).
+the block I<code> belongs to (used in C<\p{Blk=...}>).
 See L</Blocks versus Scripts>.
 
 
 =item B<script>
 
-script I<code> belongs to.
+the script I<code> belongs to.
 See L</Blocks versus Scripts>.
 
 =back
@@ -242,7 +278,7 @@ you will need also the L</compexcl()>, and L</casespec()> functions.
 
 =cut
 
-# NB: This function is duplicated in charnames.pm
+# NB: This function is nearly duplicated in charnames.pm
 sub _getcode {
     my $arg = shift;
 
@@ -255,134 +291,137 @@ sub _getcode {
     return;
 }
 
-# Lingua::KO::Hangul::Util not part of the standard distribution
-# but it will be used if available.
+# Populated by _num.  Converts real number back to input rational
+my %real_to_rational;
 
-eval { require Lingua::KO::Hangul::Util };
-my $hasHangulUtil = ! $@;
-if ($hasHangulUtil) {
-    Lingua::KO::Hangul::Util->import();
-}
-
-sub hangul_decomp { # internal: called from charinfo
-    if ($hasHangulUtil) {
-	my @tmp = decomposeHangul(shift);
-	return sprintf("%04X %04X",      @tmp) if @tmp == 2;
-	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
-    }
-    return;
-}
+# To store the contents of files found on disk.
+my @BIDIS;
+my @CATEGORIES;
+my @DECOMPOSITIONS;
+my @NUMERIC_TYPES;
+my %SIMPLE_LOWER;
+my %SIMPLE_TITLE;
+my %SIMPLE_UPPER;
+my %UNICODE_1_NAMES;
 
-sub hangul_charname { # internal: called from charinfo
-    return sprintf("HANGUL SYLLABLE-%04X", shift);
-}
+sub charinfo {
 
-sub han_charname { # internal: called from charinfo
-    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
-}
+    # This function has traditionally mimicked what is in UnicodeData.txt,
+    # warts and all.  This is a re-write that avoids UnicodeData.txt so that
+    # it can be removed to save disk space.  Instead, this assembles
+    # information gotten by other methods that get data from various other
+    # files.  It uses charnames to get the character name; and various
+    # mktables tables.
 
-# Overwritten by data in file
-my %first_last = (
-   'CJK Ideograph Extension A' => [ 0x3400,   0x4DB5   ],
-   'CJK Ideograph'             => [ 0x4E00,   0x9FA5   ],
-   'CJK Ideograph Extension B' => [ 0x20000,  0x2A6D6  ],
-);
-
-get_charinfo_ranges();
-
-sub get_charinfo_ranges {
-   my @blocks = keys %first_last;
-   
-   my $fh;
-   openunicode( \$fh, 'UnicodeData.txt' );
-   if( defined $fh ){
-      while( my $line = <$fh> ){
-         next unless $line =~ /(?:First|Last)/;
-         if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
-            my ($number,$block,$type);
-            ($number,$block) = split /;/, $line;
-            $block =~ s/<|>//g;
-            ($block,$type) = split /, /, $block;
-            my $index = $type eq 'First' ? 0 : 1;
-            $first_last{ $block }->[$index] = hex $number;
-         }
-      }
-   }
-}
+    use feature 'unicode_strings';
 
-my @CharinfoRanges = (
-# block name
-# [ first, last, coderef to name, coderef to decompose ],
-# CJK Ideographs Extension A
-  [ @{ $first_last{'CJK Ideograph Extension A'} },        \&han_charname,   undef  ],
-# CJK Ideographs
-  [ @{ $first_last{'CJK Ideograph'} },                    \&han_charname,   undef  ],
-# Hangul Syllables
-  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
-# Non-Private Use High Surrogates
-  [ 0xD800,   0xDB7F,   undef,   undef  ],
-# Private Use High Surrogates
-  [ 0xDB80,   0xDBFF,   undef,   undef  ],
-# Low Surrogates
-  [ 0xDC00,   0xDFFF,   undef,   undef  ],
-# The Private Use Area
-  [ 0xE000,   0xF8FF,   undef,   undef  ],
-# CJK Ideographs Extension B
-  [ @{ $first_last{'CJK Ideograph Extension B'} },        \&han_charname,   undef  ],
-# Plane 15 Private Use Area
-  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
-# Plane 16 Private Use Area
-  [ 0x100000, 0x10FFFD, undef,   undef  ],
-);
-
-sub charinfo {
     my $arg  = shift;
     my $code = _getcode($arg);
-    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
-	unless defined $code;
-    my $hexk = sprintf("%06X", $code);
-    my($rcode,$rname,$rdec);
-    foreach my $range (@CharinfoRanges){
-      if ($range->[0] <= $code && $code <= $range->[1]) {
-        $rcode = $hexk;
-	$rcode =~ s/^0+//;
-	$rcode =  sprintf("%04X", hex($rcode));
-        $rname = $range->[2] ? $range->[2]->($code) : '';
-        $rdec  = $range->[3] ? $range->[3]->($code) : '';
-        $hexk  = sprintf("%06X", $range->[0]); # replace by the first
-        last;
-      }
+    croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
+
+    # Non-unicode implies undef.
+    return if $code > 0x10FFFF;
+
+    my %prop;
+    my $char = chr($code);
+
+    @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES;
+    $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
+                        // $utf8::SwashInfo{'ToGc'}{'missing'};
+
+    return if $prop{'category'} eq 'Cn';    # Unassigned code points are undef
+
+    $prop{'code'} = sprintf "%04X", $code;
+    $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
+                                           : (charnames::viacode($code) // "");
+
+    $prop{'combining'} = getCombinClass($code);
+
+    @BIDIS =_read_table("To/Bc.pl") unless @BIDIS;
+    $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
+                    // $utf8::SwashInfo{'ToBc'}{'missing'};
+
+    # For most code points, we can just read in "unicore/Decomposition.pl", as
+    # its contents are exactly what should be output.  But that file doesn't
+    # contain the data for the Hangul syllable decompositions, which can be
+    # algorithmically computed, and NFD() does that, so we call NFD() for
+    # those.  We can't use NFD() for everything, as it does a complete
+    # recursive decomposition, and what this function has always done is to
+    # return what's in UnicodeData.txt which doesn't show that recursiveness.
+    # Fortunately, the NFD() of the Hanguls doesn't have any recursion
+    # issues.
+    # Having no decomposition implies an empty field; otherwise, all but
+    # "Canonical" imply a compatible decomposition, and the type is prefixed
+    # to that, as it is in UnicodeData.txt
+    if ($char =~ /\p{Block=Hangul_Syllables}/) {
+        # The code points of the decomposition are output in standard Unicode
+        # hex format, separated by blanks.
+        $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
+                                           unpack "U*", NFD($char);
     }
-    openunicode(\$UNICODEFH, "UnicodeData.txt");
-    if (defined $UNICODEFH) {
-	use Search::Dict 1.02;
-	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
-	    my $line = <$UNICODEFH>;
-	    return unless defined $line;
-	    chomp $line;
-	    my %prop;
-	    @prop{qw(
-		     code name category
-		     combining bidi decomposition
-		     decimal digit numeric
-		     mirrored unicode10 comment
-		     upper lower title
-		    )} = split(/;/, $line, -1);
-	    $hexk =~ s/^0+//;
-	    $hexk =  sprintf("%04X", hex($hexk));
-	    if ($prop{code} eq $hexk) {
-		$prop{block}  = charblock($code);
-		$prop{script} = charscript($code);
-		if(defined $rname){
-                    $prop{code} = $rcode;
-                    $prop{name} = $rname;
-                    $prop{decomposition} = $rdec;
-                }
-		return \%prop;
-	    }
-	}
+    else {
+        @DECOMPOSITIONS = _read_table("Decomposition.pl")
+                          unless @DECOMPOSITIONS;
+        $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
+                                                                $code) // "";
     }
-    return;
+
+    # Can use num() to get the numeric values, if any.
+    if (! defined (my $value = num($char))) {
+        $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
+    }
+    else {
+        if ($char =~ /\d/) {
+            $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
+        }
+        else {
+
+            # For non-decimal-digits, we have to read in the Numeric type
+            # to distinguish them.  It is not just a matter of integer vs.
+            # rational, as some whole number values are not considered digits,
+            # e.g., TAMIL NUMBER TEN.
+            $prop{'decimal'} = "";
+
+            @NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES;
+            if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
+                eq 'Digit')
+            {
+                $prop{'digit'} = $prop{'numeric'} = $value;
+            }
+            else {
+                $prop{'digit'} = "";
+                $prop{'numeric'} = $real_to_rational{$value} // $value;
+            }
+        }
+    }
+
+    $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
+
+    %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
+    $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
+
+    # This is true starting in 6.0, but, num() also requires 6.0, so
+    # don't need to test for version again here.
+    $prop{'comment'} = "";
+
+    %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
+    $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
+                     ? sprintf("%04X", $SIMPLE_UPPER{$code})
+                     : "";
+
+    %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
+    $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
+                     ? sprintf("%04X", $SIMPLE_LOWER{$code})
+                     : "";
+
+    %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
+    $prop{'title'} = (defined $SIMPLE_TITLE{$code})
+                     ? sprintf("%04X", $SIMPLE_TITLE{$code})
+                     : "";
+
+    $prop{block}  = charblock($code);
+    $prop{script} = charscript($code);
+    return \%prop;
 }
 
 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
@@ -405,6 +444,75 @@ sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
     }
 }
 
+sub _read_table ($;$) {
+
+    # Returns the contents of the mktables generated table file located at $1
+    # in the form of either an array of arrays or a hash, depending on if the
+    # optional second parameter is true (for hash return) or not.  In the case
+    # of a hash return, each key is a code point, and its corresponding value
+    # is what the table gives as the code point's corresponding value.  In the
+    # case of an array return, each outer array denotes a range with [0] the
+    # start point of that range; [1] the end point; and [2] the value that
+    # every code point in the range has.  The hash return is useful for fast
+    # lookup when the table contains only single code point ranges.  The array
+    # return takes much less memory when there are large ranges.
+    #
+    # This function has the side effect of setting
+    # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
+    #                                       table; and
+    # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
+    #                                        not listed in the table.
+    # where $property is the Unicode property name, preceded by 'To' for map
+    # properties., e.g., 'ToSc'.
+    #
+    # Table entries look like one of:
+    # 0000	0040	Common	# [65]
+    # 00AA		Latin
+
+    my $table = shift;
+    my $return_hash = shift;
+    $return_hash = 0 unless defined $return_hash;
+    my @return;
+    my %return;
+    local $_;
+    my $list = do "unicore/$table";
+
+    # Look up if this property requires adjustments, which we do below if it
+    # does.
+    require "unicore/Heavy.pl";
+    my $property = $table =~ s/\.pl//r;
+    $property = $utf8::file_to_swash_name{$property};
+    my $to_adjust = defined $property
+                    && $utf8::SwashInfo{$property}{'format'} eq 'a';
+
+    for (split /^/m, $list) {
+        my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
+                                        \s* ( \# .* )?  # Optional comment
+                                        $ /x;
+        my $decimal_start = hex $start;
+        my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
+        if ($return_hash) {
+            foreach my $i ($decimal_start .. $decimal_end) {
+                $return{$i} = ($to_adjust)
+                              ? $value + $i - $decimal_start
+                              : $value;
+            }
+        }
+        elsif (! $to_adjust
+               && @return
+               && $return[-1][1] == $decimal_start - 1
+               && $return[-1][2] eq $value)
+        {
+            # If this is merely extending the previous range, do just that.
+            $return[-1]->[1] = $decimal_end;
+        }
+        else {
+            push @return, [ $decimal_start, $decimal_end, $value ];
+        }
+    }
+    return ($return_hash) ? %return : @return;
+}
+
 sub charinrange {
     my ($range, $arg) = @_;
     my $code = _getcode($arg);
@@ -425,18 +533,24 @@ sub charinrange {
     my $range     = charblock('Armenian');
 
 With a L</code point argument> charblock() returns the I<block> the code point
-belongs to, e.g.  C<Basic Latin>.
+belongs to, e.g.  C<Basic Latin>.  The old-style block name is returned (see
+L</Old-style versus new-style block names>).
 If the code point is unassigned, this returns the block it would belong to if
-it were assigned (which it may in future versions of the Unicode Standard).
+it were assigned.
 
 See also L</Blocks versus Scripts>.
 
-If supplied with an argument that can't be a code point, charblock() tries
-to do the opposite and interpret the argument as a code point block. The
-return value is a I<range>: an anonymous list of lists that contain
-I<start-of-range>, I<end-of-range> code point pairs. You can test whether
-a code point is in a range using the L</charinrange()> function. If the
-argument is not a known code point block, B<undef> is returned.
+If supplied with an argument that can't be a code point, charblock() tries to
+do the opposite and interpret the argument as an old-style block name. The
+return value
+is a I<range set> with one range: an anonymous list with a single element that
+consists of another anonymous list whose first element is the first code point
+in the block, and whose second (and final) element is the final code point in
+the block.  (The extra list consisting of just one element is so that the same
+program logic can be used to handle both this return, and the return from
+L</charscript()> which can have multiple ranges.) You can test whether a code
+point is in a range using the L</charinrange()> function.  If the argument is
+not a known block, C<undef> is returned.
 
 =cut
 
@@ -444,9 +558,13 @@ my @BLOCKS;
 my %BLOCKS;
 
 sub _charblocks {
+
+    # Can't read from the mktables table because it loses the hyphens in the
+    # original.
     unless (@BLOCKS) {
 	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 	    local $_;
+	    local $/ = "\n";
 	    while (<$BLOCKSFH>) {
 		if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 		    my ($lo, $hi) = (hex($1), hex($2));
@@ -468,13 +586,12 @@ sub charblock {
     my $code = _getcode($arg);
 
     if (defined $code) {
-	_search(\@BLOCKS, 0, $#BLOCKS, $code);
-    } else {
-	if (exists $BLOCKS{$arg}) {
-	    return dclone $BLOCKS{$arg};
-	} else {
-	    return;
-	}
+	my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
+        return $result if defined $result;
+        return 'No_Block';
+    }
+    elsif (exists $BLOCKS{$arg}) {
+        return dclone $BLOCKS{$arg};
     }
 }
 
@@ -490,14 +607,14 @@ sub charblock {
 
 With a L</code point argument> charscript() returns the I<script> the
 code point belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
-If the code point is unassigned, it returns B<undef>
+If the code point is unassigned, it returns C<"Unknown">.
 
 If supplied with an argument that can't be a code point, charscript() tries
-to do the opposite and interpret the argument as a code point script. The
-return value is a I<range>: an anonymous list of lists that contain
+to do the opposite and interpret the argument as a script name. The
+return value is a I<range set>: an anonymous list of lists that contain
 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
-code point is in a range using the L</charinrange()> function. If the
-argument is not a known code point script, B<undef> is returned.
+code point is in a range set using the L</charinrange()> function. If the
+argument is not a known script, C<undef> is returned.
 
 See also L</Blocks versus Scripts>.
 
@@ -507,22 +624,10 @@ my @SCRIPTS;
 my %SCRIPTS;
 
 sub _charscripts {
-    unless (@SCRIPTS) {
-	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
-	    local $_;
-	    while (<$SCRIPTSFH>) {
-		if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
-		    my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
-		    my $script = lc($3);
-		    $script =~ s/\b(\w)/uc($1)/ge;
-		    my $subrange = [ $lo, $hi, $script ];
-		    push @SCRIPTS, $subrange;
-		    push @{$SCRIPTS{$script}}, $subrange;
-		}
-	    }
-	    close($SCRIPTSFH);
-	    @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
-	}
+    @SCRIPTS =_read_table("To/Sc.pl") unless @SCRIPTS;
+    foreach my $entry (@SCRIPTS) {
+        $entry->[2] =~ s/(_\w)/\L$1/g;  # Preserve old-style casing
+        push @{$SCRIPTS{$entry->[2]}}, $entry;
     }
 }
 
@@ -534,14 +639,14 @@ sub charscript {
     my $code = _getcode($arg);
 
     if (defined $code) {
-	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
-    } else {
-	if (exists $SCRIPTS{$arg}) {
-	    return dclone $SCRIPTS{$arg};
-	} else {
-	    return;
-	}
+	my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
+        return $result if defined $result;
+        return $utf8::SwashInfo{'ToSc'}{'missing'};
+    } elsif (exists $SCRIPTS{$arg}) {
+        return dclone $SCRIPTS{$arg};
     }
+
+    return;
 }
 
 =head2 B<charblocks()>
@@ -553,6 +658,12 @@ sub charscript {
 charblocks() returns a reference to a hash with the known block names
 as the keys, and the code point ranges (see L</charblock()>) as the values.
 
+The names are in the old-style (see L</Old-style versus new-style block
+names>).
+
+L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
+different type of data structure.
+
 See also L</Blocks versus Scripts>.
 
 =cut
@@ -572,6 +683,9 @@ charscripts() returns a reference to a hash with the known script
 names as the keys, and the code point ranges (see L</charscript()>) as
 the values.
 
+L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
+different type of data structure.
+
 See also L</Blocks versus Scripts>.
 
 =cut
@@ -583,7 +697,7 @@ sub charscripts {
 
 =head2 B<charinrange()>
 
-In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
+In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
 can also test whether a code point is in the I<range> as returned by
 L</charblock()> and L</charscript()> or as the values of the hash returned
 by L</charblocks()> and L</charscripts()> by using charinrange():
@@ -655,6 +769,9 @@ from the long names to the short names.  The general category is the
 one returned from
 L</charinfo()> under the C<category> key.
 
+The L</prop_value_aliases()> function can be used to get all the synonyms of
+the category name.
+
 =cut
 
 my %BIDI_TYPES =
@@ -697,6 +814,9 @@ the Unicode TR9 is recommended reading:
 L<http://www.unicode.org/reports/tr9/>
 (as of Unicode 5.0.0)
 
+The L</prop_value_aliases()> function can be used to get all the synonyms of
+the bidi type name.
+
 =cut
 
 sub bidi_types {
@@ -709,45 +829,43 @@ sub bidi_types {
 
     my $compexcl = compexcl(0x09dc);
 
-This returns B<true> if the
-L</code point argument> should not be produced by composition normalization,
-B<AND> if that fact is not otherwise determinable from the Unicode data base.
-It currently does not return B<true> if the code point has a decomposition
+This routine is included for backwards compatibility, but as of Perl 5.12, for
+most purposes it is probably more convenient to use one of the following
+instead:
+
+    my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
+    my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
+
+or even
+
+    my $compexcl = chr(0x09dc) =~ /\p{CE};
+    my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
+
+The first two forms return B<true> if the L</code point argument> should not
+be produced by composition normalization.  For the final two forms to return
+B<true>, it is additionally required that this fact not otherwise be
+determinable from the Unicode data base.
+
+This routine behaves identically to the final two forms.  That is,
+it does not return B<true> if the code point has a decomposition
 consisting of another single code point, nor if its decomposition starts
 with a code point whose combining class is non-zero.  Code points that meet
 either of these conditions should also not be produced by composition
-normalization.
+normalization, which is probably why you should use the
+C<Full_Composition_Exclusion> property instead, as shown above.
 
-It returns B<false> otherwise.
+The routine returns B<false> otherwise.
 
 =cut
 
-my %COMPEXCL;
-
-sub _compexcl {
-    unless (%COMPEXCL) {
-	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
-	    local $_;
-	    while (<$COMPEXCLFH>) {
-		if (/^([0-9A-F]+)\s+\#\s+/) {
-		    my $code = hex($1);
-		    $COMPEXCL{$code} = undef;
-		}
-	    }
-	    close($COMPEXCLFH);
-	}
-    }
-}
-
 sub compexcl {
     my $arg  = shift;
     my $code = _getcode($arg);
     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 	unless defined $code;
 
-    _compexcl() unless %COMPEXCL;
-
-    return exists $COMPEXCL{$code};
+    no warnings "non_unicode";     # So works on non-Unicode code points
+    return chr($code) =~ /\p{Composition_Exclusion}/;
 }
 
 =head2 B<casefold()>
@@ -772,9 +890,11 @@ sub compexcl {
     }
 
 This returns the (almost) locale-independent case folding of the
-character specified by the L</code point argument>.
+character specified by the L</code point argument>.  (Starting in Perl v5.16,
+the core function C<fc()> returns the C<full> mapping (described below)
+faster than this does, and for entire strings.)
 
-If there is no case folding for that code point, B<undef> is returned.
+If there is no case folding for the input code point, C<undef> is returned.
 
 If there is a case folding for that code point, a reference to a hash
 with the following fields is returned:
@@ -788,7 +908,7 @@ added if necessary to make it contain at least four hexdigits
 
 =item B<full>
 
-one or more codes (separated by spaces) that taken in order give the
+one or more codes (separated by spaces) that, taken in order, give the
 code points for the case folding for I<code>.
 Each has at least four hexdigits.
 
@@ -812,25 +932,25 @@ I<code>.  It is defined primarily for backwards compatibility.
 is C<C> (for C<common>) if the best possible fold is a single code point
 (I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
 folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
-there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).  Note
-that this
+there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
+Note that this
 describes the contents of I<mapping>.  It is defined primarily for backwards
 compatibility.
 
-On versions 3.1 and earlier of Unicode, I<status> can also be
+For Unicode versions between 3.1 and 3.1.1 inclusive, I<status> can also be
 C<I> which is the same as C<C> but is a special case for dotted uppercase I and
 dotless lowercase i:
 
 =over
 
-=item B<*>
+=item B<*> If you use this C<I> mapping
 
-If you use this C<I> mapping, the result is case-insensitive,
+the result is case-insensitive,
 but dotless and dotted I's are not distinguished
 
-=item B<*>
+=item B<*> If you exclude this C<I> mapping
 
-If you exclude this C<I> mapping, the result is not fully case-insensitive, but
+the result is not fully case-insensitive, but
 dotless and dotted I's are distinguished
 
 =back
@@ -840,13 +960,14 @@ dotless and dotted I's are distinguished
 contains any special folding for Turkic languages.  For versions of Unicode
 starting with 3.2, this field is empty unless I<code> has a different folding
 in Turkic languages, in which case it is one or more codes (separated by
-spaces) that taken in order give the code points for the case folding for
+spaces) that, taken in order, give the code points for the case folding for
 I<code> in those languages.
 Each code has at least four hexdigits.
 Note that this folding does not maintain canonical equivalence without
 additional processing.
 
-For versions of Unicode 3.1 and earlier, this field is empty unless there is a
+For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless
+there is a
 special folding for Turkic languages, in which case I<status> is C<I>, and
 I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.  
 
@@ -879,6 +1000,7 @@ sub _casefold {
     unless (%CASEFOLD) {
 	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 	    local $_;
+	    local $/ = "\n";
 	    while (<$CASEFOLDFH>) {
 		if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 		    my $code = hex($1);
@@ -949,7 +1071,7 @@ Unicode case mappings as returned by L</charinfo()> never are).
 
 If there are no case mappings for the L</code point argument>, or if all three
 possible mappings (I<lower>, I<title> and I<upper>) result in single code
-points and are locale independent and unconditional, B<undef> is returned
+points and are locale independent and unconditional, C<undef> is returned
 (which means that the case mappings, if any, for the code point are those
 returned by L</charinfo()>).
 
@@ -968,26 +1090,26 @@ added if necessary to make it contain at least four hexdigits
 
 =item B<lower>
 
-one or more codes (separated by spaces) that taken in order give the
+one or more codes (separated by spaces) that, taken in order, give the
 code points for the lower case of I<code>.
 Each has at least four hexdigits.
 
 =item B<title>
 
-one or more codes (separated by spaces) that taken in order give the
+one or more codes (separated by spaces) that, taken in order, give the
 code points for the title case of I<code>.
 Each has at least four hexdigits.
 
-=item B<lower>
+=item B<upper>
 
-one or more codes (separated by spaces) that taken in order give the
+one or more codes (separated by spaces) that, taken in order, give the
 code points for the upper case of I<code>.
 Each has at least four hexdigits.
 
 =item B<condition>
 
 the conditions for the mappings to be valid.
-If B<undef>, the mappings are always valid.
+If C<undef>, the mappings are always valid.
 When defined, this field is a list of conditions,
 all of which must be true for the mappings to be valid.
 The list consists of one or more
@@ -1007,7 +1129,7 @@ These are for context-sensitive casing.
 =back
 
 The hash described above is returned for locale-independent casing, where
-at least one of the mappings has length longer than one.  If B<undef> is 
+at least one of the mappings has length longer than one.  If C<undef> is
 returned, the code point may have mappings, but if so, all are length one,
 and are returned by L</charinfo()>.
 Note that when this function does return a value, it will be for the complete
@@ -1019,7 +1141,7 @@ will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
 followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
 and a variant code).  You can find the lists of all possible locales, see
 L<Locale::Country> and L<Locale::Language>.
-(In Unicode 5.1, the only locales returned by this function
+(In Unicode 6.0, the only locales returned by this function
 are C<lt>, C<tr>, and C<az>.)
 
 Each locale key is a reference to a hash that has the form above, and gives
@@ -1041,6 +1163,7 @@ sub _casespec {
     unless (%CASESPEC) {
 	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
 	    local $_;
+	    local $/ = "\n";
 	    while (<$CASESPECFH>) {
 		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 		    my ($hexcode, $lower, $title, $upper, $condition) =
@@ -1111,31 +1234,39 @@ sub casespec {
     my %namedseq = namedseq();
 
 If used with a single argument in a scalar context, returns the string
-consisting of the code points of the named sequence, or B<undef> if no
+consisting of the code points of the named sequence, or C<undef> if no
 named sequence by that name exists.  If used with a single argument in
 a list context, it returns the list of the ordinals of the code points.  If used
 with no
 arguments in a list context, returns a hash with the names of the
 named sequences as the keys and the named sequences as strings as
-the values.  Otherwise, it returns B<undef> or an empty list depending
+the values.  Otherwise, it returns C<undef> or an empty list depending
 on the context.
 
 This function only operates on officially approved (not provisional) named
 sequences.
 
+Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
+sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
+LETTER AINU P")> will return the same string this function does, but will also
+operate on character names that aren't named sequences, without you having to
+know which are which.  See L<charnames>.
+
 =cut
 
 my %NAMEDSEQ;
 
 sub _namedseq {
     unless (%NAMEDSEQ) {
-	if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
+	if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
 	    local $_;
+	    local $/ = "\n";
 	    while (<$NAMEDSEQFH>) {
-		if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
-		    my ($n, $s) = ($1, $2);
-		    my @s = map { chr(hex($_)) } split(' ', $s);
-		    $NAMEDSEQ{$n} = join("", @s);
+		if (/^ [0-9A-F]+ \  /x) {
+                    chomp;
+                    my ($sequence, $name) = split /\t/;
+		    my @s = map { chr(hex($_)) } split(' ', $sequence);
+		    $NAMEDSEQ{$name} = join("", @s);
 		}
 	    }
 	    close($NAMEDSEQFH);
@@ -1144,23 +1275,2046 @@ sub _namedseq {
 }
 
 sub namedseq {
-    _namedseq() unless %NAMEDSEQ;
+
+    # Use charnames::string_vianame() which now returns this information,
+    # unless the caller wants the hash returned, in which case we read it in,
+    # and thereafter use it instead of calling charnames, as it is faster.
+
     my $wantarray = wantarray();
     if (defined $wantarray) {
 	if ($wantarray) {
 	    if (@_ == 0) {
+                _namedseq() unless %NAMEDSEQ;
 		return %NAMEDSEQ;
 	    } elsif (@_ == 1) {
-		my $s = $NAMEDSEQ{ $_[0] };
+		my $s;
+                if (%NAMEDSEQ) {
+                    $s = $NAMEDSEQ{ $_[0] };
+                }
+                else {
+                    $s = charnames::string_vianame($_[0]);
+                }
 		return defined $s ? map { ord($_) } split('', $s) : ();
 	    }
 	} elsif (@_ == 1) {
-	    return $NAMEDSEQ{ $_[0] };
+            return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
+            return charnames::string_vianame($_[0]);
 	}
     }
     return;
 }
 
+my %NUMERIC;
+
+sub _numeric {
+
+    # Unicode 6.0 instituted the rule that only digits in a consecutive
+    # block of 10 would be considered decimal digits.  Before that, the only
+    # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
+    # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
+    # ONE.  The code could be modified to handle that, but not bothering, as
+    # in TUS 6.0, U+19DA was changed to Nt=Di.
+    if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
+	croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
+    }
+    my @numbers = _read_table("To/Nv.pl");
+    foreach my $entry (@numbers) {
+        my ($start, $end, $value) = @$entry;
+
+        # If value contains a slash, convert to decimal, add a reverse hash
+        # used by charinfo.
+        if ((my @rational = split /\//, $value) == 2) {
+            my $real = $rational[0] / $rational[1];
+            $real_to_rational{$real} = $value;
+            $value = $real;
+
+            # Should only be single element, but just in case...
+            for my $i ($start .. $end) {
+                $NUMERIC{$i} = $value;
+            }
+        }
+        else {
+            # The values require adjusting, as is in 'a' format
+            for my $i ($start .. $end) {
+                $NUMERIC{$i} = $value + $i - $start;
+            }
+        }
+    }
+
+    # Decided unsafe to use these that aren't officially part of the Unicode
+    # standard.
+    #use Math::Trig;
+    #my $pi = acos(-1.0);
+    #$NUMERIC{0x03C0} = $pi;
+
+    # Euler's constant, not to be confused with Euler's number
+    #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
+
+    # Euler's number
+    #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
+
+    return;
+}
+
+=pod
+
+=head2 B<num()>
+
+    use Unicode::UCD 'num';
+
+    my $val = num("123");
+    my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
+
+C<num> returns the numeric value of the input Unicode string; or C<undef> if it
+doesn't think the entire string has a completely valid, safe numeric value.
+
+If the string is just one character in length, the Unicode numeric value
+is returned if it has one, or C<undef> otherwise.  Note that this need
+not be a whole number.  C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
+example returns -0.5.
+
+=cut
+
+#A few characters to which Unicode doesn't officially
+#assign a numeric value are considered numeric by C<num>.
+#These are:
+
+# EULER CONSTANT             0.5772...  (this is NOT Euler's number)
+# SCRIPT SMALL E             2.71828... (this IS Euler's number)
+# GREEK SMALL LETTER PI      3.14159...
+
+=pod
+
+If the string is more than one character, C<undef> is returned unless
+all its characters are decimal digits (that is, they would match C<\d+>),
+from the same script.  For example if you have an ASCII '0' and a Bengali
+'3', mixed together, they aren't considered a valid number, and C<undef>
+is returned.  A further restriction is that the digits all have to be of
+the same form.  A half-width digit mixed with a full-width one will
+return C<undef>.  The Arabic script has two sets of digits;  C<num> will
+return C<undef> unless all the digits in the string come from the same
+set.
+
+C<num> errs on the side of safety, and there may be valid strings of
+decimal digits that it doesn't recognize.  Note that Unicode defines
+a number of "digit" characters that aren't "decimal digit" characters.
+"Decimal digits" have the property that they have a positional value, i.e.,
+there is a units position, a 10's position, a 100's, etc, AND they are
+arranged in Unicode in blocks of 10 contiguous code points.  The Chinese
+digits, for example, are not in such a contiguous block, and so Unicode
+doesn't view them as decimal digits, but merely digits, and so C<\d> will not
+match them.  A single-character string containing one of these digits will
+have its decimal value returned by C<num>, but any longer string containing
+only these digits will return C<undef>.
+
+Strings of multiple sub- and superscripts are not recognized as numbers.  You
+can use either of the compatibility decompositions in Unicode::Normalize to
+change these into digits, and then call C<num> on the result.
+
+=cut
+
+# To handle sub, superscripts, this could if called in list context,
+# consider those, and return the <decomposition> type in the second
+# array element.
+
+sub num {
+    my $string = $_[0];
+
+    _numeric unless %NUMERIC;
+
+    my $length = length($string);
+    return $NUMERIC{ord($string)} if $length == 1;
+    return if $string =~ /\D/;
+    my $first_ord = ord(substr($string, 0, 1));
+    my $value = $NUMERIC{$first_ord};
+    my $zero_ord = $first_ord - $value;
+
+    for my $i (1 .. $length -1) {
+        my $ord = ord(substr($string, $i, 1));
+        my $digit = $ord - $zero_ord;
+        return unless $digit >= 0 && $digit <= 9;
+        $value = $value * 10 + $digit;
+    }
+    return $value;
+}
+
+=pod
+
+=head2 B<prop_aliases()>
+
+    use Unicode::UCD 'prop_aliases';
+
+    my ($short_name, $full_name, @other_names) = prop_aliases("space");
+    my $same_full_name = prop_aliases("Space");     # Scalar context
+    my ($same_short_name) = prop_aliases("Space");  # gets 0th element
+    print "The full name is $full_name\n";
+    print "The short name is $short_name\n";
+    print "The other aliases are: ", join(", ", @other_names), "\n";
+
+    prints:
+    The full name is White_Space
+    The short name is WSpace
+    The other aliases are: Space
+
+Most Unicode properties have several synonymous names.  Typically, there is at
+least a short name, convenient to type, and a long name that more fully
+describes the property, and hence is more easily understood.
+
+If you know one name for a Unicode property, you can use C<prop_aliases> to find
+either the long name (when called in scalar context), or a list of all of the
+names, somewhat ordered so that the short name is in the 0th element, the long
+name in the next element, and any other synonyms are in the remaining
+elements, in no particular order.
+
+The long name is returned in a form nicely capitalized, suitable for printing.
+
+The input parameter name is loosely matched, which means that white space,
+hyphens, and underscores are ignored (except for the trailing underscore in
+the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
+both of which mean C<General_Category=Cased Letter>).
+
+If the name is unknown, C<undef> is returned (or an empty list in list
+context).  Note that Perl typically recognizes property names in regular
+expressions with an optional C<"Is_>" (with or without the underscore)
+prefixed to them, such as C<\p{isgc=punct}>.  This function does not recognize
+those in the input, returning C<undef>.  Nor are they included in the output
+as possible synonyms.
+
+C<prop_aliases> does know about the Perl extensions to Unicode properties,
+such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
+properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>.  The
+final example demonstrates that the C<"Is_"> prefix is recognized for these
+extensions; it is needed to resolve ambiguities.  For example,
+C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
+C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>.  This is
+because C<islc> is a Perl extension which is short for
+C<General_Category=Cased Letter>.  The lists returned for the Perl extensions
+will not include the C<"Is_"> prefix (whether or not the input had it) unless
+needed to resolve ambiguities, as shown in the C<"islc"> example, where the
+returned list had one element containing C<"Is_">, and the other without.
+
+It is also possible for the reverse to happen:  C<prop_aliases('isc')> returns
+the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
+C<(C, Other)> (the latter being a Perl extension meaning
+C<General_Category=Other>.
+L<perluniprops/Properties accessible through Unicode::UCD> lists the available
+forms, including which ones are discouraged from use.
+
+Those discouraged forms are accepted as input to C<prop_aliases>, but are not
+returned in the lists.  C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
+which are old synonyms for C<"Is_LC"> and should not be used in new code, are
+examples of this.  These both return C<(Is_LC, Cased_Letter)>.  Thus this
+function allows you to take a discourarged form, and find its acceptable
+alternatives.  The same goes with single-form Block property equivalences.
+Only the forms that begin with C<"In_"> are not discouraged; if you pass
+C<prop_aliases> a discouraged form, you will get back the equivalent ones that
+begin with C<"In_">.  It will otherwise look like a new-style block name (see.
+L</Old-style versus new-style block names>).
+
+C<prop_aliases> does not know about any user-defined properties, and will
+return C<undef> if called with one of those.  Likewise for Perl internal
+properties, with the exception of "Perl_Decimal_Digit" which it does know
+about (and which is documented below in L</prop_invmap()>).
+
+=cut
+
+# It may be that there are use cases where the discouraged forms should be
+# returned.  If that comes up, an optional boolean second parameter to the
+# function could be created, for example.
+
+# These are created by mktables for this routine and stored in unicore/UCD.pl
+# where their structures are described.
+our %string_property_loose_to_name;
+our %ambiguous_names;
+our %loose_perlprop_to_name;
+our %prop_aliases;
+
+sub prop_aliases ($) {
+    my $prop = $_[0];
+    return unless defined $prop;
+
+    require "unicore/UCD.pl";
+    require "unicore/Heavy.pl";
+    require "utf8_heavy.pl";
+
+    # The property name may be loosely or strictly matched; we don't know yet.
+    # But both types use lower-case.
+    $prop = lc $prop;
+
+    # It is loosely matched if its lower case isn't known to be strict.
+    my $list_ref;
+    if (! exists $utf8::stricter_to_file_of{$prop}) {
+        my $loose = utf8::_loose_name($prop);
+
+        # There is a hash that converts from any loose name to its standard
+        # form, mapping all synonyms for a  name to one name that can be used
+        # as a key into another hash.  The whole concept is for memory
+        # savings, as the second hash doesn't have to have all the
+        # combinations.  Actually, there are two hashes that do the
+        # converstion.  One is used in utf8_heavy.pl (stored in Heavy.pl) for
+        # looking up properties matchable in regexes.  This function needs to
+        # access string properties, which aren't available in regexes, so a
+        # second conversion hash is made for them (stored in UCD.pl).  Look in
+        # the string one now, as the rest can have an optional 'is' prefix,
+        # which these don't.
+        if (exists $string_property_loose_to_name{$loose}) {
+
+            # Convert to its standard loose name.
+            $prop = $string_property_loose_to_name{$loose};
+        }
+        else {
+            my $retrying = 0;   # bool.  ? Has an initial 'is' been stripped
+        RETRY:
+            if (exists $utf8::loose_property_name_of{$loose}
+                && (! $retrying
+                    || ! exists $ambiguous_names{$loose}))
+            {
+                # Found an entry giving the standard form.  We don't get here
+                # (in the test above) when we've stripped off an
+                # 'is' and the result is an ambiguous name.  That is because
+                # these are official Unicode properties (though Perl can have
+                # an optional 'is' prefix meaning the official property), and
+                # all ambiguous cases involve a Perl single-form extension
+                # for the gc, script, or block properties, and the stripped
+                # 'is' means that they mean one of those, and not one of
+                # these
+                $prop = $utf8::loose_property_name_of{$loose};
+            }
+            elsif (exists $loose_perlprop_to_name{$loose}) {
+
+                # This hash is specifically for this function to list Perl
+                # extensions that aren't in the earlier hashes.  If there is
+                # only one element, the short and long names are identical.
+                # Otherwise the form is already in the same form as
+                # %prop_aliases, which is handled at the end of the function.
+                $list_ref = $loose_perlprop_to_name{$loose};
+                if (@$list_ref == 1) {
+                    my @list = ($list_ref->[0], $list_ref->[0]);
+                    $list_ref = \@list;
+                }
+            }
+            elsif (! exists $utf8::loose_to_file_of{$loose}) {
+
+                # loose_to_file_of is a complete list of loose names.  If not
+                # there, the input is unknown.
+                return;
+            }
+            else {
+
+                # Here we found the name but not its aliases, so it has to
+                # exist.  This means it must be one of the Perl single-form
+                # extensions.  First see if it is for a property-value
+                # combination in one of the following properties.
+                my @list;
+                foreach my $property ("gc", "script") {
+                    @list = prop_value_aliases($property, $loose);
+                    last if @list;
+                }
+                if (@list) {
+
+                    # Here, it is one of those property-value combination
+                    # single-form synonyms.  There are ambiguities with some
+                    # of these.  Check against the list for these, and adjust
+                    # if necessary.
+                    for my $i (0 .. @list -1) {
+                        if (exists $ambiguous_names
+                                   {utf8::_loose_name(lc $list[$i])})
+                        {
+                            # The ambiguity is resolved by toggling whether or
+                            # not it has an 'is' prefix
+                            $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
+                        }
+                    }
+                    return @list;
+                }
+
+                # Here, it wasn't one of the gc or script single-form
+                # extensions.  It could be a block property single-form
+                # extension.  An 'in' prefix definitely means that, and should
+                # be looked up without the prefix.  However, starting in
+                # Unicode 6.1, we have to special case 'indic...', as there
+                # is a property that begins with that name.   We shouldn't
+                # strip the 'in' from that.   I'm (khw) generalizing this to
+                # 'indic' instead of the single property, because I suspect
+                # that others of this class may come along in the future.
+                # However, this could backfire and a block created whose name
+                # begins with 'dic...', and we would want to strip the 'in'.
+                # At which point this would have to be tweaked.
+                my $began_with_in = $loose =~ s/^in(?!dic)//;
+                @list = prop_value_aliases("block", $loose);
+                if (@list) {
+                    map { $_ =~ s/^/In_/ } @list;
+                    return @list;
+                }
+
+                # Here still haven't found it.  The last opportunity for it
+                # being valid is only if it began with 'is'.  We retry without
+                # the 'is', setting a flag to that effect so that we don't
+                # accept things that begin with 'isis...'
+                if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
+                    $retrying = 1;
+                    goto RETRY;
+                }
+
+                # Here, didn't find it.  Since it was in %loose_to_file_of, we
+                # should have been able to find it.
+                carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'.  Send bug report to perlbug\@perl.org";
+                return;
+            }
+        }
+    }
+
+    if (! $list_ref) {
+        # Here, we have set $prop to a standard form name of the input.  Look
+        # it up in the structure created by mktables for this purpose, which
+        # contains both strict and loosely matched properties.  Avoid
+        # autovivifying.
+        $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
+        return unless $list_ref;
+    }
+
+    # The full name is in element 1.
+    return $list_ref->[1] unless wantarray;
+
+    return @{dclone $list_ref};
+}
+
+=pod
+
+=head2 B<prop_value_aliases()>
+
+    use Unicode::UCD 'prop_value_aliases';
+
+    my ($short_name, $full_name, @other_names)
+                                   = prop_value_aliases("Gc", "Punct");
+    my $same_full_name = prop_value_aliases("Gc", "P");   # Scalar cntxt
+    my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
+                                                           # element
+    print "The full name is $full_name\n";
+    print "The short name is $short_name\n";
+    print "The other aliases are: ", join(", ", @other_names), "\n";
+
+    prints:
+    The full name is Punctuation
+    The short name is P
+    The other aliases are: Punct
+
+Some Unicode properties have a restricted set of legal values.  For example,
+all binary properties are restricted to just C<true> or C<false>; and there
+are only a few dozen possible General Categories.
+
+For such properties, there are usually several synonyms for each possible
+value.  For example, in binary properties, I<truth> can be represented by any of
+the strings "Y", "Yes", "T", or "True"; and the General Category
+"Punctuation" by that string, or "Punct", or simply "P".
+
+Like property names, there is typically at least a short name for each such
+property-value, and a long name.  If you know any name of the property-value,
+you can use C<prop_value_aliases>() to get the long name (when called in
+scalar context), or a list of all the names, with the short name in the 0th
+element, the long name in the next element, and any other synonyms in the
+remaining elements, in no particular order, except that any all-numeric
+synonyms will be last.
+
+The long name is returned in a form nicely capitalized, suitable for printing.
+
+Case, white space, hyphens, and underscores are ignored in the input parameters
+(except for the trailing underscore in the old-form grandfathered-in general
+category property value C<"L_">, which is better written as C<"LC">).
+
+If either name is unknown, C<undef> is returned.  Note that Perl typically
+recognizes property names in regular expressions with an optional C<"Is_>"
+(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
+This function does not recognize those in the property parameter, returning
+C<undef>.
+
+If called with a property that doesn't have synonyms for its values, it
+returns the input value, possibly normalized with capitalization and
+underscores.
+
+For the block property, new-style block names are returned (see
+L</Old-style versus new-style block names>).
+
+To find the synonyms for single-forms, such as C<\p{Any}>, use
+L</prop_aliases()> instead.
+
+C<prop_value_aliases> does not know about any user-defined properties, and
+will return C<undef> if called with one of those.
+
+=cut
+
+# These are created by mktables for this routine and stored in unicore/UCD.pl
+# where their structures are described.
+our %loose_to_standard_value;
+our %prop_value_aliases;
+
+sub prop_value_aliases ($$) {
+    my ($prop, $value) = @_;
+    return unless defined $prop && defined $value;
+
+    require "unicore/UCD.pl";
+    require "utf8_heavy.pl";
+
+    # Find the property name synonym that's used as the key in other hashes,
+    # which is element 0 in the returned list.
+    ($prop) = prop_aliases($prop);
+    return if ! $prop;
+    $prop = utf8::_loose_name(lc $prop);
+
+    # Here is a legal property, but the hash below (created by mktables for
+    # this purpose) only knows about the properties that have a very finite
+    # number of potential values, that is not ones whose value could be
+    # anything, like most (if not all) string properties.  These don't have
+    # synonyms anyway.  Simply return the input.  For example, there is no
+    # synonym for ('Uppercase_Mapping', A').
+    return $value if ! exists $prop_value_aliases{$prop};
+
+    # The value name may be loosely or strictly matched; we don't know yet.
+    # But both types use lower-case.
+    $value = lc $value;
+
+    # If the name isn't found under loose matching, it certainly won't be
+    # found under strict
+    my $loose_value = utf8::_loose_name($value);
+    return unless exists $loose_to_standard_value{"$prop=$loose_value"};
+
+    # Similarly if the combination under loose matching doesn't exist, it
+    # won't exist under strict.
+    my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
+    return unless exists $prop_value_aliases{$prop}{$standard_value};
+
+    # Here we did find a combination under loose matching rules.  But it could
+    # be that is a strict property match that shouldn't have matched.
+    # %prop_value_aliases is set up so that the strict matches will appear as
+    # if they were in loose form.  Thus, if the non-loose version is legal,
+    # we're ok, can skip the further check.
+    if (! exists $utf8::stricter_to_file_of{"$prop=$value"}
+
+        # We're also ok and skip the further check if value loosely matches.
+        # mktables has verified that no strict name under loose rules maps to
+        # an existing loose name.  This code relies on the very limited
+        # circumstances that strict names can be here.  Strict name matching
+        # happens under two conditions:
+        # 1) when the name begins with an underscore.  But this function
+        #    doesn't accept those, and %prop_value_aliases doesn't have
+        #    them.
+        # 2) When the values are numeric, in which case we need to look
+        #    further, but their squeezed-out loose values will be in
+        #    %stricter_to_file_of
+        && exists $utf8::stricter_to_file_of{"$prop=$loose_value"})
+    {
+        # The only thing that's legal loosely under strict is that can have an
+        # underscore between digit pairs XXX
+        while ($value =~ s/(\d)_(\d)/$1$2/g) {}
+        return unless exists $utf8::stricter_to_file_of{"$prop=$value"};
+    }
+
+    # Here, we know that the combination exists.  Return it.
+    my $list_ref = $prop_value_aliases{$prop}{$standard_value};
+    if (@$list_ref > 1) {
+        # The full name is in element 1.
+        return $list_ref->[1] unless wantarray;
+
+        return @{dclone $list_ref};
+    }
+
+    return $list_ref->[0] unless wantarray;
+
+    # Only 1 element means that it repeats
+    return ( $list_ref->[0], $list_ref->[0] );
+}
+
+# All 1 bits is the largest possible UV.
+$Unicode::UCD::MAX_CP = ~0;
+
+=pod
+
+=head2 B<prop_invlist()>
+
+C<prop_invlist> returns an inversion list (described below) that defines all the
+code points for the binary Unicode property (or "property=value" pair) given
+by the input parameter string:
+
+ use feature 'say';
+ use Unicode::UCD 'prop_invlist';
+ say join ", ", prop_invlist("Any");
+
+ prints:
+ 0, 1114112
+
+An empty list is returned if the input is unknown; the number of elements in
+the list is returned if called in scalar context.
+
+L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
+the list of properties that this function accepts, as well as all the possible
+forms for them (including with the optional "Is_" prefixes).  (Except this
+function doesn't accept any Perl-internal properties, some of which are listed
+there.) This function uses the same loose or tighter matching rules for
+resolving the input property's name as is done for regular expressions.  These
+are also specified in L<perluniprops|perluniprops/Properties accessible
+through \p{} and \P{}>.  Examples of using the "property=value" form are:
+
+ say join ", ", prop_invlist("Script=Shavian");
+
+ prints:
+ 66640, 66688
+
+ say join ", ", prop_invlist("ASCII_Hex_Digit=No");
+
+ prints:
+ 0, 48, 58, 65, 71, 97, 103
+
+ say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
+
+ prints:
+ 48, 58, 65, 71, 97, 103
+
+Inversion lists are a compact way of specifying Unicode property-value
+definitions.  The 0th item in the list is the lowest code point that has the
+property-value.  The next item (item [1]) is the lowest code point beyond that
+one that does NOT have the property-value.  And the next item beyond that
+([2]) is the lowest code point beyond that one that does have the
+property-value, and so on.  Put another way, each element in the list gives
+the beginning of a range that has the property-value (for even numbered
+elements), or doesn't have the property-value (for odd numbered elements).
+The name for this data structure stems from the fact that each element in the
+list toggles (or inverts) whether the corresponding range is or isn't on the
+list.
+
+In the final example above, the first ASCII Hex digit is code point 48, the
+character "0", and all code points from it through 57 (a "9") are ASCII hex
+digits.  Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
+are, as are 97 ("a") through 102 ("f").  103 starts a range of code points
+that aren't ASCII hex digits.  That range extends to infinity, which on your
+computer can be found in the variable C<$Unicode::UCD::MAX_CP>.  (This
+variable is as close to infinity as Perl can get on your platform, and may be
+too high for some operations to work; you may wish to use a smaller number for
+your purposes.)
+
+Note that the inversion lists returned by this function can possibly include
+non-Unicode code points, that is anything above 0x10FFFF.  This is in
+contrast to Perl regular expression matches on those code points, in which a
+non-Unicode code point always fails to match.  For example, both of these have
+the same result:
+
+ chr(0x110000) =~ \p{ASCII_Hex_Digit=True}      # Fails.
+ chr(0x110000) =~ \p{ASCII_Hex_Digit=False}     # Fails!
+
+And both raise a warning that a Unicode property is being used on a
+non-Unicode code point.  It is arguable as to which is the correct thing to do
+here.  This function has chosen the way opposite to the Perl regular
+expression behavior.  This allows you to easily flip to to the Perl regular
+expression way (for you to go in the other direction would be far harder).
+Simply add 0x110000 at the end of the non-empty returned list if it isn't
+already that value; and pop that value if it is; like:
+
+ my @list = prop_invlist("foo");
+ if (@list) {
+     if ($list[-1] == 0x110000) {
+         pop @list;  # Defeat the turning on for above Unicode
+     }
+     else {
+         push @list, 0x110000; # Turn off for above Unicode
+     }
+ }
+
+It is a simple matter to expand out an inversion list to a full list of all
+code points that have the property-value:
+
+ my @invlist = prop_invlist($property_name);
+ die "empty" unless @invlist;
+ my @full_list;
+ for (my $i = 0; $i < @invlist; $i += 2) {
+    my $upper = ($i + 1) < @invlist
+                ? $invlist[$i+1] - 1      # In range
+                : $Unicode::UCD::MAX_CP;  # To infinity.  You may want
+                                          # to stop much much earlier;
+                                          # going this high may expose
+                                          # perl deficiencies with very
+                                          # large numbers.
+    for my $j ($invlist[$i] .. $upper) {
+        push @full_list, $j;
+    }
+ }
+
+C<prop_invlist> does not know about any user-defined nor Perl internal-only
+properties, and will return C<undef> if called with one of those.
+
+=cut
+
+# User-defined properties could be handled with some changes to utf8_heavy.pl;
+# and implementing here of dealing with EXTRAS.  If done, consideration should
+# be given to the fact that the user subroutine could return different results
+# with each call; security issues need to be thought about.
+
+# These are created by mktables for this routine and stored in unicore/UCD.pl
+# where their structures are described.
+our %loose_defaults;
+our $MAX_UNICODE_CODEPOINT;
+
+sub prop_invlist ($) {
+    my $prop = $_[0];
+    return if ! defined $prop;
+
+    require "utf8_heavy.pl";
+
+    # Warnings for these are only for regexes, so not applicable to us
+    no warnings 'deprecated';
+
+    # Get the swash definition of the property-value.
+    my $swash = utf8::SWASHNEW(__PACKAGE__, $prop, undef, 1, 0);
+
+    # Fail if not found, or isn't a boolean property-value, or is a
+    # user-defined property, or is internal-only.
+    return if ! $swash
+              || ref $swash eq ""
+              || $swash->{'BITS'} != 1
+              || $swash->{'USER_DEFINED'}
+              || $prop =~ /^\s*_/;
+
+    if ($swash->{'EXTRAS'}) {
+        carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
+        return;
+    }
+    if ($swash->{'SPECIALS'}) {
+        carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic";
+        return;
+    }
+
+    my @invlist;
+
+    # The input lines look like:
+    # 0041\t005A   # [26]
+    # 005F
+
+    # Split into lines, stripped of trailing comments
+    foreach my $range (split "\n",
+                            $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
+    {
+        # And find the beginning and end of the range on the line
+        my ($hex_begin, $hex_end) = split "\t", $range;
+        my $begin = hex $hex_begin;
+
+        # If the new range merely extends the old, we remove the marker
+        # created the last time through the loop for the old's end, which
+        # causes the new one's end to be used instead.
+        if (@invlist && $begin == $invlist[-1]) {
+            pop @invlist;
+        }
+        else {
+            # Add the beginning of the range
+            push @invlist, $begin;
+        }
+
+        if (defined $hex_end) { # The next item starts with the code point 1
+                                # beyond the end of the range.
+            push @invlist, hex($hex_end) + 1;
+        }
+        else {  # No end of range, is a single code point.
+            push @invlist, $begin + 1;
+        }
+    }
+
+    require "unicore/UCD.pl";
+    my $FIRST_NON_UNICODE = $MAX_UNICODE_CODEPOINT + 1;
+
+    # Could need to be inverted: add or subtract a 0 at the beginning of the
+    # list.  And to keep it from matching non-Unicode, add or subtract the
+    # first non-unicode code point.
+    if ($swash->{'INVERT_IT'}) {
+        if (@invlist && $invlist[0] == 0) {
+            shift @invlist;
+        }
+        else {
+            unshift @invlist, 0;
+        }
+        if (@invlist && $invlist[-1] == $FIRST_NON_UNICODE) {
+            pop @invlist;
+        }
+        else {
+            push @invlist, $FIRST_NON_UNICODE;
+        }
+    }
+
+    # Here, the list is set up to include only Unicode code points.  But, if
+    # the table is the default one for the property, it should contain all
+    # non-Unicode code points.  First calculate the loose name for the
+    # property.  This is done even for strict-name properties, as the data
+    # structure that mktables generates for us is set up so that we don't have
+    # to worry about that.  The property-value needs to be split if compound,
+    # as the loose rules need to be independently calculated on each part.  We
+    # know that it is syntactically valid, or SWASHNEW would have failed.
+
+    $prop = lc $prop;
+    my ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
+    if ($table) {
+
+        # May have optional prefixed 'is'
+        $prop = utf8::_loose_name($prop_only) =~ s/^is//r;
+        $prop = $utf8::loose_property_name_of{$prop};
+        $prop .= "=" . utf8::_loose_name($table);
+    }
+    else {
+        $prop = utf8::_loose_name($prop);
+    }
+    if (exists $loose_defaults{$prop}) {
+
+        # Here, is the default table.  If a range ended with 10ffff, instead
+        # continue that range to infinity, by popping the 110000; otherwise,
+        # add the range from 11000 to infinity
+        if (! @invlist || $invlist[-1] != $FIRST_NON_UNICODE) {
+            push @invlist, $FIRST_NON_UNICODE;
+        }
+        else {
+            pop @invlist;
+        }
+    }
+
+    return @invlist;
+}
+
+sub _search_invlist {
+    # Find the range in the inversion list which contains a code point; that
+    # is, find i such that l[i] <= code_point < l[i+1]
+
+    # If this is ever made public, could use to speed up .t specials.  Would
+    # need to use code point argument, as in other functions in this pm
+
+    my $list_ref = shift;
+    my $code_point = shift;
+    # Verify non-neg numeric  XXX
+
+    my $max_element = @$list_ref - 1;
+    return if ! $max_element < 0;     # Undef if list is empty.
+
+    # Short cut something at the far-end of the table.  This also allows us to
+    # refer to element [$i+1] without fear of being out-of-bounds in the loop
+    # below.
+    return $max_element if $code_point >= $list_ref->[$max_element];
+
+    use integer;        # want integer division
+
+    my $i = $max_element / 2;
+
+    my $lower = 0;
+    my $upper = $max_element;
+    while (1) {
+
+        if ($code_point >= $list_ref->[$i]) {
+
+            # Here we have met the lower constraint.  We can quit if we
+            # also meet the upper one.
+            last if $code_point < $list_ref->[$i+1];
+
+            $lower = $i;        # Still too low.
+
+        }
+        else {
+
+            # Here, $code_point < $list_ref[$i], so look lower down.
+            $upper = $i;
+        }
+
+        # Split search domain in half to try again.
+        my $temp = ($upper + $lower) / 2;
+
+        # No point in continuing unless $i changes for next time
+        # in the loop.
+        return $i if $temp == $i;
+        $i = $temp;
+    } # End of while loop
+
+    # Here we have found the offset
+    return $i;
+}
+
+=pod
+
+=head2 B<prop_invmap()>
+
+ use Unicode::UCD 'prop_invmap';
+ my ($list_ref, $map_ref, $format, $missing)
+                                      = prop_invmap("General Category");
+
+C<prop_invmap> is used to get the complete mapping definition for a property,
+in the form of an inversion map.  An inversion map consists of two parallel
+arrays.  One is an ordered list of code points that mark range beginnings, and
+the other gives the value (or mapping) that all code points in the
+corresponding range have.
+
+C<prop_invmap> is called with the name of the desired property.  The name is
+loosely matched, meaning that differences in case, white-space, hyphens, and
+underscores are not meaningful (except for the trailing underscore in the
+old-form grandfathered-in property C<"L_">, which is better written as C<"LC">,
+or even better, C<"Gc=LC">).
+
+Many Unicode properties have more than one name (or alias).  C<prop_invmap>
+understands all of these, including Perl extensions to them.  Ambiguities are
+resolved as described above for L</prop_aliases()>.  The Perl internal
+property "Perl_Decimal_Digit, described below, is also accepted.  C<undef> is
+returned if the property name is unknown.
+See L<perluniprops/Properties accessible through Unicode::UCD> for the
+properties acceptable as inputs to this function.
+
+It is a fatal error to call this function except in list context.
+
+In addition to the the two arrays that form the inversion map, C<prop_invmap>
+returns two other values; one is a scalar that gives some details as to the
+format of the entries of the map array; the other is used for specialized
+purposes, described at the end of this section.
+
+This means that C<prop_invmap> returns a 4 element list.  For example,
+
+ my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
+                                                 = prop_invmap("Block");
+
+In this call, the two arrays will be populated as shown below (for Unicode
+6.0):
+
+ Index  @blocks_ranges  @blocks_maps
+   0        0x0000      Basic Latin
+   1        0x0080      Latin-1 Supplement
+   2        0x0100      Latin Extended-A
+   3        0x0180      Latin Extended-B
+   4        0x0250      IPA Extensions
+   5        0x02B0      Spacing Modifier Letters
+   6        0x0300      Combining Diacritical Marks
+   7        0x0370      Greek and Coptic
+   8        0x0400      Cyrillic
+  ...
+ 233        0x2B820     No_Block
+ 234        0x2F800     CJK Compatibility Ideographs Supplement
+ 235        0x2FA20     No_Block
+ 236        0xE0000     Tags
+ 237        0xE0080     No_Block
+ 238        0xE0100     Variation Selectors Supplement
+ 239        0xE01F0     No_Block
+ 240        0xF0000     Supplementary Private Use Area-A
+ 241        0x100000    Supplementary Private Use Area-B
+ 242        0x110000    No_Block
+
+The first line (with Index [0]) means that the value for code point 0 is "Basic
+Latin".  The entry "0x0080" in the @blocks_ranges column in the second line
+means that the value from the first line, "Basic Latin", extends to all code
+points in the range from 0 up to but not including 0x0080, that is, through
+127.  In other words, the code points from 0 to 127 are all in the "Basic
+Latin" block.  Similarly, all code points in the range from 0x0080 up to (but
+not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
+(Notice that the return is the old-style block names; see L</Old-style versus
+new-style block names>).
+
+The final line (with Index [242]) means that the value for all code points above
+the legal Unicode maximum code point have the value "No_Block", which is the
+term Unicode uses for a non-existing block.
+
+The arrays completely specify the mappings for all possible code points.
+The final element in an inversion map returned by this function will always be
+for the range that consists of all the code points that aren't legal Unicode,
+but that are expressible on the platform.  (That is, it starts with code point
+0x110000, the first code point above the legal Unicode maximum, and extends to
+infinity.) The value for that range will be the same that any typical
+unassigned code point has for the specified property.  (Certain unassigned
+code points are not "typical"; for example the non-character code points, or
+those in blocks that are to be written right-to-left.  The above-Unicode
+range's value is not based on these atypical code points.)  It could be argued
+that, instead of treating these as unassigned Unicode code points, the value
+for this range should be C<undef>.  If you wish, you can change the returned
+arrays accordingly.
+
+The maps are almost always simple scalars that should be interpreted as-is.
+These values are those given in the Unicode-supplied data files, which may be
+inconsistent as to capitalization and as to which synonym for a property-value
+is given.  The results may be normalized by using the L</prop_value_aliases()>
+function.
+
+There are exceptions to the simple scalar maps.  Some properties have some
+elements in their map list that are themselves lists of scalars; and some
+special strings are returned that are not to be interpreted as-is.  Element
+[2] (placed into C<$format> in the example above) of the returned four element
+list tells you if the map has any of these special elements or not, as follows:
+
+=over
+
+=item B<C<s>>
+
+means all the elements of the map array are simple scalars, with no special
+elements.  Almost all properties are like this, like the C<block> example
+above.
+
+=item B<C<sl>>
+
+means that some of the map array elements have the form given by C<"s">, and
+the rest are lists of scalars.  For example, here is a portion of the output
+of calling C<prop_invmap>() with the "Script Extensions" property:
+
+ @scripts_ranges  @scripts_maps
+      ...
+      0x0953      Devanagari
+      0x0964      [ Bengali, Devanagari, Gurumukhi, Oriya ]
+      0x0966      Devanagari
+      0x0970      Common
+
+Here, the code points 0x964 and 0x965 are both used in Bengali,
+Devanagari, Gurmukhi, and Oriya, but no other scripts.
+
+The Name_Alias property is also of this form.  But each scalar consists of two
+components:  1) the name, and 2) the type of alias this is.  They are
+separated by a colon and a space.  In Unicode 6.1, there are several alias types:
+
+=over
+
+=item C<correction>
+
+indicates that the name is a corrected form for the
+original name (which remains valid) for the same code point.
+
+=item C<control>
+
+adds a new name for a control character.
+
+=item C<alternate>
+
+is an alternate name for a character
+
+=item C<figment>
+
+is a name for a character that has been documented but was never in any
+actual standard.
+
+=item C<abbreviation>
+
+is a common abbreviation for a character
+
+=back
+
+The lists are ordered (roughly) so the most preferred names come before less
+preferred ones.
+
+For example,
+
+ @aliases_ranges        @alias_maps
+    ...
+    0x009E        [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
+    0x009F        [ 'APPLICATION PROGRAM COMMAND: control',
+                    'APC: abbreviation'
+                  ]
+    0x00A0        'NBSP: abbreviation'
+    0x00A1        ""
+    0x00AD        'SHY: abbreviation'
+    0x00AE        ""
+    0x01A2        'LATIN CAPITAL LETTER GHA: correction'
+    0x01A3        'LATIN SMALL LETTER GHA: correction'
+    0x01A4        ""
+    ...
+
+A map to the empty string means that there is no alias defined for the code
+point.
+
+=item B<C<a>>
+
+is like C<"s"> in that all the map array elements are scalars, but here they are
+restricted to all being integers, and some have to be adjusted (hence the name
+C<"a">) to get the correct result.  For example, in:
+
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+                          = prop_invmap("Simple_Uppercase_Mapping");
+
+the returned arrays look like this:
+
+ @$uppers_ranges_ref    @$uppers_maps_ref   Note
+       0                      0
+      97                     65          'a' maps to 'A', b => B ...
+     123                      0
+     181                    924          MICRO SIGN => Greek Cap MU
+     182                      0
+     ...
+
+Let's start with the second line.  It says that the uppercase of code point 97
+is 65; or C<uc("a")> == "A".  But the line is for the entire range of code
+points 97 through 122.  To get the mapping for any code point in a range, you
+take the offset it has from the beginning code point of the range, and add
+that to the mapping for that first code point.  So, the mapping for 122 ("z")
+is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
+yielding 90 ("z").  Likewise for everything in between.
+
+The first line works the same way.  The first map in a range is always the
+correct value for its code point (because the adjustment is 0).  Thus the
+C<uc(chr(0))> is just itself.  Also, C<uc(chr(1))> is also itself, as the
+adjustment is 0+1-0 .. C<uc(chr(96))> is 96.
+
+Requiring this simple adjustment allows the returned arrays to be
+significantly smaller than otherwise, up to a factor of 10, speeding up
+searching through them.
+
+=item B<C<al>>
+
+means that some of the map array elements have the form given by C<"a">, and
+the rest are ordered lists of code points.
+For example, in:
+
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+                                 = prop_invmap("Uppercase_Mapping");
+
+the returned arrays look like this:
+
+ @$uppers_ranges_ref    @$uppers_maps_ref
+       0                      0
+      97                     65
+     123                      0
+     181                    924
+     182                      0
+     ...
+    0x0149              [ 0x02BC 0x004E ]
+    0x014A                    0
+    0x014B                  330
+     ...
+
+This is the full Uppercase_Mapping property (as opposed to the
+Simple_Uppercase_Mapping given in the example for format C<"a">).  The only
+difference between the two in the ranges shown is that the code point at
+0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
+characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
+CAPITAL LETTER N).
+
+No adjustments are needed to entries that are references to arrays; each such
+entry will have exactly one element in its range, so the offset is always 0.
+
+=item B<C<ae>>
+
+This is like C<"a">, but some elements are the empty string, and should not be
+adjusted.
+The one internal Perl property accessible by C<prop_invmap> is of this type:
+"Perl_Decimal_Digit" returns an inversion map which gives the numeric values
+that are represented by the Unicode decimal digit characters.  Characters that
+don't represent decimal digits map to the empty string, like so:
+
+ @digits    @values
+ 0x0000       ""
+ 0x0030        0
+ 0x003A:      ""
+ 0x0660:       0
+ 0x066A:      ""
+ 0x06F0:       0
+ 0x06FA:      ""
+ 0x07C0:       0
+ 0x07CA:      ""
+ 0x0966:       0
+ ...
+
+This means that the code points from 0 to 0x2F do not represent decimal digits;
+the code point 0x30 (DIGIT ZERO) represents 0;  code point 0x31, (DIGIT ONE),
+represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
+... code points 0x3A through 0x65F do not represent decimal digits; 0x660
+(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
+represents 0+1-0 = 1 ...
+
+=item B<C<ale>>
+
+is a combination of the C<"al"> type and the C<"ae"> type.  Some of
+the map array elements have the forms given by C<"al">, and
+the rest are the empty string.  The property C<NFKC_Casefold> has this form.
+An example slice is:
+
+ @$ranges_ref  @$maps_ref         Note
+    ...
+   0x00AA       97                FEMININE ORDINAL INDICATOR => 'a'
+   0x00AB        0
+   0x00AD                         SOFT HYPHEN => ""
+   0x00AE        0
+   0x00AF     [ 0x0020, 0x0304 ]  MACRON => SPACE . COMBINING MACRON
+   0x00B0        0
+   ...
+
+=item B<C<ar>>
+
+means that all the elements of the map array are either rational numbers or
+the string C<"NaN">, meaning "Not a Number".  A rational number is either an
+integer, or two integers separated by a solidus (C<"/">).  The second integer
+represents the denominator of the division implied by the solidus, and is
+actually always positive, so it is guaranteed not to be 0 and to not to be
+signed.  When the element is a plain integer (without the
+solidus), it may need to be adjusted to get the correct value by adding the
+offset, just as other C<"a"> properties.  No adjustment is needed for
+fractions, as the range is guaranteed to have just a single element, and so
+the offset is always 0.
+
+If you want to convert the returned map to entirely scalar numbers, you
+can use something like this:
+
+ my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
+ if ($format && $format eq "ar") {
+     map { $_ = eval $_ } @$invmap_ref;
+ }
+
+Here's some entries from the output of the property "Nv", which has format
+C<"ar">.
+
+ @numerics_ranges  @numerics_maps       Note
+        0x00           "NaN"
+        0x30             0           DIGIT 0 .. DIGIT 9
+        0x3A           "NaN"
+        0xB2             2           SUPERSCRIPTs 2 and 3
+        0xB4           "NaN"
+        0xB9             1           SUPERSCRIPT 1
+        0xBA           "NaN"
+        0xBC            1/4          VULGAR FRACTION 1/4
+        0xBD            1/2          VULGAR FRACTION 1/2
+        0xBE            3/4          VULGAR FRACTION 3/4
+        0xBF           "NaN"
+        0x660            0           ARABIC-INDIC DIGIT ZERO .. NINE
+        0x66A          "NaN"
+
+=item B<C<n>>
+
+means the Name property.  All the elements of the map array are simple
+scalars, but some of them contain special strings that require more work to
+get the actual name.
+
+Entries such as:
+
+ CJK UNIFIED IDEOGRAPH-<code point>
+
+mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
+with the code point (expressed in hexadecimal) appended to it, like "CJK
+UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
+pointE<gt>>>).
+
+Also, entries like
+
+ <hangul syllable>
+
+means that the name is algorithmically calculated.  This is easily done by
+the function L<charnames/charnames::viacode(code)>.
+
+Note that for control characters (C<Gc=cc>), Unicode's data files have the
+string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
+string.  This function returns that real name, the empty string.  (There are
+names for these characters, but they are considered aliases, not the Name
+property name, and are contained in the C<Name_Alias> property.)
+
+=item B<C<ad>>
+
+means the Decomposition_Mapping property.  This property is like C<"al">
+properties, except that one of the scalar elements is of the form:
+
+ <hangul syllable>
+
+This signifies that this entry should be replaced by the decompositions for
+all the code points whose decomposition is algorithmically calculated.  (All
+of them are currently in one range and no others outisde the range are likely
+to ever be added to Unicode; the C<"n"> format
+has this same entry.)  These can be generated via the function
+L<Unicode::Normalize::NFD()|Unicode::Normalize>.
+
+Note that the mapping is the one that is specified in the Unicode data files,
+and to get the final decomposition, it may need to be applied recursively.
+
+=back
+
+Note that a format begins with the letter "a" if and only the property it is
+for requires adjustments by adding the offsets in multi-element ranges.  For
+all these properties, an entry should be adjusted only if the map is a scalar
+which is an integer.  That is, it must match the regular expression:
+
+    / ^ -? \d+ $ /xa
+
+Further, the first element in a range never needs adjustment, as the
+adjustment would be just adding 0.
+
+A binary search can be used to quickly find a code point in the inversion
+list, and hence its corresponding mapping.
+
+The final element (index [3], assigned to C<$default> in the "block" example) in
+the four element list returned by this function may be useful for applications
+that wish to convert the returned inversion map data structure into some
+other, such as a hash.  It gives the mapping that most code points map to
+under the property.  If you establish the convention that any code point not
+explicitly listed in your data structure maps to this value, you can
+potentially make your data structure much smaller.  As you construct your data
+structure from the one returned by this function, simply ignore those ranges
+that map to this value, generally called the "default" value.  For example, to
+convert to the data structure searchable by L</charinrange()>, you can follow
+this recipe for properties that don't require adjustments:
+
+ my ($list_ref, $map_ref, $format, $missing) = prop_invmap($property);
+ my @range_list;
+
+ # Look at each element in the list, but the -2 is needed because we
+ # look at $i+1 in the loop, and the final element is guaranteed to map
+ # to $missing by prop_invmap(), so we would skip it anyway.
+ for my $i (0 .. @$list_ref - 2) {
+    next if $map_ref->[$i] eq $missing;
+    push @range_list, [ $list_ref->[$i],
+                        $list_ref->[$i+1],
+                        $map_ref->[$i]
+                      ];
+ }
+
+ print charinrange(\@range_list, $code_point), "\n";
+
+With this, C<charinrange()> will return C<undef> if its input code point maps
+to C<$missing>.  You can avoid this by omitting the C<next> statement, and adding
+a line after the loop to handle the final element of the inversion map.
+
+Similarly, this recipe can be used for properties that do require adjustments:
+
+ for my $i (0 .. @$list_ref - 2) {
+    next if $map_ref->[$i] eq $missing;
+
+    # prop_invmap() guarantees that if the mapping is to an array, the
+    # range has just one element, so no need to worry about adjustments.
+    if (ref $map_ref->[$i]) {
+        push @range_list,
+                   [ $list_ref->[$i], $list_ref->[$i], $map_ref->[$i] ];
+    }
+    else {  # Otherwise each element is actually mapped to a separate
+            # value, so the range has to be split into single code point
+            # ranges.
+
+        my $adjustment = 0;
+
+        # For each code point that gets mapped to something...
+        for my $j ($list_ref->[$i] .. $list_ref->[$i+1] -1 ) {
+
+            # ... add a range consisting of just it mapping to the
+            # original plus the adjustment, which is incremented for the
+            # next time through the loop, as the offset increases by 1
+            # for each element in the range
+            push @range_list,
+                             [ $j, $j, $map_ref->[$i] + $adjustment++ ];
+        }
+    }
+ }
+
+Note that the inversion maps returned for the C<Case_Folding> and
+C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
+Use L</casefold()> for these.
+
+C<prop_invmap> does not know about any user-defined properties, and will
+return C<undef> if called with one of those.
+
+=cut
+
+# User-defined properties could be handled with some changes to utf8_heavy.pl;
+# if done, consideration should be given to the fact that the user subroutine
+# could return different results with each call, which could lead to some
+# security issues.
+
+# One could store things in memory so they don't have to be recalculated, but
+# it is unlikely this will be called often, and some properties would take up
+# significant memory.
+
+# These are created by mktables for this routine and stored in unicore/UCD.pl
+# where their structures are described.
+our @algorithmic_named_code_points;
+our $HANGUL_BEGIN;
+our $HANGUL_COUNT;
+
+sub prop_invmap ($) {
+
+    croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray;
+
+    my $prop = $_[0];
+    return unless defined $prop;
+
+    # Fail internal properties
+    return if $prop =~ /^_/;
+
+    # The values returned by this function.
+    my (@invlist, @invmap, $format, $missing);
+
+    # The swash has two components we look at, the base list, and a hash,
+    # named 'SPECIALS', containing any additional members whose mappings don't
+    # fit into the the base list scheme of things.  These generally 'override'
+    # any value in the base list for the same code point.
+    my $overrides;
+
+    require "utf8_heavy.pl";
+    require "unicore/UCD.pl";
+
+RETRY:
+
+    # If there are multiple entries for a single code point
+    my $has_multiples = 0;
+
+    # Try to get the map swash for the property.  They have 'To' prepended to
+    # the property name, and 32 means we will accept 32 bit return values.
+    # The 0 means we aren't calling this from tr///.
+    my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
+
+    # If didn't find it, could be because needs a proxy.  And if was the
+    # 'Block' or 'Name' property, use a proxy even if did find it.  Finding it
+    # in these cases would be the result of the installation changing mktables
+    # to output the Block or Name tables.  The Block table gives block names
+    # in the new-style, and this routine is supposed to return old-style block
+    # names.  The Name table is valid, but we need to execute the special code
+    # below to add in the algorithmic-defined name entries.
+    # And NFKCCF needs conversion, so handle that here too.
+    if (ref $swash eq ""
+        || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x)
+    {
+
+        # Get the short name of the input property, in standard form
+        my ($second_try) = prop_aliases($prop);
+        return unless $second_try;
+        $second_try = utf8::_loose_name(lc $second_try);
+
+        if ($second_try eq "in") {
+
+            # This property is identical to age for inversion map purposes
+            $prop = "age";
+            goto RETRY;
+        }
+        elsif ($second_try =~ / ^ s ( cf | [ltu] c ) $ /x) {
+
+            # These properties use just the LIST part of the full mapping,
+            # which includes the simple maps that are otherwise overridden by
+            # the SPECIALS.  So all we need do is to not look at the SPECIALS;
+            # set $overrides to indicate that
+            $overrides = -1;
+
+            # The full name is the simple name stripped of its initial 's'
+            $prop = $second_try =~ s/^s//r;
+            goto RETRY;
+        }
+        elsif ($second_try eq "blk") {
+
+            # We use the old block names.  Just create a fake swash from its
+            # data.
+            _charblocks();
+            my %blocks;
+            $blocks{'LIST'} = "";
+            $blocks{'TYPE'} = "ToBlk";
+            $utf8::SwashInfo{ToBlk}{'missing'} = "No_Block";
+            $utf8::SwashInfo{ToBlk}{'format'} = "s";
+
+            foreach my $block (@BLOCKS) {
+                $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n",
+                                           $block->[0],
+                                           $block->[1],
+                                           $block->[2];
+            }
+            $swash = \%blocks;
+        }
+        elsif ($second_try eq "na") {
+
+            # Use the combo file that has all the Name-type properties in it,
+            # extracting just the ones that are for the actual 'Name'
+            # property.  And create a fake swash from it.
+            my %names;
+            $names{'LIST'} = "";
+            my $original = do "unicore/Name.pl";
+            my $algorithm_names = \@algorithmic_named_code_points;
+
+            # We need to remove the names from it that are aliases.  For that
+            # we need to also read in that table.  Create a hash with the keys
+            # being the code points, and the values being a list of the
+            # aliases for the code point key.
+            my ($aliases_code_points, $aliases_maps, undef, undef) =
+                                                &prop_invmap('Name_Alias');
+            my %aliases;
+            for (my $i = 0; $i < @$aliases_code_points; $i++) {
+                my $code_point = $aliases_code_points->[$i];
+                $aliases{$code_point} = $aliases_maps->[$i];
+
+                # If not already a list, make it into one, so that later we
+                # can treat things uniformly
+                if (! ref $aliases{$code_point}) {
+                    $aliases{$code_point} = [ $aliases{$code_point} ];
+                }
+
+                # Remove the alias type from the entry, retaining just the
+                # name.
+                map { s/:.*// } @{$aliases{$code_point}};
+            }
+
+            my $i = 0;
+            foreach my $line (split "\n", $original) {
+                my ($hex_code_point, $name) = split "\t", $line;
+
+                # Weeds out all comments, blank lines, and named sequences
+                next if $hex_code_point =~ /\P{ASCII_HEX_DIGIT}/;
+
+                my $code_point = hex $hex_code_point;
+
+                # The name of all controls is the default: the empty string.
+                # The set of controls is immutable, so these hard-coded
+                # constants work.
+                next if $code_point <= 0x9F
+                        && ($code_point <= 0x1F || $code_point >= 0x7F);
+
+                # If this is a name_alias, it isn't a name
+                next if grep { $_ eq $name } @{$aliases{$code_point}};
+
+                # If we are beyond where one of the special lines needs to
+                # be inserted ...
+                while ($i < @$algorithm_names
+                    && $code_point > $algorithm_names->[$i]->{'low'})
+                {
+
+                    # ... then insert it, ahead of what we were about to
+                    # output
+                    $names{'LIST'} .= sprintf "%x\t%x\t%s\n",
+                                            $algorithm_names->[$i]->{'low'},
+                                            $algorithm_names->[$i]->{'high'},
+                                            $algorithm_names->[$i]->{'name'};
+
+                    # Done with this range.
+                    $i++;
+
+                    # We loop until all special lines that precede the next
+                    # regular one are output.
+                }
+
+                # Here, is a normal name.
+                $names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name;
+            } # End of loop through all the names
+
+            $names{'TYPE'} = "ToNa";
+            $utf8::SwashInfo{ToNa}{'missing'} = "";
+            $utf8::SwashInfo{ToNa}{'format'} = "n";
+            $swash = \%names;
+        }
+        elsif ($second_try =~ / ^ ( d [mt] ) $ /x) {
+
+            # The file is a combination of dt and dm properties.  Create a
+            # fake swash from the portion that we want.
+            my $original = do "unicore/Decomposition.pl";
+            my %decomps;
+
+            if ($second_try eq 'dt') {
+                $decomps{'TYPE'} = "ToDt";
+                $utf8::SwashInfo{'ToDt'}{'missing'} = "None";
+                $utf8::SwashInfo{'ToDt'}{'format'} = "s";
+            }   # 'dm' is handled below, with 'nfkccf'
+
+            $decomps{'LIST'} = "";
+
+            # This property has one special range not in the file: for the
+            # hangul syllables
+            my $done_hangul = 0;    # Have we done the hangul range.
+            foreach my $line (split "\n", $original) {
+                my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
+                my $code_point = hex $hex_lower;
+                my $value;
+                my $redo = 0;
+
+                # The type, enclosed in <...>, precedes the mapping separated
+                # by blanks
+                if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) {
+                    $value = ($second_try eq 'dt') ? $1 : $2
+                }
+                else {  # If there is no type specified, it's canonical
+                    $value = ($second_try eq 'dt')
+                             ? "Canonical" :
+                             $type_and_map;
+                }
+
+                # Insert the hangul range at the appropriate spot.
+                if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
+                    $done_hangul = 1;
+                    $decomps{'LIST'} .=
+                                sprintf "%x\t%x\t%s\n",
+                                        $HANGUL_BEGIN,
+                                        $HANGUL_BEGIN + $HANGUL_COUNT - 1,
+                                        ($second_try eq 'dt')
+                                        ? "Canonical"
+                                        : "<hangul syllable>";
+                }
+
+                # And append this to our constructed LIST.
+                $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
+
+                redo if $redo;
+            }
+            $swash = \%decomps;
+        }
+        elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail.
+            return;
+        }
+
+        if ($second_try eq 'nfkccf' || $second_try eq 'dm') {
+
+            # The 'nfkccf' property is stored in the old format for backwards
+            # compatibility for any applications that has read its file
+            # directly before prop_invmap() existed.
+            # And the code above has extracted the 'dm' property from its file
+            # yielding the same format.  So here we convert them to adjusted
+            # format for compatibility with the other properties similar to
+            # them.
+            my %revised_swash;
+
+            # We construct a new converted list.
+            my $list = "";
+
+            my @ranges = split "\n", $swash->{'LIST'};
+            for (my $i = 0; $i < @ranges; $i++) {
+                my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i];
+
+                # The dm property has maps that are space separated sequences
+                # of code points, as well as the special entry "<hangul
+                # syllable>, which also contains a blank.
+                my @map = split " ", $map;
+                if (@map > 1) {
+
+                    # If it's just the special entry, append as-is.
+                    if ($map eq '<hangul syllable>') {
+                        $list .= "$ranges[$i]\n";
+                    }
+                    else {
+
+                        # These should all single-element ranges.
+                        croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "";
+
+                        # Convert them to decimal, as that's what's expected.
+                        $list .= "$hex_begin\t\t"
+                            . join(" ", map { hex } @map)
+                            . "\n";
+                    }
+                    next;
+                }
+
+                # Here, the mapping doesn't have a blank, is for a single code
+                # point.
+                my $begin = hex $hex_begin;
+                my $end = (defined $hex_end && $hex_end ne "")
+                        ? hex $hex_end
+                        : $begin;
+
+                # Again, the output is to be in decimal.
+                my $decimal_map = hex $map;
+
+                # We know that multi-element ranges with the same mapping
+                # should not be adjusted, as after the adjustment
+                # multi-element ranges are for consecutive increasing code
+                # points.  Further, the final element in the list won't be
+                # adjusted, as there is nothing after it to include in the
+                # adjustment
+                if ($begin != $end || $i == @ranges -1) {
+
+                    # So just convert these to single-element ranges
+                    foreach my $code_point ($begin .. $end) {
+                        $list .= sprintf("%04X\t\t%d\n",
+                                        $code_point, $decimal_map);
+                    }
+                }
+                else {
+
+                    # Here, we have a candidate for adjusting.  What we do is
+                    # look through the subsequent adjacent elements in the
+                    # input.  If the map to the next one differs by 1 from the
+                    # one before, then we combine into a larger range with the
+                    # initial map.  Loop doing this until we find one that
+                    # can't be combined.
+
+                    my $offset = 0;     # How far away are we from the initial
+                                        # map
+                    my $squished = 0;   # ? Did we squish at least two
+                                        # elements together into one range
+                    for ( ; $i < @ranges; $i++) {
+                        my ($next_hex_begin, $next_hex_end, $next_map)
+                                                = split "\t", $ranges[$i+1];
+
+                        # In the case of 'dm', the map may be a sequence of
+                        # multiple code points, which are never combined with
+                        # another range
+                        last if $next_map =~ / /;
+
+                        $offset++;
+                        my $next_decimal_map = hex $next_map;
+
+                        # If the next map is not next in sequence, it
+                        # shouldn't be combined.
+                        last if $next_decimal_map != $decimal_map + $offset;
+
+                        my $next_begin = hex $next_hex_begin;
+
+                        # Likewise, if the next element isn't adjacent to the
+                        # previous one, it shouldn't be combined.
+                        last if $next_begin != $begin + $offset;
+
+                        my $next_end = (defined $next_hex_end
+                                        && $next_hex_end ne "")
+                                            ? hex $next_hex_end
+                                            : $next_begin;
+
+                        # And finally, if the next element is a multi-element
+                        # range, it shouldn't be combined.
+                        last if $next_end != $next_begin;
+
+                        # Here, we will combine.  Loop to see if we should
+                        # combine the next element too.
+                        $squished = 1;
+                    }
+
+                    if ($squished) {
+
+                        # Here, 'i' is the element number of the last element to
+                        # be combined, and the range is single-element, or we
+                        # wouldn't be combining.  Get it's code point.
+                        my ($hex_end, undef, undef) = split "\t", $ranges[$i];
+                        $list .= "$hex_begin\t$hex_end\t$decimal_map\n";
+                    } else {
+
+                        # Here, no combining done.  Just appen the initial
+                        # (and current) values.
+                        $list .= "$hex_begin\t\t$decimal_map\n";
+                    }
+                }
+            } # End of loop constructing the converted list
+
+            # Finish up the data structure for our converted swash
+            my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm';
+            $revised_swash{'LIST'} = $list;
+            $revised_swash{'TYPE'} = $type;
+            $revised_swash{'SPECIALS'} = $swash->{'SPECIALS'};
+            $swash = \%revised_swash;
+
+            $utf8::SwashInfo{$type}{'missing'} = 0;
+            $utf8::SwashInfo{$type}{'format'} = 'a';
+        }
+    }
+
+    if ($swash->{'EXTRAS'}) {
+        carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic";
+        return;
+    }
+
+    # Here, have a valid swash return.  Examine it.
+    my $returned_prop = $swash->{'TYPE'};
+
+    # All properties but binary ones should have 'missing' and 'format'
+    # entries
+    $missing = $utf8::SwashInfo{$returned_prop}{'missing'};
+    $missing = 'N' unless defined $missing;
+
+    $format = $utf8::SwashInfo{$returned_prop}{'format'};
+    $format = 'b' unless defined $format;
+
+    my $requires_adjustment = $format =~ /^a/;
+
+    # The LIST input lines look like:
+    # ...
+    # 0374\t\tCommon
+    # 0375\t0377\tGreek   # [3]
+    # 037A\t037D\tGreek   # [4]
+    # 037E\t\tCommon
+    # 0384\t\tGreek
+    # ...
+    #
+    # Convert them to like
+    # 0374 => Common
+    # 0375 => Greek
+    # 0378 => $missing
+    # 037A => Greek
+    # 037E => Common
+    # 037F => $missing
+    # 0384 => Greek
+    #
+    # For binary properties, the final non-comment column is absent, and
+    # assumed to be 'Y'.
+
+    foreach my $range (split "\n", $swash->{'LIST'}) {
+        $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
+
+        # Find the beginning and end of the range on the line
+        my ($hex_begin, $hex_end, $map) = split "\t", $range;
+        my $begin = hex $hex_begin;
+        my $end = (defined $hex_end && $hex_end ne "")
+                  ? hex $hex_end
+                  : $begin;
+
+        # Each time through the loop (after the first):
+        # $invlist[-2] contains the beginning of the previous range processed
+        # $invlist[-1] contains the end+1 of the previous range processed
+        # $invmap[-2] contains the value of the previous range processed
+        # $invmap[-1] contains the default value for missing ranges ($missing)
+        #
+        # Thus, things are set up for the typical case of a new non-adjacent
+        # range of non-missings to be added.  But, if the new range is
+        # adjacent, it needs to replace the [-1] element; and if the new
+        # range is a multiple value of the previous one, it needs to be added
+        # to the [-2] map element.
+
+        # The first time through, everything will be empty.  If the property
+        # doesn't have a range that begins at 0, add one that maps to $missing
+        if (! @invlist) {
+            if ($begin != 0) {
+                push @invlist, 0;
+                push @invmap, $missing;
+            }
+        }
+        elsif (@invlist > 1 && $invlist[-2] == $begin) {
+
+            # Here we handle the case where the input has multiple entries for
+            # each code point.  mktables should have made sure that each such
+            # range contains only one code point.  At this point, $invlist[-1]
+            # is the $missing that was added at the end of the last loop
+            # iteration, and [-2] is the last real input code point, and that
+            # code point is the same as the one we are adding now, making the
+            # new one a multiple entry.  Add it to the existing entry, either
+            # by pushing it to the existing list of multiple entries, or
+            # converting the single current entry into a list with both on it.
+            # This is all we need do for this iteration.
+
+            if ($end != $begin) {
+                croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
+            }
+            if (! ref $invmap[-2]) {
+                $invmap[-2] = [ $invmap[-2], $map ];
+            }
+            else {
+                push @{$invmap[-2]}, $map;
+            }
+            $has_multiples = 1;
+            next;
+        }
+        elsif ($invlist[-1] == $begin) {
+
+            # If the input isn't in the most compact form, so that there are
+            # two adjacent ranges that map to the same thing, they should be
+            # combined (EXCEPT where the arrays require adjustments, in which
+            # case everything is already set up correctly).  This happens in
+            # our constructed dt mapping, as Element [-2] is the map for the
+            # latest range so far processed.  Just set the beginning point of
+            # the map to $missing (in invlist[-1]) to 1 beyond where this
+            # range ends.  For example, in
+            # 12\t13\tXYZ
+            # 14\t17\tXYZ
+            # we have set it up so that it looks like
+            # 12 => XYZ
+            # 14 => $missing
+            #
+            # We now see that it should be
+            # 12 => XYZ
+            # 18 => $missing
+            if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
+                                  ? $invmap[-2] eq $map
+                                  : $invmap[-2] eq 'Y'))
+            {
+                $invlist[-1] = $end + 1;
+                next;
+            }
+
+            # Here, the range started in the previous iteration that maps to
+            # $missing starts at the same code point as this range.  That
+            # means there is no gap to fill that that range was intended for,
+            # so we just pop it off the parallel arrays.
+            pop @invlist;
+            pop @invmap;
+        }
+
+        # Add the range beginning, and the range's map.
+        push @invlist, $begin;
+        if ($returned_prop eq 'ToDm') {
+
+            # The decomposition maps are either a line like <hangul syllable>
+            # which are to be taken as is; or a sequence of code points in hex
+            # and separated by blanks.  Convert them to decimal, and if there
+            # is more than one, use an anonymous array as the map.
+            if ($map =~ /^ < /x) {
+                push @invmap, $map;
+            }
+            else {
+                my @map = split " ", $map;
+                if (@map == 1) {
+                    push @invmap, $map[0];
+                }
+                else {
+                    push @invmap, \@map;
+                }
+            }
+        }
+        else {
+
+            # Otherwise, convert hex formatted list entries to decimal; add a
+            # 'Y' map for the missing value in binary properties, or
+            # otherwise, use the input map unchanged.
+            $map = ($format eq 'x')
+                ? hex $map
+                : $format eq 'b'
+                  ? 'Y'
+                  : $map;
+            push @invmap, $map;
+        }
+
+        # We just started a range.  It ends with $end.  The gap between it and
+        # the next element in the list must be filled with a range that maps
+        # to the default value.  If there is no gap, the next iteration will
+        # pop this, unless there is no next iteration, and we have filled all
+        # of the Unicode code space, so check for that and skip.
+        if ($end < $MAX_UNICODE_CODEPOINT) {
+            push @invlist, $end + 1;
+            push @invmap, $missing;
+        }
+    }
+
+    # If the property is empty, make all code points use the value for missing
+    # ones.
+    if (! @invlist) {
+        push @invlist, 0;
+        push @invmap, $missing;
+    }
+
+    # And add in standard element that all non-Unicode code points map to:
+    # $missing
+    push @invlist, $MAX_UNICODE_CODEPOINT + 1;
+    push @invmap, $missing;
+
+    # The second component of the map are those values that require
+    # non-standard specification, stored in SPECIALS.  These override any
+    # duplicate code points in LIST.  If we are using a proxy, we may have
+    # already set $overrides based on the proxy.
+    $overrides = $swash->{'SPECIALS'} unless defined $overrides;
+    if ($overrides) {
+
+        # A negative $overrides implies that the SPECIALS should be ignored,
+        # and a simple 'a' list is the value.
+        if ($overrides < 0) {
+            $format = 'a';
+        }
+        else {
+
+            # Currently, all overrides are for properties that normally map to
+            # single code points, but now some will map to lists of code
+            # points (but there is an exception case handled below).
+            $format = 'al';
+
+            # Look through the overrides.
+            foreach my $cp_maybe_utf8 (keys %$overrides) {
+                my $cp;
+                my @map;
+
+                # If the overrides came from SPECIALS, the code point keys are
+                # packed UTF-8.
+                if ($overrides == $swash->{'SPECIALS'}) {
+                    $cp = unpack("C0U", $cp_maybe_utf8);
+                    @map = unpack "U0U*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
+
+                    # The empty string will show up unpacked as an empty
+                    # array.
+                    $format = 'ale' if @map == 0;
+                }
+                else {
+
+                    # But if we generated the overrides, we didn't bother to
+                    # pack them, and we, so far, do this only for properties
+                    # that are 'a' ones.
+                    $cp = $cp_maybe_utf8;
+                    @map = hex $overrides->{$cp};
+                    $format = 'a';
+                }
+
+                # Find the range that the override applies to.
+                my $i = _search_invlist(\@invlist, $cp);
+                if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
+                    croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
+                }
+
+                # And what that range currently maps to
+                my $cur_map = $invmap[$i];
+
+                # If there is a gap between the next range and the code point
+                # we are overriding, we have to add elements to both arrays to
+                # fill that gap, using the map that applies to it, which is
+                # $cur_map, since it is part of the current range.
+                if ($invlist[$i + 1] > $cp + 1) {
+                    #use feature 'say';
+                    #say "Before splice:";
+                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+
+                    splice @invlist, $i + 1, 0, $cp + 1;
+                    splice @invmap, $i + 1, 0, $cur_map;
+
+                    #say "After splice:";
+                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+                }
+
+                # If the remaining portion of the range is multiple code
+                # points (ending with the one we are replacing, guaranteed by
+                # the earlier splice).  We must split it into two
+                if ($invlist[$i] < $cp) {
+                    $i++;   # Compensate for the new element
+
+                    #use feature 'say';
+                    #say "Before splice:";
+                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+
+                    splice @invlist, $i, 0, $cp;
+                    splice @invmap, $i, 0, 'dummy';
+
+                    #say "After splice:";
+                    #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
+                    #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
+                    #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
+                    #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
+                    #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
+                }
+
+                # Here, the range we are overriding contains a single code
+                # point.  The result could be the empty string, a single
+                # value, or a list.  If the last case, we use an anonymous
+                # array.
+                $invmap[$i] = (scalar @map == 0)
+                               ? ""
+                               : (scalar @map > 1)
+                                  ? \@map
+                                  : $map[0];
+            }
+        }
+    }
+    elsif ($format eq 'x') {
+
+        # All hex-valued properties are really to code points, and have been
+        # converted to decimal.
+        $format = 's';
+    }
+    elsif ($returned_prop eq 'ToDm') {
+        $format = 'ad';
+    }
+    elsif ($format eq 'sw') { # blank-separated elements to form a list.
+        map { $_ = [ split " ", $_  ] if $_ =~ / / } @invmap;
+        $format = 'sl';
+    }
+    elsif ($returned_prop eq 'ToNameAlias') {
+
+        # This property currently doesn't have any lists, but theoretically
+        # could
+        $format = 'sl';
+    }
+    elsif ($returned_prop eq 'ToPerlDecimalDigit') {
+        $format = 'ae';
+    }
+    elsif ($returned_prop eq 'ToNv') {
+
+        # The one property that has this format is stored as a delta, so needs
+        # to indicate that need to add code point to it.
+        $format = 'ar';
+    }
+    elsif ($format ne 'n' && $format ne 'a') {
+
+        # All others are simple scalars
+        $format = 's';
+    }
+    if ($has_multiples &&  $format !~ /l/) {
+	croak __PACKAGE__, "::prop_invmap: Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
+    }
+
+    return (\@invlist, \@invmap, $format, $missing);
+}
+
 =head2 Unicode::UCD::UnicodeVersion
 
 This returns the version of the Unicode Character Database, in other words, the
@@ -1174,6 +3328,7 @@ my $UNICODEVERSION;
 sub UnicodeVersion {
     unless (defined $UNICODEVERSION) {
 	openunicode(\$VERSIONFH, "version");
+	local $/ = "\n";
 	chomp($UNICODEVERSION = <$VERSIONFH>);
 	close($VERSIONFH);
 	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
@@ -1204,26 +3359,51 @@ For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
 
 Scripts are matched with the regular-expression construct
 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
-while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
+while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
 any of the 256 code points in the Tibetan block).
 
+=head2 Old-style versus new-style block names
+
+Unicode publishes the names of blocks in two different styles, though the two
+are equivalent under Unicode's loose matching rules.
+
+The original style uses blanks and hyphens in the block names (except for
+C<No_Block>), like so:
 
-=head2 Implementation Note
+ Miscellaneous Mathematical Symbols-B
 
-The first use of charinfo() opens a read-only filehandle to the Unicode
-Character Database (the database is included in the Perl distribution).
-The filehandle is then kept open for further queries.  In other words,
-if you are wondering where one of your filehandles went, that's where.
+The newer style replaces these with underscores, like this:
+
+ Miscellaneous_Mathematical_Symbols_B
+
+This newer style is consistent with the values of other Unicode properties.
+To preserve backward compatibility, all the functions in Unicode::UCD that
+return block names (except one) return the old-style ones.  That one function,
+L</prop_value_aliases()> can be used to convert from old-style to new-style:
+
+ my $new_style = prop_values_aliases("block", $old_style);
+
+Perl also has single-form extensions that refer to blocks, C<In_Cyrillic>,
+meaning C<Block=Cyrillic>.  These have always been written in the new style.
+
+To convert from new-style to old-style, follow this recipe:
+
+ $old_style = charblock((prop_invlist("block=$new_style"))[0]);
+
+(which finds the range of code points in the block using C<prop_invlist>,
+gets the lower end of the range (0th element) and then looks up the old name
+for its block using C<charblock>).
+
+Note that starting in Unicode 6.1, many of the block names have shorter
+synonyms.  These are always given in the new style.
 
 =head1 BUGS
 
 Does not yet support EBCDIC platforms.
 
-L</compexcl()> should give a complete list of excluded code points.
-
 =head1 AUTHOR
 
-Jarkko Hietaniemi
+Jarkko Hietaniemi.  Now maintained by perl5 porters.
 
 =cut
 
diff --git a/gnu/usr.bin/perl/lib/Unicode/UCD.t b/gnu/usr.bin/perl/lib/Unicode/UCD.t
index a2f972e5f2d..2e5a741f0f9 100644
--- a/gnu/usr.bin/perl/lib/Unicode/UCD.t
+++ b/gnu/usr.bin/perl/lib/Unicode/UCD.t
@@ -6,7 +6,6 @@ BEGIN {
     }
     chdir 't' if -d 't';
     @INC = '../lib';
-    @INC = "::lib" if $^O eq 'MacOS'; # module parses @INC itself
     require Config; import Config;
     if ($Config{'extensions'} !~ /\bStorable\b/) {
         print "1..0 # Skip: Storable was not built; Unicode::UCD uses Storable\n";
@@ -18,12 +17,34 @@ use strict;
 use Unicode::UCD;
 use Test::More;
 
-BEGIN { plan tests => 239 };
-
 use Unicode::UCD 'charinfo';
 
+$/ = 7;
+
 my $charinfo;
 
+is(charinfo(0x110000), undef, "Verify charinfo() of non-unicode is undef");
+
+$charinfo = charinfo(0);    # Null is often problematic, so test it.
+
+is($charinfo->{code},           '0000', '<control>');
+is($charinfo->{name},           '<control>');
+is($charinfo->{category},       'Cc');
+is($charinfo->{combining},      '0');
+is($charinfo->{bidi},           'BN');
+is($charinfo->{decomposition},  '');
+is($charinfo->{decimal},        '');
+is($charinfo->{digit},          '');
+is($charinfo->{numeric},        '');
+is($charinfo->{mirrored},       'N');
+is($charinfo->{unicode10},      'NULL');
+is($charinfo->{comment},        '');
+is($charinfo->{upper},          '');
+is($charinfo->{lower},          '');
+is($charinfo->{title},          '');
+is($charinfo->{block},          'Basic Latin');
+is($charinfo->{script},         'Common');
+
 $charinfo = charinfo(0x41);
 
 is($charinfo->{code},           '0041', 'LATIN CAPITAL LETTER A');
@@ -112,12 +133,12 @@ is($charinfo->{script},         'Hebrew');
 
 $charinfo = charinfo(0xAC00);
 
-is($charinfo->{code},           'AC00', 'HANGUL SYLLABLE-AC00');
-is($charinfo->{name},           'HANGUL SYLLABLE-AC00');
+is($charinfo->{code},           'AC00', 'HANGUL SYLLABLE U+AC00');
+is($charinfo->{name},           'HANGUL SYLLABLE GA');
 is($charinfo->{category},       'Lo');
 is($charinfo->{combining},      '0');
 is($charinfo->{bidi},           'L');
-is($charinfo->{decomposition},  undef);
+is($charinfo->{decomposition},  '1100 1161');
 is($charinfo->{decimal},        '');
 is($charinfo->{digit},          '');
 is($charinfo->{numeric},        '');
@@ -134,12 +155,12 @@ is($charinfo->{script},         'Hangul');
 
 $charinfo = charinfo(0xAE00);
 
-is($charinfo->{code},           'AE00', 'HANGUL SYLLABLE-AE00');
-is($charinfo->{name},           'HANGUL SYLLABLE-AE00');
+is($charinfo->{code},           'AE00', 'HANGUL SYLLABLE U+AE00');
+is($charinfo->{name},           'HANGUL SYLLABLE GEUL');
 is($charinfo->{category},       'Lo');
 is($charinfo->{combining},      '0');
 is($charinfo->{bidi},           'L');
-is($charinfo->{decomposition},  undef);
+is($charinfo->{decomposition},  "1100 1173 11AF");
 is($charinfo->{decimal},        '');
 is($charinfo->{digit},          '');
 is($charinfo->{numeric},        '');
@@ -197,7 +218,8 @@ use Unicode::UCD qw(charblock charscript);
 # 0x0590 is in the Hebrew block but unused.
 
 is(charblock(0x590),          'Hebrew', '0x0590 - Hebrew unused charblock');
-is(charscript(0x590),         undef,    '0x0590 - Hebrew unused charscript');
+is(charscript(0x590),         'Unknown',    '0x0590 - Hebrew unused charscript');
+is(charblock(0x1FFFF),        'No_Block', '0x1FFFF - unused charblock');
 
 $charinfo = charinfo(0xbe);
 
@@ -219,6 +241,50 @@ is($charinfo->{title},          '');
 is($charinfo->{block},          'Latin-1 Supplement');
 is($charinfo->{script},         'Common');
 
+# This is to test a case where both simple and full lowercases exist and
+# differ
+$charinfo = charinfo(0x130);
+
+is($charinfo->{code},           '0130', 'LATIN CAPITAL LETTER I WITH DOT ABOVE');
+is($charinfo->{name},           'LATIN CAPITAL LETTER I WITH DOT ABOVE');
+is($charinfo->{category},       'Lu');
+is($charinfo->{combining},      '0');
+is($charinfo->{bidi},           'L');
+is($charinfo->{decomposition},  '0049 0307');
+is($charinfo->{decimal},        '');
+is($charinfo->{digit},          '');
+is($charinfo->{numeric},        '');
+is($charinfo->{mirrored},       'N');
+is($charinfo->{unicode10},      'LATIN CAPITAL LETTER I DOT');
+is($charinfo->{comment},        '');
+is($charinfo->{upper},          '');
+is($charinfo->{lower},          '0069');
+is($charinfo->{title},          '');
+is($charinfo->{block},          'Latin Extended-A');
+is($charinfo->{script},         'Latin');
+
+# This is to test a case where both simple and full uppercases exist and
+# differ
+$charinfo = charinfo(0x1F80);
+
+is($charinfo->{code},           '1F80', 'GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI');
+is($charinfo->{name},           'GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI');
+is($charinfo->{category},       'Ll');
+is($charinfo->{combining},      '0');
+is($charinfo->{bidi},           'L');
+is($charinfo->{decomposition},  '1F00 0345');
+is($charinfo->{decimal},        '');
+is($charinfo->{digit},          '');
+is($charinfo->{numeric},        '');
+is($charinfo->{mirrored},       'N');
+is($charinfo->{unicode10},      '');
+is($charinfo->{comment},        '');
+is($charinfo->{upper},          '1F88');
+is($charinfo->{lower},          '');
+is($charinfo->{title},          '1F88');
+is($charinfo->{block},          'Greek Extended');
+is($charinfo->{script},         'Greek');
+
 use Unicode::UCD qw(charblocks charscripts);
 
 my $charblocks = charblocks();
@@ -247,8 +313,8 @@ is($charscript, 'Ethiopic');
 my $ranges;
 
 $ranges = charscript('Ogham');
-is($ranges->[1]->[0], hex('1681'), 'Ogham charscript');
-is($ranges->[1]->[1], hex('169a'));
+is($ranges->[0]->[0], hex('1680'), 'Ogham charscript');
+is($ranges->[0]->[1], hex('169C'));
 
 use Unicode::UCD qw(charinrange);
 
@@ -276,11 +342,13 @@ is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic');
 
 # If this fails, then maybe one should look at the Unicode changes to see
 # what else might need to be updated.
-is(Unicode::UCD::UnicodeVersion, '5.2.0', 'UnicodeVersion');
+is(Unicode::UCD::UnicodeVersion, '6.1.0', 'UnicodeVersion');
 
 use Unicode::UCD qw(compexcl);
 
 ok(!compexcl(0x0100), 'compexcl');
+ok(!compexcl(0xD801), 'compexcl of surrogate');
+ok(!compexcl(0x110000), 'compexcl of non-Unicode code point');
 ok( compexcl(0x0958));
 
 use Unicode::UCD qw(casefold);
@@ -402,7 +470,7 @@ is(Unicode::UCD::_getcode('U+123x'),  undef, "_getcode(x123)");
 {
     my $r1 = charscript('Latin');
     my $n1 = @$r1;
-    is($n1, 42, "number of ranges in Latin script (Unicode 5.1.0)");
+    is($n1, 30, "number of ranges in Latin script (Unicode 6.1.0)");
     shift @$r1 while @$r1;
     my $r2 = charscript('Latin');
     is(@$r2, $n1, "modifying results should not mess up internal caches");
@@ -427,3 +495,1505 @@ is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");
 @ns = namedseq(42);
 is(@ns, 0);
 
+use Unicode::UCD qw(num);
+use charnames ":full";
+
+is(num("0"), 0, 'Verify num("0") == 0');
+is(num("98765"), 98765, 'Verify num("98765") == 98765');
+ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
+is(num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}"), 21, 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}") == 21');
+ok(! defined num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}"), 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined');
+is(num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}"), 3, 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
+ok(! defined num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}"), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
+is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2');
+is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000');
+is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5');
+is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9');
+is(num("\N{U+5146}"), 1000000000000, 'Verify num("\N{U+5146}") == 1000000000000');
+
+# Create a user-defined property
+sub InKana {<<'END'}
+3040    309F
+30A0    30FF
+END
+
+use Unicode::UCD qw(prop_aliases);
+
+is(prop_aliases(undef), undef, "prop_aliases(undef) returns <undef>");
+is(prop_aliases("unknown property"), undef,
+                "prop_aliases(<unknown property>) returns <undef>");
+is(prop_aliases("InKana"), undef,
+                "prop_aliases(<user-defined property>) returns <undef>");
+is(prop_aliases("Perl_Decomposition_Mapping"), undef, "prop_aliases('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only");
+is(prop_aliases("Perl_Charnames"), undef,
+    "prop_aliases('Perl_Charnames') returns <undef> since internal-Perl-only");
+is(prop_aliases("isgc"), undef,
+    "prop_aliases('isgc') returns <undef> since is not covered Perl extension");
+is(prop_aliases("Is_Is_Any"), undef,
+                "prop_aliases('Is_Is_Any') returns <undef> since two is's");
+
+require 'utf8_heavy.pl';
+require "unicore/Heavy.pl";
+
+# Keys are lists of properties. Values are defined if have been tested.
+my %props;
+
+# To test for loose matching, add in the characters that are ignored there.
+my $extra_chars = "-_ ";
+
+# The one internal property we accept
+$props{'Perl_Decimal_Digit'} = 1;
+my @list = prop_aliases("perldecimaldigit");
+is_deeply(\@list,
+          [ "Perl_Decimal_Digit",
+            "Perl_Decimal_Digit"
+          ], "prop_aliases('perldecimaldigit') returns Perl_Decimal_Digit as both short and full names");
+
+# Get the official Unicode property name synonyms and test them.
+open my $props, "<", "../lib/unicore/PropertyAliases.txt"
+                or die "Can't open Unicode PropertyAliases.txt";
+$/ = "\n";
+while (<$props>) {
+    s/\s*#.*//;           # Remove comments
+    next if /^\s* $/x;    # Ignore empty and comment lines
+
+    chomp;
+    my $count = 0;  # 0th field in line is short name; 1th is long name
+    my $short_name;
+    my $full_name;
+    my @names_via_short;
+    foreach my $alias (split /\s*;\s*/) {    # Fields are separated by
+                                             # semi-colons
+        # Add in the characters that are supposed to be ignored, to test loose
+        # matching, which the tested function does on all inputs.
+        my $mod_name = "$extra_chars$alias";
+
+        my $loose = &utf8::_loose_name(lc $alias);
+
+        # Indicate we have tested this.
+        $props{$loose} = 1;
+
+        my @all_names = prop_aliases($mod_name);
+        if (grep { $_ eq $loose } @Unicode::UCD::suppressed_properties) {
+            is(@all_names, 0, "prop_aliases('$mod_name') returns undef since $alias is not installed");
+            next;
+        }
+        elsif (! @all_names) {
+            fail("prop_aliases('$mod_name')");
+            diag("'$alias' is unknown to prop_aliases()");
+            next;
+        }
+
+        if ($count == 0) {  # Is short name
+
+            @names_via_short = prop_aliases($mod_name);
+
+            # If the 0th test fails, no sense in continuing with the others
+            last unless is($names_via_short[0], $alias,
+                    "prop_aliases: '$alias' is the short name for '$mod_name'");
+            $short_name = $alias;
+        }
+        elsif ($count == 1) {   # Is full name
+
+            # Some properties have the same short and full name; no sense
+            # repeating the test if the same.
+            if ($alias ne $short_name) {
+                my @names_via_full = prop_aliases($mod_name);
+                is_deeply(\@names_via_full, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'");
+            }
+
+            # Tests scalar context
+            is(prop_aliases($short_name), $alias,
+                "prop_aliases: '$alias' is the long name for '$short_name'");
+        }
+        else {  # Is another alias
+            is_deeply(\@all_names, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'");
+            ok((grep { $_ =~ /^$alias$/i } @all_names),
+                "prop_aliases: '$alias' is listed as an alias for '$mod_name'");
+        }
+
+        $count++;
+    }
+}
+
+# Now test anything we can find that wasn't covered by the tests of the
+# official properties.  We have no way of knowing if mktables omitted a Perl
+# extension or not, but we do the best we can from its generated lists
+
+foreach my $alias (keys %utf8::loose_to_file_of) {
+    next if $alias =~ /=/;
+    my $lc_name = lc $alias;
+    my $loose = &utf8::_loose_name($lc_name);
+    next if exists $props{$loose};  # Skip if already tested
+    $props{$loose} = 1;
+    my $mod_name = "$extra_chars$alias";    # Tests loose matching
+    my @aliases = prop_aliases($mod_name);
+    my $found_it = grep { &utf8::_loose_name(lc $_) eq $lc_name } @aliases;
+    if ($found_it) {
+        pass("prop_aliases: '$lc_name' is listed as an alias for '$mod_name'");
+    }
+    elsif ($lc_name =~ /l[_&]$/) {
+
+        # These two names are special in that they don't appear in the
+        # returned list because they are discouraged from use.  Verify
+        # that they return the same list as a non-discouraged version.
+        my @LC = prop_aliases('Is_LC');
+        is_deeply(\@aliases, \@LC, "prop_aliases: '$lc_name' returns the same list as 'Is_LC'");
+    }
+    else {
+        my $stripped = $lc_name =~ s/^is//;
+
+        # Could be that the input includes a prefix 'is', which is rarely
+        # returned as an alias, so having successfully stripped it off above,
+        # try again.
+        if ($stripped) {
+            $found_it = grep { &utf8::_loose_name(lc $_) eq $lc_name } @aliases;
+        }
+
+        # If that didn't work, it could be that it's a block, which is always
+        # returned with a leading 'In_' to avoid ambiguity.  Try comparing
+        # with that stripped off.
+        if (! $found_it) {
+            $found_it = grep { &utf8::_loose_name(s/^In_(.*)/\L$1/r) eq $lc_name }
+                              @aliases;
+            # Could check that is a real block, but tests for invmap will
+            # likely pickup any errors, since this will be tested there.
+            $lc_name = "in$lc_name" if $found_it;   # Change for message below
+        }
+        my $message = "prop_aliases: '$lc_name' is listed as an alias for '$mod_name'";
+        ($found_it) ? pass($message) : fail($message);
+    }
+}
+
+my $done_equals = 0;
+foreach my $alias (keys %utf8::stricter_to_file_of) {
+    if ($alias =~ /=/) {    # Only test one case where there is an equals
+        next if $done_equals;
+        $done_equals = 1;
+    }
+    my $lc_name = lc $alias;
+    my @list = prop_aliases($alias);
+    if ($alias =~ /^_/) {
+        is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since it is internal_only");
+    }
+    elsif ($alias =~ /=/) {
+        is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since is illegal property name");
+    }
+    else {
+        ok((grep { lc $_ eq $lc_name } @list),
+                "prop_aliases: '$lc_name' is listed as an alias for '$alias'");
+    }
+}
+
+use Unicode::UCD qw(prop_value_aliases);
+
+is(prop_value_aliases("unknown property", "unknown value"), undef,
+    "prop_value_aliases(<unknown property>, <unknown value>) returns <undef>");
+is(prop_value_aliases(undef, undef), undef,
+                           "prop_value_aliases(undef, undef) returns <undef>");
+is((prop_value_aliases("na", "A")), "A", "test that prop_value_aliases returns its input for properties that don't have synonyms");
+is(prop_value_aliases("isgc", "C"), undef, "prop_value_aliases('isgc', 'C') returns <undef> since is not covered Perl extension");
+is(prop_value_aliases("gc", "isC"), undef, "prop_value_aliases('gc', 'isC') returns <undef> since is not covered Perl extension");
+
+# We have no way of knowing if mktables omitted a Perl extension that it
+# shouldn't have, but we can check if it omitted an official Unicode property
+# name synonym.  And for those, we can check if the short and full names are
+# correct.
+
+my %pva_tested;   # List of things already tested.
+open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt"
+     or die "Can't open Unicode PropValueAliases.txt";
+while (<$propvalues>) {
+    s/\s*#.*//;           # Remove comments
+    next if /^\s* $/x;    # Ignore empty and comment lines
+    chomp;
+
+    # Fix typo in official input file
+    s/CCC133/CCC132/g if $version eq "6.1.0";
+
+    my @fields = split /\s*;\s*/; # Fields are separated by semi-colons
+    my $prop = shift @fields;   # 0th field is the property,
+    my $count = 0;  # 0th field in line (after shifting off the property) is
+                    # short name; 1th is long name
+    my $short_name;
+    my @names_via_short;    # Saves the values between iterations
+
+    # The property on the lhs of the = is always loosely matched.  Add in
+    # characters that are ignored under loose matching to test that
+    my $mod_prop = "$extra_chars$prop";
+
+    if ($fields[0] eq 'n/a') {  # See comments in input file, essentially
+                                # means full name and short name are identical
+        $fields[0] = $fields[1];
+    }
+    elsif ($fields[0] ne $fields[1]
+           && &utf8::_loose_name(lc $fields[0])
+               eq &utf8::_loose_name(lc $fields[1])
+           && $fields[1] !~ /[[:upper:]]/)
+    {
+        # Also, there is a bug in the file in which "n/a" is omitted, and
+        # the two fields are identical except for case, and the full name
+        # is all lower case.  Copy the "short" name unto the full one to
+        # give it some upper case.
+
+        $fields[1] = $fields[0];
+    }
+
+    # The ccc property in the file is special; has an extra numeric field
+    # (0th), which should go at the end, since we use the next two fields as
+    # the short and full names, respectively.  See comments in input file.
+    splice (@fields, 0, 0, splice(@fields, 1, 2)) if $prop eq 'ccc';
+
+    my $loose_prop = &utf8::_loose_name(lc $prop);
+    my $suppressed = grep { $_ eq $loose_prop }
+                          @Unicode::UCD::suppressed_properties;
+    foreach my $value (@fields) {
+        if ($suppressed) {
+            is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop");
+            next;
+        }
+        elsif (grep { $_ eq ("$loose_prop=" . &utf8::_loose_name(lc $value)) } @Unicode::UCD::suppressed_properties) {
+            is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop=$value");
+            next;
+        }
+
+        # Add in test for loose matching.
+        my $mod_value = "$extra_chars$value";
+
+        # If the value is a number, optionally negative, including a floating
+        # point or rational numer, it should be only strictly matched, so the
+        # loose matching should fail.
+        if ($value =~ / ^ -? \d+ (?: [\/.] \d+ )? $ /x) {
+            is(prop_value_aliases($mod_prop, $mod_value), undef, "prop_value_aliases('$mod_prop', '$mod_value') returns undef because '$mod_value' should be strictly matched");
+
+            # And reset so below tests just the strict matching.
+            $mod_value = $value;
+        }
+
+        if ($count == 0) {
+
+            @names_via_short = prop_value_aliases($mod_prop, $mod_value);
+
+            # If the 0th test fails, no sense in continuing with the others
+            last unless is($names_via_short[0], $value, "prop_value_aliases: In '$prop', '$value' is the short name for '$mod_value'");
+            $short_name = $value;
+        }
+        elsif ($count == 1) {
+
+            # Some properties have the same short and full name; no sense
+            # repeating the test if the same.
+            if ($value ne $short_name) {
+                my @names_via_full =
+                            prop_value_aliases($mod_prop, $mod_value);
+                is_deeply(\@names_via_full, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'");
+            }
+
+            # Tests scalar context
+            is(prop_value_aliases($prop, $short_name), $value, "'$value' is the long name for prop_value_aliases('$prop', '$short_name')");
+        }
+        else {
+            my @all_names = prop_value_aliases($mod_prop, $mod_value);
+            is_deeply(\@all_names, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'");
+            ok((grep { &utf8::_loose_name(lc $_) eq &utf8::_loose_name(lc $value) } prop_value_aliases($prop, $short_name)), "'$value' is listed as an alias for prop_value_aliases('$prop', '$short_name')");
+        }
+
+        $pva_tested{&utf8::_loose_name(lc $prop) . "=" . &utf8::_loose_name(lc $value)} = 1;
+        $count++;
+    }
+}
+
+# And test as best we can, the non-official pva's that mktables generates.
+foreach my $hash (\%utf8::loose_to_file_of, \%utf8::stricter_to_file_of) {
+    foreach my $test (keys %$hash) {
+        next if exists $pva_tested{$test};  # Skip if already tested
+
+        my ($prop, $value) = split "=", $test;
+        next unless defined $value; # prop_value_aliases() requires an input
+                                    # 'value'
+        my $mod_value;
+        if ($hash == \%utf8::loose_to_file_of) {
+
+            # Add extra characters to test loose-match rhs value
+            $mod_value = "$extra_chars$value";
+        }
+        else { # Here value is strictly matched.
+
+            # Extra elements are added by mktables to this hash so that
+            # something like "age=6.0" has a synonym of "age=6".  It's not
+            # clear to me (khw) if we should be encouraging those synonyms, so
+            # don't test for them.
+            next if $value !~ /\D/ && exists $hash->{"$prop=$value.0"};
+
+            # Verify that loose matching fails when only strict is called for.
+            next unless is(prop_value_aliases($prop, "$extra_chars$value"), undef,
+                        "prop_value_aliases('$prop', '$extra_chars$value') returns undef since '$value' should be strictly matched"),
+
+            # Strict matching does allow for underscores between digits.  Test
+            # for that.
+            $mod_value = $value;
+            while ($mod_value =~ s/(\d)(\d)/$1_$2/g) {}
+        }
+
+        # The lhs property is always loosely matched, so add in extra
+        # characters to test that.
+        my $mod_prop = "$extra_chars$prop";
+
+        if ($prop eq 'gc' && $value =~ /l[_&]$/) {
+            # These two names are special in that they don't appear in the
+            # returned list because they are discouraged from use.  Verify
+            # that they return the same list as a non-discouraged version.
+            my @LC = prop_value_aliases('gc', 'lc');
+            my @l_ = prop_value_aliases($mod_prop, $mod_value);
+            is_deeply(\@l_, \@LC, "prop_value_aliases('$mod_prop', '$mod_value) returns the same list as prop_value_aliases('gc', 'lc')");
+        }
+        else {
+            ok((grep { &utf8::_loose_name(lc $_) eq &utf8::_loose_name(lc $value) }
+                prop_value_aliases($mod_prop, $mod_value)),
+                "'$value' is listed as an alias for prop_value_aliases('$mod_prop', '$mod_value')");
+        }
+    }
+}
+
+undef %pva_tested;
+
+no warnings 'once'; # We use some values once from 'required' modules.
+
+use Unicode::UCD qw(prop_invlist prop_invmap MAX_CP);
+
+# There were some problems with caching interfering with prop_invlist() vs
+# prop_invmap() on binary properties, and also between the 3 properties where
+# Perl used the same 'To' name as another property (see utf8_heavy.pl).
+# So, before testing all of prop_invlist(),
+#   1)  call prop_invmap() to try both orders of these name issues.  This uses
+#       up two of the 3 properties;  the third will be left so that invlist()
+#       on it gets called before invmap()
+#   2)  call prop_invmap() on a generic binary property, ahead of invlist().
+# This should test that the caching works in both directions.
+
+# These properties are not stable between Unicode versions, but the first few
+# elements are; just look at the first element to see if are getting the
+# distinction right.  The general inversion map testing below will test the
+# whole thing.
+my $prop = "uc";
+my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 'al', "prop_invmap() format of '$prop' is 'al'");
+is($missing, '0', "prop_invmap() missing of '$prop' is '0'");
+is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61");
+is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41");
+
+$prop = "upper";
+($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 's', "prop_invmap() format of '$prop' is 's");
+is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
+is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
+is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'");
+
+$prop = "lower";
+($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 's', "prop_invmap() format of '$prop' is 's'");
+is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
+is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61");
+is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'");
+
+$prop = "lc";
+($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 'al', "prop_invmap() format of '$prop' is 'al'");
+is($missing, '0', "prop_invmap() missing of '$prop' is '0'");
+is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41");
+is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61");
+
+# This property is stable and small, so can test all of it
+$prop = "ASCII_Hex_Digit";
+($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop);
+is($format, 's', "prop_invmap() format of '$prop' is 's'");
+is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'");
+is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A, 0x0041,
+                          0x0047, 0x0061, 0x0067, 0x110000 ],
+          "prop_invmap('$prop') code point list is correct");
+is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] ,
+          "prop_invmap('$prop') map list is correct");
+
+is(prop_invlist("Unknown property"), undef, "prop_invlist(<Unknown property>) returns undef");
+is(prop_invlist(undef), undef, "prop_invlist(undef) returns undef");
+is(prop_invlist("Any"), 2, "prop_invlist('Any') returns the number of elements in scalar context");
+my @invlist = prop_invlist("Is_Any");
+is_deeply(\@invlist, [ 0, 0x110000 ], "prop_invlist works on 'Is_' prefixes");
+is(prop_invlist("Is_Is_Any"), undef, "prop_invlist('Is_Is_Any') returns <undef> since two is's");
+
+use Storable qw(dclone);
+
+is(prop_invlist("InKana"), undef, "prop_invlist(<user-defined property returns undef>)");
+
+# The way both the tests for invlist and invmap work is that they take the
+# lists returned by the functions and construct from them what the original
+# file should look like, which are then compared with the file.  If they are
+# identical, the test passes.  What this tests isn't that the results are
+# correct, but that invlist and invmap haven't introduced errors beyond what
+# are there in the files.  As a small hedge against that, test some
+# prop_invlist() tables fully with the known correct result.  We choose
+# ASCII_Hex_Digit again, as it is stable.
+@invlist = prop_invlist("AHex");
+is_deeply(\@invlist, [ 0x0030, 0x003A, 0x0041,
+                                 0x0047, 0x0061, 0x0067 ],
+          "prop_invlist('AHex') is exactly the expected set of points");
+@invlist = prop_invlist("AHex=f");
+is_deeply(\@invlist, [ 0x0000, 0x0030, 0x003A, 0x0041,
+                                 0x0047, 0x0061, 0x0067 ],
+          "prop_invlist('AHex=f') is exactly the expected set of points");
+
+sub fail_with_diff ($$$$) {
+    # For use below to output better messages
+    my ($prop, $official, $constructed, $tested_function_name) = @_;
+
+    is($constructed, $official, "$tested_function_name('$prop')");
+    diag("Comment out lines " . (__LINE__ - 1) . " through " . (__LINE__ + 1) . " in '$0' on Un*x-like systems to see just the differences.  Uses the 'diff' first in your \$PATH");
+    return;
+
+    fail("$tested_function_name('$prop')");
+
+    require File::Temp;
+    my $off = File::Temp->new();
+    chomp $official;
+    print $off $official, "\n";
+    close $off || die "Can't close official";
+
+    chomp $constructed;
+    my $gend = File::Temp->new();
+    print $gend $constructed, "\n";
+    close $gend || die "Can't close gend";
+
+    my $diff = File::Temp->new();
+    system("diff $off $gend > $diff");
+
+    open my $fh, "<", $diff || die "Can't open $diff";
+    my @diffs = <$fh>;
+    diag("In the diff output below '<' marks lines from the filesystem tables;\n'>' are from $tested_function_name()");
+    diag(@diffs);
+}
+
+my %tested_invlist;
+
+# Look at everything we think that mktables tells us exists, both loose and
+# strict
+foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of)
+{
+    foreach my $table (keys %$set_of_tables) {
+
+        my $mod_table;
+        my ($prop_only, $value) = split "=", $table;
+        if (defined $value) {
+
+            # If this is to be loose matched, add in characters to test that.
+            if ($set_of_tables == \%utf8::loose_to_file_of) {
+                $value = "$extra_chars$value";
+            }
+            else {  # Strict match
+
+                # Verify that loose matching fails when only strict is called
+                # for.
+                next unless is(prop_invlist("$prop_only=$extra_chars$value"), undef, "prop_invlist('$prop_only=$extra_chars$value') returns undef since should be strictly matched");
+
+                # Strict matching does allow for underscores between digits.
+                # Test for that.
+                while ($value =~ s/(\d)(\d)/$1_$2/g) {}
+            }
+
+            # The property portion in compound form specifications always
+            # matches loosely
+            $mod_table = "$extra_chars$prop_only = $value";
+        }
+        else {  # Single-form.
+
+            # Like above, use looose if required, and insert underscores
+            # between digits if strict.
+            if ($set_of_tables == \%utf8::loose_to_file_of) {
+                $mod_table = "$extra_chars$table";
+            }
+            else {
+                $mod_table = $table;
+                while ($mod_table =~ s/(\d)(\d)/$1_$2/g) {}
+            }
+        }
+
+        my @tested = prop_invlist($mod_table);
+        if ($table =~ /^_/) {
+            is(@tested, 0, "prop_invlist('$mod_table') returns an empty list since is internal-only");
+            next;
+        }
+
+        # If we have already tested a property that uses the same file, this
+        # list should be identical to the one that was tested, and can bypass
+        # everything else.
+        my $file = $set_of_tables->{$table};
+        if (exists $tested_invlist{$file}) {
+            is_deeply(\@tested, $tested_invlist{$file}, "prop_invlist('$mod_table') gave same results as its name synonym");
+            next;
+        }
+        $tested_invlist{$file} = dclone \@tested;
+
+        # A leading '!' in the file name means that it is to be inverted.
+        my $invert = $file =~ s/^!//;
+        my $official = do "unicore/lib/$file.pl";
+
+        # Get rid of any trailing space and comments in the file.
+        $official =~ s/\s*(#.*)?$//mg;
+        chomp $official;
+
+        # If we are to test against an inverted file, it is easier to invert
+        # our array than the file.
+        # The file only is valid for Unicode code points, while the inversion
+        # list is valid for all possible code points.  Therefore, we must test
+        # just the Unicode part against the file.  Later we will test for
+        # the non-Unicode part.
+
+        my $before_invert;  # Saves the pre-inverted table.
+        if ($invert) {
+            $before_invert = dclone \@tested;
+            if (@tested && $tested[0] == 0) {
+                shift @tested;
+            } else {
+                unshift @tested, 0;
+            }
+            if (@tested && $tested[-1] == 0x110000) {
+                pop @tested;
+            }
+            else {
+                push @tested, 0x110000;
+            }
+        }
+
+        # Now construct a string from the list that should match the file.
+        # The file gives ranges of code points with starting and ending values
+        # in hex, like this:
+        # 0041\t005A
+        # 0061\t007A
+        # 00AA
+        # Our list has even numbered elements start ranges that are in the
+        # list, and odd ones that aren't in the list.  Therefore the odd
+        # numbered ones are one beyond the end of the previous range, but
+        # otherwise don't get reflected in the file.
+        my $tested = "";
+        my $i = 0;
+        for (; $i < @tested - 1; $i += 2) {
+            my $start = $tested[$i];
+            my $end = $tested[$i+1] - 1;
+            if ($start == $end) {
+                $tested .= sprintf("%04X\n", $start);
+            }
+            else {
+                $tested .= sprintf "%04X\t%04X\n", $start, $end;
+            }
+        }
+
+        # As mentioned earlier, the disk files only go up through Unicode,
+        # whereas the prop_invlist() ones go as high as necessary.  The
+        # comparison is only valid through max Unicode.
+        if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) {
+            $tested .= sprintf("%04X\t10FFFF\n", $tested[$i]);
+        }
+        chomp $tested;
+        if ($tested ne $official) {
+            fail_with_diff($mod_table, $official, $tested, "prop_invlist");
+            next;
+        }
+
+        # Here, it matched the table.  Now need to check for if it is correct
+        # for beyond Unicode.  First, calculate if is the default table or
+        # not.  This is the same algorithm as used internally in
+        # prop_invlist(), so if it is wrong there, this test won't catch it.
+        my $prop = lc $table;
+        ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
+        if (defined $table) {
+
+            # May have optional prefixed 'is'
+            $prop = &utf8::_loose_name($prop_only) =~ s/^is//r;
+            $prop = $utf8::loose_property_name_of{$prop};
+            $prop .= "=" . &utf8::_loose_name($table);
+        }
+        else {
+            $prop = &utf8::_loose_name($prop);
+        }
+        my $is_default = exists $Unicode::UCD::loose_defaults{$prop};
+
+        @tested = @$before_invert if $invert;    # Use the original
+        if (@tested % 2 == 0) {
+
+            # If there are an even number of elements, the final one starts a
+            # range (going to infinity) of code points that are not in the
+            # list.
+            if ($is_default) {
+                fail("prop_invlist('$mod_table')");
+                diag("default table doesn't goto infinity");
+                use Data::Dumper;
+                diag Dumper \@tested;
+                next;
+            }
+        }
+        else {
+            # An odd number of elements means the final one starts a range
+            # (going to infinity of code points that are in the list.
+            if (! $is_default) {
+                fail("prop_invlist('$mod_table')");
+                diag("non-default table needs to stop in the Unicode range");
+                use Data::Dumper;
+                diag Dumper \@tested;
+                next;
+            }
+        }
+
+        pass("prop_invlist('$mod_table')");
+    }
+}
+
+# Now test prop_invmap().
+
+@list = prop_invmap("Unknown property");
+is (@list, 0, "prop_invmap(<Unknown property>) returns an empty list");
+@list = prop_invmap(undef);
+is (@list, 0, "prop_invmap(undef) returns an empty list");
+ok (! eval "prop_invmap('gc')" && $@ ne "",
+                                "prop_invmap('gc') dies in scalar context");
+@list = prop_invmap("_X_Begin");
+is (@list, 0, "prop_invmap(<internal property>) returns an empty list");
+@list = prop_invmap("InKana");
+is(@list, 0, "prop_invmap(<user-defined property returns undef>)");
+@list = prop_invmap("Perl_Decomposition_Mapping"), undef,
+is(@list, 0, "prop_invmap('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only");
+@list = prop_invmap("Perl_Charnames"), undef,
+is(@list, 0, "prop_invmap('Perl_Charnames') returns <undef> since internal-Perl-only");
+@list = prop_invmap("Is_Is_Any");
+is(@list, 0, "prop_invmap('Is_Is_Any') returns <undef> since two is's");
+
+# The set of properties to test on has already been compiled into %props by
+# the prop_aliases() tests.
+
+my %tested_invmaps;
+
+# Like prop_invlist(), prop_invmap() is tested by comparing the results
+# returned by the function with the tables that mktables generates.  Some of
+# these tables are directly stored as files on disk, in either the unicore or
+# unicore/To directories, and most should be listed in the mktables generated
+# hash %utf8::loose_property_to_file_of, with a few additional ones that this
+# handles specially.  For these, the files are read in directly, massaged, and
+# compared with what invmap() returns.  The SPECIALS hash in some of these
+# files overrides values in the main part of the file.
+#
+# The other properties are tested indirectly by generating all the possible
+# inversion lists for the property, and seeing if those match the inversion
+# lists returned by prop_invlist(), which has already been tested.
+
+PROPERTY:
+foreach my $prop (keys %props) {
+    my $loose_prop = &utf8::_loose_name(lc $prop);
+    my $suppressed = grep { $_ eq $loose_prop }
+                          @Unicode::UCD::suppressed_properties;
+
+    # Find the short and full names that this property goes by
+    my ($name, $full_name) = prop_aliases($prop);
+    if (! $name) {
+        if (! $suppressed) {
+            fail("prop_invmap('$prop')");
+            diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap");
+        }
+        next PROPERTY;
+    }
+
+    # Normalize the short name, as it is stored in the hashes under the
+    # normalized version.
+    $name = &utf8::_loose_name(lc $name);
+
+    # Add in the characters that are supposed to be ignored to test loose
+    # matching, which the tested function applies to all properties
+    my $mod_prop = "$extra_chars$prop";
+
+    my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($mod_prop);
+    my $return_ref = [ $invlist_ref, $invmap_ref, $format, $missing ];
+
+    # If have already tested this property under a different name, merely
+    # compare the return from now with the saved one from before.
+    if (exists $tested_invmaps{$name}) {
+        is_deeply($return_ref, $tested_invmaps{$name}, "prop_invmap('$mod_prop') gave same results as its synonym, '$name'");
+        next PROPERTY;
+    }
+    $tested_invmaps{$name} = dclone $return_ref;
+
+    # If prop_invmap() returned nothing, is ok iff is a property whose file is
+    # not generated.
+    if ($suppressed) {
+        if (defined $format) {
+            fail("prop_invmap('$mod_prop')");
+            diag("did not return undef for suppressed property $prop");
+        }
+        next PROPERTY;
+    }
+    elsif (!defined $format) {
+        fail("prop_invmap('$mod_prop')");
+        diag("'$prop' is unknown to prop_invmap()");
+        next PROPERTY;
+    }
+
+    # The two parallel arrays must have the same number of elements.
+    if (@$invlist_ref != @$invmap_ref) {
+        fail("prop_invmap('$mod_prop')");
+        diag("invlist has "
+             . scalar @$invlist_ref
+             . " while invmap has "
+             . scalar @$invmap_ref
+             . " elements");
+        next PROPERTY;
+    }
+
+    # The last element must be for the above-Unicode code points, and must be
+    # for the default value.
+    if ($invlist_ref->[-1] != 0x110000) {
+        fail("prop_invmap('$mod_prop')");
+        diag("The last inversion list element is not 0x110000");
+        next PROPERTY;
+    }
+    if ($invmap_ref->[-1] ne $missing) {
+        fail("prop_invmap('$mod_prop')");
+        diag("The last inversion list element is '$invmap_ref->[-1]', and should be '$missing'");
+        next PROPERTY;
+    }
+
+    if ($name eq 'bmg') {   # This one has an atypical $missing
+        if ($missing ne "") {
+            fail("prop_invmap('$mod_prop')");
+            diag("The missings should be \"\"; got '$missing'");
+            next PROPERTY;
+        }
+    }
+    elsif ($format =~ /^ a (?!r) /x) {
+        if ($full_name eq 'Perl_Decimal_Digit') {
+            if ($missing ne "") {
+                fail("prop_invmap('$mod_prop')");
+                diag("The missings should be \"\"; got '$missing'");
+                next PROPERTY;
+            }
+        }
+        elsif ($missing ne "0") {
+            fail("prop_invmap('$mod_prop')");
+            diag("The missings should be '0'; got '$missing'");
+            next PROPERTY;
+        }
+    }
+    elsif ($missing =~ /[<>]/) {
+        fail("prop_invmap('$mod_prop')");
+        diag("The missings should NOT be something with <...>'");
+        next PROPERTY;
+
+        # I don't want to hard code in what all the missings should be, so
+        # those don't get fully tested.
+    }
+
+    # Certain properties don't have their own files, but must be constructed
+    # using proxies.
+    my $proxy_prop = $name;
+    if ($full_name eq 'Present_In') {
+        $proxy_prop = "age";    # The maps for these two props are identical
+    }
+    elsif ($full_name eq 'Simple_Case_Folding'
+           || $full_name =~ /Simple_ (.) .*? case_Mapping  /x)
+    {
+        if ($full_name eq 'Simple_Case_Folding') {
+            $proxy_prop = 'cf';
+        }
+        else {
+            # We captured the U, L, or T, leading to uc, lc, or tc.
+            $proxy_prop = lc $1 . "c";
+        }
+        if ($format ne "a") {
+            fail("prop_invmap('$mod_prop')");
+            diag("The format should be 'a'; got '$format'");
+            next PROPERTY;
+        }
+    }
+
+    if ($format !~ / ^ (?: a [der]? | ale? | n | sl? ) $ /x) {
+        fail("prop_invmap('$mod_prop')");
+        diag("Unknown format '$format'");
+        next PROPERTY;
+    }
+
+    my $base_file;
+    my $official;
+
+    # Handle the properties that have full disk files for them (except the
+    # Name property which is structurally enough different that it is handled
+    # separately below.)
+    if ($name ne 'na'
+        && ($name eq 'blk'
+            || defined
+                    ($base_file = $utf8::loose_property_to_file_of{$proxy_prop})
+            || exists $utf8::loose_to_file_of{$proxy_prop}
+            || $name eq "dm"))
+    {
+        # In the above, blk is done unconditionally, as we need to test that
+        # the old-style block names are returned, even if mktables has
+        # generated a file for the new-style; the test for dm comes afterward,
+        # so that if a file has been generated for it explicitly, we use that
+        # file (which is valid, unlike blk) instead of the combo
+        # Decomposition.pl files.
+        my $file;
+        my $is_binary = 0;
+        if ($name eq 'blk') {
+
+            # The blk property is special.  The original file with old block
+            # names is retained, and the default is to not write out a
+            # new-name file.  What we do is get the old names into a data
+            # structure, and from that create what the new file would look
+            # like.  $base_file is needed to be defined, just to avoid a
+            # message below.
+            $base_file = "This is a dummy name";
+            my $blocks_ref = charblocks();
+            $official = "";
+            for my $range (sort { $a->[0][0] <=> $b->[0][0] }
+                           values %$blocks_ref)
+            {
+                # Translate the charblocks() data structure to what the file
+                # would like.
+                $official .= sprintf"%04X\t%04X\t%s\n",
+                             $range->[0][0],
+                             $range->[0][1],
+                             $range->[0][2];
+            }
+        }
+        else {
+            $base_file = "Decomposition" if $format eq 'ad';
+
+            # Above leaves $base_file undefined only if it came from the hash
+            # below.  This should happen only when it is a binary property
+            # (and are accessing via a single-form name, like 'In_Latin1'),
+            # and so it is stored in a different directory than the To ones.
+            # XXX Currently, the only cases where it is complemented are the
+            # ones that have no code points.  And it works out for these that
+            # 1) complementing them, and then 2) adding or subtracting the
+            # initial 0 and final 110000 cancel each other out.  But further
+            # work would be needed in the unlikely event that an inverted
+            # property comes along without these characteristics
+            if (!defined $base_file) {
+                $base_file = $utf8::loose_to_file_of{$proxy_prop};
+                $is_binary = ($base_file =~ s/^!//) ? -1 : 1;
+                $base_file = "lib/$base_file";
+            }
+
+            # Read in the file
+            $file = "unicore/$base_file.pl";
+            $official = do $file;
+
+            # Get rid of any trailing space and comments in the file.
+            $official =~ s/\s*(#.*)?$//mg;
+
+            if ($format eq 'ad') {
+                my @official = split /\n/, $official;
+                $official = "";
+                foreach my $line (@official) {
+                    my ($start, $end, $value)
+                                    = $line =~ / ^ (.+?) \t (.*?) \t (.+?)
+                                                \s* ( \# .* )? $ /x;
+                    # Decomposition.pl also has the <compatible> types in it,
+                    # which should be removed.
+                    $value =~ s/<.*?> //;
+                    $official .= "$start\t\t$value\n";
+
+                    # If this is a multi-char range, we turn it into as many
+                    # single character ranges as necessary.  This makes things
+                    # easier below.
+                    if ($end ne "") {
+                        for my $i (hex($start) + 1 .. hex $end) {
+                            $official .= sprintf "%04X\t\t%s\n", $i, $value;
+                        }
+                    }
+                }
+            }
+        }
+        chomp $official;
+
+        # If there are any special elements, get a reference to them.
+        my $swash_name = $utf8::file_to_swash_name{$base_file};
+        my $specials_ref;
+        if ($swash_name) {
+            $specials_ref = $utf8::SwashInfo{$swash_name}{'specials_name'};
+            if ($specials_ref) {
+
+                # Convert from the name to the actual reference.
+                no strict 'refs';
+                $specials_ref = \%{$specials_ref};
+            }
+        }
+
+        # Certain of the proxy properties have to be adjusted to match the
+        # real ones.
+        if ($full_name =~ /^(Case_Folding|(Lower|Title|Upper)case_Mapping)/) {
+
+            # Here we have either
+            #   1) Case_Folding; or
+            #   2) a proxy that is a full mapping, which means that what the
+            #      real property is is the equivalent simple mapping.
+            # In both cases, the file will have a standard list containing
+            # simple mappings (to a single code point), and a specials hash
+            # which contains all the mappings that are to multiple code
+            # points.  First, extract a list containing all the file's simple
+            # mappings.
+            my @list;
+            for (split "\n", $official) {
+                my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
+                                                \s* ( \# .* )? $ /x;
+                $end = $start if $end eq "";
+                push @list, [ hex $start, hex $end, $value ];
+            }
+
+            # For these mappings, the file contains all the simple mappings,
+            # including the ones that are overridden by the specials.  These
+            # need to be removed as the list is for just the full ones.
+
+            # Go through any special mappings one by one.  They are packed.
+            my $i = 0;
+            foreach my $utf8_cp (sort keys %$specials_ref) {
+                my $cp = unpack("C0U", $utf8_cp);
+
+                # Find the spot in the @list of simple mappings that this
+                # special applies to; uses a linear search.
+                while ($i < @list -1 ) {
+                    last if  $cp <= $list[$i][1];
+                    $i++;
+                }
+
+                # Here $i is such that it points to the first range which ends
+                # at or above cp, and hence is the only range that could
+                # possibly contain it.
+
+                # If not in this range, no range contains it: nothing to
+                # remove.
+                next if $cp < $list[$i][0];
+
+                # Otherwise, remove the existing entry.  If it is the first
+                # element of the range...
+                if ($cp == $list[$i][0]) {
+
+                    # ... and there are other elements in the range, just shorten
+                    # the range to exclude this code point.
+                    if ($list[$i][1] > $list[$i][0]) {
+                        $list[$i][0]++;
+                    }
+
+                    # ... but if it is the only element in the range, remove
+                    # it entirely.
+                    else {
+                        splice @list, $i, 1;
+                    }
+                }
+                else { # Is somewhere in the middle of the range
+                    # Split the range into two, excluding this one in the
+                    # middle
+                    splice @list, $i, 1,
+                           [ $list[$i][0], $cp - 1, $list[$i][2] ],
+                           [ $cp + 1, $list[$i][1], $list[$i][2] ];
+                }
+            }
+
+            # Here, have gone through all the specials, modifying @list as
+            # needed.  Turn it back into what the file should look like.
+            $official = "";
+            for my $element (@list) {
+                $official .= "\n" if $official;
+                if ($element->[1] == $element->[0]) {
+                    $official .= sprintf "%04X\t\t%s", $element->[0], $element->[2];
+                }
+                else {
+                    $official .= sprintf "%04X\t%04X\t%s", $element->[0], $element->[1], $element->[2];
+                }
+            }
+        }
+        elsif ($full_name =~ /Simple_(Case_Folding|(Lower|Title|Upper)case_Mapping)/)
+        {
+
+            # These properties have everything in the regular array, and the
+            # specials are superfluous.
+            undef $specials_ref;
+        }
+        elsif ($name eq 'bmg') {
+
+            # For this property, the file is output using hex notation for the
+            # map, with all ranges equal to length 1.  Convert from hex to
+            # decimal.
+            my @lines = split "\n", $official;
+            foreach my $line (@lines) {
+                my ($code_point, $map) = split "\t\t", $line;
+                $line = $code_point . "\t\t" . hex $map;
+            }
+            $official = join "\n", @lines;
+        }
+
+        # Here, in $official, we have what the file looks like, or should like
+        # if we've had to fix it up.  Now take the invmap() output and reverse
+        # engineer from that what the file should look like.  Each iteration
+        # appends the next line to the running string.
+        my $tested_map = "";
+
+        # Create a copy of the file's specials hash.  (It has been undef'd if
+        # we know it isn't relevant to this property, so if it exists, it's an
+        # error or is relevant).  As we go along, we delete from that copy.
+        # If a delete fails, or something is left over after we are done,
+        # it's an error
+        my %specials = %$specials_ref if $specials_ref;
+
+        # The extra -1 is because the final element has been tested above to
+        # be for anything above Unicode.  The file doesn't go that high.
+        for (my $i = 0; $i <  @$invlist_ref - 1; $i++) {
+
+            # If the map element is a reference, have to stringify it (but
+            # don't do so if the format doesn't allow references, so that an
+            # improper format will generate an error.
+            if (ref $invmap_ref->[$i]
+                && ($format eq 'ad' || $format =~ /^ . l /x))
+            {
+                # The stringification depends on the format.
+                if ($format eq 'sl') {
+
+                    # At the time of this writing, there are two types of 'sl'
+                    # format  One, in Name_Alias, has multiple separate entries
+                    # for each code point; the other, in Script_Extension, is space
+                    # separated.  Assume the latter for non-Name_Alias.
+                    if ($full_name ne 'Name_Alias') {
+                        $invmap_ref->[$i] = join " ", @{$invmap_ref->[$i]};
+                    }
+                    else {
+                        # For Name_Alias, we emulate the file.  Entries with
+                        # just one value don't need any changes, but we
+                        # convert the list entries into a series of lines for
+                        # the file, starting with the first name.  The
+                        # succeeding entries are on separate lines, with the
+                        # code point repeated for each one and then two tabs,
+                        # then the value.  Code at the end of the loop will
+                        # set up the first line with its code point and two
+                        # tabs before the value, just as it does for every
+                        # other property; thus the special handling of the
+                        # first line.
+                        if (ref $invmap_ref->[$i]) {
+                            my $hex_cp = sprintf("%04X", $invlist_ref->[$i]);
+                            my $concatenated = $invmap_ref->[$i][0];
+                            for (my $j = 1; $j < @{$invmap_ref->[$i]}; $j++) {
+                                $concatenated .= "\n$hex_cp\t\t" . $invmap_ref->[$i][$j];
+                            }
+                            $invmap_ref->[$i] = $concatenated;
+                        }
+                    }
+                }
+                elsif ($format =~ / ^ al e? $/x) {
+
+                    # For a al property, the stringified result should be in
+                    # the specials hash.  The key is the packed code point,
+                    # and the value is the packed map.
+                    my $value;
+                    if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) {
+                        fail("prop_invmap('$mod_prop')");
+                        diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]);
+                        next PROPERTY;
+                    }
+                    my $packed = pack "U*", @{$invmap_ref->[$i]};
+                    if ($value ne $packed) {
+                        fail("prop_invmap('$mod_prop')");
+                        diag(sprintf "For %04X, expected the mapping to be '$packed', but got '$value'");
+                        next PROPERTY;
+                    }
+
+                    # As this doesn't get tested when we later compare with
+                    # the actual file, it could be out of order and we
+                    # wouldn't know it.
+                    if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+                        || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+                    {
+                        fail("prop_invmap('$mod_prop')");
+                        diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+                        next PROPERTY;
+                    }
+                    next;
+                }
+                elsif ($format eq 'ad') {
+
+                    # The decomposition mapping file has the code points as
+                    # a string of space-separated hex constants.
+                    $invmap_ref->[$i] = join " ", map { sprintf "%04X", $_ } @{$invmap_ref->[$i]};
+                }
+                else {
+                    fail("prop_invmap('$mod_prop')");
+                    diag("Can't handle format '$format'");
+                    next PROPERTY;
+                }
+            }
+            elsif ($format eq 'ad' || $format eq 'ale') {
+
+                # The numerics in the returned map are stored as adjusted
+                # decimal integers.  The defaults are 0, and don't appear in
+                # $official, and are excluded later, but the elements must be
+                # converted back to their hex values before comparing with
+                # $official, as these files, for backwards compatibility, are
+                # not stored as adjusted.  (There currently is only one ale
+                # property, nfkccf.  If that changed this would also have to.)
+                if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x
+                    && $invmap_ref->[$i] != 0)
+                {
+                    my $next = $invmap_ref->[$i] + 1;
+                    $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i]);
+
+                    # If there are other elements in this range they need to
+                    # be adjusted; they must individually be re-mapped.  Do
+                    # this by splicing in a new element into the list and the
+                    # map containing the remainder of the range.  Next time
+                    # through we will look at that (possibly splicing again
+                    # until the whole range is processed).
+                    if ($invlist_ref->[$i+1] > $invlist_ref->[$i] + 1) {
+                        splice @$invlist_ref, $i+1, 0,
+                                $invlist_ref->[$i] + 1;
+                        splice @$invmap_ref, $i+1, 0, $next;
+                    }
+                }
+                if ($format eq 'ale' && $invmap_ref->[$i] eq "") {
+
+                    # ale properties have maps to the empty string that also
+                    # should be in the specials hash, with the key the packed
+                    # code point, and the map just empty.
+                    my $value;
+                    if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) {
+                        fail("prop_invmap('$mod_prop')");
+                        diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]);
+                        next PROPERTY;
+                    }
+                    if ($value ne "") {
+                        fail("prop_invmap('$mod_prop')");
+                        diag(sprintf "For %04X, expected the mapping to be \"\", but got '$value'", $invlist_ref->[$i]);
+                        next PROPERTY;
+                    }
+
+                    # As this doesn't get tested when we later compare with
+                    # the actual file, it could be out of order and we
+                    # wouldn't know it.
+                    if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+                        || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+                    {
+                        fail("prop_invmap('$mod_prop')");
+                        diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+                        next PROPERTY;
+                    }
+                    next;
+                }
+            }
+            elsif ($is_binary) { # These binary files don't have an explicit Y
+                $invmap_ref->[$i] =~ s/Y//;
+            }
+
+            # The file doesn't include entries that map to $missing, so don't
+            # include it in the built-up string.  But make sure that it is in
+            # the correct order in the input.
+            if ($invmap_ref->[$i] eq $missing) {
+                if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+                    || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+                {
+                    fail("prop_invmap('$mod_prop')");
+                    diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+                    next PROPERTY;
+                }
+                next;
+            }
+
+            # The ad property has one entry which isn't in the file.
+            # Ignore it, but make sure it is in order.
+            if ($format eq 'ad'
+                && $invmap_ref->[$i] eq '<hangul syllable>'
+                && $invlist_ref->[$i] == 0xAC00)
+            {
+                if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+                    || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+                {
+                    fail("prop_invmap('$mod_prop')");
+                    diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+                    next PROPERTY;
+                }
+                next;
+            }
+
+            # Finally have figured out what the map column in the file should
+            # be.  Append the line to the running string.
+            my $start = $invlist_ref->[$i];
+            my $end = $invlist_ref->[$i+1] - 1;
+            $end = ($start == $end) ? "" : sprintf("%04X", $end);
+            if ($invmap_ref->[$i] ne "") {
+                $tested_map .= sprintf "%04X\t%s\t%s\n", $start, $end, $invmap_ref->[$i];
+            }
+            elsif ($end ne "") {
+                $tested_map .= sprintf "%04X\t%s\n", $start, $end;
+            }
+            else {
+                $tested_map .= sprintf "%04X\n", $start;
+            }
+        } # End of looping over all elements.
+
+        # Here are done with generating what the file should look like
+
+        chomp $tested_map;
+
+        # And compare.
+        if ($tested_map ne $official) {
+            fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap");
+            next PROPERTY;
+        }
+
+        # There shouldn't be any specials unaccounted for.
+        if (keys %specials) {
+            fail("prop_invmap('$mod_prop')");
+            diag("Unexpected specials: " . join ", ", keys %specials);
+            next PROPERTY;
+        }
+    }
+    elsif ($format eq 'n') {
+
+        # Handle the Name property similar to the above.  But the file is
+        # sufficiently different that it is more convenient to make a special
+        # case for it.  It is a combination of the Name, Unicode1_Name, and
+        # Name_Alias properties, and named sequences.  We need to remove all
+        # but the Name in order to do the comparison.
+
+        if ($missing ne "") {
+            fail("prop_invmap('$mod_prop')");
+            diag("The missings should be \"\"; got \"missing\"");
+            next PROPERTY;
+        }
+
+        $official = do "unicore/Name.pl";
+
+        # Get rid of the named sequences portion of the file.  These don't
+        # have a tab before the first blank on a line.
+        $official =~ s/ ^ [^\t]+ \  .*? \n //xmg;
+
+        # And get rid of the controls.  These are named in the file, but
+        # shouldn't be in the property.  This gets rid of the two ranges in
+        # one fell swoop, and also all the Unicode1_Name values that may not
+        # be in Name_Alias.
+        $official =~ s/ 00000 \t .* 0001F .*? \n//xs;
+        $official =~ s/ 0007F \t .* 0009F .*? \n//xs;
+
+        # And remove the aliases.  We read in the Name_Alias property, and go
+        # through them one by one.
+        my ($aliases_code_points, $aliases_maps, undef, undef)
+                                                = &prop_invmap('Name_Alias');
+        for (my $i = 0; $i < @$aliases_code_points; $i++) {
+            my $code_point = $aliases_code_points->[$i];
+
+            # Already removed these above.
+            next if $code_point <= 0x1F
+                    || ($code_point >= 0x7F && $code_point <= 0x9F);
+
+            my $hex_code_point = sprintf "%05X", $code_point;
+
+            # Convert to a list if not already to make the following loop
+            # control uniform.
+            $aliases_maps->[$i] = [ $aliases_maps->[$i] ]
+                                                if ! ref $aliases_maps->[$i];
+
+            # Remove each alias for this code point from the file
+            foreach my $alias (@{$aliases_maps->[$i]}) {
+
+                # Remove the alias type from the entry, retaining just the name.
+                $alias =~ s/:.*//;
+
+                $alias = quotemeta($alias);
+                $official =~ s/$hex_code_point \t $alias \n //x;
+            }
+        }
+        chomp $official;
+
+        # Here have adjusted the file.  We also have to adjust the returned
+        # inversion map by checking and deleting all the lines in it that
+        # won't be in the file.  These are the lines that have generated
+        # things, like <hangul syllable>.
+        my $tested_map = "";        # Current running string
+        my @code_point_in_names =
+                               @Unicode::UCD::code_points_ending_in_code_point;
+
+        for my $i (0 .. @$invlist_ref - 1 - 1) {
+            my $start = $invlist_ref->[$i];
+            my $end = $invlist_ref->[$i+1] - 1;
+            if ($invmap_ref->[$i] eq $missing) {
+                if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+                    || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+                {
+                    fail("prop_invmap('$mod_prop')");
+                    diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+                    next PROPERTY;
+                }
+                next;
+            }
+            if ($invmap_ref->[$i] =~ / (.*) ( < .*? > )/x) {
+                my $name = $1;
+                my $type = $2;
+                if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1])
+                    || $invlist_ref->[$i] >= $invlist_ref->[$i+1])
+                {
+                    fail("prop_invmap('$mod_prop')");
+                    diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+                    next PROPERTY;
+                }
+                if ($type eq "<hangul syllable>") {
+                    if ($name ne "") {
+                        fail("prop_invmap('$mod_prop')");
+                        diag("Unexpected text in $invmap_ref->[$i]");
+                        next PROPERTY;
+                    }
+                    if ($start != 0xAC00) {
+                        fail("prop_invmap('$mod_prop')");
+                        diag(sprintf("<hangul syllables> should begin at 0xAC00, got %04X", $start));
+                        next PROPERTY;
+                    }
+                    if ($end != $start + 11172 - 1) {
+                        fail("prop_invmap('$mod_prop')");
+                        diag(sprintf("<hangul syllables> should end at %04X, got %04X", $start + 11172 -1, $end));
+                        next PROPERTY;
+                    }
+                }
+                elsif ($type ne "<code point>") {
+                    fail("prop_invmap('$mod_prop')");
+                    diag("Unexpected text '$type' in $invmap_ref->[$i]");
+                    next PROPERTY;
+                }
+                else {
+
+                    # Look through the array of names that end in code points,
+                    # and look for this start and end.  If not found is an
+                    # error.  If found, delete it, and at the end, make sure
+                    # have deleted everything.
+                    for my $i (0 .. @code_point_in_names - 1) {
+                        my $hash = $code_point_in_names[$i];
+                        if ($hash->{'low'} == $start
+                            && $hash->{'high'} == $end
+                            && "$hash->{'name'}-" eq $name)
+                        {
+                            splice @code_point_in_names, $i, 1;
+                            last;
+                        }
+                        else {
+                            fail("prop_invmap('$mod_prop')");
+                            diag("Unexpected code-point-in-name line '$invmap_ref->[$i]'");
+                            next PROPERTY;
+                        }
+                    }
+                }
+
+                next;
+            }
+
+            # Have adjusted the map, as needed.  Append to running string.
+            $end = ($start == $end) ? "" : sprintf("%05X", $end);
+            $tested_map .= sprintf "%05X\t%s\n", $start, $invmap_ref->[$i];
+        }
+
+        # Finished creating the string from the inversion map.  Can compare
+        # with what the file is.
+        chomp $tested_map;
+        if ($tested_map ne $official) {
+            fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap");
+            next PROPERTY;
+        }
+        if (@code_point_in_names) {
+            fail("prop_invmap('$mod_prop')");
+            use Data::Dumper;
+            diag("Missing code-point-in-name line(s)" . Dumper \@code_point_in_names);
+            next PROPERTY;
+        }
+    }
+    elsif ($format eq 's') {
+
+        # Here the map is not more or less directly from a file stored on
+        # disk.  We try a different tack.  These should all be properties that
+        # have just a few possible values (most of them are  binary).  We go
+        # through the map list, sorting each range into buckets, one for each
+        # map value.  Thus for binary properties there will be a bucket for Y
+        # and one for N.  The buckets are inversion lists.  We compare each
+        # constructed inversion list with what we would get for it using
+        # prop_invlist(), which has already been tested.  If they all match,
+        # the whole map must have matched.
+        my %maps;
+        my $previous_map;
+
+        # (The extra -1 is to not look at the final element in the loop, which
+        # we know is the one that starts just beyond Unicode and goes to
+        # infinity.)
+        for my $i (0 .. @$invlist_ref - 1 - 1) {
+            my $range_start = $invlist_ref->[$i];
+
+            # Because we are sorting into buckets, things could be
+            # out-of-order here, and still be in the correct order in the
+            # bucket, and hence wouldn't show up as an error; so have to
+            # check.
+            if (($i > 0 && $range_start <= $invlist_ref->[$i-1])
+                || $range_start >= $invlist_ref->[$i+1])
+            {
+                fail("prop_invmap('$mod_prop')");
+                diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]);
+                next PROPERTY;
+            }
+
+            # This new range closes out the range started in the previous
+            # iteration.
+            push @{$maps{$previous_map}}, $range_start if defined $previous_map;
+
+            # And starts a range which will be closed in the next iteration.
+            $previous_map = $invmap_ref->[$i];
+            push @{$maps{$previous_map}}, $range_start;
+        }
+
+        # The range we just started hasn't been closed, and we didn't look at
+        # the final element of the loop.  If that range is for the default
+        # value, it shouldn't be closed, as it is to extend to infinity.  But
+        # otherwise, it should end at the final Unicode code point, and the
+        # list that maps to the default value should have another element that
+        # does go to infinity for every above Unicode code point.
+
+        if (@$invlist_ref > 1) {
+            my $penultimate_map = $invmap_ref->[-2];
+            if ($penultimate_map ne $missing) {
+
+                # The -1th element contains the first non-Unicode code point.
+                push @{$maps{$penultimate_map}}, $invlist_ref->[-1];
+                push @{$maps{$missing}}, $invlist_ref->[-1];
+            }
+        }
+
+        # Here, we have the buckets (inversion lists) all constructed.  Go
+        # through each and verify that matches what prop_invlist() returns.
+        # We could use is_deeply() for the comparison, but would get multiple
+        # messages for each $prop.
+        foreach my $map (keys %maps) {
+            my @off_invlist = prop_invlist("$prop = $map");
+            my $min = (@off_invlist >= @{$maps{$map}})
+                       ? @off_invlist
+                       : @{$maps{$map}};
+            for my $i (0 .. $min- 1) {
+                if ($i > @off_invlist - 1) {
+                    fail("prop_invmap('$mod_prop')");
+                    diag("There is no element [$i] for $prop=$map from prop_invlist(), while [$i] in the implicit one constructed from prop_invmap() is '$maps{$map}[$i]'");
+                    next PROPERTY;
+                }
+                elsif ($i > @{$maps{$map}} - 1) {
+                    fail("prop_invmap('$mod_prop')");
+                    diag("There is no element [$i] from the implicit $prop=$map constructed from prop_invmap(), while [$i] in the one from prop_invlist() is '$off_invlist[$i]'");
+                    next PROPERTY;
+                }
+                elsif ($maps{$map}[$i] ne $off_invlist[$i]) {
+                    fail("prop_invmap('$mod_prop')");
+                    diag("Element [$i] of the implicit $prop=$map constructed from prop_invmap() is '$maps{$map}[$i]', and the one from prop_invlist() is '$off_invlist[$i]'");
+                    next PROPERTY;
+                }
+            }
+        }
+    }
+    else {  # Don't know this property nor format.
+
+        fail("prop_invmap('$mod_prop')");
+        diag("Unknown format '$format'");
+    }
+
+    pass("prop_invmap('$mod_prop')");
+}
+
+done_testing();
author	sthen <sthen@openbsd.org>	2013-03-25 20:06:16 +0000
committer	sthen <sthen@openbsd.org>	2013-03-25 20:06:16 +0000
commit	898184e3e61f9129feb5978fad5a8c6865f00b92 (patch)
tree	56f32aefc1eed60b534611007c7856f82697a205 /gnu/usr.bin/perl/lib/Unicode
parent	PGSHIFT -> PAGE_SHIFT (diff)
download	wireguard-openbsd-898184e3e61f9129feb5978fad5a8c6865f00b92.tar.xz wireguard-openbsd-898184e3e61f9129feb5978fad5a8c6865f00b92.zip