diff options
author | 2004-04-07 21:12:13 +0000 | |
---|---|---|
committer | 2004-04-07 21:12:13 +0000 | |
commit | 1b0c1ed84083c2966ca2436c25340d7804b6abd2 (patch) | |
tree | fb235961cc7f3cddfb00cd2cb125a6b8608c3603 /gnu/usr.bin/perl/lib/Unicode | |
parent | reflect reality. with lots of help from jmc@ (diff) | |
download | wireguard-openbsd-1b0c1ed84083c2966ca2436c25340d7804b6abd2.tar.xz wireguard-openbsd-1b0c1ed84083c2966ca2436c25340d7804b6abd2.zip |
perl 5.8.3 from CPAN
Diffstat (limited to 'gnu/usr.bin/perl/lib/Unicode')
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/Collate.pm | 327 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/Collate/Changes | 24 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/Collate/README | 42 | ||||
-rwxr-xr-x | gnu/usr.bin/perl/lib/Unicode/Collate/t/altern.t | 108 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/Collate/t/contract.t | 2 | ||||
-rwxr-xr-x | gnu/usr.bin/perl/lib/Unicode/Collate/t/illegal.t | 180 | ||||
-rwxr-xr-x | gnu/usr.bin/perl/lib/Unicode/Collate/t/illegalp.t | 80 | ||||
-rwxr-xr-x | gnu/usr.bin/perl/lib/Unicode/Collate/t/rearrang.t | 97 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/Collate/t/test.t | 223 | ||||
-rw-r--r-- | gnu/usr.bin/perl/lib/Unicode/Collate/t/version.t | 2 | ||||
-rwxr-xr-x | gnu/usr.bin/perl/lib/Unicode/Collate/t/view.t | 239 |
11 files changed, 1008 insertions, 316 deletions
diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate.pm b/gnu/usr.bin/perl/lib/Unicode/Collate.pm index a4d6d80cd1a..4f5032ef824 100644 --- a/gnu/usr.bin/perl/lib/Unicode/Collate.pm +++ b/gnu/usr.bin/perl/lib/Unicode/Collate.pm @@ -12,9 +12,11 @@ use warnings; use Carp; use File::Spec; +no warnings 'utf8'; + require Exporter; -our $VERSION = '0.30'; +our $VERSION = '0.33'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -47,12 +49,6 @@ use constant Min3Wt => 0x02; # Shifted weight at 4th level use constant Shift4Wt => 0xFFFF; -# Variable weight at 1st level. -# This is a negative value but should be regarded as zero on collation. -# This is for distinction of variable chars from level 3 ignorable chars. -use constant Var1Wt => -1; - - # A boolean for Variable and 16-bit weights at 4 levels of Collation Element # PROBLEM: The Default Unicode Collation Element Table # has weights over 0xFFFF at the 4th level. @@ -185,11 +181,13 @@ sub change { sub _checkLevel { my $level = shift; - my $key = shift; - croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.", - $level, $key, MinLevel if MinLevel > $level; - croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ", - $level, $key, MaxLevel if MaxLevel < $level; + my $key = shift; # 'level' or 'backwards' + MinLevel <= $level or croak sprintf + "Illegal level %d (in value for key '%s') lower than %d.", + $level, $key, MinLevel; + $level <= MaxLevel or croak sprintf + "Unsupported level %d (in value for key '%s') higher than %d.", + $level, $key, MaxLevel; } my %DerivCode = ( @@ -206,7 +204,7 @@ sub checkCollator { or croak "Illegal UCA version (passed $self->{UCA_Version})."; $self->{variable} ||= $self->{alternate} || $self->{variableTable} || - $self->{alternateTable} || $self->{alternate} || 'shifted'; + $self->{alternateTable} || 'shifted'; $self->{variable} = $self->{alternate} = lc($self->{variable}); exists $VariableOK{ $self->{variable} } or croak "$PACKAGE unknown variable tag name: $self->{variable}"; @@ -280,9 +278,9 @@ sub new $self->{level} ||= MaxLevel; $self->{UCA_Version} ||= UCA_Version(); - $self->{overrideHangul} = '' + $self->{overrideHangul} = FALSE if ! exists $self->{overrideHangul}; - $self->{overrideCJK} = '' + $self->{overrideCJK} = FALSE if ! exists $self->{overrideCJK}; $self->{normalization} = 'NFD' if ! exists $self->{normalization}; @@ -298,36 +296,36 @@ sub new sub read_table { my $self = shift; - my $file = $self->{table} ne '' ? $self->{table} : $KeyFile; - my $filepath = File::Spec->catfile($Path, $file); + my $filepath = File::Spec->catfile($Path, $self->{table}); open my $fk, "<$filepath" or croak "File does not exist at $filepath"; while (<$fk>) { next if /^\s*#/; - if (/^\s*\@/) { - if (/^\s*\@version\s*(\S*)/) { - $self->{versionTable} ||= $1; - } - elsif (/^\s*\@variable\s+(\S*)/) { # since UTS #10-9 - $self->{variableTable} ||= $1; - } - elsif (/^\s*\@alternate\s+(\S*)/) { # till UTS #10-8 - $self->{alternateTable} ||= $1; - } - elsif (/^\s*\@backwards\s+(\S*)/) { - push @{ $self->{backwardsTable} }, $1; - } - elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use - push @{ $self->{forwardsTable} }, $1; - } - elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG - push @{ $self->{rearrangeTable} }, _getHexArray($1); - } + unless (s/^\s*\@//) { + $self->parseEntry($_); next; } - $self->parseEntry($_); + + if (/^version\s*(\S*)/) { + $self->{versionTable} ||= $1; + } + elsif (/^variable\s+(\S*)/) { # since UTS #10-9 + $self->{variableTable} ||= $1; + } + elsif (/^alternate\s+(\S*)/) { # till UTS #10-8 + $self->{alternateTable} ||= $1; + } + elsif (/^backwards\s+(\S*)/) { + push @{ $self->{backwardsTable} }, $1; + } + elsif (/^forwards\s+(\S*)/) { # parhaps no use + push @{ $self->{forwardsTable} }, $1; + } + elsif (/^rearrange\s+(.*)/) { # (\S*) is NG + push @{ $self->{rearrangeTable} }, _getHexArray($1); + } } close $fk; } @@ -405,23 +403,28 @@ sub parseEntry ## -## arrayref[weights] = varCE(VCE) +## VCE = _varCE(variable term, VCE) ## -sub varCE +sub _varCE { - my $self = shift; - my($var, @wt) = unpack(VCE_TEMPLATE, shift); - - $self->{variable} eq 'blanked' ? - $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt : - $self->{variable} eq 'non-ignorable' ? - \@wt : - $self->{variable} eq 'shifted' ? - $var ? [Var1Wt, 0, 0, $wt[0] ] - : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] : - $self->{variable} eq 'shift-trimmed' ? - $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] : - croak "$PACKAGE unknown variable name: $self->{variable}"; + my $vbl = shift; + my $vce = shift; + if ($vbl eq 'non-ignorable') { + return $vce; + } + my ($var, @wt) = unpack VCE_TEMPLATE, $vce; + + if ($var) { + return pack(VCE_TEMPLATE, $var, 0, 0, 0, + $vbl eq 'blanked' ? $wt[3] : $wt[0]); + } + elsif ($vbl eq 'blanked') { + return $vce; + } + else { + return pack(VCE_TEMPLATE, $var, @wt[0..2], + $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0); + } } sub viewSortKey @@ -491,18 +494,16 @@ sub splitEnt } } - if ($ver9) { - # To remove a character marked as a completely ignorable. - for (my $i = 0; $i < @src; $i++) { - $src[$i] = undef if $ign->{ $src[$i] }; - } + # To remove a character marked as a completely ignorable. + for (my $i = 0; $i < @src; $i++) { + $src[$i] = undef + if _isIllegal($src[$i]) || ($ver9 && $ign->{ $src[$i] }); } for (my $i = 0; $i < @src; $i++) { - next if _isNonCharacter($src[$i]); - - my $i_orig = $i; my $jcps = $src[$i]; + next if ! defined $jcps; + my $i_orig = $i; if ($max->{$jcps}) { # contract my $temp_jcps = $jcps; @@ -548,9 +549,8 @@ sub splitEnt } if ($wLen) { - for (my $p = $i + 1; $p < @src; $p++) { - last if defined $src[$p]; - $i = $p; + for (; $i + 1 < @src; $i++) { + last if defined $src[$i + 1]; } } @@ -561,17 +561,18 @@ sub splitEnt ## -## list of arrayrefs of weights = getWt(JCPS) +## list of VCE = getWt(JCPS) ## sub getWt { my $self = shift; my $u = shift; + my $vbl = $self->{variable}; my $map = $self->{mapping}; my $der = $self->{derivCode}; return if !defined $u; - return map($self->varCE($_), @{ $map->{$u} }) + return map(_varCE($vbl, $_), @{ $map->{$u} }) if $map->{$u}; # JCPS must not be a contraction, then it's a code point. @@ -613,13 +614,13 @@ sub getWt $map->{$_} ? @{ $map->{$_} } : $der->($_); } @decH); } - return map $self->varCE($_), @hangulCE; + return map _varCE($vbl, $_), @hangulCE; } elsif (CJK_UidIni <= $u && $u <= CJK_UidFin || CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) { my $cjk = $self->{overrideCJK}; - return map $self->varCE($_), + return map _varCE($vbl, $_), $cjk ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u)) : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max @@ -627,7 +628,7 @@ sub getWt : $der->($u); } else { - return map $self->varCE($_), $der->($u); + return map _varCE($vbl, $_), $der->($u); } } @@ -641,11 +642,9 @@ sub getSortKey my $lev = $self->{level}; my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS my $ver9 = $self->{UCA_Version} >= 9; - my $v2i = $self->{variable} ne 'non-ignorable'; - - # weight arrays - my (@wts, @buf, $last_is_variable); + my $v2i = $ver9 && $self->{variable} ne 'non-ignorable'; + my @buf; # weight arrays if ($self->{hangul_terminator}) { my $preHST = ''; foreach my $jcps (@$rEnt) { @@ -659,38 +658,40 @@ sub getSortKey $preHST =~ /V\z/ && $curHST =~ /^L/ || $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { - push @wts, $self->varCE_HangulTerm; + push @buf, $self->getWtHangulTerm(); } $preHST = $curHST; - push @wts, $self->getWt($jcps); + push @buf, $self->getWt($jcps); } $preHST # end at hangul - and push @wts, $self->varCE_HangulTerm; + and push @buf, $self->getWtHangulTerm(); } else { foreach my $jcps (@$rEnt) { - push @wts, $self->getWt($jcps); + push @buf, $self->getWt($jcps); } } - foreach my $wt (@wts) { - if ($v2i && $ver9) { - if ($wt->[0] == 0) { # ignorable + # make sort key + my @ret = ([],[],[],[]); + my $last_is_variable; + + foreach my $vwt (@buf) { + my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); + if ($v2i) { + if ($var) { + $last_is_variable = TRUE; + } + elsif (!$wt[0]) { # ignorable next if $last_is_variable; - } else { - $last_is_variable = ($wt->[0] == Var1Wt); + } + else { + $last_is_variable = FALSE; } } - push @buf, $wt; - } - - # make sort key - my @ret = ([],[],[],[]); - foreach my $v (0..$lev-1) { - foreach my $b (@buf) { - push @{ $ret[$v] }, $b->[$v] - if 0 < $b->[$v]; + foreach my $v (0..$lev-1) { + 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v]; } } @@ -772,10 +773,10 @@ sub _derivCE_8 { } -sub varCE_HangulTerm { +sub getWtHangulTerm { my $self = shift; - return $self->varCE(pack(VCE_TEMPLATE, - NON_VAR, $self->{hangul_terminator}, 0,0,0)); + return _varCE($self->{variable}, + pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0)); } @@ -801,7 +802,7 @@ sub _decompHangul { ); } -sub _isNonCharacter { +sub _isIllegal { my $code = shift; return ! defined $code # removed || ($code < 0 || 0x10FFFF < $code) # out of range @@ -888,63 +889,68 @@ sub index ? map([$_, 0], $temp..$len) : wantarray ? ($temp,0) : $temp; } - if ($len < $pos) { - return wantarray ? () : NOMATCHPOS; - } + $len < $pos + and return wantarray ? () : NOMATCHPOS; my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); - if (! @$strE) { - return wantarray ? () : NOMATCHPOS; - } - my $last_is_variable; + @$strE + or return wantarray ? () : NOMATCHPOS; + my(@strWt, @iniPos, @finPos, @subWt, @g_ret); - $last_is_variable = FALSE; - for my $wt (map $self->getWt($_), @$subE) { - my $to_be_pushed = _nonIgnorAtLevel($wt,$lev); + my $last_is_variable; + for my $vwt (map $self->getWt($_), @$subE) { + my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); + my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); if ($v2i && $ver9) { - if ($wt->[0] == 0) { + if ($var) { + $last_is_variable = TRUE; + } + elsif (!$wt[0]) { # ignorable $to_be_pushed = FALSE if $last_is_variable; - } else { - $last_is_variable = ($wt->[0] == Var1Wt); + } + else { + $last_is_variable = FALSE; } } - if (@subWt && $wt->[0] == 0) { - push @{ $subWt[-1] }, $wt if $to_be_pushed; + if (@subWt && !$var && !$wt[0]) { + push @{ $subWt[-1] }, \@wt if $to_be_pushed; } else { - $wt->[0] = 0 if $wt->[0] == Var1Wt; - push @subWt, [ $wt ]; + push @subWt, [ \@wt ]; } } my $count = 0; my $end = @$strE - 1; - $last_is_variable = FALSE; - + $last_is_variable = FALSE; # reuse for (my $i = 0; $i <= $end; ) { # no $i++ my $found_base = 0; # fetch a grapheme while ($i <= $end && $found_base == 0) { - for my $wt ($self->getWt($strE->[$i][0])) { - my $to_be_pushed = _nonIgnorAtLevel($wt,$lev); + for my $vwt ($self->getWt($strE->[$i][0])) { + my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); + my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); if ($v2i && $ver9) { - if ($wt->[0] == 0) { + if ($var) { + $last_is_variable = TRUE; + } + elsif (!$wt[0]) { # ignorable $to_be_pushed = FALSE if $last_is_variable; - } else { - $last_is_variable = ($wt->[0] == Var1Wt); + } + else { + $last_is_variable = FALSE; } } - if (@strWt && $wt->[0] == 0) { - push @{ $strWt[-1] }, $wt if $to_be_pushed; + if (@strWt && !$var && !$wt[0]) { + push @{ $strWt[-1] }, \@wt if $to_be_pushed; $finPos[-1] = $strE->[$i][2]; } elsif ($to_be_pushed) { - $wt->[0] = 0 if $wt->[0] == Var1Wt; - push @strWt, [ $wt ]; + push @strWt, [ \@wt ]; push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; $finPos[-1] = NOMATCHPOS if $found_base; push @finPos, $strE->[$i][2]; @@ -1144,17 +1150,43 @@ in the collation element table through C<table>, mapping to collation elements is overrided. If it does not exist, the mapping is defined additionally. - entry => <<'ENTRIES', # use the UCA file format -00E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e> -0063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish -0043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish -ENTRIES + entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) +0063 0068 ; [.0E6A.0020.0002.0063] # ch +0043 0068 ; [.0E6A.0020.0007.0043] # Ch +0043 0048 ; [.0E6A.0020.0008.0043] # CH +006C 006C ; [.0F4C.0020.0002.006C] # ll +004C 006C ; [.0F4C.0020.0007.004C] # Ll +004C 004C ; [.0F4C.0020.0008.004C] # LL +006E 0303 ; [.0F7B.0020.0002.006E] # n-tilde +004E 0303 ; [.0F7B.0020.0008.004E] # N-tilde +ENTRY + + entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) +00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> +00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> +ENTRY B<NOTE:> The code point in the UCA file format (before C<';'>) -B<must> be a Unicode code point, but not a native code point. +B<must> be a Unicode code point (defined as hexadecimal), +but not a native code point. So C<0063> must always denote C<U+0063>, but not a character of C<"\x63">. +Weighting may vary depending on collation element table. +So ensure the weights defined in C<entry> will be consistent with +those in the collation element table loaded via C<table>. + +In DUCET v4.0.0, primary weight of C<C> is C<0E60> +and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> +(as a value between C<0E60> and C<0E6D>) +makes ordering as C<C E<lt> CH E<lt> D>. +Exactly speaking DUCET already has some characters between C<C> and C<D>: +C<small capital C> (C<U+1D04>) with primary weight C<0E64>, +C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, +and C<c-curl> (C<U+0255>) with C<0E69>. +Then primary weight C<0E6A> for C<CH> makes C<CH> +ordered between C<c-curl> and C<D>. + =item hangul_terminator -- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10. @@ -1232,10 +1264,10 @@ though they are not concerned with C<Unicode::Normalize::normalize()>. If C<undef> (not a string C<"undef">) is passed explicitly as the value for this key, any normalization is not carried out (this may make tailoring easier -if any normalization is not desired). -Under C<(normalization =E<gt> undef)>, only contiguous contractions -are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>, -even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>. +if any normalization is not desired). Under C<(normalization =E<gt> undef)>, +only contiguous contractions are resolved; +e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, +C<A-cedilla-ring> would be primary equal to C<A>. In this point, C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. @@ -1289,14 +1321,15 @@ in table or C<entry> is still valid. -- see 7.1 Derived Collation Elements, UTS #10. -By default, Hangul Syllables are decomposed into Hangul Jamo. +By default, Hangul Syllables are decomposed into Hangul Jamo, +even if C<(normalization =E<gt> undef)>. But the mapping of Hangul Syllables may be overrided. This tag works like C<overrideCJK>, so see there for examples. If you want to override the mapping of Hangul Syllables, -the Normalization Forms D and KD are not appropriate -(they will be decomposed before overriding). +NFD, NFKD, and FCD are not appropriate, +since they will decompose Hangul Syllables before overriding. If C<undef> is passed explicitly as the value for this key, weight for Hangul Syllables is treated as undefined @@ -1344,11 +1377,10 @@ but it is not warned at present.> You can use another collation element table if desired. The table file must be put into a directory -where F<Unicode/Collate.pm> is installed. -E.g. in F<perl/lib/Unicode/Collate> directory -when you have F<perl/lib/Unicode/Collate.pm>. +where F<Unicode/Collate.pm> is installed; e.g. into +F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>. -By default, the filename F<"allkeys.txt"> is used. +By default, the filename F<allkeys.txt> is used. If C<undef> is passed explicitly as the value for this key, no file is read (but you can define collation elements via C<entry>). @@ -1422,7 +1454,7 @@ By default (if specification is omitted), 'shifted' is adopted. 'Blanked' Variable elements are made ignorable at levels 1 through 3; considered at the 4th level. - 'Non-ignorable' Variable elements are not reset to ignorable. + 'Non-Ignorable' Variable elements are not reset to ignorable. 'Shifted' Variable elements are made ignorable at levels 1 through 3 their level 4 weight is replaced by the old level 1 weight. @@ -1680,9 +1712,8 @@ assign C<normalization =E<gt> undef> explicitly. =head2 Conformance Test -The Conformance Test for the UCA is provided -in L<http://www.unicode.org/reports/tr10/CollationTest.html> -and L<http://www.unicode.org/reports/tr10/CollationTest.zip> +The Conformance Test for the UCA is available +under L<http://www.unicode.org/Public/UCA/>. For F<CollationTest_SHIFTED.txt>, a collator via C<Unicode::Collate-E<gt>new( )> should be used; @@ -1693,7 +1724,7 @@ B<Unicode::Normalize is required to try The Conformance Test.> =head1 AUTHOR -SADAHIRO Tomoyuki, <SADAHIRO@cpan.org> +SADAHIRO Tomoyuki <SADAHIRO@cpan.org> http://homepage1.nifty.com/nomenclator/perl/ @@ -1712,17 +1743,17 @@ L<http://www.unicode.org/reports/tr10/> =item The Default Unicode Collation Element Table (DUCET) -L<http://www.unicode.org/reports/tr10/allkeys.txt> +L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> =item The conformance test for the UCA -L<http://www.unicode.org/reports/tr10/CollationTest.html> +L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> -L<http://www.unicode.org/reports/tr10/CollationTest.zip> +L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> =item Hangul Syllable Type -http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt +L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> =item Unicode Normalization Forms - UAX #15 diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/Changes b/gnu/usr.bin/perl/lib/Unicode/Collate/Changes index 7f92d7aad18..a59ffa0e55e 100644 --- a/gnu/usr.bin/perl/lib/Unicode/Collate/Changes +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/Changes @@ -1,14 +1,36 @@ Revision history for Perl module Unicode::Collate. +0.33 Sat Dec 13 14:07:27 2003 + - documentation improvement: in "entry", "overrideHangul", etc. + +0.32 Wed Dec 3 23:38:18 2003 + - A matching part from index(), match() etc. will include illegal + code points (as well as ignorable characters) following a grapheme. + - Contraction with illegal code point will be invalid. + - Added some tests in illegal.t; added view.t. + - Some tests are separated from test.t into altern.t and rearrang.t. + - modified XSUB internals. + +0.31 Sun Nov 16 15:40:15 2003 + - Illegal code points (surrogate and noncharacter; they are definitely + ignorable) will be distinguished from NULL ("\0"); + but porting is not successful in the case of ((Pure Perl) and + (Perl 5.7.3 or before)). If perl 5.6.X is used, XSUB may help it + in place of broken CORE::unpack('U*') in older perl. + - added illegal.t and illegalp.t. + - added XSUB (EXPERIMENTAL!) where some functions are implemented + in XSUB. Pure Perl is also supported. + 0.30 Mon Oct 13 21:26:37 2003 - fix: Completely ignorable in table should be able to be overrided by non-ignorable in entry. - fix: Maximum length for contraction must not be shortened - by a shorter contraction following. + by a shorter contraction following in table and/or entry. - added normal.t. - some doc fixes 0.29 Mon Oct 13 12:18:23 2003 + - now UCA Version 11 (but no functionality is different from Version 9). - supported hangul_terminator. - fix: Base_Unicode_Version falsely returns Perl's Unicode version. C4 in UTS #10 requires UTS's Unicode version. diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/README b/gnu/usr.bin/perl/lib/Unicode/Collate/README index 6a4b712a8b0..376a0c2c13c 100644 --- a/gnu/usr.bin/perl/lib/Unicode/Collate/README +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.30 +Unicode/Collate version 0.33 =============================== NAME @@ -23,6 +23,22 @@ SYNOPSIS INSTALLATION Perl 5.6.1 or later +(recommended: Perl 5.8.0 or later) + +To use this module, it is recommended to install a table file +in the UCA format, by copying it into the directory +where F<Unicode/Collate.pm> is installed; +e.g. into F<perl/lib/Unicode/Collate/> directory +if you have F<perl/lib/Unicode/Collate.pm>. + +The most preferable one is "The Default Unicode Collation Element Table", +available from the Unicode consortium's website: + + http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version) + +Though this distribution contains a subset of allkeys.txt, named "keys.txt", +this one is intended only for doing a test of this module +and practically useless for any other purpose. To install this module type the following: @@ -31,17 +47,25 @@ To install this module type the following: make test make install -To use this module, it is better to install a table file in the UCA format, -by copying it into the lib/Unicode/Collate directory. +(!! XSUB for Unicode::Collate is an EXPERIMENTAL support !!) +If you have a C compiler and want to use XSUB edition, +type the following (!! "enableXS" must run before "Makefile.PL" !!): -The most preferable one is "The Default Unicode Collation Element Table", -available from the Unicode consortium's website: + perl enableXS + perl Makefile.PL + make + make test + make install - http://www.unicode.org/reports/tr10/allkeys.txt +If you decide to install pure Perl (i.e. non-XS) edition after trying +to build XSUB, type the following: -Though this distribution contains a subset of allkeys.txt, named "keys.txt", -this one is intended only for doing a test of this module -and practically useless for any other purpose. + make clean + perl disableXS + perl Makefile.PL + make + make test + make install DEPENDENCIES diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/t/altern.t b/gnu/usr.bin/perl/lib/Unicode/Collate/t/altern.t new file mode 100755 index 00000000000..d48e168b696 --- /dev/null +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/t/altern.t @@ -0,0 +1,108 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 37 }; + +use strict; +use warnings; +use Unicode::Collate; + +ok(1); + +######################### + +sub _pack_U { Unicode::Collate::pack_U(@_) } +sub _unpack_U { Unicode::Collate::unpack_U(@_) } + +my $A_acute = _pack_U(0xC1); +my $acute = _pack_U(0x0301); + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, +); + +my %origAlt = $Collator->change(alternate => 'Blanked'); + +ok($Collator->lt("death", "de luge")); +ok($Collator->lt("de luge", "de-luge")); +ok($Collator->lt("de-luge", "deluge")); +ok($Collator->lt("deluge", "de\x{2010}luge")); +ok($Collator->lt("deluge", "de Luge")); + +$Collator->change(alternate => 'Non-ignorable'); + +ok($Collator->lt("de luge", "de Luge")); +ok($Collator->lt("de Luge", "de-luge")); +ok($Collator->lt("de-Luge", "de\x{2010}luge")); +ok($Collator->lt("de-luge", "death")); +ok($Collator->lt("death", "deluge")); + +$Collator->change(alternate => 'Shifted'); + +ok($Collator->lt("death", "de luge")); +ok($Collator->lt("de luge", "de-luge")); +ok($Collator->lt("de-luge", "deluge")); +ok($Collator->lt("deluge", "de Luge")); +ok($Collator->lt("de Luge", "deLuge")); + +$Collator->change(alternate => 'Shift-Trimmed'); + +ok($Collator->lt("death", "deluge")); +ok($Collator->lt("deluge", "de luge")); +ok($Collator->lt("de luge", "de-luge")); +ok($Collator->lt("de-luge", "deLuge")); +ok($Collator->lt("deLuge", "de Luge")); + +$Collator->change(%origAlt); + +ok($Collator->{alternate}, 'shifted'); + +############## + +# ignorable after alternate + +# Shifted; +ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); +ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); +ok($Collator->eq("?\x{300}", "?")); +ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. + +$Collator->change(level => 3); +ok($Collator->eq("\cA", "?")); + +$Collator->change(alternate => 'blanked', level => 4); +ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); +ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); +ok($Collator->eq("?\x{300}", "?")); +ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. + +$Collator->change(level => 3); +ok($Collator->eq("\cA", "?")); + +$Collator->change(alternate => 'Non-ignorable', level => 4); + +ok($Collator->lt("?\x{300}", "?!")); +ok($Collator->gt("?\x{300}A$acute", "?$A_acute")); +ok($Collator->gt("?\x{300}", "?")); +ok($Collator->gt("?\x{344}", "?")); + +$Collator->change(level => 3); +ok($Collator->lt("\cA", "?")); + +$Collator->change(alternate => 'Shifted', level => 4); + diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/t/contract.t b/gnu/usr.bin/perl/lib/Unicode/Collate/t/contract.t index 1c6658d5724..18a0cfbdc93 100644 --- a/gnu/usr.bin/perl/lib/Unicode/Collate/t/contract.t +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/t/contract.t @@ -72,7 +72,7 @@ $sortkeys{'KAta'} = $kjeNoN->viewSortKey("\x{043A}\x{0334}\x{0301}"); $sortkeys{'KAat'} = $kjeNoN->viewSortKey("\x{043A}\x{0301}\x{0334}"); eval { require Unicode::Normalize }; -if (!$@ && !$IsEBCDIC) { +if (!$@) { my $kjeNFD = Unicode::Collate->new( level => 1, table => undef, diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/t/illegal.t b/gnu/usr.bin/perl/lib/Unicode/Collate/t/illegal.t new file mode 100755 index 00000000000..803e2f6739a --- /dev/null +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/t/illegal.t @@ -0,0 +1,180 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +use strict; +use warnings; + +BEGIN { + use Unicode::Collate; + + unless (exists &Unicode::Collate::bootstrap or 5.008 <= $]) { + print "1..0 # skipped: XSUB, or Perl 5.8.0 or later". + " needed for this test\n"; + print $@; + exit; + } +} + +BEGIN { plan tests => 40 }; + +ok(1); + +######################### + +no warnings 'utf8'; + +# NULL is tailorable but illegal code points are not. +# illegal code points should be always ingored +# (cf. UCA, 7.1.1 Illegal code points). + +my $illeg = Unicode::Collate->new( + entry => <<'ENTRIES', +0000 ; [.0020.0000.0000.0000] # [0000] NULL +0001 ; [.0021.0000.0000.0001] # [0001] START OF HEADING +FFFE ; [.0022.0000.0000.FFFE] # <noncharacter-FFFE> (invalid) +FFFF ; [.0023.0000.0000.FFFF] # <noncharacter-FFFF> (invalid) +D800 ; [.0024.0000.0000.D800] # <surrogate-D800> (invalid) +DFFF ; [.0025.0000.0000.DFFF] # <surrogate-DFFF> (invalid) +FDD0 ; [.0026.0000.0000.FDD0] # <noncharacter-FDD0> (invalid) +FDEF ; [.0027.0000.0000.FDEF] # <noncharacter-FDEF> (invalid) +0002 ; [.0030.0000.0000.0002] # [0002] START OF TEXT +10FFFF; [.0040.0000.0000.10FFFF] # <noncharacter-10FFFF> (invalid) +110000; [.0041.0000.0000.110000] # <out-of-range 110000> (invalid) +0041 ; [.1000.0020.0008.0041] # latin A +0041 0000 ; [.1100.0020.0008.0041] # latin A + NULL +0041 FFFF ; [.1200.0020.0008.0041] # latin A + FFFF (invalid) +ENTRIES + level => 1, + table => undef, + normalization => undef, +); + +# 2..12 +ok($illeg->lt("", "\x00")); +ok($illeg->lt("", "\x01")); +ok($illeg->eq("", "\x{FFFE}")); +ok($illeg->eq("", "\x{FFFF}")); +ok($illeg->eq("", "\x{D800}")); +ok($illeg->eq("", "\x{DFFF}")); +ok($illeg->eq("", "\x{FDD0}")); +ok($illeg->eq("", "\x{FDEF}")); +ok($illeg->lt("", "\x02")); +ok($illeg->eq("", "\x{10FFFF}")); +ok($illeg->eq("", "\x{110000}")); + +# 13..22 +ok($illeg->lt("\x00", "\x01")); +ok($illeg->lt("\x01", "\x02")); +ok($illeg->ne("\0", "\x{D800}")); +ok($illeg->ne("\0", "\x{DFFF}")); +ok($illeg->ne("\0", "\x{FDD0}")); +ok($illeg->ne("\0", "\x{FDEF}")); +ok($illeg->ne("\0", "\x{FFFE}")); +ok($illeg->ne("\0", "\x{FFFF}")); +ok($illeg->ne("\0", "\x{10FFFF}")); +ok($illeg->ne("\0", "\x{110000}")); + +# 23..26 +ok($illeg->eq("A", "A\x{FFFF}")); +ok($illeg->gt("A\0", "A\x{FFFF}")); +ok($illeg->lt("A", "A\0")); +ok($illeg->lt("AA", "A\0")); + +################## + +my($match, $str, $sub, $ret); + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + level => 1, + normalization => undef, +); + +$sub = "pe"; + + +$str = "Pe\x{300}\x{301}rl"; +$ret = "Pe\x{300}\x{301}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{300}\0\0\x{301}rl"; +$ret = "Pe\x{300}\0\0\x{301}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{DA00}\x{301}\x{DFFF}rl"; +$ret = "Pe\x{DA00}\x{301}\x{DFFF}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{FFFF}\x{301}rl"; +$ret = "Pe\x{FFFF}\x{301}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{110000}\x{301}rl"; +$ret = "Pe\x{110000}\x{301}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{300}\x{d801}\x{301}rl"; +$ret = "Pe\x{300}\x{d801}\x{301}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{300}\x{ffff}\x{301}rl"; +$ret = "Pe\x{300}\x{ffff}\x{301}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{300}\x{110000}\x{301}rl"; +$ret = "Pe\x{300}\x{110000}\x{301}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{D9ab}\x{DFFF}rl"; +$ret = "Pe\x{D9ab}\x{DFFF}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{FFFF}rl"; +$ret = "Pe\x{FFFF}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{110000}rl"; +$ret = "Pe\x{110000}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{300}\x{D800}\x{DFFF}rl"; +$ret = "Pe\x{300}\x{D800}\x{DFFF}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{300}\x{FFFF}rl"; +$ret = "Pe\x{300}\x{FFFF}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + +$str = "Pe\x{300}\x{110000}rl"; +$ret = "Pe\x{300}\x{110000}"; +($match) = $Collator->match($str, $sub); +ok($match, $ret); + + diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/t/illegalp.t b/gnu/usr.bin/perl/lib/Unicode/Collate/t/illegalp.t new file mode 100755 index 00000000000..690c88d0bb1 --- /dev/null +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/t/illegalp.t @@ -0,0 +1,80 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 17 }; + +use strict; +use warnings; + +ok(1); + +# +# No test for Unicode::Collate is included in this .t file. +# +# UCA conformance test requires completely ignorable characters +# (including noncharacters) must be able to be ordered in code point order; +# If not so, Unicode::Collate must not be compliant with UCA. +# +# ~~~ CollationTest_SHIFTED.txt in CollationTest-4.0.0 +# +# 206F 0021; # ! NOMINAL DIGIT SHAPES [| | | 0251] +# D800 0021; # ! <surrogate-D800> [| | | 0251] +# DFFF 0021; # ! <surrogate-DFFF> [| | | 0251] +# FDD0 0021; # ! <noncharacter-FDD0> [| | | 0251] +# FFFB 0021; # ! INTERLINEAR ANNOTATION TERMINATOR [| | | 0251] +# FFFE 0021; # ! <noncharacter-FFFE> [| | | 0251] +# FFFF 0021; # ! <noncharacter-FFFF> [| | | 0251] +# 1D165 0021; # ! MS. Cm. STEM [| | | 0251] +# +# ~~~ CollationTest_NON_IGNORABLE.txt in CollationTest-4.0.0 +# +# 206F 0021; # ! NOMINAL DIGIT SHAPES [0251 | 0020 | 0002 |] +# D800 0021; # ! <surrogate-D800> [0251 | 0020 | 0002 |] +# DFFF 0021; # ! <surrogate-DFFF> [0251 | 0020 | 0002 |] +# FDD0 0021; # ! <noncharacter-FDD0> [0251 | 0020 | 0002 |] +# FFFB 0021; # ! INTERLINEAR ANNOTATION TERMINATOR [0251 | 0020 | 0002 |] +# FFFE 0021; # ! <noncharacter-FFFE> [0251 | 0020 | 0002 |] +# FFFF 0021; # ! <noncharacter-FFFF> [0251 | 0020 | 0002 |] +# 1D165 0021; # ! MS. Cm. STEM [0251 | 0020 | 0002 |] +# + +no warnings 'utf8'; + +ok("\x{206F}!" lt "\x{D800}!"); +ok(pack('U*', 0x206F, 0x21) lt pack('U*', 0xD800, 0x21)); + +ok("\x{D800}!" lt "\x{DFFF}!"); +ok(pack('U*', 0xD800, 0x21) lt pack('U*', 0xDFFF, 0x21)); + +ok("\x{DFFF}!" lt "\x{FDD0}!"); +ok(pack('U*', 0xDFFF, 0x21) lt pack('U*', 0xFDD0, 0x21) ); + +ok("\x{FDD0}!" lt "\x{FFFB}!"); +ok(pack('U*', 0xFDD0, 0x21) lt pack('U*', 0xFFFB, 0x21)); + +ok("\x{FFFB}!" lt "\x{FFFE}!"); +ok(pack('U*', 0xFFFB, 0x21) lt pack('U*', 0xFFFE, 0x21)); + +ok("\x{FFFE}!" lt "\x{FFFF}!"); +ok(pack('U*', 0xFFFE, 0x21) lt pack('U*', 0xFFFF, 0x21)); + +ok("\x{FFFF}!" lt "\x{1D165}!"); +ok(pack('U*', 0xFFFF, 0x21) lt pack('U*', 0x1D165, 0x21)); + +ok("\000!" lt "\x{FFFF}!"); +ok(pack('U*', 0, 0x21) lt pack('U*', 0xFFFF, 0x21)); + diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/t/rearrang.t b/gnu/usr.bin/perl/lib/Unicode/Collate/t/rearrang.t new file mode 100755 index 00000000000..cc02fa9f796 --- /dev/null +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/t/rearrang.t @@ -0,0 +1,97 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 23 }; + +use strict; +use warnings; +use Unicode::Collate; + +ok(1); + +######################### + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, +); + +# rearrange : 0x0E40..0x0E44, 0x0EC0..0x0EC4 (default) + +##### 2..9 + +my %old_rearrange = $Collator->change(rearrange => undef); + +ok($Collator->gt("\x{0E41}A", "\x{0E40}B")); +ok($Collator->gt("A\x{0E41}A", "A\x{0E40}B")); + +$Collator->change(rearrange => [ 0x61 ]); + # U+0061, 'a': This is a Unicode value, never a native value. + +ok($Collator->gt("ab", "AB")); # as 'ba' > 'AB' + +$Collator->change(%old_rearrange); + +ok($Collator->lt("ab", "AB")); +ok($Collator->lt("\x{0E40}", "\x{0E41}")); +ok($Collator->lt("\x{0E40}A", "\x{0E41}B")); +ok($Collator->lt("\x{0E41}A", "\x{0E40}B")); +ok($Collator->lt("A\x{0E41}A", "A\x{0E40}B")); + +##### 10..13 + +my $all_undef_8 = Unicode::Collate->new( + table => undef, + normalization => undef, + overrideCJK => undef, + overrideHangul => undef, + UCA_Version => 8, +); + +ok($all_undef_8->lt("\x{0E40}", "\x{0E41}")); +ok($all_undef_8->lt("\x{0E40}A", "\x{0E41}B")); +ok($all_undef_8->lt("\x{0E41}A", "\x{0E40}B")); +ok($all_undef_8->lt("A\x{0E41}A", "A\x{0E40}B")); + +##### 14..18 + +my $no_rearrange = Unicode::Collate->new( + table => undef, + normalization => undef, + rearrange => [], +); + +ok($no_rearrange->lt("A", "B")); +ok($no_rearrange->lt("\x{0E40}", "\x{0E41}")); +ok($no_rearrange->lt("\x{0E40}A", "\x{0E41}B")); +ok($no_rearrange->gt("\x{0E41}A", "\x{0E40}B")); +ok($no_rearrange->gt("A\x{0E41}A", "A\x{0E40}B")); + +##### 19..23 + +my $undef_rearrange = Unicode::Collate->new( + table => undef, + normalization => undef, + rearrange => undef, +); + +ok($undef_rearrange->lt("A", "B")); +ok($undef_rearrange->lt("\x{0E40}", "\x{0E41}")); +ok($undef_rearrange->lt("\x{0E40}A", "\x{0E41}B")); +ok($undef_rearrange->gt("\x{0E41}A", "\x{0E40}B")); +ok($undef_rearrange->gt("A\x{0E41}A", "A\x{0E40}B")); + diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/t/test.t b/gnu/usr.bin/perl/lib/Unicode/Collate/t/test.t index 8a7eb8b59f9..53fa7ca8792 100644 --- a/gnu/usr.bin/perl/lib/Unicode/Collate/t/test.t +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/t/test.t @@ -15,14 +15,12 @@ BEGIN { } use Test; -BEGIN { plan tests => 203 }; +BEGIN { plan tests => 160 }; use strict; use warnings; use Unicode::Collate; -our $IsEBCDIC = ord("A") != 0x41; - ok(1); ##### 2..6 @@ -73,11 +71,10 @@ ok($Collator->lt("A", $A_acute)); ok($Collator->lt("A", $a_acute)); ok($Collator->lt($a_acute, $A_acute)); -##### 17..20 +##### 18..20 eval { require Unicode::Normalize }; - -if (!$@ && !$IsEBCDIC) { +if (!$@) { my $NFD = Unicode::Collate->new( table => 'keys.txt', level => 1, @@ -102,7 +99,7 @@ else { ok(1); } -##### 21..30 +##### 21..34 my $trad = Unicode::Collate->new( table => 'keys.txt', @@ -127,10 +124,20 @@ ok( join(':', $Collator->sort( qw/ acha aca ada acia acka / ) ), join(':', qw/ aca acha acia acka ada / ), ); + ok($trad->eq("ocho", "oc\cAho")); # UCA v9 ok($trad->eq("ocho", "oc\0\cA\0\cBho")); # UCA v9 -ok($trad->eq("-", "")); # also UCA v8 -ok($trad->lt("oc-ho", "ocho")); # also UCA v8 +ok($trad->eq("-", "")); +ok($trad->gt("ocho", "oc-ho")); + +$trad->change(UCA_Version => 8); + +ok($trad->gt("ocho", "oc\cAho")); +ok($trad->gt("ocho", "oc\0\cA\0\cBho")); +ok($trad->eq("-", "")); +ok($trad->gt("ocho", "oc-ho")); + +$trad->change(UCA_Version => 9); my $hiragana = "\x{3042}\x{3044}"; my $katakana = "\x{30A2}\x{30A4}"; @@ -141,7 +148,7 @@ ok($trad->eq("", $katakana)); ok($trad->eq($hiragana, $katakana)); ok($trad->eq($katakana, $hiragana)); -##### 31..37 +##### 35..41 $Collator->change(level => 2); @@ -154,7 +161,7 @@ ok( $Collator->cmp($hiragana, $katakana), 0); ok( $Collator->eq($hiragana, $katakana) ); ok( $Collator->ge($hiragana, $katakana) ); -##### 38..43 +##### 42..47 # hangul ok( $Collator->eq("a\x{AC00}b", "a\x{1100}\x{1161}b") ); @@ -164,7 +171,7 @@ ok( $Collator->lt("a\x{AC00}b", "a\x{AE00}b") ); ok( $Collator->gt("a\x{D7A3}b", "a\x{C544}b") ); ok( $Collator->lt("a\x{C544}b", "a\x{30A2}b") ); # hangul < hiragana -##### 44..52 +##### 48..56 $Collator->change(%old_level, katakana_before_hiragana => 1); @@ -179,7 +186,7 @@ ok( $Collator->ne($hiragana, $katakana) ); ok( $Collator->gt($hiragana, $katakana) ); ok( $Collator->ge($hiragana, $katakana) ); -##### 53..58 +##### 57..62 $Collator->change(upper_before_lower => 1); @@ -190,7 +197,7 @@ ok( $Collator->cmp($hiragana, $katakana), 1); ok( $Collator->ge($hiragana, $katakana), 1); ok( $Collator->gt($hiragana, $katakana), 1); -##### 59..64 +##### 63..68 $Collator->change(katakana_before_hiragana => 0); @@ -204,7 +211,7 @@ ok( $Collator->le("abc", "ABC") ); ok( $Collator->cmp($hiragana, $katakana), -1); ok( $Collator->lt($hiragana, $katakana) ); -##### 65..66 +##### 69..70 my $ignoreAE = Unicode::Collate->new( table => 'keys.txt', @@ -215,7 +222,7 @@ my $ignoreAE = Unicode::Collate->new( ok($ignoreAE->eq("element","lament")); ok($ignoreAE->eq("Perl","ePrl")); -##### 67 +##### 71 my $onlyABC = Unicode::Collate->new( table => undef, @@ -235,7 +242,7 @@ ok( join(':', qw/ A aB Ab ABA BAC cAc cc / ), ); -##### 68..71 +##### 72..75 my $undefAE = Unicode::Collate->new( table => 'keys.txt', @@ -248,7 +255,7 @@ ok($Collator->lt("edge","fog")); ok($undefAE ->gt("lake","like")); ok($Collator->lt("lake","like")); -##### 72..81 +##### 76..85 # Table is undefined, then no entry is defined. @@ -282,7 +289,7 @@ ok($undef_table->lt("\x{4E00}","\x{4E8C}")); # U+4E8C: Ideograph "TWO" -##### 82..86 +##### 86..90 my $few_entries = Unicode::Collate->new( entry => <<'ENTRIES', @@ -313,7 +320,7 @@ ok($few_entries->lt("\x{AE30}", "\x{AC00}")); ok($few_entries->eq("\x{AC00}", "\x{1100}\x{1161}")); -##### 87..91 +##### 91..95 my $all_undef_8 = Unicode::Collate->new( table => undef, @@ -332,7 +339,7 @@ ok($all_undef_8->lt("\x{4E00}", "\x{AC00}")); ok($all_undef_8->gt("\x{AC00}", "\x{1100}\x{1161}")); ok($all_undef_8->gt("\x{AC00}", "\x{ABFF}")); -##### 92..96 +##### 96..100 my $all_undef_9 = Unicode::Collate->new( table => undef, @@ -351,7 +358,7 @@ ok($all_undef_9->lt("\x{20000}", "\x{AC00}")); ok($all_undef_9->gt("\x{AC00}", "\x{1100}\x{1161}")); ok($all_undef_9->gt("\x{AC00}", "\x{ABFF}")); # U+ABFF: not assigned -##### 97..101 +##### 101..105 my $ignoreCJK = Unicode::Collate->new( table => undef, @@ -370,7 +377,7 @@ ok($ignoreCJK->eq("Pe\x{4E00}rl", "Perl")); # U+4E00 is a CJK. ok($ignoreCJK->gt("\x{4DFF}", "\x{4E00}")); # U+4DFF is not CJK. ok($ignoreCJK->lt("Pe\x{5B57}rl", "Perl")); # 'r' is unassigned. -##### 102..106 +##### 106..110 my $ignoreHangul = Unicode::Collate->new( table => undef, @@ -389,45 +396,7 @@ ok($ignoreHangul->lt("\x{AC00}", "\x{AE00}")); ok($ignoreHangul->lt("\x{AC00}", "\x{1100}\x{1161}")); # Jamo are not ignored. ok($ignoreHangul->lt("Pe\x{AE00}rl", "Perl")); # 'r' is unassigned. -##### 107..127 - -my %origAlter = $Collator->change(alternate => 'Blanked'); - -ok($Collator->lt("death", "de luge")); -ok($Collator->lt("de luge", "de-luge")); -ok($Collator->lt("de-luge", "deluge")); -ok($Collator->lt("deluge", "de\x{2010}luge")); -ok($Collator->lt("deluge", "de Luge")); - -$Collator->change(alternate => 'Non-ignorable'); - -ok($Collator->lt("de luge", "de Luge")); -ok($Collator->lt("de Luge", "de-luge")); -ok($Collator->lt("de-Luge", "de\x{2010}luge")); -ok($Collator->lt("de-luge", "death")); -ok($Collator->lt("death", "deluge")); - -$Collator->change(alternate => 'Shifted'); - -ok($Collator->lt("death", "de luge")); -ok($Collator->lt("de luge", "de-luge")); -ok($Collator->lt("de-luge", "deluge")); -ok($Collator->lt("deluge", "de Luge")); -ok($Collator->lt("de Luge", "deLuge")); - -$Collator->change(alternate => 'Shift-Trimmed'); - -ok($Collator->lt("death", "deluge")); -ok($Collator->lt("deluge", "de luge")); -ok($Collator->lt("de luge", "de-luge")); -ok($Collator->lt("de-luge", "deLuge")); -ok($Collator->lt("deLuge", "de Luge")); - -$Collator->change(%origAlter); - -ok($Collator->{alternate}, 'shifted'); - -##### 128..132 +##### 111..115 my $overCJK = Unicode::Collate->new( table => undef, @@ -449,62 +418,7 @@ ok($overCJK->lt("A\x{4E03}", "A\x{4E00}")); ok($overCJK->lt("A\x{4E03}", "a\x{4E00}")); ok($overCJK->lt("a\x{4E03}", "A\x{4E00}")); -##### 133..144 - -# rearrange : 0x0E40..0x0E44, 0x0EC0..0x0EC4 (default) - -my %old_rearrange = $Collator->change(rearrange => undef); - -ok($Collator->gt("\x{0E41}A", "\x{0E40}B")); -ok($Collator->gt("A\x{0E41}A", "A\x{0E40}B")); - -$Collator->change(rearrange => [ 0x61 ]); - # U+0061, 'a': This is a Unicode value, never a native value. - -ok($Collator->gt("ab", "AB")); # as 'ba' > 'AB' - -$Collator->change(%old_rearrange); - -ok($Collator->lt("ab", "AB")); -ok($Collator->lt("\x{0E40}", "\x{0E41}")); -ok($Collator->lt("\x{0E40}A", "\x{0E41}B")); -ok($Collator->lt("\x{0E41}A", "\x{0E40}B")); -ok($Collator->lt("A\x{0E41}A", "A\x{0E40}B")); - -ok($all_undef_8->lt("\x{0E40}", "\x{0E41}")); -ok($all_undef_8->lt("\x{0E40}A", "\x{0E41}B")); -ok($all_undef_8->lt("\x{0E41}A", "\x{0E40}B")); -ok($all_undef_8->lt("A\x{0E41}A", "A\x{0E40}B")); - -##### 145..149 - -my $no_rearrange = Unicode::Collate->new( - table => undef, - normalization => undef, - rearrange => [], -); - -ok($no_rearrange->lt("A", "B")); -ok($no_rearrange->lt("\x{0E40}", "\x{0E41}")); -ok($no_rearrange->lt("\x{0E40}A", "\x{0E41}B")); -ok($no_rearrange->gt("\x{0E41}A", "\x{0E40}B")); -ok($no_rearrange->gt("A\x{0E41}A", "A\x{0E40}B")); - -##### 150..154 - -my $undef_rearrange = Unicode::Collate->new( - table => undef, - normalization => undef, - rearrange => undef, -); - -ok($undef_rearrange->lt("A", "B")); -ok($undef_rearrange->lt("\x{0E40}", "\x{0E41}")); -ok($undef_rearrange->lt("\x{0E40}A", "\x{0E41}B")); -ok($undef_rearrange->gt("\x{0E41}A", "\x{0E40}B")); -ok($undef_rearrange->gt("A\x{0E41}A", "A\x{0E40}B")); - -##### 155..159 +##### 116..120 my $dropArticles = Unicode::Collate->new( table => "keys.txt", @@ -522,7 +436,7 @@ ok($dropArticles->lt("the pen", "a pencil")); ok($Collator->lt("Perl", "The Perl")); ok($Collator->gt("the pen", "a pencil")); -##### 160..161 +##### 121..122 my $backLevel1 = Unicode::Collate->new( table => undef, @@ -535,7 +449,7 @@ my $backLevel1 = Unicode::Collate->new( ok($backLevel1->gt("AB", "BA")); ok($backLevel1->gt("\x{3042}\x{3044}", "\x{3044}\x{3042}")); -##### 162..169 +##### 123..130 my $backLevel2 = Unicode::Collate->new( table => "keys.txt", @@ -557,41 +471,7 @@ ok($backLevel2->lt("\x{4E03}", $katakana)); ok($Collator ->gt("\x{4E00}", $hiragana)); ok($Collator ->gt("\x{4E03}", $katakana)); -##### 170..184 - -# ignorable after variable - -# Shifted; -ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); -ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); -ok($Collator->eq("?\x{300}", "?")); -ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. - -$Collator->change(level => 3); -ok($Collator->eq("\cA", "?")); - -$Collator->change(alternate => 'blanked', level => 4); -ok($Collator->eq("?\x{300}!\x{301}\x{315}", "?!")); -ok($Collator->eq("?\x{300}A\x{301}", "?$A_acute")); -ok($Collator->eq("?\x{300}", "?")); -ok($Collator->eq("?\x{344}", "?")); # U+0344 has two CEs. - -$Collator->change(level => 3); -ok($Collator->eq("\cA", "?")); - -$Collator->change(alternate => 'Non-ignorable', level => 4); - -ok($Collator->lt("?\x{300}", "?!")); -ok($Collator->gt("?\x{300}A$acute", "?$A_acute")); -ok($Collator->gt("?\x{300}", "?")); -ok($Collator->gt("?\x{344}", "?")); - -$Collator->change(level => 3); -ok($Collator->lt("\cA", "?")); - -$Collator->change(alternate => 'Shifted', level => 4); - -##### 185..196 +##### 131..142 # According to Conformance Test, # a L3-ignorable is treated as a completely ignorable. @@ -631,7 +511,7 @@ ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}")); ok($L3ignorable->eq("\x{1D1BB}", "\x{1D1B9}\x{1D165}")); ok($L3ignorable->eq("\x{1D1BC}", "\x{1D1BA}\x{1D165}")); -##### 197..203 +##### 143..149 my $O_str = Unicode::Collate->new( table => "keys.txt", @@ -665,4 +545,35 @@ ok($Collator->eq("\x{200B}", "\0")); ok($O_str ->gt("\x{200B}", "\0")); ok($O_str ->gt("\x{200B}", "A")); +##### 150..159 + +my %origVer = $Collator->change(UCA_Version => 8); + +$Collator->change(level => 3); + +ok($Collator->gt("!\x{300}", "")); +ok($Collator->gt("!\x{300}", "!")); +ok($Collator->eq("!\x{300}", "\x{300}")); + +$Collator->change(level => 2); + +ok($Collator->eq("!\x{300}", "\x{300}")); + +$Collator->change(level => 4); + +ok($Collator->gt("!\x{300}", "!")); +ok($Collator->lt("!\x{300}", "\x{300}")); + +$Collator->change(%origVer, level => 3); + +ok($Collator->eq("!\x{300}", "")); +ok($Collator->eq("!\x{300}", "!")); +ok($Collator->lt("!\x{300}", "\x{300}")); + +$Collator->change(level => 4); + +ok($Collator->gt("!\x{300}", "")); +ok($Collator->eq("!\x{300}", "!")); + ##### + diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/t/version.t b/gnu/usr.bin/perl/lib/Unicode/Collate/t/version.t index 0a6d448e1e3..fec144c9d79 100644 --- a/gnu/usr.bin/perl/lib/Unicode/Collate/t/version.t +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/t/version.t @@ -25,7 +25,7 @@ ok(1); ######################### -# Fix me when UCA and/or key.txt is upgraded. +# Fix me when UCA and/or keys.txt is upgraded. my $UCA_Version = "11"; my $Base_Unicode_Version = "4.0"; my $Key_Version = "3.1.1"; diff --git a/gnu/usr.bin/perl/lib/Unicode/Collate/t/view.t b/gnu/usr.bin/perl/lib/Unicode/Collate/t/view.t new file mode 100755 index 00000000000..578d4843e57 --- /dev/null +++ b/gnu/usr.bin/perl/lib/Unicode/Collate/t/view.t @@ -0,0 +1,239 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 53 }; + +use strict; +use warnings; +use Unicode::Collate; + +######################### + +ok(1); + +my $Collator = Unicode::Collate->new( + table => 'keys.txt', + normalization => undef, +); + +############## + +ok($Collator->viewSortKey(""), "[| | |]"); + +ok($Collator->viewSortKey("A"), "[0A15 | 0020 | 0008 | FFFF]"); + +ok($Collator->viewSortKey("ABC"), + "[0A15 0A29 0A3D | 0020 0020 0020 | 0008 0008 0008 | FFFF FFFF FFFF]"); + +ok($Collator->viewSortKey("(12)"), + "[0A0C 0A0D | 0020 0020 | 0002 0002 | 027A FFFF FFFF 027B]"); + +ok($Collator->viewSortKey("!\x{300}"), "[| | | 024B]"); + +ok($Collator->viewSortKey("\x{300}"), "[| 0035 | 0002 | FFFF]"); + +$Collator->change(level => 3); +ok($Collator->viewSortKey("A"), "[0A15 | 0020 | 0008 |]"); + +$Collator->change(level => 2); +ok($Collator->viewSortKey("A"), "[0A15 | 0020 | |]"); + +$Collator->change(level => 1); +ok($Collator->viewSortKey("A"), "[0A15 | | |]"); + +### Version 8 + +$Collator->change(level => 4, UCA_Version => 8); + +ok($Collator->viewSortKey(""), "[|||]"); + +ok($Collator->viewSortKey("A"), "[0A15|0020|0008|FFFF]"); + +ok($Collator->viewSortKey("ABC"), + "[0A15 0A29 0A3D|0020 0020 0020|0008 0008 0008|FFFF FFFF FFFF]"); + +ok($Collator->viewSortKey("(12)"), + "[0A0C 0A0D|0020 0020|0002 0002|027A FFFF FFFF 027B]"); + +ok($Collator->viewSortKey("!\x{300}"), "[|0035|0002|024B FFFF]"); + +ok($Collator->viewSortKey("\x{300}"), "[|0035|0002|FFFF]"); + +$Collator->change(level => 3); +ok($Collator->viewSortKey("A"), "[0A15|0020|0008|]"); + +$Collator->change(level => 2); +ok($Collator->viewSortKey("A"), "[0A15|0020||]"); + +$Collator->change(level => 1); +ok($Collator->viewSortKey("A"), "[0A15|||]"); + +# Version 9 + +$Collator->change(level => 3, UCA_Version => 9); +ok($Collator->viewSortKey("A\x{300}z\x{301}"), + "[0A15 0C13 | 0020 0035 0020 0032 | 0008 0002 0002 0002 |]"); + +$Collator->change(backwards => 1); +ok($Collator->viewSortKey("A\x{300}z\x{301}"), + "[0C13 0A15 | 0020 0035 0020 0032 | 0008 0002 0002 0002 |]"); + +$Collator->change(backwards => 2); +ok($Collator->viewSortKey("A\x{300}z\x{301}"), + "[0A15 0C13 | 0032 0020 0035 0020 | 0008 0002 0002 0002 |]"); + +$Collator->change(backwards => [1,3]); +ok($Collator->viewSortKey("A\x{300}z\x{301}"), + "[0C13 0A15 | 0020 0035 0020 0032 | 0002 0002 0002 0008 |]"); + +$Collator->change(backwards => [2]); +ok($Collator->viewSortKey("\x{300}\x{301}\x{302}\x{303}"), + "[| 004E 003C 0032 0035 | 0002 0002 0002 0002 |]"); + +$Collator->change(backwards => []); +ok($Collator->viewSortKey("A\x{300}z\x{301}"), + "[0A15 0C13 | 0020 0035 0020 0032 | 0008 0002 0002 0002 |]"); + +$Collator->change(level => 4); + +# Variable + +our %origVar = $Collator->change(variable => 'Blanked'); +ok($Collator->viewSortKey("1+2"), + '[0A0C 0A0D | 0020 0020 | 0002 0002 | 0031 002B 0032]'); + +ok($Collator->viewSortKey("?\x{300}!\x{301}\x{315}."), + '[| | | 003F 0021 002E]'); + +ok($Collator->viewSortKey("?!."), '[| | | 003F 0021 002E]'); + +$Collator->change(variable => 'Non-ignorable'); +ok($Collator->viewSortKey("1+2"), + '[0A0C 039F 0A0D | 0020 0020 0020 | 0002 0002 0002 | 0031 002B 0032]'); + +ok($Collator->viewSortKey("?\x{300}!"), + '[024E 024B | 0020 0035 0020 | 0002 0002 0002 | 003F 0300 0021]'); + +ok($Collator->viewSortKey("?!."), + '[024E 024B 0255 | 0020 0020 0020 | 0002 0002 0002 | 003F 0021 002E]'); + +$Collator->change(variable => 'Shifted'); +ok($Collator->viewSortKey("1+2"), + '[0A0C 0A0D | 0020 0020 | 0002 0002 | FFFF 039F FFFF]'); + +ok($Collator->viewSortKey("?\x{300}!\x{301}\x{315}."), + '[| | | 024E 024B 0255]'); + +ok($Collator->viewSortKey("?!."), '[| | | 024E 024B 0255]'); + +$Collator->change(variable => 'Shift-Trimmed'); +ok($Collator->viewSortKey("1+2"), + '[0A0C 0A0D | 0020 0020 | 0002 0002 | 039F]'); + +ok($Collator->viewSortKey("?\x{300}!\x{301}\x{315}."), + '[| | | 024E 024B 0255]'); + +ok($Collator->viewSortKey("?!."), '[| | | 024E 024B 0255]'); + +$Collator->change(%origVar); + +##### + +# Level 3 weight + +ok($Collator->viewSortKey("a\x{3042}"), + '[0A15 1921 | 0020 0020 | 0002 000E | FFFF FFFF]'); + +ok($Collator->viewSortKey("A\x{30A2}"), + '[0A15 1921 | 0020 0020 | 0008 0011 | FFFF FFFF]'); + +$Collator->change(upper_before_lower => 1); + +ok($Collator->viewSortKey("a\x{3042}"), + '[0A15 1921 | 0020 0020 | 0008 000E | FFFF FFFF]'); + +ok($Collator->viewSortKey("A\x{30A2}"), + '[0A15 1921 | 0020 0020 | 0002 0011 | FFFF FFFF]'); + +$Collator->change(katakana_before_hiragana => 1); + +ok($Collator->viewSortKey("a\x{3042}"), + '[0A15 1921 | 0020 0020 | 0008 0013 | FFFF FFFF]'); +ok($Collator->viewSortKey("A\x{30A2}"), + '[0A15 1921 | 0020 0020 | 0002 000F | FFFF FFFF]'); + +$Collator->change(upper_before_lower => 0); + +ok($Collator->viewSortKey("a\x{3042}"), + '[0A15 1921 | 0020 0020 | 0002 0013 | FFFF FFFF]'); + +ok($Collator->viewSortKey("A\x{30A2}"), + '[0A15 1921 | 0020 0020 | 0008 000F | FFFF FFFF]'); + +$Collator->change(katakana_before_hiragana => 0); + +ok($Collator->viewSortKey("a\x{3042}"), + '[0A15 1921 | 0020 0020 | 0002 000E | FFFF FFFF]'); + +ok($Collator->viewSortKey("A\x{30A2}"), + '[0A15 1921 | 0020 0020 | 0008 0011 | FFFF FFFF]'); + +##### + +our $el = Unicode::Collate->new( + entry => <<'ENTRY', +006C ; [.0B03.0020.0002.006C] # LATIN SMALL LETTER L +FF4C ; [.0B03.0020.0003.FF4C] # FULLWIDTH LATIN SMALL LETTER L; QQK +217C ; [.0B03.0020.0004.217C] # SMALL ROMAN NUMERAL FIFTY; QQK +2113 ; [.0B03.0020.0005.2113] # SCRIPT SMALL L; QQK +24DB ; [.0B03.0020.0006.24DB] # CIRCLED LATIN SMALL LETTER L; QQK +004C ; [.0B03.0020.0008.004C] # LATIN CAPITAL LETTER L +FF2C ; [.0B03.0020.0009.FF2C] # FULLWIDTH LATIN CAPITAL LETTER L; QQK +216C ; [.0B03.0020.000A.216C] # ROMAN NUMERAL FIFTY; QQK +2112 ; [.0B03.0020.000B.2112] # SCRIPT CAPITAL L; QQK +24C1 ; [.0B03.0020.000C.24C1] # CIRCLED LATIN CAPITAL LETTER L; QQK +ENTRY + table => undef, + normalization => undef, +); + +our $el12 = '0B03 0B03 0B03 0B03 0B03 | 0020 0020 0020 0020 0020'; + +ok($el->viewSortKey("l\x{FF4C}\x{217C}\x{2113}\x{24DB}"), + "[$el12 | 0002 0003 0004 0005 0006 | FFFF FFFF FFFF FFFF FFFF]"); + +ok($el->viewSortKey("L\x{FF2C}\x{216C}\x{2112}\x{24C1}"), + "[$el12 | 0008 0009 000A 000B 000C | FFFF FFFF FFFF FFFF FFFF]"); + +$el->change(upper_before_lower => 1); + +ok($el->viewSortKey("l\x{FF4C}\x{217C}\x{2113}\x{24DB}"), + "[$el12 | 0008 0009 000A 000B 000C | FFFF FFFF FFFF FFFF FFFF]"); + +ok($el->viewSortKey("L\x{FF2C}\x{216C}\x{2112}\x{24C1}"), + "[$el12 | 0002 0003 0004 0005 0006 | FFFF FFFF FFFF FFFF FFFF]"); + +$el->change(upper_before_lower => 0); + +ok($el->viewSortKey("l\x{FF4C}\x{217C}\x{2113}\x{24DB}"), + "[$el12 | 0002 0003 0004 0005 0006 | FFFF FFFF FFFF FFFF FFFF]"); + +ok($el->viewSortKey("L\x{FF2C}\x{216C}\x{2112}\x{24C1}"), + "[$el12 | 0008 0009 000A 000B 000C | FFFF FFFF FFFF FFFF FFFF]"); + +##### + |