diff options
author | 2019-02-13 21:15:00 +0000 | |
---|---|---|
committer | 2019-02-13 21:15:00 +0000 | |
commit | 9f11ffb7133c203312a01e4b986886bc88c7d74b (patch) | |
tree | 6618511204c614b20256e4ef9dea39a7b311d638 /gnu/usr.bin/perl/pod | |
parent | Import perl-5.28.1 (diff) | |
download | wireguard-openbsd-9f11ffb7133c203312a01e4b986886bc88c7d74b.tar.xz wireguard-openbsd-9f11ffb7133c203312a01e4b986886bc88c7d74b.zip |
Fix merge issues, remove excess files - match perl-5.28.1 dist
looking good sthen@, Great! bluhm@
Diffstat (limited to 'gnu/usr.bin/perl/pod')
65 files changed, 6023 insertions, 3973 deletions
diff --git a/gnu/usr.bin/perl/pod/buildtoc b/gnu/usr.bin/perl/pod/buildtoc index acb8cf62a76..004a726a0fb 100644 --- a/gnu/usr.bin/perl/pod/buildtoc +++ b/gnu/usr.bin/perl/pod/buildtoc @@ -1,12 +1,12 @@ #!/usr/bin/perl -w use strict; -use vars qw($Quiet); use File::Spec; use FindBin; use Text::Wrap; use Getopt::Long; +our $Quiet; no locale; # Assumption is that we're either already being run from the top level (*nix, @@ -14,11 +14,11 @@ no locale; BEGIN { my $Top = File::Spec->catdir($FindBin::Bin, File::Spec->updir); chdir $Top or die "Can't chdir to $Top: $!"; - require 'Porting/pod_lib.pl'; + require './Porting/pod_lib.pl'; } die "$0: Usage: $0 [--quiet]\n" - unless GetOptions (quiet => \$Quiet) && !@ARGV; + unless GetOptions ('q|quiet' => \$Quiet) && !@ARGV; my $state = get_pod_metadata(0, sub { warn @_ if @_ }, 'pod/perltoc.pod'); @@ -186,7 +186,7 @@ my ($inhead1, $inhead2, $initem); sub podset { my ($pod, $file) = @_; - open my $fh, '<', $file or my_die "Can't open file '$file' for $pod: $!"; + open my $fh, '<:raw', $file or my_die "Can't open file '$file' for $pod: $!"; local *_; my $found_pod; diff --git a/gnu/usr.bin/perl/pod/perl.pod b/gnu/usr.bin/perl/pod/perl.pod index 32eed57f1ce..fbbb952b6bb 100644 --- a/gnu/usr.bin/perl/pod/perl.pod +++ b/gnu/usr.bin/perl/pod/perl.pod @@ -34,7 +34,7 @@ For ease of access, the Perl manual has been split up into several sections. # This section is parsed by Porting/pod_lib.pl for use by pod/buildtoc etc -flag =g perlmodlib perlapi perlintern +flag =g perluniprops perlmodlib perlapi perlintern flag =go perltoc flag =ro perlcn perljp perlko perltw flag = perlvms @@ -44,7 +44,7 @@ path perlglossary cpan/perlfaq/lib/ path perlxs(?:tut|typemap)? dist/ExtUtils-ParseXS/lib/ path perldoc cpan/Pod-Perldoc/ -aux c2ph h2ph h2xs perlbug pl2pm pod2html pod2man splain xsubpp +aux h2ph h2xs perlbug pl2pm pod2html pod2man splain xsubpp =end buildtoc @@ -98,6 +98,7 @@ aux c2ph h2ph h2xs perlbug pl2pm pod2html pod2man splain xsubpp perlpodspec Perl plain old documentation format specification perlpodstyle Perl POD style guide perldiag Perl diagnostic messages + perldeprecation Perl deprecations perllexwarn Perl warnings and their control perldebug Perl debugging perlvar Perl predefined variables @@ -180,6 +181,12 @@ aux c2ph h2ph h2xs perlbug pl2pm pod2html pod2man splain xsubpp perlhist Perl history records perldelta Perl changes since previous version + perl5280delta Perl changes in version 5.28.0 + perl5262delta Perl changes in version 5.26.2 + perl5261delta Perl changes in version 5.26.1 + perl5260delta Perl changes in version 5.26.0 + perl5244delta Perl changes in version 5.24.4 + perl5243delta Perl changes in version 5.24.3 perl5242delta Perl changes in version 5.24.2 perl5241delta Perl changes in version 5.24.1 perl5240delta Perl changes in version 5.24.0 diff --git a/gnu/usr.bin/perl/pod/perl5200delta.pod b/gnu/usr.bin/perl/pod/perl5200delta.pod index 874d8d10de0..427a2a017cb 100644 --- a/gnu/usr.bin/perl/pod/perl5200delta.pod +++ b/gnu/usr.bin/perl/pod/perl5200delta.pod @@ -2247,7 +2247,7 @@ Introduced by L<perl #113536|https://rt.perl.org/Public/Bug/Display.html?id=113536>, a memory leak on every call to C<system> and backticks (C< `` >), on most Win32 Perls starting from 5.18.0 has been fixed. The memory leak only occurred if you -enabled psuedo-fork in your build of Win32 Perl, and were running that build on +enabled pseudo-fork in your build of Win32 Perl, and were running that build on Server 2003 R2 or newer OS. The leak does not appear on WinXP SP3. [L<perl #121676|https://rt.perl.org/Public/Bug/Display.html?id=121676>] @@ -2736,7 +2736,7 @@ don't depend on the locale. [perl #120675] =item * -Under certain conditions, Perl would throw an error if in an lookbehind +Under certain conditions, Perl would throw an error if in a lookbehind assertion in a regexp, the assertion referred to a named subpattern, complaining the lookbehind was variable when it wasn't. This has been fixed. [perl #120600], [perl #120618]. The current fix may be improved diff --git a/gnu/usr.bin/perl/pod/perl58delta.pod b/gnu/usr.bin/perl/pod/perl58delta.pod index 8b81d4c2739..1997ff91d62 100644 --- a/gnu/usr.bin/perl/pod/perl58delta.pod +++ b/gnu/usr.bin/perl/pod/perl58delta.pod @@ -318,7 +318,7 @@ tainted data and in some future release they will produce fatal errors. The existing behaviour when localising tied arrays and hashes is wrong, and will be changed in a future release, so do not rely on the existing -behaviour. See L<"Localising Tied Arrays and Hashes Is Broken">. +behaviour. See L</"Localising Tied Arrays and Hashes Is Broken">. =back diff --git a/gnu/usr.bin/perl/pod/perlapio.pod b/gnu/usr.bin/perl/pod/perlapio.pod index 8e0f82e705b..4aa3c113f38 100644 --- a/gnu/usr.bin/perl/pod/perlapio.pod +++ b/gnu/usr.bin/perl/pod/perlapio.pod @@ -58,7 +58,7 @@ perlapio - perl's IO abstraction interface. SSize_t PerlIO_get_bufsiz(PerlIO *f); PerlIO *PerlIO_importFILE(FILE *stdio, const char *mode); - FILE *PerlIO_exportFILE(PerlIO *f, int flags); + FILE *PerlIO_exportFILE(PerlIO *f, const char *mode); FILE *PerlIO_findFILE(PerlIO *f); void PerlIO_releaseFILE(PerlIO *f,FILE *stdio); @@ -82,7 +82,7 @@ C<PerlIO *> takes the place of FILE *. Like FILE * it should be treated as opaque (it is probably safe to assume it is a pointer to something). -There are currently three implementations: +There are currently two implementations: =over 4 @@ -138,7 +138,7 @@ when C<NULL> is returned if this limit is exceeded. =item B<PerlIO_reopen(path,mode,f)> -While this currently exists in all three implementations perl itself +While this currently exists in both implementations, perl itself does not use it. I<As perl does not use it, it is not well tested.> Perl prefers to C<dup> the new low-level descriptor to the descriptor diff --git a/gnu/usr.bin/perl/pod/perlcall.pod b/gnu/usr.bin/perl/pod/perlcall.pod index c41d8357911..9a268aa4c10 100644 --- a/gnu/usr.bin/perl/pod/perlcall.pod +++ b/gnu/usr.bin/perl/pod/perlcall.pod @@ -1223,8 +1223,6 @@ I<PrintList>. static void call_PrintList() { - dSP; - call_argv("PrintList", G_DISCARD, words); } diff --git a/gnu/usr.bin/perl/pod/perlcheat.pod b/gnu/usr.bin/perl/pod/perlcheat.pod index 6e4e919ff50..99a8dfc5472 100644 --- a/gnu/usr.bin/perl/pod/perlcheat.pod +++ b/gnu/usr.bin/perl/pod/perlcheat.pod @@ -41,7 +41,7 @@ already be overwhelming. && /i case insensitive ^ string begin || // /m line based ^$ $ str end (bfr \n) .. ... /s . includes \n + one or more - ?: /x ignore wh.space * zero or more + ?: /x /xx ign. wh.space * zero or more = += last goto /p preserve ? zero or one , => /a ASCII /aa safe {3,7} repeat in range list ops /l locale /d dual | alternation diff --git a/gnu/usr.bin/perl/pod/perlcommunity.pod b/gnu/usr.bin/perl/pod/perlcommunity.pod index 4b86740afd6..bb55b67a6c7 100644 --- a/gnu/usr.bin/perl/pod/perlcommunity.pod +++ b/gnu/usr.bin/perl/pod/perlcommunity.pod @@ -32,9 +32,6 @@ contributors. If you don't see a certain project listed at L<http://lists.perl.org>, check the particular website for that project. Most mailing lists are archived at L<http://nntp.perl.org/>. -There are also plenty of Perl related newsgroups located under -C<comp.lang.perl.*>. - =head2 IRC The Perl community has a rather large IRC presence. For starters, it has its diff --git a/gnu/usr.bin/perl/pod/perldata.pod b/gnu/usr.bin/perl/pod/perldata.pod index a285eb7d43e..d03fe257730 100644 --- a/gnu/usr.bin/perl/pod/perldata.pod +++ b/gnu/usr.bin/perl/pod/perldata.pod @@ -25,7 +25,7 @@ be a chain of identifiers, separated by C<::> (or by the slightly archaic C<'>); all but the last are interpreted as names of packages, to locate the namespace in which to look up the final identifier (see L<perlmod/Packages> for details). For a more in-depth discussion -on identifiers, see L<Identifier parsing>. It's possible to +on identifiers, see L</Identifier parsing>. It's possible to substitute for a simple identifier, an expression that produces a reference to the value at runtime. This is described in more detail below and in L<perlref>. @@ -212,12 +212,12 @@ example is C<${^GLOBAL_PHASE}>. =item 5. -A sigil, followed by any single character in the range C<[\x80-\xFF]> +A sigil, followed by any single character in the range C<[\xA1-\xAC\xAE-\xFF]> when not under C<S<"use utf8">>. (Under C<S<"use utf8">>, the normal identifier rules given earlier in this section apply.) Use of non-graphic characters (the C1 controls, the NO-BREAK SPACE, and the -SOFT HYPHEN) is deprecated and will be forbidden in a future Perl -version. The use of the other characters is unwise, as these are all +SOFT HYPHEN) has been disallowed since v5.26.0. +The use of the other characters is unwise, as these are all reserved to have special meaning to Perl, and none of them currently do have special meaning, though this could change without notice. @@ -236,9 +236,9 @@ where the first character is any one of the characters in the range C<[\x80-\xFF]> followed by ASCII word characters up to the trailing brace. -The same caveats as the previous form apply: The non-graphic characters -are deprecated, it is unwise to use this form at all, and utf8ness makes -a big difference. +The same caveats as the previous form apply: The non-graphic +characters are no longer allowed with S<"use utf8">, it is unwise +to use this form at all, and utf8ness makes a big difference. =back @@ -320,12 +320,17 @@ are considered pretty much the same thing for nearly all purposes, references are strongly-typed, uncastable pointers with builtin reference-counting and destructor invocation. +X<truth> X<falsehood> X<true> X<false> X<!> X<not> X<negation> X<0> +X<boolean> X<bool> A scalar value is interpreted as FALSE in the Boolean sense if it is undefined, the null string or the number 0 (or its string equivalent, "0"), and TRUE if it is anything else. The Boolean context is just a special kind of scalar context where no conversion to a string or a number is ever performed. -X<boolean> X<bool> X<true> X<false> X<truth> +Negation of a true value by C<!> or C<not> returns a special false value. +When evaluated as a string it is treated as C<"">, but as a number, it +is treated as 0. Most Perl operators +that return true or false behave this way. There are actually two varieties of null strings (sometimes referred to as "empty" strings), a defined one and an undefined one. The @@ -399,18 +404,25 @@ leave nothing to doubt: $element_count = scalar(@whatever); -If you evaluate a hash in scalar context, it returns false if the -hash is empty. If there are any key/value pairs, it returns true; -more precisely, the value returned is a string consisting of the +If you evaluate a hash in scalar context, it returns a false value if +the hash is empty. If there are any key/value pairs, it returns a +true value. A more precise definition is version dependent. + +Prior to Perl 5.25 the value returned was a string consisting of the number of used buckets and the number of allocated buckets, separated by a slash. This is pretty much useful only to find out whether Perl's internal hashing algorithm is performing poorly on your data set. For example, you stick 10,000 things in a hash, but evaluating %HASH in scalar context reveals C<"1/16">, which means only one out of sixteen buckets has been touched, and presumably contains all -10,000 of your items. This isn't supposed to happen. If a tied hash -is evaluated in scalar context, the C<SCALAR> method is called (with a -fallback to C<FIRSTKEY>). +10,000 of your items. This isn't supposed to happen. + +As of Perl 5.25 the return was changed to be the count of keys in the +hash. If you need access to the old behavior you can use +C<Hash::Util::bucket_ratio()> instead. + +If a tied hash is evaluated in scalar context, the C<SCALAR> method is +called (with a fallback to C<FIRSTKEY>). X<hash, scalar context> X<hash, bucket> X<bucket> You can preallocate space for a hash by assigning to the keys() function. @@ -531,7 +543,7 @@ The infinity and not-a-number have their own special arithmetic rules. The general rule is that they are "contagious": C<Inf> plus one is C<Inf>, and C<NaN> plus one is C<NaN>. Where things get interesting is when you combine infinities and not-a-numbers: C<Inf> minus C<Inf> -and C<Inf> divided by C<INf> are C<NaN> (while C<Inf> plus C<Inf> is +and C<Inf> divided by C<Inf> are C<NaN> (while C<Inf> plus C<Inf> is C<Inf> and C<Inf> times C<Inf> is C<Inf>). C<NaN> is also curious in that it does not equal any number, I<including> itself: C<NaN> != C<NaN>. @@ -607,6 +619,17 @@ introduced, __END__ behaves like __DATA__ in the top level script (but not in files loaded with C<require> or C<do>) and leaves the remaining contents of the file accessible via C<main::DATA>. +The C<DATA> file handle by default has whatever PerlIO layers were +in place when Perl read the file to parse the source. Normally that +means that the file is being read bytewise, as if it were encoded in +Latin-1, but there are two major ways for it to be otherwise. Firstly, +if the C<__END__>/C<__DATA__> token is in the scope of a C<use utf8> +pragma then the C<DATA> handle will be in UTF-8 mode. And secondly, +if the source is being read from perl's standard input then the C<DATA> +file handle is actually aliased to the C<STDIN> file handle, and may +be in UTF-8 mode because of the C<PERL_UNICODE> environment variable or +perl's command-line switches. + See L<SelfLoader> for more description of __DATA__, and an example of its use. Note that you cannot read from the DATA filehandle in a BEGIN block: the BEGIN block is executed as soon @@ -766,6 +789,53 @@ As of Perl 5.22, you can also use C<(undef)x2> instead of C<undef, undef>. (You can also do C<($x) x 2>, which is less useful, because it assigns to the same variable twice, clobbering the first value assigned.) +When you assign a list of scalars to an array, all previous values in that +array are wiped out and the number of elements in the array will now be equal to +the number of elements in the right-hand list -- the list from which +assignment was made. The array will automatically resize itself to precisely +accommodate each element in the right-hand list. + + use warnings; + my (@xyz, $x, $y, $z); + + @xyz = (1, 2, 3); + print "@xyz\n"; # 1 2 3 + + @xyz = ('al', 'be', 'ga', 'de'); + print "@xyz\n"; # al be ga de + + @xyz = (101, 102); + print "@xyz\n"; # 101 102 + +When, however, you assign a list of scalars to another list of scalars, the +results differ according to whether the left-hand list -- the list being +assigned to -- has the same, more or fewer elements than the right-hand list. + + ($x, $y, $z) = (1, 2, 3); + print "$x $y $z\n"; # 1 2 3 + + ($x, $y, $z) = ('al', 'be', 'ga', 'de'); + print "$x $y $z\n"; # al be ga + + ($x, $y, $z) = (101, 102); + print "$x $y $z\n"; # 101 102 + # Use of uninitialized value $z in concatenation (.) + # or string at [program] line [line number]. + +If the number of scalars in the left-hand list is less than that in the +right-hand list, the "extra" scalars in the right-hand list will simply not be +assigned. + +If the number of scalars in the left-hand list is greater than that in the +left-hand list, the "missing" scalars will become undefined. + + ($x, $y, $z) = (101, 102); + for my $el ($x, $y, $z) { + (defined $el) ? print "$el " : print "<undef>"; + } + print "\n"; + # 101 102 <undef> + List assignment in scalar context returns the number of elements produced by the expression on the right side of the assignment: @@ -1007,8 +1077,11 @@ returning a list of key/value pairs rather than just values: %h = (blonk => 2, foo => 3, squink => 5, bar => 8); %subset = %h{'foo', 'bar'}; # key/value hash slice # %subset is now (foo => 3, bar => 8) + %removed = delete %h{'foo', 'bar'}; + # %removed is now (foo => 3, bar => 8) + # %h is now (blonk => 2, squink => 5) -However, the result of such a slice cannot be localized, deleted or used +However, the result of such a slice cannot be localized or used in assignment. These are otherwise very much consistent with hash slices using the @ symbol. @@ -1021,6 +1094,12 @@ of index/value pairs: @a = "a".."z"; @list = %a[3,4,6]; # @list is now (3, "d", 4, "e", 6, "g") + @removed = delete %a[3,4,6] + # @removed is now (3, "d", 4, "e", 6, "g") + # @list[3,4,6] are now undef + +Note that calling L<C<delete>|perlfunc/delete EXPR> on array values is +strongly discouraged. =head2 Typeglobs and Filehandles X<typeglob> X<filehandle> X<*> diff --git a/gnu/usr.bin/perl/pod/perldebguts.pod b/gnu/usr.bin/perl/pod/perldebguts.pod index c463ff0315b..99c177159ba 100644 --- a/gnu/usr.bin/perl/pod/perldebguts.pod +++ b/gnu/usr.bin/perl/pod/perldebguts.pod @@ -94,9 +94,13 @@ unless C<< $^D & (1<<30) >> is true. =item * When execution of the program reaches a subroutine call, a call to -C<&DB::sub>(I<args>) is made instead, with C<$DB::sub> holding the -name of the called subroutine. (This doesn't happen if the subroutine -was compiled in the C<DB> package.) +C<&DB::sub>(I<args>) is made instead, with C<$DB::sub> set to identify +the called subroutine. (This doesn't happen if the calling subroutine +was compiled in the C<DB> package.) C<$DB::sub> normally holds the name +of the called subroutine, if it has a name by which it can be looked up. +Failing that, C<$DB::sub> will hold a reference to the called subroutine. +Either way, the C<&DB::sub> subroutine can use C<$DB::sub> as a reference +by which to call the called subroutine, which it will normally want to do. X<&DB::lsub>If the call is to an lvalue subroutine, and C<&DB::lsub> is defined C<&DB::lsub>(I<args>) is called instead, otherwise falling @@ -104,10 +108,15 @@ back to C<&DB::sub>(I<args>). =item * -When execution of the program uses C<goto> to enter a non-XS -subroutine and the 0x80 bit is set in C<$^P>, a call to C<&DB::goto> -is made, with C<$DB::sub> holding the name of the subroutine being -entered. +When execution of the program uses C<goto> to enter a non-XS subroutine +and the 0x80 bit is set in C<$^P>, a call to C<&DB::goto> is made, with +C<$DB::sub> set to identify the subroutine being entered. The call to +C<&DB::goto> does not replace the C<goto>; the requested subroutine will +still be entered once C<&DB::goto> has returned. C<$DB::sub> normally +holds the name of the subroutine being entered, if it has one. Failing +that, C<$DB::sub> will hold a reference to the subroutine being entered. +Unlike when C<&DB::sub> is called, it is not guaranteed that C<$DB::sub> +can be used as a reference to operate on the subroutine being entered. =back @@ -556,62 +565,68 @@ will be lost. # Exit points - END no End of program. - SUCCEED no Return from a subroutine, basically. + END no End of program. + SUCCEED no Return from a subroutine, basically. # Line Start Anchors: - SBOL no Match "" at beginning of line: /^/, /\A/ - MBOL no Same, assuming multiline: /^/m + SBOL no Match "" at beginning of line: /^/, /\A/ + MBOL no Same, assuming multiline: /^/m # Line End Anchors: - SEOL no Match "" at end of line: /$/ - MEOL no Same, assuming multiline: /$/m - EOS no Match "" at end of string: /\z/ + SEOL no Match "" at end of line: /$/ + MEOL no Same, assuming multiline: /$/m + EOS no Match "" at end of string: /\z/ # Match Start Anchors: - GPOS no Matches where last m//g left off. + GPOS no Matches where last m//g left off. # Word Boundary Opcodes: - BOUND no Like BOUNDA for non-utf8, otherwise match "" - between any Unicode \w\W or \W\w - BOUNDL no Like BOUND/BOUNDU, but \w and \W are defined - by current locale - BOUNDU no Match "" at any boundary of a given type - using Unicode rules - BOUNDA no Match "" at any boundary between \w\W or - \W\w, where \w is [_a-zA-Z0-9] - NBOUND no Like NBOUNDA for non-utf8, otherwise match - "" between any Unicode \w\w or \W\W - NBOUNDL no Like NBOUND/NBOUNDU, but \w and \W are - defined by current locale - NBOUNDU no Match "" at any non-boundary of a given type - using using Unicode rules - NBOUNDA no Match "" betweeen any \w\w or \W\W, where \w - is [_a-zA-Z0-9] + BOUND no Like BOUNDA for non-utf8, otherwise match + "" between any Unicode \w\W or \W\w + BOUNDL no Like BOUND/BOUNDU, but \w and \W are + defined by current locale + BOUNDU no Match "" at any boundary of a given type + using Unicode rules + BOUNDA no Match "" at any boundary between \w\W or + \W\w, where \w is [_a-zA-Z0-9] + NBOUND no Like NBOUNDA for non-utf8, otherwise match + "" between any Unicode \w\w or \W\W + NBOUNDL no Like NBOUND/NBOUNDU, but \w and \W are + defined by current locale + NBOUNDU no Match "" at any non-boundary of a given + type using using Unicode rules + NBOUNDA no Match "" betweeen any \w\w or \W\W, where + \w is [_a-zA-Z0-9] # [Special] alternatives: - REG_ANY no Match any one character (except newline). - SANY no Match any one character. - ANYOF sv 1 Match character in (or not in) this class, - single char match only - ANYOFD sv 1 Like ANYOF, but /d is in effect - ANYOFL sv 1 Like ANYOF, but /l is in effect + REG_ANY no Match any one character (except newline). + SANY no Match any one character. + ANYOF sv 1 Match character in (or not in) this class, + single char match only + ANYOFD sv 1 Like ANYOF, but /d is in effect + ANYOFL sv 1 Like ANYOF, but /l is in effect + ANYOFM byte 1 Like ANYOF, but matches an invariant byte + as determined by the mask and arg # POSIX Character Classes: - POSIXD none Some [[:class:]] under /d; the FLAGS field - gives which one - POSIXL none Some [[:class:]] under /l; the FLAGS field - gives which one - POSIXU none Some [[:class:]] under /u; the FLAGS field - gives which one - POSIXA none Some [[:class:]] under /a; the FLAGS field - gives which one - NPOSIXD none complement of POSIXD, [[:^class:]] - NPOSIXL none complement of POSIXL, [[:^class:]] - NPOSIXU none complement of POSIXU, [[:^class:]] - NPOSIXA none complement of POSIXA, [[:^class:]] - - CLUMP no Match any extended grapheme cluster sequence + POSIXD none Some [[:class:]] under /d; the FLAGS field + gives which one + POSIXL none Some [[:class:]] under /l; the FLAGS field + gives which one + POSIXU none Some [[:class:]] under /u; the FLAGS field + gives which one + POSIXA none Some [[:class:]] under /a; the FLAGS field + gives which one + NPOSIXD none complement of POSIXD, [[:^class:]] + NPOSIXL none complement of POSIXL, [[:^class:]] + NPOSIXU none complement of POSIXU, [[:^class:]] + NPOSIXA none complement of POSIXA, [[:^class:]] + + ASCII none [[:ascii:]] + NASCII none [[:^ascii:]] + + CLUMP no Match any extended grapheme cluster + sequence # Alternation @@ -624,39 +639,40 @@ will be lost. # pointer of each individual branch points; each branch # starts with the operand node of a BRANCH node. # - BRANCH node Match this alternative, or the next... + BRANCH node Match this alternative, or the next... # Literals - EXACT str Match this string (preceded by length). - EXACTL str Like EXACT, but /l is in effect (used so - locale-related warnings can be checked for). - EXACTF str Match this non-UTF-8 string (not guaranteed - to be folded) using /id rules (w/len). - EXACTFL str Match this string (not guaranteed to be - folded) using /il rules (w/len). - EXACTFU str Match this string (folded iff in UTF-8, - length in folding doesn't change if not in - UTF-8) using /iu rules (w/len). - EXACTFA str Match this string (not guaranteed to be - folded) using /iaa rules (w/len). - - EXACTFU_SS str Match this string (folded iff in UTF-8, - length in folding may change even if not in - UTF-8) using /iu rules (w/len). - EXACTFLU8 str Rare cirucmstances: like EXACTFU, but is - under /l, UTF-8, folded, and everything in - it is above 255. - EXACTFA_NO_TRIE str Match this string (which is not trie-able; - not guaranteed to be folded) using /iaa - rules (w/len). + EXACT str Match this string (preceded by length). + EXACTL str Like EXACT, but /l is in effect (used so + locale-related warnings can be checked + for). + EXACTF str Match this non-UTF-8 string (not guaranteed + to be folded) using /id rules (w/len). + EXACTFL str Match this string (not guaranteed to be + folded) using /il rules (w/len). + EXACTFU str Match this string (folded iff in UTF-8, + length in folding doesn't change if not in + UTF-8) using /iu rules (w/len). + EXACTFAA str Match this string (not guaranteed to be + folded) using /iaa rules (w/len). + + EXACTFU_SS str Match this string (folded iff in UTF-8, + length in folding may change even if not in + UTF-8) using /iu rules (w/len). + EXACTFLU8 str Rare circumstances: like EXACTFU, but is + under /l, UTF-8, folded, and everything in + it is above 255. + EXACTFAA_NO_TRIE str Match this string (which is not trie-able; + not guaranteed to be folded) using /iaa + rules (w/len). # Do nothing types - NOTHING no Match empty string. + NOTHING no Match empty string. # A variant of above which delimits a group, thus stops optimizations - TAIL no Match empty string. Can jump here from - outside. + TAIL no Match empty string. Can jump here from + outside. # Loops @@ -665,69 +681,74 @@ will be lost. # (one character per match) are implemented with STAR # and PLUS for speed and to minimize recursive plunges. # - STAR node Match this (simple) thing 0 or more times. - PLUS node Match this (simple) thing 1 or more times. + STAR node Match this (simple) thing 0 or more times. + PLUS node Match this (simple) thing 1 or more times. - CURLY sv 2 Match this simple thing {n,m} times. - CURLYN no 2 Capture next-after-this simple thing - CURLYM no 2 Capture this medium-complex thing {n,m} - times. - CURLYX sv 2 Match this complex thing {n,m} times. + CURLY sv 2 Match this simple thing {n,m} times. + CURLYN no 2 Capture next-after-this simple thing + CURLYM no 2 Capture this medium-complex thing {n,m} + times. + CURLYX sv 2 Match this complex thing {n,m} times. # This terminator creates a loop structure for CURLYX - WHILEM no Do curly processing and see if rest matches. + WHILEM no Do curly processing and see if rest + matches. # Buffer related # OPEN,CLOSE,GROUPP ...are numbered at compile time. - OPEN num 1 Mark this point in input as start of #n. - CLOSE num 1 Analogous to OPEN. - - REF num 1 Match some already matched string - REFF num 1 Match already matched string, folded using - native charset rules for non-utf8 - REFFL num 1 Match already matched string, folded in loc. - REFFU num 1 Match already matched string, folded using - unicode rules for non-utf8 - REFFA num 1 Match already matched string, folded using - unicode rules for non-utf8, no mixing ASCII, - non-ASCII + OPEN num 1 Mark this point in input as start of #n. + CLOSE num 1 Close corresponding OPEN of #n. + SROPEN none Same as OPEN, but for script run + SRCLOSE none Close preceding SROPEN + + REF num 1 Match some already matched string + REFF num 1 Match already matched string, folded using + native charset rules for non-utf8 + REFFL num 1 Match already matched string, folded in + loc. + REFFU num 1 Match already matched string, folded using + unicode rules for non-utf8 + REFFA num 1 Match already matched string, folded using + unicode rules for non-utf8, no mixing + ASCII, non-ASCII # Named references. Code in regcomp.c assumes that these all are after # the numbered references - NREF no-sv 1 Match some already matched string - NREFF no-sv 1 Match already matched string, folded using - native charset rules for non-utf8 - NREFFL no-sv 1 Match already matched string, folded in loc. - NREFFU num 1 Match already matched string, folded using - unicode rules for non-utf8 - NREFFA num 1 Match already matched string, folded using - unicode rules for non-utf8, no mixing ASCII, - non-ASCII + NREF no-sv 1 Match some already matched string + NREFF no-sv 1 Match already matched string, folded using + native charset rules for non-utf8 + NREFFL no-sv 1 Match already matched string, folded in + loc. + NREFFU num 1 Match already matched string, folded using + unicode rules for non-utf8 + NREFFA num 1 Match already matched string, folded using + unicode rules for non-utf8, no mixing + ASCII, non-ASCII # Support for long RE - LONGJMP off 1 1 Jump far away. - BRANCHJ off 1 1 BRANCH with long offset. + LONGJMP off 1 1 Jump far away. + BRANCHJ off 1 1 BRANCH with long offset. # Special Case Regops - IFMATCH off 1 2 Succeeds if the following matches. - UNLESSM off 1 2 Fails if the following matches. - SUSPEND off 1 1 "Independent" sub-RE. - IFTHEN off 1 1 Switch, should be preceded by switcher. - GROUPP num 1 Whether the group matched. + IFMATCH off 1 2 Succeeds if the following matches. + UNLESSM off 1 2 Fails if the following matches. + SUSPEND off 1 1 "Independent" sub-RE. + IFTHEN off 1 1 Switch, should be preceded by switcher. + GROUPP num 1 Whether the group matched. # The heavy worker - EVAL evl/flags Execute some Perl code. - 2L + EVAL evl/flags Execute some Perl code. + 2L # Modifiers - MINMOD no Next operator is not greedy. - LOGICAL no Next opcode should set the flag only. + MINMOD no Next operator is not greedy. + LOGICAL no Next opcode should set the flag only. # This is not used yet - RENUM off 1 1 Group with independently numbered parens. + RENUM off 1 1 Group with independently numbered parens. # Trie Related @@ -735,60 +756,60 @@ will be lost. # have inline charclass data (ascii only), the 'C' store it in the # structure. - TRIE trie 1 Match many EXACT(F[ALU]?)? at once. - flags==type - TRIEC trie Same as TRIE, but with embedded charclass - charclass data + TRIE trie 1 Match many EXACT(F[ALU]?)? at once. + flags==type + TRIEC trie Same as TRIE, but with embedded charclass + charclass data - AHOCORASICK trie 1 Aho Corasick stclass. flags==type - AHOCORASICKC trie Same as AHOCORASICK, but with embedded - charclass charclass data + AHOCORASICK trie 1 Aho Corasick stclass. flags==type + AHOCORASICKC trie Same as AHOCORASICK, but with embedded + charclass charclass data # Regex Subroutines - GOSUB num/ofs 2L recurse to paren arg1 at (signed) ofs arg2 + GOSUB num/ofs 2L recurse to paren arg1 at (signed) ofs arg2 # Special conditionals - NGROUPP no-sv 1 Whether the group matched. - INSUBP num 1 Whether we are in a specific recurse. - DEFINEP none 1 Never execute directly. + NGROUPP no-sv 1 Whether the group matched. + INSUBP num 1 Whether we are in a specific recurse. + DEFINEP none 1 Never execute directly. # Backtracking Verbs - ENDLIKE none Used only for the type field of verbs - OPFAIL no-sv 1 Same as (?!), but with verb arg - ACCEPT no-sv/num Accepts the current matched string, with - 2L verbar + ENDLIKE none Used only for the type field of verbs + OPFAIL no-sv 1 Same as (?!), but with verb arg + ACCEPT no-sv/num Accepts the current matched string, with + 2L verbar # Verbs With Arguments - VERB no-sv 1 Used only for the type field of verbs - PRUNE no-sv 1 Pattern fails at this startpoint if no- - backtracking through this - MARKPOINT no-sv 1 Push the current location for rollback by - cut. - SKIP no-sv 1 On failure skip forward (to the mark) before - retrying - COMMIT no-sv 1 Pattern fails outright if backtracking - through this - CUTGROUP no-sv 1 On failure go to the next alternation in the - group + VERB no-sv 1 Used only for the type field of verbs + PRUNE no-sv 1 Pattern fails at this startpoint if no- + backtracking through this + MARKPOINT no-sv 1 Push the current location for rollback by + cut. + SKIP no-sv 1 On failure skip forward (to the mark) + before retrying + COMMIT no-sv 1 Pattern fails outright if backtracking + through this + CUTGROUP no-sv 1 On failure go to the next alternation in + the group # Control what to keep in $&. - KEEPS no $& begins here. + KEEPS no $& begins here. # New charclass like patterns - LNBREAK none generic newline pattern + LNBREAK none generic newline pattern # SPECIAL REGOPS # This is not really a node, but an optimized away piece of a "long" # node. To simplify debugging output, we mark it as if it were a node - OPTIMIZED off Placeholder for dump. + OPTIMIZED off Placeholder for dump. # Special opcode with the property that no opcode in a compiled program # will ever be of this type. Thus it can be used as a flag value that # no other opcode has been seen. END is used similarly, in that an END # node cant be optimized. So END implies "unoptimizable" and PSEUDO # mean "not seen anything to optimize yet". - PSEUDO off Pseudo opcode for internal use. + PSEUDO off Pseudo opcode for internal use. =for regcomp.pl end diff --git a/gnu/usr.bin/perl/pod/perldebug.pod b/gnu/usr.bin/perl/pod/perldebug.pod index 5762235aa9d..53c3d602871 100644 --- a/gnu/usr.bin/perl/pod/perldebug.pod +++ b/gnu/usr.bin/perl/pod/perldebug.pod @@ -8,10 +8,12 @@ perldebug - Perl debugging First of all, have you tried using L<C<use strict;>|strict> and L<C<use warnings;>|warnings>? - If you're new to the Perl debugger, you may prefer to read L<perldebtut>, which is a tutorial introduction to the debugger. +If you're looking for the nitty gritty details of how the debugger is +I<implemented>, you may prefer to read L<perldebguts>. + =head1 The Perl Debugger If you invoke Perl with the B<-d> switch, your script runs under the @@ -126,7 +128,7 @@ hashes, you'll probably prefer 'x \%h' rather than 'x %h'. See L<Dumpvalue> if you'd like to do this yourself. The output format is governed by multiple options described under -L<"Configurable Options">. +L</"Configurable Options">. If the C<maxdepth> is included, it must be a numeral I<N>; the value is dumped only I<N> levels deep, as if the C<dumpDepth> option had been @@ -474,7 +476,7 @@ For historical reasons, the C<=value> is optional, but defaults to 1 only where it is safe to do so--that is, mostly for Boolean options. It is always better to assign a specific value using C<=>. The C<option> can be abbreviated, but for clarity probably should -not be. Several options can be set together. See L<"Configurable Options"> +not be. Several options can be set together. See L</"Configurable Options"> for a list of these. =item < ? diff --git a/gnu/usr.bin/perl/pod/perldelta.pod b/gnu/usr.bin/perl/pod/perldelta.pod index d22e0f20486..ad87d40297a 100644 --- a/gnu/usr.bin/perl/pod/perldelta.pod +++ b/gnu/usr.bin/perl/pod/perldelta.pod @@ -2,43 +2,39 @@ =head1 NAME -perldelta - what is new for perl v5.24.3 +perldelta - what is new for perl v5.28.1 =head1 DESCRIPTION -This document describes differences between the 5.24.2 release and the 5.24.3 +This document describes differences between the 5.28.0 release and the 5.28.1 release. -If you are upgrading from an earlier release such as 5.24.1, first read -L<perl5242delta>, which describes differences between 5.24.1 and 5.24.2. +If you are upgrading from an earlier release such as 5.26.0, first read +L<perl5280delta>, which describes differences between 5.26.0 and 5.28.0. =head1 Security -=head2 [CVE-2017-12837] Heap buffer overflow in regular expression compiler +=head2 [CVE-2018-18311] Integer overflow leading to buffer overflow and segmentation fault -Compiling certain regular expression patterns with the case-insensitive -modifier could cause a heap buffer overflow and crash perl. This has now been -fixed. -L<[perl #131582]|https://rt.perl.org/Public/Bug/Display.html?id=131582> +Integer arithmetic in C<Perl_my_setenv()> could wrap when the combined length +of the environment variable name and value exceeded around 0x7fffffff. This +could lead to writing beyond the end of an allocated buffer with attacker +supplied data. -=head2 [CVE-2017-12883] Buffer over-read in regular expression parser +L<[perl #133204]|https://rt.perl.org/Ticket/Display.html?id=133204> -For certain types of syntax error in a regular expression pattern, the error -message could either contain the contents of a random, possibly large, chunk of -memory, or could crash perl. This has now been fixed. -L<[perl #131598]|https://rt.perl.org/Public/Bug/Display.html?id=131598> +=head2 [CVE-2018-18312] Heap-buffer-overflow write in S_regatom (regcomp.c) -=head2 [CVE-2017-12814] C<$ENV{$key}> stack buffer overflow on Windows +A crafted regular expression could cause heap-buffer-overflow write during +compilation, potentially allowing arbitrary code execution. -A possible stack buffer overflow in the C<%ENV> code on Windows has been fixed -by removing the buffer completely since it was superfluous anyway. -L<[perl #131665]|https://rt.perl.org/Public/Bug/Display.html?id=131665> +L<[perl #133423]|https://rt.perl.org/Ticket/Display.html?id=133423> =head1 Incompatible Changes -There are no changes intentionally incompatible with 5.24.2. If any exist, -they are bugs, and we request that you submit a report. See L</Reporting -Bugs> below. +There are no changes intentionally incompatible with 5.28.0. If any exist, +they are bugs, and we request that you submit a report. See +L</Reporting Bugs> below. =head1 Modules and Pragmata @@ -48,83 +44,7 @@ Bugs> below. =item * -L<Module::CoreList> has been upgraded from version 5.20170715_24 to -5.20170922_24. - -=item * - -L<POSIX> has been upgraded from version 1.65 to 1.65_01. - -=item * - -L<Time::HiRes> has been upgraded from version 1.9733 to 1.9741. - -L<[perl #128427]|https://rt.perl.org/Public/Bug/Display.html?id=128427> -L<[perl #128445]|https://rt.perl.org/Public/Bug/Display.html?id=128445> -L<[perl #128972]|https://rt.perl.org/Public/Bug/Display.html?id=128972> -L<[cpan #120032]|https://rt.cpan.org/Public/Bug/Display.html?id=120032> - -=back - -=head1 Configuration and Compilation - -=over 4 - -=item * - -When building with GCC 6 and link-time optimization (the B<-flto> option to -B<gcc>), F<Configure> was treating all probed symbols as present on the system, -regardless of whether they actually exist. This has been fixed. -L<[perl #128131]|https://rt.perl.org/Public/Bug/Display.html?id=128131> - -=item * - -F<Configure> now aborts if both C<-Duselongdouble> and C<-Dusequadmath> are -requested. -L<[perl #126203]|https://rt.perl.org/Public/Bug/Display.html?id=126203> - -=item * - -Fixed a bug in which F<Configure> could append C<-quadmath> to the archname -even if it was already present. -L<[perl #128538]|https://rt.perl.org/Public/Bug/Display.html?id=128538> - -=item * - -Clang builds with C<-DPERL_GLOBAL_STRUCT> or C<-DPERL_GLOBAL_STRUCT_PRIVATE> -have been fixed (by disabling Thread Safety Analysis for these configurations). - -=back - -=head1 Platform Support - -=head2 Platform-Specific Notes - -=over 4 - -=item VMS - -=over 4 - -=item * - -C<configure.com> now recognizes the VSI-branded C compiler. - -=back - -=item Windows - -=over 4 - -=item * - -Building XS modules with GCC 6 in a 64-bit build of Perl failed due to -incorrect mapping of C<strtoll> and C<strtoull>. This has now been fixed. -L<[perl #131726]|https://rt.perl.org/Public/Bug/Display.html?id=131726> -L<[cpan #121683]|https://rt.cpan.org/Public/Bug/Display.html?id=121683> -L<[cpan #122353]|https://rt.cpan.org/Public/Bug/Display.html?id=122353> - -=back +L<Module::CoreList> has been upgraded from version 5.20180622 to 5.20181129_28. =back @@ -134,163 +54,41 @@ L<[cpan #122353]|https://rt.cpan.org/Public/Bug/Display.html?id=122353> =item * -C<< /@0{0*-E<gt>@*/*0 >> and similar contortions used to crash, but no longer -do, but merely produce a syntax error. -L<[perl #128171]|https://rt.perl.org/Public/Bug/Display.html?id=128171> - -=item * - -C<do> or C<require> with an argument which is a reference or typeglob which, -when stringified, contains a null character, started crashing in Perl 5.20, but +Perl 5.28 introduced an C<index()> optimization when comparing to -1 (or +indirectly, e.g. >= 0). When this optimization was triggered inside a C<when> +clause it caused a warning ("Argument %s isn't numeric in smart match"). This has now been fixed. -L<[perl #128182]|https://rt.perl.org/Public/Bug/Display.html?id=128182> +L<[perl #133368]|https://rt.perl.org/Ticket/Display.html?id=133368> =item * -Expressions containing an C<&&> or C<||> operator (or their synonyms C<and> and -C<or>) were being compiled incorrectly in some cases. If the left-hand side -consisted of either a negated bareword constant or a negated C<do {}> block -containing a constant expression, and the right-hand side consisted of a -negated non-foldable expression, one of the negations was effectively ignored. -The same was true of C<if> and C<unless> statement modifiers, though with the -left-hand and right-hand sides swapped. This long-standing bug has now been -fixed. -L<[perl #127952]|https://rt.perl.org/Public/Bug/Display.html?id=127952> +Matching of decimal digits in script runs, introduced in Perl 5.28, had a bug +that led to C<"1\N{THAI DIGIT FIVE}"> matching C</^(*sr:\d+)$/> when it should +not. This has now been fixed. =item * -C<reset> with an argument no longer crashes when encountering stash entries -other than globs. -L<[perl #128106]|https://rt.perl.org/Public/Bug/Display.html?id=128106> - -=item * - -Assignment of hashes to, and deletion of, typeglobs named C<*::::::> no longer -causes crashes. -L<[perl #128086]|https://rt.perl.org/Public/Bug/Display.html?id=128086> - -=item * - -Assignment variants of any bitwise ops under the C<bitwise> feature would crash -if the left-hand side was an array or hash. -L<[perl #128204]|https://rt.perl.org/Public/Bug/Display.html?id=128204> - -=item * - -C<socket> now leaves the error code returned by the system in C<$!> on failure. -L<[perl #128316]|https://rt.perl.org/Public/Bug/Display.html?id=128316> - -=item * - -Parsing bad POSIX charclasses no longer leaks memory. -L<[perl #128313]|https://rt.perl.org/Public/Bug/Display.html?id=128313> - -=item * - -Since Perl 5.20, line numbers have been off by one when perl is invoked with -the B<-x> switch. This has been fixed. -L<[perl #128508]|https://rt.perl.org/Public/Bug/Display.html?id=128508> - -=item * - -Some obscure cases of subroutines and file handles being freed at the same time -could result in crashes, but have been fixed. The crash was introduced in Perl -5.22. -L<[perl #128597]|https://rt.perl.org/Public/Bug/Display.html?id=128597> - -=item * - -Some regular expression parsing glitches could lead to assertion failures with -regular expressions such as C</(?E<lt>=/> and C</(?E<lt>!/>. This has now been -fixed. -L<[perl #128170]|https://rt.perl.org/Public/Bug/Display.html?id=128170> - -=item * - -C<gethostent> and similar functions now perform a null check internally, to -avoid crashing with the torsocks library. This was a regression from Perl -5.22. -L<[perl #128740]|https://rt.perl.org/Public/Bug/Display.html?id=128740> - -=item * - -Mentioning the same constant twice in a row (which is a syntax error) no longer -fails an assertion under debugging builds. This was a regression from Perl -5.20. -L<[perl #126482]|https://rt.perl.org/Public/Bug/Display.html?id=126482> - -=item * - -In Perl 5.24 C<fchown> was changed not to accept negative one as an argument -because in some platforms that is an error. However, in some other platforms -that is an acceptable argument. This change has been reverted. -L<[perl #128967]|https://rt.perl.org/Public/Bug/Display.html?id=128967>. - -=item * - -C<@{x> followed by a newline where C<"x"> represents a control or non-ASCII -character no longer produces a garbled syntax error message or a crash. -L<[perl #128951]|https://rt.perl.org/Public/Bug/Display.html?id=128951> - -=item * - -A regression in Perl 5.24 with C<tr/\N{U+...}/foo/> when the code point was -between 128 and 255 has been fixed. -L<[perl #128734]|https://rt.perl.org/Public/Bug/Display.html?id=128734>. - -=item * - -Many issues relating to C<printf "%a"> of hexadecimal floating point were -fixed. In addition, the "subnormals" (formerly known as "denormals") floating -point numbers are now supported both with the plain IEEE 754 floating point -numbers (64-bit or 128-bit) and the x86 80-bit "extended precision". Note that -subnormal hexadecimal floating point literals will give a warning about -"exponent underflow". -L<[perl #128843]|https://rt.perl.org/Public/Bug/Display.html?id=128843> -L<[perl #128888]|https://rt.perl.org/Public/Bug/Display.html?id=128888> -L<[perl #128889]|https://rt.perl.org/Public/Bug/Display.html?id=128889> -L<[perl #128890]|https://rt.perl.org/Public/Bug/Display.html?id=128890> -L<[perl #128893]|https://rt.perl.org/Public/Bug/Display.html?id=128893> -L<[perl #128909]|https://rt.perl.org/Public/Bug/Display.html?id=128909> -L<[perl #128919]|https://rt.perl.org/Public/Bug/Display.html?id=128919> - -=item * - -The parser could sometimes crash if a bareword came after C<evalbytes>. -L<[perl #129196]|https://rt.perl.org/Public/Bug/Display.html?id=129196> - -=item * - -Fixed a place where the regex parser was not setting the syntax error correctly -on a syntactically incorrect pattern. -L<[perl #129122]|https://rt.perl.org/Public/Bug/Display.html?id=129122> - -=item * - -A vulnerability in Perl's C<sprintf> implementation has been fixed by avoiding -a possible memory wrap. -L<[perl #131260]|https://rt.perl.org/Public/Bug/Display.html?id=131260> +The new in-place editing code no longer leaks directory handles. +L<[perl #133314]|https://rt.perl.org/Ticket/Display.html?id=133314> =back =head1 Acknowledgements -Perl 5.24.3 represents approximately 2 months of development since Perl 5.24.2 -and contains approximately 3,200 lines of changes across 120 files from 23 +Perl 5.28.1 represents approximately 5 months of development since Perl 5.28.0 +and contains approximately 6,100 lines of changes across 44 files from 12 authors. Excluding auto-generated files, documentation and release tools, there were -approximately 1,600 lines of changes to 56 .pm, .t, .c and .h files. +approximately 700 lines of changes to 12 .pm, .t, .c and .h files. -Perl continues to flourish into its third decade thanks to a vibrant community +Perl continues to flourish into its fourth decade thanks to a vibrant community of users and developers. The following people are known to have contributed -the improvements that became Perl 5.24.3: +the improvements that became Perl 5.28.1: -Aaron Crane, Craig A. Berry, Dagfinn Ilmari Mannsåker, Dan Collins, Daniel -Dragan, Dave Cross, David Mitchell, Eric Herman, Father Chrysostomos, H.Merijn -Brand, Hugo van der Sanden, James E Keenan, Jarkko Hietaniemi, John SJ -Anderson, Karl Williamson, Ken Brown, Lukas Mai, Matthew Horsfall, Stevan -Little, Steve Hay, Steven Humphrey, Tony Cook, Yves Orton. +Aaron Crane, Abigail, Chris 'BinGOs' Williams, Dagfinn Ilmari Mannsåker, David +Mitchell, James E Keenan, John SJ Anderson, Karen Etheridge, Karl Williamson, +Sawyer X, Steve Hay, Tony Cook. The list above is almost certainly incomplete as it is automatically generated from version control history. In particular, it does not include the names of @@ -306,9 +104,8 @@ the F<AUTHORS> file in the Perl source distribution. =head1 Reporting Bugs -If you find what you think is a bug, you might check the articles recently -posted to the comp.lang.perl.misc newsgroup and the perl bug database at -L<https://rt.perl.org/> . There may also be information at +If you find what you think is a bug, you might check the perl bug database +at L<https://rt.perl.org/> . There may also be information at L<http://www.perl.org/> , the Perl Home Page. If you believe you have an unreported bug, please run the L<perlbug> program @@ -318,8 +115,17 @@ will be sent off to perlbug@perl.org to be analysed by the Perl porting team. If the bug you are reporting has security implications which make it inappropriate to send to a publicly archived mailing list, then see -L<perlsec/SECURITY VULNERABILITY CONTACT INFORMATION> for details of how to -report the issue. +L<perlsec/SECURITY VULNERABILITY CONTACT INFORMATION> +for details of how to report the issue. + +=head1 Give Thanks + +If you wish to thank the Perl 5 Porters for the work we had done in Perl 5, +you can do so by running the C<perlthanks> program: + + perlthanks + +This will send an email to the Perl 5 Porters list with your show of thanks. =head1 SEE ALSO diff --git a/gnu/usr.bin/perl/pod/perldiag.pod b/gnu/usr.bin/perl/pod/perldiag.pod index 644b8140086..607bfc54697 100644 --- a/gnu/usr.bin/perl/pod/perldiag.pod +++ b/gnu/usr.bin/perl/pod/perldiag.pod @@ -221,6 +221,22 @@ Auto-decrement> for details. (W syntax) You called stat() on an array, but the array will be coerced to a scalar - the number of elements in the array. +=item A signature parameter must start with '$', '@' or '%' + +(F) Each subroutine signature parameter declaration must start with a valid +sigil; for example: + + sub foo ($a, $, $b = 1, @c) {} + +=item A slurpy parameter may not have a default value + +(F) Only scalar subroutine signature parameters may have a default value; +for example: + + sub foo ($a = 1) {} # legal + sub foo (@a = (1)) {} # invalid + sub foo (%a = (a => b)) {} # invalid + =item assertion botched: %s (X) The malloc package that comes with Perl had an internal failure. @@ -298,7 +314,7 @@ the current set of allowed keys of a restricted hash. (F) You wrote C<bless $foo> with one argument after somehow causing the current package to be freed. Perl cannot figure out what to -do, so it throws up in hands in despair. +do, so it throws up its hands in despair. =item Attempt to bless into a reference @@ -402,26 +418,12 @@ assigning through that reference. For example used as an lvalue, which is pretty strange. Perhaps you forgot to dereference it first. See L<perlfunc/substr>. -=item Attribute "locked" is deprecated - -(D deprecated) You have used the attributes pragma to modify the -"locked" attribute on a code reference. The :locked attribute is -obsolete, has had no effect since 5005 threads were removed, and -will be removed in a future release of Perl 5. - =item Attribute prototype(%s) discards earlier prototype attribute in same sub (W misc) A sub was declared as sub foo : prototype(A) : prototype(B) {}, for example. Since each sub can only have one prototype, the earlier declaration(s) are discarded while the last one is applied. -=item Attribute "unique" is deprecated - -(D deprecated) You have used the attributes pragma to modify -the "unique" attribute on an array, hash or scalar reference. -The :unique attribute has had no effect since Perl 5.8.8, and -will be removed in a future release of Perl 5. - =item av_reify called on tied array (S debugging) This indicates that something went wrong and Perl got I<very> @@ -532,6 +534,22 @@ a bareword: The C<strict> pragma is useful in avoiding such errors. +=item Bareword in require contains "%s" + +=item Bareword in require maps to disallowed filename "%s" + +=item Bareword in require maps to empty filename + +(F) The bareword form of require has been invoked with a filename which could +not have been generated by a valid bareword permitted by the parser. You +shouldn't be able to get this error from Perl code, but XS code may throw it +if it passes an invalid module name to C<Perl_load_module>. + +=item Bareword in require must not start with a double-colon: "%s" + +(F) In C<require Bare::Word>, the bareword is not allowed to start with a +double-colon. Write C<require ::Foo::Bar> as C<require Foo::Bar> instead. + =item Bareword "%s" not allowed while "strict subs" in use (F) With "strict subs" in use, a bareword is only allowed as a @@ -634,15 +652,17 @@ checking. Alternatively, if you are certain that you're calling the function correctly, you may put an ampersand before the name to avoid the warning. See L<perlsub>. -=item Calling POSIX::%s() is deprecated - -(D deprecated) You called a function whose use is deprecated. See -the function's name in L<POSIX> for details. - =item Cannot chr %f (F) You passed an invalid number (like an infinity or not-a-number) to C<chr>. +=item Cannot complete in-place edit of %s: %s + +(F) Your perl script appears to have changed directory while +performing an in-place edit of a file specified by a relative path, +and your system doesn't include the directory relative POSIX functions +needed to handle that. + =item Cannot compress %f in pack (F) You tried compressing an infinity or not-a-number as an unsigned @@ -677,6 +697,20 @@ be directly assigned to. (S io) You tried to apply an encoding that did not exist to a filehandle, either with open() or binmode(). +=item Cannot open %s as a dirhandle: it is already open as a filehandle + +(F) You tried to use opendir() to associate a dirhandle to a symbol (glob +or scalar) that already holds a filehandle. Since this idiom might render +your code confusing, it was deprecated in Perl 5.10. As of Perl 5.28, it +is a fatal error. + +=item Cannot open %s as a filehandle: it is already open as a dirhandle + +(F) You tried to use open() to associate a filehandle to a symbol (glob +or scalar) that already holds a dirhandle. Since this idiom might render +your code confusing, it was deprecated in Perl 5.10. As of Perl 5.28, it +is a fatal error. + =item Cannot pack %f with '%c' (F) You tried converting an infinity or not-a-number to an integer, @@ -703,9 +737,9 @@ Perl code, but are only used internally. (F) Some XS code tried to use C<sv_catpvfn()> or a related function with a format string that specifies explicit indexes for some of the elements, and -using a C-style variable-argument list (a C<va_list>). This is not currently -supported. XS authors wanting to do this must instead construct a C array of -C<SV*> scalars containing the arguments. +using a C-style variable-argument list (a C<va_list>). This is not currently +supported. XS authors wanting to do this must instead construct a C array +of C<SV*> scalars containing the arguments. =item Can only compress unsigned integers in pack @@ -812,6 +846,13 @@ C<foreach> loop nor a C<given> block. (Note that this error is issued on exit from the C<default> block, so you won't get the error if you use an explicit C<continue>.) +=item Can't determine class of operator %s, assuming BASEOP + +(S) This warning indicates something wrong in the internals of perl. +Perl was trying to find the class (e.g. LISTOP) of a particular OP, +and was unable to do so. This is likely to be due to a bug in the perl +internals, or due to a bug in XS code which manipulates perl optrees. + =item Can't do inplace edit: %s is not a regular file (S inplace) You tried to use the B<-i> switch on a special file, such as @@ -990,6 +1031,25 @@ pipe, Perl can't retrieve its name for later use. (P) An error peculiar to VMS. Perl asked $GETSYI how big you want your mailbox buffers to be, and didn't get an answer. +=item Can't "goto" into a binary or list expression + +(F) A "goto" statement was executed to jump into the middle of a binary +or list expression. You can't get there from here. The reason for this +restriction is that the interpreter would get confused as to how many +arguments there are, resulting in stack corruption or crashes. This +error occurs in cases such as these: + + goto F; + print do { F: }; # Can't jump into the arguments to print + + goto G; + $x + do { G: $y }; # How is + supposed to get its first operand? + +=item Can't "goto" into a "given" block + +(F) A "goto" statement was executed to jump into the middle of a C<given> +block. You can't get there from here. See L<perlfunc/goto>. + =item Can't "goto" into the middle of a foreach loop (F) A "goto" statement was executed to jump into the middle of a foreach @@ -1146,6 +1206,8 @@ a NULL. =item Can't modify non-lvalue subroutine call of &%s +=item Can't modify non-lvalue subroutine call of &%s in %s + (F) Subroutines meant to be used in lvalue context should be declared as such. See L<perlsub/"Lvalue subroutines">. @@ -1273,9 +1335,14 @@ loops once. See L<perlfunc/redo>. file. Perl was unable to remove the original file to replace it with the modified file. The file was left unmodified. +=item Can't rename in-place work file '%s' to '%s': %s + +(F) When closed implicitly, the temporary file for in-place editing +couldn't be renamed to the original filename. + =item Can't rename %s to %s: %s, skipping file -(S inplace) The rename done by the B<-i> switch failed for some reason, +(F) The rename done by the B<-i> switch failed for some reason, probably because you don't have write permission to the directory. =item Can't reopen input pipe (name: %s) in binary mode @@ -1346,6 +1413,11 @@ with Perl, though, if you really want to do that. however, redefine it while it's running, and you can even undef the redefined subroutine while the old routine is running. Go figure. +=item Can't unweaken a nonreference + +(F) You attempted to unweaken something that was not a reference. Only +references can be unweakened. + =item Can't upgrade %s (%d) to %d (P) The internal sv_upgrade routine adds "members" to an SV, making it @@ -1599,7 +1671,8 @@ uses the character values modulus 256 instead, as if you had provided: unpack("s", "\x{f3}b") -=item charnames alias definitions may not contain a sequence of multiple spaces +=item charnames alias definitions may not contain a sequence of multiple +spaces; marked by S<<-- HERE> in %s (F) You defined a character name which had multiple space characters in a row. Change them to single spaces. Usually these names are @@ -1607,7 +1680,8 @@ defined in the C<:alias> import argument to C<use charnames>, but they could be defined by a translator installed into C<$^H{charnames}>. See L<charnames/CUSTOM ALIASES>. -=item charnames alias definitions may not contain trailing white-space +=item charnames alias definitions may not contain trailing white-space; +marked by S<<-- HERE> in %s (F) You defined a character name which ended in a space character. Remove the trailing space(s). Usually these names are @@ -1702,7 +1776,23 @@ being readable by a later Perl. instead of Perl. Check the #! line, or manually feed your script into Perl yourself. The #! line at the top of your file could look like - #!/usr/bin/perl -w + #!/usr/bin/perl + +=item %s: command not found + +(A) You've accidentally run your script through B<bash> or another shell +instead of Perl. Check the #! line, or manually feed your script into +Perl yourself. The #! line at the top of your file could look like + + #!/usr/bin/perl + +=item %s: command not found: %s + +(A) You've accidentally run your script through B<zsh> or another shell +instead of Perl. Check the #! line, or manually feed your script into +Perl yourself. The #! line at the top of your file could look like + + #!/usr/bin/perl =item Compilation failed in require @@ -1749,7 +1839,7 @@ usually indicates a syntax error in dereferencing the constant value. See L<perlsub/"Constant Functions"> and L<constant>. =item Constants from lexical variables potentially modified elsewhere are -deprecated +deprecated. This will not be allowed in Perl 5.32 (D deprecated) You wrote something like @@ -1768,8 +1858,8 @@ breaks the behavior of closures, in which the subroutine captures the variable itself, rather than its value, so future changes to the variable are reflected in the subroutine's return value. -This usage is deprecated, because the behavior is likely to change -in a future version of Perl. +This usage is deprecated, and will no longer be allowed in Perl 5.32, +making it possible to change the behavior in the future. If you intended for the subroutine to be eligible for inlining, then make sure the variable is not referenced elsewhere, possibly by @@ -1861,6 +1951,18 @@ valid magic number. you have also specified an explicit size for the string. See L<perlfunc/pack>. +=item Declaring references is experimental + +(S experimental::declared_refs) This warning is emitted if you use +a reference constructor on the right-hand side of C<my>, C<state>, C<our>, or +C<local>. Simply suppress the warning if you want to use the feature, but +know that in doing so you are taking the risk of using an experimental +feature which may change or be removed in a future Perl version: + + no warnings "experimental::declared_refs"; + use feature "declared_refs"; + $fooref = my \$foo; + =for comment The following are used in lib/diagnostics.t for testing two =items that share the same description. Changes here need to be propagated to there @@ -1892,17 +1994,6 @@ discovered. (F) You said something like "use Module 42" but in the Module file there are neither package declarations nor a C<$VERSION>. -=item delete argument is index/value array slice, use array slice - -(F) You used index/value array slice syntax (C<%array[...]>) as -the argument to C<delete>. You probably meant C<@array[...]> with -an @ symbol instead. - -=item delete argument is key/value hash slice, use hash slice - -(F) You used key/value hash slice syntax (C<%hash{...}>) as the argument to -C<delete>. You probably meant C<@hash{...}> with an @ symbol instead. - =item delete argument is not a HASH or ARRAY element or slice (F) The argument to C<delete> must be either a hash or array element, @@ -1916,13 +2007,18 @@ or a hash or array slice, such as: @foo[$bar, $baz, $xyzzy] @{$ref->[12]}{"susie", "queue"} +or a hash key/value or array index/value slice, such as: + + %foo[$bar, $baz, $xyzzy] + %{$ref->[12]}{"susie", "queue"} + =item Delimiter for here document is too long (F) In a here document construct like C<<<FOO>, the label C<FOO> is too long for Perl to handle. You have to be seriously twisted to write code that triggers this error. -=item Deprecated use of my() in false conditional +=item Deprecated use of my() in false conditional. This will be a fatal error in Perl 5.30 (D deprecated) You used a declaration similar to C<my $x if 0>. There has been a long-standing bug in Perl that causes a lexical variable @@ -1943,6 +2039,9 @@ lexicals that are initialized only once (see L<feature>): sub f { state $x; return $x++ } +This use of C<my()> in a false conditional has been deprecated since +Perl 5.10, and it will become a fatal error in Perl 5.30. + =item DESTROY created new reference to dead object '%s' (F) A DESTROY() method created a new reference to the object which is @@ -1951,7 +2050,7 @@ than to create a dangling reference. =item Did not produce a valid header -See Server error. +See L</500 Server error>. =item %s did not return a true value @@ -1967,7 +2066,7 @@ some such. =item (Did you mean "local" instead of "our"?) -(W misc) Remember that "our" does not localize the declared global +(W shadow) Remember that "our" does not localize the declared global variable. You have declared it again in the same lexical scope, which seems superfluous. @@ -1984,7 +2083,7 @@ you called it with no args and C<$@> was empty. =item Document contains no data -See Server error. +See L</500 Server error>. =item %s does not define %s::VERSION--version check failed @@ -1996,6 +2095,14 @@ define a C<$VERSION>. (F) You cannot put a repeat count of any kind right after the '/' code. See L<perlfunc/pack>. +=item do "%s" failed, '.' is no longer in @INC; did you mean do "./%s"? + +(D deprecated) Previously C< do "somefile"; > would search the current +directory for the specified file. Since perl v5.26.0, F<.> has been +removed from C<@INC> by default, so this is no longer true. To search the +current directory (and only the current directory) you can write +C< do "./somefile"; >. + =item Don't know how to get file name (P) C<PerlIO_getname>, a perl internal I/O function specific to VMS, was @@ -2020,10 +2127,15 @@ something that isn't defined yet, you don't actually have to define the subroutine or package before the current location. You can use an empty "sub foo;" or "package FOO;" to enter a "forward" declaration. -=item dump() better written as CORE::dump() +=item dump() better written as CORE::dump(). dump() will no longer be available in Perl 5.30 + +(D deprecated, misc) You used the obsolescent C<dump()> built-in function, +without fully qualifying it as C<CORE::dump()>. Maybe it's a typo. -(W misc) You used the obsolescent C<dump()> built-in function, without fully -qualifying it as C<CORE::dump()>. Maybe it's a typo. See L<perlfunc/dump>. +Use of a unqualified C<dump()> was deprecated in Perl 5.8.0, and this +will not be available in Perl 5.30. + +See L<perlfunc/dump>. =item dump is not supported @@ -2054,6 +2166,14 @@ unlikely to be what you want. described in L<perlunicode> and L<perlre>. You used C<\p> or C<\P> in a regular expression without specifying the property name. +=item ${^ENCODING} is no longer supported + +(F) The special variable C<${^ENCODING}>, formerly used to implement +the C<encoding> pragma, is no longer supported as of Perl 5.26.0. + +Setting it to anything other than C<undef> is a fatal error as of Perl +5.28. + =item entering effective %s failed (F) While under the C<use filetest> pragma, switching the real and @@ -2116,7 +2236,7 @@ variable and glob that. (F) The C<exec> function is not implemented on some systems, e.g., Symbian OS. See L<perlport>. -=item Execution of %s aborted due to compilation errors. +=item %sExecution of %s aborted due to compilation errors. (F) The final summary message when a Perl compilation fails. @@ -2170,6 +2290,26 @@ to denote a capturing group of the form L<C<(?I<PARNO>)>|perlre/(?PARNO) (?-PARNO) (?+PARNO) (?R) (?0)>, but omitted the C<")">. +=item Expecting close paren for nested extended charclass in regex; marked +by <-- HERE in m/%s/ + +(F) While parsing a nested extended character class like: + + (?[ ... (?flags:(?[ ... ])) ... ]) + ^ + +we expected to see a close paren ')' (marked by ^) but did not. + +=item Expecting close paren for wrapper for nested extended charclass in +regex; marked by <-- HERE in m/%s/ + +(F) While parsing a nested extended character class like: + + (?[ ... (?flags:(?[ ... ])) ... ]) + ^ + +we expected to see a close paren ')' (marked by ^) but did not. + =item Expecting '(?flags:(?[...' in regex; marked by S<<-- HERE> in m/%s/ (F) The C<(?[...])> extended character class regular expression construct @@ -2204,14 +2344,6 @@ has been removed. The C<postderef> feature may meet your needs better. use feature "signatures"; sub foo ($left, $right) { ... } -=item Experimental "%s" subs not enabled - -(F) To use lexical subs, you must first enable them: - - no warnings 'experimental::lexical_subs'; - use feature 'lexical_subs'; - my sub foo { ... } - =item Explicit blessing to '' (assuming package main) (W misc) You are blessing a reference to a zero length string. This has @@ -2230,7 +2362,7 @@ Check the #! line, or manually feed your script into Perl yourself. CHECK, INIT, or END subroutine. Processing of the remainder of the queue of such routines has been prematurely ended. -=item Failed to close in-place edit file %s: %s +=item Failed to close in-place work file %s: %s (F) Closing an output file from in-place editing, as with the C<-i> command-line switch, failed. @@ -2268,6 +2400,20 @@ which can't encode values above 63. So there is no point in asking for a line length bigger than that. Perl will behave as if you specified C<u63> as the format. +=item File::Glob::glob() will disappear in perl 5.30. Use File::Glob::bsd_glob() instead. + +(D deprecated) C<< File::Glob >> has a function called C<< glob >>, which +just calls C<< bsd_glob >>. However, its prototype is different from the +prototype of C<< CORE::glob >>, and hence, C<< File::Glob::glob >> should +not be used. + +C<< File::Glob::glob() >> was deprecated in perl 5.8.0. A deprecation +message was issued from perl 5.26.0 onwards, and the function will +disappear in perl 5.30.0. + +Code using C<< File::Glob::glob() >> should call +C<< File::Glob::bsd_glob() >> instead. + =item Filehandle %s opened only for input (W io) You tried to write on a read-only filehandle. If you intended @@ -2464,13 +2610,6 @@ created on an emergency basis to prevent a core dump. (F) The parser has given up trying to parse the program after 10 errors. Further error messages would likely be uninformative. -=item Having more than one /%c regexp modifier is deprecated - -(D deprecated, regexp) You used the indicated regular expression pattern -modifier at least twice in a string of modifiers. It is deprecated to -do this with this particular modifier, to allow future extensions to the -Perl language. - =item Hexadecimal float: exponent overflow (W overflow) The hexadecimal floating point has a larger exponent @@ -2527,7 +2666,7 @@ zero-length sequence. When such an escape is used in a character class its behavior is not well defined. Check that the correct escape has been used, and the correct charname handler is in scope. -=item Illegal binary digit %s +=item Illegal binary digit '%c' (F) You used a digit other than 0 or 1 in a binary number. @@ -2546,11 +2685,27 @@ or '%', since those two will accept 0 or more final parameters. =item Illegal character \%o (carriage return) -(F) Perl normally treats carriage returns in the program text as it -would any other whitespace, which means you should never see this error -when Perl was built using standard options. For some reason, your -version of Perl appears to have been built without this support. Talk -to your Perl administrator. +(F) Perl normally treats carriage returns in the program text as +it would any other whitespace, which means you should never see +this error when Perl was built using standard options. For some +reason, your version of Perl appears to have been built without +this support. Talk to your Perl administrator. + +=item Illegal character following sigil in a subroutine signature + +(F) A parameter in a subroutine signature contained an unexpected character +following the C<$>, C<@> or C<%> sigil character. Normally the sigil +should be followed by the variable name or C<=> etc. Perhaps you are +trying use a prototype while in the scope of C<use feature 'signatures'>? +For example: + + sub foo ($$) {} # legal - a prototype + + use feature 'signatures; + sub foo ($$) {} # illegal - was expecting a signature + sub foo ($a, $b) + :prototype($$) {} # legal + =item Illegal character in prototype for %s : %s @@ -2591,7 +2746,7 @@ numbers don't take to this kindly. (F) The number of bits in vec() (the third argument) must be a power of two from 1 to 32 (or 64, if your platform supports that). -=item Illegal octal digit %s +=item Illegal octal digit '%c' (F) You used an 8 or 9 in an octal number. @@ -2600,6 +2755,17 @@ two from 1 to 32 (or 64, if your platform supports that). (W digit) You may have tried to use an 8 or 9 in an octal number. Interpretation of the octal number stopped before the 8 or 9. +=item Illegal operator following parameter in a subroutine signature + +(F) A parameter in a subroutine signature, was followed by something +other than C<=> introducing a default, C<,> or C<)>. + + use feature 'signatures'; + sub foo ($=1) {} # legal + sub foo ($a = 1) {} # legal + sub foo ($a += 1) {} # illegal + sub foo ($a == 1) {} # illegal + =item Illegal pattern in regex; marked by S<<-- HERE> in m/%s/ (F) You wrote something like @@ -2667,19 +2833,45 @@ parent '%s' C3-consistent, and you have enabled the C3 MRO for this class. See the C3 documentation in L<mro> for more information. +=item Indentation on line %d of here-doc doesn't match delimiter + +(F) You have an indented here-document where one or more of its lines +have whitespace at the beginning that does not match the closing +delimiter. + +For example, line 2 below is wrong because it does not have at least +2 spaces, but lines 1 and 3 are fine because they have at least 2: + + if ($something) { + print <<~EOF; + Line 1 + Line 2 not + Line 3 + EOF + } + +Note that tabs and spaces are compared strictly, meaning 1 tab will +not match 8 spaces. + =item Infinite recursion in regex (F) You used a pattern that references itself without consuming any input text. You should check the pattern to ensure that recursive patterns either consume text or fail. -=item Initialization of state variables in list context currently forbidden +=item Infinite recursion via empty pattern -(F) Currently the implementation of "state" only permits the -initialization of scalar variables in scalar context. Re-write -C<state ($a) = 42> as C<state $a = 42> to change from list to scalar -context. Constructions such as C<state (@a) = foo()> will be -supported in a future perl release. +(F) You tried to use the empty pattern inside of a regex code block, +for instance C</(?{ s!!! })/>, which resulted in re-executing +the same pattern, which is an infinite loop which is broken by +throwing an exception. + +=item Initialization of state variables in list currently forbidden + +(F) C<state> only permits initializing a single variable, specified +without parentheses. So C<state $a = 42> and C<state @a = qw(a b c)> are +allowed, but not C<state ($a) = 42> or C<(state $a) = 42>. To initialize +more than one C<state> variable, initialize them one at a time. =item %%s[%s] in scalar context better written as $%s[%s] @@ -2817,6 +3009,14 @@ expression pattern should be an indivisible token, with nothing intervening between the C<"("> and the C<"?">, but you separated them with whitespace. +=item In '(*...)', the '(' and '*' must be adjacent in regex; +marked by S<<-- HERE> in m/%s/ + +(F) The two-character sequence C<"(*"> in this context in a regular +expression pattern should be an indivisible token, with nothing +intervening between the C<"("> and the C<"*">, but you separated them. +Fix the pattern and retry. + =item Invalid %s attribute: %s (F) The indicated attribute for a subroutine or variable was not recognized @@ -2974,10 +3174,9 @@ an arbitrary reference was blessed into the "version" class. =item In '(*VERB...)', the '(' and '*' must be adjacent in regex; marked by S<<-- HERE> in m/%s/ -(F) The two-character sequence C<"(*"> in -this context in a regular expression pattern should be an -indivisible token, with nothing intervening between the C<"("> -and the C<"*">, but you separated them. +(F) The two-character sequence C<"(*"> in this context in a regular +expression pattern should be an indivisible token, with nothing +intervening between the C<"("> and the C<"*">, but you separated them. =item ioctl is not implemented @@ -3006,9 +3205,9 @@ neither as a system call nor an ioctl call (SIOCATMARK). Perl. The current valid ones are given in L<perlrebackslash/\b{}, \b, \B{}, \B>. -=item %s() is deprecated on :utf8 handles +=item %s() is deprecated on :utf8 handles. This will be a fatal error in Perl 5.30 -(W deprecated) The sysread(), recv(), syswrite() and send() operators are +(D deprecated) The sysread(), recv(), syswrite() and send() operators are deprecated on handles that have the C<:utf8> layer, either explicitly, or implicitly, eg., with the C<:encoding(UTF-16LE)> layer. @@ -3022,19 +3221,19 @@ the layer is some different encoding, such as the example above. Ideally, all of these operators would completely ignore the C<:utf8> state, working only with bytes, but this would result in silently breaking existing -code. To avoid this a future version of perl will throw an exception when -any of sysread(), recv(), syswrite() or send() are called on handle with the -C<:utf8> layer. +code. + +In Perl 5.30, it will no longer be possible to use sysread(), recv(), +syswrite() or send() to read or send bytes from/to :utf8 handles. =item "%s" is more clearly written simply as "%s" in regex; marked by S<<-- HERE> in m/%s/ (W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>) -You specified a character that has the given plainer way of writing it, -and which is also portable to platforms running with different character -sets. +You specified a character that has the given plainer way of writing it, and +which is also portable to platforms running with different character sets. -=item $* is no longer supported +=item $* is no longer supported. Its use will be fatal in Perl 5.30 (D deprecated, syntax) The special variable C<$*>, deprecated in older perls, has been removed as of 5.10.0 and is no longer supported. In @@ -3046,12 +3245,16 @@ modifiers. You can enable C</m> for a lexical scope (even a whole file) with C<use re '/m'>. (In older versions: when C<$*> was set to a true value then all regular expressions behaved as if they were written using C</m>.) -=item $# is no longer supported +Use of this variable will be a fatal error in Perl 5.30. + +=item $# is no longer supported. Its use will be fatal in Perl 5.30 (D deprecated, syntax) The special variable C<$#>, deprecated in older perls, has been removed as of 5.10.0 and is no longer supported. You should use the printf/sprintf functions instead. +Use of this variable will be a fatal error in Perl 5.30. + =item '%s' is not a code reference (W overload) The second (fourth, sixth, ...) argument of @@ -3144,6 +3347,22 @@ L<perlfunc/listen>. form of C<open> does not support pipes, such as C<open($pipe, '|-', @args)>. Use the two-argument C<open($pipe, '|prog arg1 arg2...')> form instead. +=item Literal vertical space in [] is illegal except under /x in regex; +marked by S<<-- HERE> in m/%s/ + +(F) (only under C<S<use re 'strict'>> or within C<(?[...])>) + +Likely you forgot the C</x> modifier or there was a typo in the pattern. +For example, did you really mean to match a form-feed? If so, all the +ASCII vertical space control characters are representable by escape +sequences which won't present such a jarring appearance as your pattern +does when displayed. + + \r carriage return + \f form feed + \n line feed + \cK vertical tab + =item %s: loadable library and perl binaries are mismatched (got handshake key %p, needed %p) (P) A dynamic loading library C<.so> or C<.dll> was being loaded into the @@ -3151,6 +3370,45 @@ process that was built against a different build of perl than the said library was compiled against. Reinstalling the XS module will likely fix this error. +=item Locale '%s' contains (at least) the following characters which +have unexpected meanings: %s The Perl program will use the expected +meanings + +(W locale) You are using the named UTF-8 locale. UTF-8 locales are +expected to have very particular behavior, which most do. This message +arises when perl found some departures from the expectations, and is +notifying you that the expected behavior overrides these differences. +In some cases the differences are caused by the locale definition being +defective, but the most common causes of this warning are when there are +ambiguities and conflicts in following the Standard, and the locale has +chosen an approach that differs from Perl's. + +One of these is because that, contrary to the claims, Unicode is not +completely locale insensitive. Turkish and some related languages +have two types of C<"I"> characters. One is dotted in both upper- and +lowercase, and the other is dotless in both cases. Unicode allows a +locale to use either the Turkish rules, or the rules used in all other +instances, where there is only one type of C<"I">, which is dotless in +the uppercase, and dotted in the lower. The perl core does not (yet) +handle the Turkish case, and this message warns you of that. Instead, +the L<Unicode::Casing> module allows you to mostly implement the Turkish +casing rules. + +The other common cause is for the characters + + $ + < = > ^ ` | ~ + +These are probematic. The C standard says that these should be +considered punctuation in the C locale (and the POSIX standard defers to +the C standard), and Unicode is generally considered a superset of +the C locale. But Unicode has added an extra category, "Symbol", and +classifies these particular characters as being symbols. Most UTF-8 +locales have them treated as punctuation, so that L<ispunct(2)> returns +non-zero for them. But a few locales have it return 0. Perl takes +the first approach, not using C<ispunct()> at all (see L<Note [5] in +perlrecharclass|perlrecharclass/[5]>), and this message is raised to notify you that you +are getting Perl's approach, not the locale's. + =item Locale '%s' may not work well.%s (W locale) You are using the named locale, which is a non-UTF-8 one, and @@ -3279,34 +3537,41 @@ Perhaps the function's author was trying to write a subroutine signature but didn't enable that feature first (C<use feature 'signatures'>), so the signature was instead interpreted as a bad prototype. -=item Malformed UTF-8 character (%s) +=item Malformed UTF-8 character%s -(S utf8)(F) Perl detected a string that didn't comply with UTF-8 -encoding rules, even though it had the UTF8 flag on. +(S utf8)(F) Perl detected a string that should be UTF-8, but didn't +comply with UTF-8 encoding rules, or represents a code point whose +ordinal integer value doesn't fit into the word size of the current +platform (overflows). Details as to the exact malformation are given in +the variable, C<%s>, part of the message. One possible cause is that you set the UTF8 flag yourself for data that -you thought to be in UTF-8 but it wasn't (it was for example legacy -8-bit data). To guard against this, you can use Encode::decode_utf8. +you thought to be in UTF-8 but it wasn't (it was for example legacy 8-bit +data). To guard against this, you can use C<Encode::decode('UTF-8', ...)>. If you use the C<:encoding(UTF-8)> PerlIO layer for input, invalid byte -sequences are handled gracefully, but if you use C<:utf8>, the flag is -set without validating the data, possibly resulting in this error -message. +sequences are handled gracefully, but if you use C<:utf8>, the flag is set +without validating the data, possibly resulting in this error message. See also L<Encode/"Handling Malformed Data">. -=item Malformed UTF-8 character immediately after '%s' - -(F) You said C<use utf8>, but the program file doesn't comply with UTF-8 -encoding rules. The message prints out the properly encoded characters -just before the first bad one. If C<utf8> warnings are enabled, a -warning is generated that gives more details about the type of -malformation. - =item Malformed UTF-8 returned by \N{%s} immediately after '%s' (F) The charnames handler returned malformed UTF-8. +=item Malformed UTF-8 string in "%s" + +(F) This message indicates a bug either in the Perl core or in XS +code. Such code was trying to find out if a character, allegedly +stored internally encoded as UTF-8, was of a given type, such as +being punctuation or a digit. But the character was not encoded +in legal UTF-8. The C<%s> is replaced by a string that can be used +by knowledgeable people to determine what the type being checked +against was. + +Passing malformed strings was deprecated in Perl 5.18, and +became fatal in Perl 5.26. + =item Malformed UTF-8 string in '%c' format in unpack (F) You tried to unpack something that didn't comply with UTF-8 encoding @@ -3341,7 +3606,7 @@ not be portable (S non_unicode) Perl allows strings to contain a superset of Unicode code points; each code point may be as large as what is storable -in an unsigned integer on your system, but these may not be accepted by +in a signed integer on your system, but these may not be accepted by other languages/systems. This message occurs when you matched a string containing such a code point against a regular expression pattern, and the code point was matched against a Unicode property, C<\p{...}> or @@ -3402,7 +3667,7 @@ doesn't resolve to a valid subroutine. See L<overload>. =item Method %s not permitted -See Server error. +See L</500 Server error>. =item Might be a runaway multi-line %s string starting on line %d @@ -3415,6 +3680,11 @@ ended earlier on the current line. (W syntax) An underscore (underbar) in a numeric constant did not separate two digits. +=item Missing argument for %n in %s + +(F) A C<%n> was used in a format string with no corresponding argument for +perl to write the current string length to. + =item Missing argument in %s (W missing) You called a function with fewer arguments than other @@ -3480,11 +3750,12 @@ can vary from one line to the next. (S syntax) This is an educated guess made in conjunction with the message "%s found where operator expected". Often the missing operator is a comma. -=item Missing or undefined argument to require +=item Missing or undefined argument to %s -(F) You tried to call require with no argument or with an undefined +(F) You tried to call require or do with no argument or with an undefined value as an argument. Require expects either a package name or a -file-specification as an argument. See L<perlfunc/require>. +file-specification as an argument; do expects a filename. See +L<perlfunc/require EXPR> and L<perlfunc/do EXPR>. =item Missing right brace on \%c{} in regex; marked by S<<-- HERE> in m/%s/ @@ -3609,6 +3880,15 @@ mutable before freeing the ops. (W syntax) Multidimensional arrays aren't written like C<$foo[1,2,3]>. They're written like C<$foo[1][2][3]>, as in C. +=item Multiple slurpy parameters not allowed + +(F) In subroutine signatures, a slurpy parameter (C<@> or C<%>) must be +the last parameter, and there must not be more than one of them; for +example: + + sub foo ($a, @b) {} # legal + sub foo ($a, @b, %) {} # invalid + =item '/' must follow a numeric type in unpack (F) You had an unpack template that contained a '/', but this did not @@ -3619,7 +3899,7 @@ See L<perlfunc/pack>. (F) Transliteration (C<tr///> and C<y///>) transliterates individual characters. But a named sequence by definition is more than an -individual charater, and hence doing this operation on it doesn't make +individual character, and hence doing this operation on it doesn't make sense. =item "my sub" not yet implemented @@ -3786,14 +4066,6 @@ setgid script to even be allowed to attempt. Generally speaking there will be another way to do what you want that is, if not secure, at least securable. See L<perlsec>. -=item NO-BREAK SPACE in a charnames alias definition is deprecated - -(D deprecated) You defined a character name which contained a no-break -space character. Change it to a regular space. Usually these names are -defined in the C<:alias> import argument to C<use charnames>, but they -could be defined by a translator installed into C<$^H{charnames}>. See -L<charnames/CUSTOM ALIASES>. - =item No code specified for -%c (F) Perl's B<-e> and B<-E> command-line options require an argument. If @@ -3926,11 +4198,13 @@ doesn't know where you wanted to redirect stdout. redirection, and found a '>' or a '>>' on the command line, but can't find the name of the file to which to write data destined for stdout. +=item No package name allowed for subroutine %s in "our" + =item No package name allowed for variable %s in "our" -(F) Fully qualified variable names are not allowed in "our" -declarations, because that doesn't make much sense under existing -rules. Such syntax is reserved for future extensions. +(F) Fully qualified subroutine and variable names are not allowed in "our" +declarations, because that doesn't make much sense under existing rules. +Such syntax is reserved for future extensions. =item No Perl script found in input @@ -3996,18 +4270,25 @@ kind of ref it really was. See L<perlref>. reference to something else instead. You can use the ref() function to find out what kind of ref it really was. See L<perlref>. +=item '#' not allowed immediately following a sigil in a subroutine signature + +(F) In a subroutine signature definition, a comment following a sigil +(C<$>, C<@> or C<%>), needs to be separated by whitespace or a comma etc., in +particular to avoid confusion with the C<$#> variable. For example: + + # bad + sub f ($# ignore first arg + , $b) {} + # good + sub f ($, # ignore first arg + $b) {} + =item Not an ARRAY reference (F) Perl was trying to evaluate a reference to an array value, but found a reference to something else instead. You can use the ref() function to find out what kind of ref it really was. See L<perlref>. -=item Not an unblessed ARRAY reference - -(F) You passed a reference to a blessed array to C<push>, C<shift> or -another array function. These only accept unblessed array references -or arrays beginning explicitly with C<@>. - =item Not a SCALAR reference (F) Perl was trying to evaluate a reference to a scalar value, but found @@ -4099,14 +4380,16 @@ the braces. (4294967295) and therefore non-portable between systems. See L<perlport> for more on portability concerns. -=item Odd name/value argument for subroutine +=item Odd name/value argument for subroutine '%s' (F) A subroutine using a slurpy hash parameter in its signature received an odd number of arguments to populate the hash. It requires the arguments to be paired, with the same number of keys as values. -The caller of the subroutine is presumably at fault. Inconveniently, -this error will be reported at the location of the subroutine, not that -of the caller. +The caller of the subroutine is presumably at fault. + +The message attempts to include the name of the called subroutine. If the +subroutine has been aliased, the subroutine's original name will be shown, +regardless of what name the caller used. =item Odd number of arguments for overload::constant @@ -4133,6 +4416,13 @@ C<sysread()>ing a file, or when seeking past the end of a scalar opened for I/O (in anticipation of future reads and to imitate the behavior with real files). +=item Old package separator used in string + +(W syntax) You used the old package separator, "'", in a variable +named inside a double-quoted string; e.g., C<"In $name's house">. This +is equivalent to C<"In $name::s house">. If you meant the former, put +a backslash before the apostrophe (C<"In $name\'s house">). + =item %s() on unopened %s (W unopened) An I/O operation was attempted on a filehandle that was @@ -4152,20 +4442,6 @@ that isn't open. Check your control flow. See also L<perlfunc/-X>. (S internal) An internal warning that the grammar is screwed up. -=item Opening dirhandle %s also as a file - -(D io, deprecated) You used open() to associate a filehandle to -a symbol (glob or scalar) that already holds a dirhandle. -Although legal, this idiom might render your code confusing -and is deprecated. - -=item Opening filehandle %s also as a directory - -(D io, deprecated) You used opendir() to associate a dirhandle to -a symbol (glob or scalar) that already holds a filehandle. -Although legal, this idiom might render your code confusing -and is deprecated. - =item Operand with no preceding operator in regex; marked by S<<-- HERE> in m/%s/ @@ -4227,7 +4503,7 @@ have a specific default. You probably want "$a = undef". =item "our" variable %s redeclared -(W misc) You seem to have already declared the same global once before +(W shadow) You seem to have already declared the same global once before in the current lexical scope. =item Out of memory! @@ -4349,10 +4625,6 @@ able to initialize properly. (P) Failed an internal consistency check trying to compile a grep. -=item panic: ck_split, type=%u - -(P) Failed an internal consistency check trying to compile a split. - =item panic: corrupt saved stack index %ld (P) The savestack was requested to restore more localized values than @@ -4479,10 +4751,6 @@ and freeing temporaries and lexicals from. (P) The internal pp_match() routine was called with invalid operational data. -=item panic: pp_split, pm=%p, s=%p - -(P) Something terrible went wrong in setting up for the split. - =item panic: realloc, %s (P) Something requested a negative number of bytes of realloc. @@ -4537,6 +4805,12 @@ was string. (P) The compiler is screwed up and attempted to use an op that isn't permitted at run time. +=item panic: unknown OA_*: %x + +(P) The internal routine that handles arguments to C<&CORE::foo()> +subroutine calls was unable to determine what type of arguments +were expected. + =item panic: utf16_to_utf8: odd bytelen (P) Something tried to call utf16_to_utf8 with an odd (as opposed @@ -4568,17 +4842,6 @@ Remember that "my", "our", "local" and "state" bind tighter than comma. (F) Parsing code supplied by an extension violated the parser's API in a detectable way. -=item Passing malformed UTF-8 to "%s" is deprecated - -(D deprecated, utf8) This message indicates a bug either in the Perl -core or in XS code. Such code was trying to find out if a character, -allegedly stored internally encoded as UTF-8, was of a given type, such -as being punctuation or a digit. But the character was not encoded in -legal UTF-8. The C<%s> is replaced by a string that can be used by -knowledgeable people to determine what the type being checked against -was. If C<utf8> warnings are enabled, a further message is raised, -giving details of the malformation. - =item Pattern subroutine nesting without pos change exceeded limit in regex (F) You used a pattern that uses too many nested subpattern calls without @@ -4882,7 +5145,7 @@ of "||". =item Premature end of script headers -See Server error. +See L</500 Server error>. =item printf() on closed filehandle %s @@ -5077,6 +5340,11 @@ to use parens. In any case, a hash requires key/value B<pairs>. (W misc) You have attempted to weaken a reference that is already weak. Doing so has no effect. +=item Reference is not weak + +(W misc) You have attempted to unweaken a reference that is not weak. +Doing so has no effect. + =item Reference to invalid group 0 in regex; marked by S<<-- HERE> in m/%s/ (F) You used C<\g0> or similar in a regular expression. You may refer @@ -5171,6 +5439,11 @@ terminates. You might use ^# instead. See L<perlform>. search list. So the additional elements in the replacement list are meaningless. +=item '(*%s' requires a terminating ':' in regex; marked by <-- HERE in m/%s/ + +(F) You used a construct that needs a colon and pattern argument. +Supply these or check that you are using the right construct. + =item '%s' resolved to '\o{%s}%d' (W misc, regexp) You wrote something like C<\08>, or C<\179> in a @@ -5378,7 +5651,7 @@ in the regular expression the problem was discovered. (F) An C<(?R)> or C<(?0)> sequence in a regular expression was missing the final parenthesis. -=item Server error (a.k.a. "500 Server error") +=item Z<>500 Server error (A) This is the error message generally seen in a browser window when trying to run a CGI program (including SSI) over the web. The @@ -5438,26 +5711,20 @@ didn't think so. forget to check the return value of your socket() call? See L<perlfunc/setsockopt>. -=item Setting ${^ENCODING} is deprecated +=item Setting $/ to a reference to %s is forbidden -(D deprecated) You assigned a non-C<undef> value to C<${^ENCODING}>. -This is deprecated; see C<L<perlvar/${^ENCODING}>> for details. - -=item Setting $/ to a reference to %s as a form of slurp is deprecated, treating as undef - -(D deprecated) You assigned a reference to a scalar to C<$/> where the -referenced item is not a positive integer. In older perls this B<appeared> -to work the same as setting it to C<undef> but was in fact internally -different, less efficient and with very bad luck could have resulted in -your file being split by a stringified form of the reference. +(F) You assigned a reference to a scalar to C<$/> where the referenced item is +not a positive integer. In older perls this B<appeared> to work the same as +setting it to C<undef> but was in fact internally different, less efficient +and with very bad luck could have resulted in your file being split by a +stringified form of the reference. In Perl 5.20.0 this was changed so that it would be B<exactly> the same as -setting C<$/> to undef, with the exception that this warning would be -thrown. +setting C<$/> to undef, with the exception that this warning would be thrown. -You are recommended to change your code to set C<$/> to C<undef> explicitly -if you wish to slurp the file. In future versions of Perl assigning -a reference to will throw a fatal error. +You are recommended to change your code to set C<$/> to C<undef> explicitly if +you wish to slurp the file. As of Perl 5.28 assigning C<$/> to a reference +to an integer which isn't positive is a fatal error. =item Setting $/ to %s reference is forbidden @@ -5527,6 +5794,13 @@ Perl. Particularly, its current behavior is noticed for being unnecessarily complex and unintuitive, and is very likely to be overhauled. +=item Sorry, hash keys must be smaller than 2**31 bytes + +(F) You tried to create a hash containing a very large key, where "very +large" means that it needs at least 2 gigabytes to store. Unfortunately, +Perl doesn't yet handle such large hash keys. You should +reconsider your design to avoid hashing such a long string directly. + =item sort is now a reserved word (F) An ancient error message that almost nobody ever runs into anymore. @@ -5597,6 +5871,15 @@ model on-disk files and can only contain bytes. stubs. Stubs should never be implicitly created, but explicit calls to C<can> may break this. +=item Subroutine attributes must come before the signature + +(F) When subroutine signatures are enabled, any subroutine attributes must +come before the signature. Note that this order was the opposite in +versions 5.22..5.26. So: + + sub foo :lvalue ($a, $b) { ... } # 5.20 and 5.28 + + sub foo ($a, $b) :lvalue { ... } # 5.22 .. 5.26 + =item Subroutine "&%s" is not available (W closure) During compilation, an inner named subroutine or eval is @@ -5629,7 +5912,7 @@ being executed, so its &a is not available for capture. =item "%s" subroutine &%s masks earlier declaration in same %s -(W misc) A "my" or "state" subroutine has been redeclared in the +(W shadow) A "my" or "state" subroutine has been redeclared in the current scope or statement, effectively eliminating all access to the previous instance. This is almost always a typographical error. Note that the earlier subroutine will still exist until the end of @@ -5841,17 +6124,11 @@ as a compiler directive. You may say only one of This is to prevent the problem of one module changing the array base out from under another module inadvertently. See L<perlvar/$[> and L<arybase>. -=item The bitwise feature is experimental - -(S experimental::bitwise) This warning is emitted if you use bitwise -operators (C<& | ^ ~ &. |. ^. ~.>) with the "bitwise" feature enabled. -Simply suppress the warning if you want to use the feature, but know -that in doing so you are taking the risk of using an experimental -feature which may change or be removed in a future Perl version: +=item The alpha_assertions feature is experimental - no warnings "experimental::bitwise"; - use feature "bitwise"; - $x |.= $y; +(S experimental::alpha_assertions) This feature is experimental +and its behavior may change in any future release of perl. See +L<perlre/Extended Patterns>. =item The crypt() function is unimplemented due to excessive paranoia. @@ -5861,35 +6138,37 @@ think the U.S. Government thinks it's a secret, or at least that they will continue to pretend that it is. And if you quote me on that, I will deny it. -=item The %s function is unimplemented +=item The experimental declared_refs feature is not enabled -(F) The function indicated isn't implemented on this architecture, -according to the probings of Configure. +(F) To declare references to variables, as in C<my \%x>, you must first enable +the feature: -=item The lexical_subs feature is experimental + no warnings "experimental::declared_refs"; + use feature "declared_refs"; -(S experimental::lexical_subs) This warning is emitted if you -declare a sub with C<my> or C<state>. Simply suppress the warning -if you want to use the feature, but know that in doing so you -are taking the risk of using an experimental feature which may -change or be removed in a future Perl version: +=item The %s function is unimplemented - no warnings "experimental::lexical_subs"; - use feature "lexical_subs"; - my sub foo { ... } +(F) The function indicated isn't implemented on this architecture, +according to the probings of Configure. =item The regex_sets feature is experimental (S experimental::regex_sets) This warning is emitted if you use the syntax S<C<(?[ ])>> in a regular expression. The details of this feature are subject to change. -if you want to use it, but know that in doing so you +If you want to use it, but know that in doing so you are taking the risk of using an experimental feature which may change in a future Perl version, you can do this to silence the warning: no warnings "experimental::regex_sets"; +=item The script_run feature is experimental + +(S experimental::script_run) This feature is experimental +and its behavior may in any future release of perl. See +L<perlre/Script Runs>. + =item The signatures feature is experimental (S experimental::signatures) This warning is emitted if you unwrap a @@ -5970,12 +6249,14 @@ See L<perlunicode/"User-Defined Character Properties">. (F) There has to be at least one argument to syscall() to specify the system call to call, silly dilly. -=item Too few arguments for subroutine +=item Too few arguments for subroutine '%s' -(F) A subroutine using a signature received fewer arguments than required -by the signature. The caller of the subroutine is presumably at fault. -Inconveniently, this error will be reported at the location of the -subroutine, not that of the caller. +(F) A subroutine using a signature fewer arguments than required by the +signature. The caller of the subroutine is presumably at fault. + +The message attempts to include the name of the called subroutine. If +the subroutine has been aliased, the subroutine's original name will be +shown, regardless of what name the caller used. =item Too late for "-%s" option @@ -6007,12 +6288,14 @@ BEGIN block. (F) The function requires fewer arguments than you specified. -=item Too many arguments for subroutine +=item Too many arguments for subroutine '%s' -(F) A subroutine using a signature received more arguments than required +(F) A subroutine using a signature received more arguments than permitted by the signature. The caller of the subroutine is presumably at fault. -Inconveniently, this error will be reported at the location of the -subroutine, not that of the caller. + +The message attempts to include the name of the called subroutine. If the +subroutine has been aliased, the subroutine's original name will be shown, +regardless of what name the caller used. =item Too many )'s @@ -6136,47 +6419,144 @@ C<undef *foo>. (A) You've accidentally run your script through B<csh> instead of Perl. Check the #! line, or manually feed your script into Perl yourself. -=item Unescaped left brace in regex is deprecated, passed through in regex; +=item Unescaped left brace in regex is deprecated here (and will be fatal in Perl 5.30), passed through in regex; marked by S<<-- HERE> in m/%s/ + +=item Unescaped left brace in regex is deprecated here (and will be fatal in Perl 5.32), passed through in regex; marked by S<<-- HERE> in m/%s/ + +(D deprecated, regexp) The simple rule to remember, if you want to +match a literal C<{> character (U+007B C<LEFT CURLY BRACKET>) in a +regular expression pattern, is to escape each literal instance of it in +some way. Generally easiest is to precede it with a backslash, like +C<\{> or enclose it in square brackets (C<[{]>). If the pattern +delimiters are also braces, any matching right brace (C<}>) should +also be escaped to avoid confusing the parser, for example, + + qr{abc\{def\}ghi} + +Forcing literal C<{> characters to be escaped will enable the Perl +language to be extended in various ways in future releases. To avoid +needlessly breaking existing code, the restriction is is not enforced in +contexts where there are unlikely to ever be extensions that could +conflict with the use there of C<{> as a literal. + +In this release of Perl, some literal uses of C<{> are fatal, and some +still just deprecated. This is because of an oversight: some uses of a +literal C<{> that should have raised a deprecation warning starting in +v5.20 did not warn until v5.26. By making the already-warned uses fatal +now, some of the planned extensions can be made to the language sooner. +The cases which are still allowed will be fatal in Perl 5.30 or 5.32. + +The contexts where no warnings or errors are raised are: + +=over 4 + +=item * + +as the first character in a pattern, or following C<^> indicating to +anchor the match to the beginning of a line. + +=item * + +as the first character following a C<|> indicating alternation. + +=item * + +as the first character in a parenthesized grouping like + + /foo({bar)/ + /foo(?:{bar)/ + +=item * + +as the first character following a quantifier + + /\s*{/ + +=back + +=for comment +The text of the message above is duplicated below to allow splain (and +'use diagnostics') to work. Since one is fatal, and one not, they can't +be combined as one message. And since the non-fatal one is temporary, +there's no real need to enhance perldiag to handle this transient case. + +=item Unescaped left brace in regex is illegal here in regex; marked by S<<-- HERE> in m/%s/ -(D deprecated, regexp) You used a literal C<"{"> character in a regular -expression pattern. You should change to use C<"\{"> instead, because a -future version of Perl (tentatively v5.26) will consider this to be a -syntax error. If the pattern delimiters are also braces, any matching -right brace (C<"}">) should also be escaped to avoid confusing the parser, -for example, +(F) The simple rule to remember, if you want to +match a literal C<"{"> character (U+007B C<LEFT CURLY BRACKET>) in a +regular expression pattern, is to escape each literal instance of it in +some way. Generally easiest is to precede it with a backslash, like +C<"\{"> or enclose it in square brackets (C<"[{]">). If the pattern +delimiters are also braces, any matching right brace (C<"}">) should +also be escaped to avoid confusing the parser, for example, - qr{abc\{def\}ghi} + qr{abc\{def\}ghi} -=item unexec of %s into %s failed! +Forcing literal C<"{"> characters to be escaped will enable the Perl +language to be extended in various ways in future releases. To avoid +needlessly breaking existing code, the restriction is is not enforced in +contexts where there are unlikely to ever be extensions that could +conflict with the use there of C<"{"> as a literal. -(F) The unexec() routine failed for some reason. See your local FSF -representative, who probably put it there in the first place. +In this release of Perl, some literal uses of C<"{"> are fatal, and some +still just deprecated. This is because of an oversight: some uses of a +literal C<"{"> that should have raised a deprecation warning starting in +v5.20 did not warn until v5.26. By making the already-warned uses fatal +now, some of the planned extensions can be made to the language sooner. -=item Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/%s/ +The contexts where no warnings or errors are raised are: -(F) While parsing an extended character class a ']' character was encountered -at a point in the definition where the only legal use of ']' is to close the -character class definition as part of a '])', you may have forgotten the close -paren, or otherwise confused the parser. +=over 4 -=item Expecting close paren for nested extended charclass in regex; marked by <-- HERE in m/%s/ +=item * -(F) While parsing a nested extended character class like: +as the first character in a pattern, or following C<"^"> indicating to +anchor the match to the beginning of a line. - (?[ ... (?flags:(?[ ... ])) ... ]) - ^ +=item * -we expected to see a close paren ')' (marked by ^) but did not. +as the first character following a C<"|"> indicating alternation. -=item Expecting close paren for wrapper for nested extended charclass in regex; marked by <-- HERE in m/%s/ +=item * -(F) While parsing a nested extended character class like: +as the first character in a parenthesized grouping like - (?[ ... (?flags:(?[ ... ])) ... ]) - ^ + /foo({bar)/ + /foo(?:{bar)/ -we expected to see a close paren ')' (marked by ^) but did not. +=item * + +as the first character following a quantifier + + /\s*{/ + +=back + +=item Unescaped literal '%c' in regex; marked by <-- HERE in m/%s/ + +(W regexp) (only under C<S<use re 'strict'>>) + +Within the scope of C<S<use re 'strict'>> in a regular expression +pattern, you included an unescaped C<}> or C<]> which was interpreted +literally. These two characters are sometimes metacharacters, and +sometimes literals, depending on what precedes them in the +pattern. This is unlike the similar C<)> which is always a +metacharacter unless escaped. + +This action at a distance, perhaps a large distance, can lead to Perl +silently misinterpreting what you meant, so when you specify that you +want extra checking by C<S<use re 'strict'>>, this warning is generated. +If you meant the character as a literal, simply confirm that to Perl by +preceding the character with a backslash, or make it into a bracketed +character class (like C<[}]>). If you meant it as closing a +corresponding C<[> or C<{>, you'll need to look back through the pattern +to find out why that isn't happening. + +=item unexec of %s into %s failed! + +(F) The unexec() routine failed for some reason. See your local FSF +representative, who probably put it there in the first place. =item Unexpected binary operator '%c' with no preceding operand in regex; marked by S<<-- HERE> in m/%s/ @@ -6227,6 +6607,14 @@ The C<")"> is out-of-place. Something apparently was supposed to be combined with the digits, or the C<"+"> shouldn't be there, or something like that. Perl can't figure out what was intended. +=item Unexpected ']' with no following ')' in (?[... in regex; marked by +<-- HERE in m/%s/ + +(F) While parsing an extended character class a ']' character was +encountered at a point in the definition where the only legal use of +']' is to close the character class definition as part of a '])', you +may have forgotten the close paren, or otherwise confused the parser. + =item Unexpected '(' with no preceding operator in regex; marked by S<<-- HERE> in m/%s/ @@ -6263,12 +6651,6 @@ problems when being input or output, which is likely where this message came from. If you really really know what you are doing you can turn off this warning by C<no warnings 'surrogate';>. -=item Unknown charname '' is deprecated - -(D deprecated) You had a C<\N{}> with nothing between the braces. This -usage is deprecated, and will be made a syntax error in a future Perl -version. - =item Unknown charname '%s' (F) The name you used inside C<\N{}> is unknown to Perl. Check the @@ -6279,11 +6661,24 @@ exactly, regardless of whether C<:loose> is used or not.) This error may also happen if the C<\N{}> is not in the scope of the corresponding C<S<use charnames>>. +=item Unknown '(*...)' construct '%s' in regex; marked by <-- HERE in m/%s/ + +(F) The C<(*> was followed by something that the regular expression +compiler does not recognize. Check your spelling. + =item Unknown error (P) Perl was about to print an error message in C<$@>, but the C<$@> variable did not exist, even after an attempt to create it. +=item Unknown locale category %d; can't set it to %s + +(W locale) You used a locale category that perl doesn't recognize, so it +cannot carry out your request. Check that you are using a valid +category. If so, see L<perllocale/Multi-threaded> for advice on +reporting this as a bug, and for modifying perl locally to accommodate +your needs. + =item Unknown open() mode '%s' (F) The second argument of 3-argument open() is not among the list @@ -6306,7 +6701,7 @@ iterating over it, and someone else stuck a message in the stream of data Perl expected. Someone's very confused, or perhaps trying to subvert Perl's population of %ENV for nefarious purposes. -=item Unknown regex modifier "%s" +=item Unknown regexp modifier "/%s" (F) Alphanumerics immediately following the closing delimiter of a regular expression pattern are interpreted by Perl as modifier @@ -6331,15 +6726,21 @@ m/%s/ (F) The condition part of a (?(condition)if-clause|else-clause) construct is not known. The condition must be one of the following: - (1) (2) ... true if 1st, 2nd, etc., capture matched - (<NAME>) ('NAME') true if named capture matched - (?=...) (?<=...) true if subpattern matches - (?!...) (?<!...) true if subpattern fails to match - (?{ CODE }) true if code returns a true value - (R) true if evaluating inside recursion - (R1) (R2) ... true if directly inside capture group 1, 2, etc. - (R&NAME) true if directly inside named capture - (DEFINE) always false; for defining named subpatterns + (1) (2) ... true if 1st, 2nd, etc., capture matched + (<NAME>) ('NAME') true if named capture matched + (?=...) (?<=...) true if subpattern matches + (*pla:...) (*plb:...) true if subpattern matches; also + (*positive_lookahead:...) + (*positive_lookbehind:...) + (*nla:...) (*nlb:...) true if subpattern fails to match; also + (*negative_lookahead:...) + (*negative_lookbehind:...) + (?{ CODE }) true if code returns a true value + (R) true if evaluating inside recursion + (R1) (R2) ... true if directly inside capture group 1, 2, + etc. + (R&NAME) true if directly inside named capture + (DEFINE) always false; for defining named subpatterns The S<<-- HERE> shows whereabouts in the regular expression the problem was discovered. See L<perlre>. @@ -6482,6 +6883,11 @@ declares it to be in a Unicode encoding that Perl cannot read. (F) Your machine doesn't support the Berkeley socket mechanism, or at least that's what Configure thought. +=item Unterminated '(*...' argument in regex; marked by <-- HERE in m/%s/ + +(F) You used a pattern of the form C<(*...:...)> but did not terminate +the pattern with a C<)>. Fix the pattern and retry. + =item Unterminated attribute list (F) The lexer found something other than a simple identifier at the @@ -6502,6 +6908,11 @@ character to get your parentheses to balance. See L<attributes>. compressed integer format and could not be converted to an integer. See L<perlfunc/pack>. +=item Unterminated '(*...' construct in regex; marked by <-- HERE in m/%s/ + +(F) You used a pattern of the form C<(*...)> but did not terminate +the pattern with a C<)>. Fix the pattern and retry. + =item Unterminated delimiter for here document (F) This message occurs when a here document label has an initial @@ -6714,16 +7125,20 @@ you can write it as C<push(@tied_array,())> to avoid this warning. (F) The "use" keyword is recognized and executed at compile time, and returns no useful value. See L<perlmod>. -=item Use of assignment to $[ is deprecated +=item Use of assignment to $[ is deprecated, and will be fatal in 5.30 (D deprecated) The C<$[> variable (index of the first element in an array) -is deprecated. See L<perlvar/"$[">. +is deprecated since Perl 5.12, and setting it to a non-zero value will be +fatal as of Perl 5.30. +See L<perlvar/"$[">. -=item Use of bare << to mean <<"" is deprecated +=item Use of bare << to mean <<"" is forbidden -(D deprecated) You are now encouraged to use the explicitly quoted -form if you wish to use an empty line as the terminator of the -here-document. +(F) You are now required to use the explicitly quoted form if you wish +to use an empty line as the terminator of the here-document. + +Use of a bare terminator was deprecated in Perl 5.000, and is a fatal +error as of Perl 5.28. =item Use of /c modifier is meaningless in s/// @@ -6736,24 +7151,24 @@ modifier is not presently meaningful in substitutions. use the /g modifier. Currently, /c is meaningful only when /g is used. (This may change in the future.) -=item Use of code point 0x%s is deprecated; the permissible max is 0x%s +=item Use of code point 0x%s is not allowed; the permissible max is 0x%x + +=item Use of code point 0x%s is not allowed; the permissible max is 0x%x +in regex; marked by <-- HERE in m/%s/ -(D deprecated) You used a code point that will not be allowed in a -future perl version, because it is too large. Unicode only allows code -points up to 0x10FFFF, but Perl allows much larger ones. However, the -largest possible ones break the perl interpreter in some constructs, -including causing it to hang in a few cases. The known problem areas -are in C<tr///>, regular expression pattern matching using quantifiers, -and as the upper limits in loops. +(F) You used a code point that is not allowed, because it is too large. +Unicode only allows code points up to 0x10FFFF, but Perl allows much +larger ones. Earlier versions of Perl allowed code points above IV_MAX +(0x7FFFFFF on 32-bit platforms, 0x7FFFFFFFFFFFFFFF on 64-bit platforms), +however, this could possibly break the perl interpreter in some constructs, +including causing it to hang in a few cases. If your code is to run on various platforms, keep in mind that the upper limit depends on the platform. It is much larger on 64-bit word sizes than 32-bit ones. -=item Use of comma-less variable list is deprecated - -(D deprecated) The values you give to a format should be -separated by commas, not just aligned on a line. +The use of out of range code points was deprecated in Perl 5.24, and +became a fatal error in Perl 5.28. =item Use of each() on hash after insertion without resetting hash iterator results in undefined behavior @@ -6792,11 +7207,6 @@ For speed and efficiency reasons, Perl internally does not do full reference-counting of iterated items, hence deleting such an item in the middle of an iteration causes Perl to see a freed value. -=item Use of *glob{FILEHANDLE} is deprecated - -(D deprecated) You are now encouraged to use the shorter *glob{IO} form -to access the filehandle slot within a typeglob. - =item Use of /g modifier is meaningless in split (W regexp) You used the /g modifier on the pattern for a C<split> @@ -6808,73 +7218,34 @@ repeatedly, the C</g> has no effect. (D deprecated) Using C<goto> to jump from an outer scope into an inner scope is deprecated and should be avoided. -=item Use of inherited AUTOLOAD for non-method %s() is deprecated +This was deprecated in Perl 5.12. + +=item Use of '%s' in \p{} or \P{} is deprecated because: %s -(D deprecated) As an (ahem) accidental feature, C<AUTOLOAD> -subroutines are looked up as methods (using the C<@ISA> hierarchy) -even when the subroutines to be autoloaded were called as plain -functions (e.g. C<Foo::bar()>), not as methods (e.g. C<< Foo->bar() >> or -C<< $obj->bar() >>). +(D deprecated) Certain properties are deprecated by Unicode, and may +eventually be removed from the Standard, at which time Perl will follow +along. In the meantime, this message is raised to notify you. -This bug will be rectified in future by using method lookup only for -methods' C<AUTOLOAD>s. However, there is a significant base of existing -code that may be using the old behavior. So, as an interim step, Perl -currently issues an optional warning when non-methods use inherited -C<AUTOLOAD>s. +=item Use of inherited AUTOLOAD for non-method %s::%s() is no longer allowed -The simple rule is: Inheritance will not work when autoloading -non-methods. The simple fix for old code is: In any module that used -to depend on inheriting C<AUTOLOAD> for non-methods from a base class -named C<BaseClass>, execute C<*AUTOLOAD = \&BaseClass::AUTOLOAD> during -startup. +(F) As an accidental feature, C<AUTOLOAD> subroutines were looked up as +methods (using the C<@ISA> hierarchy), even when the subroutines to be +autoloaded were called as plain functions (e.g. C<Foo::bar()>), not as +methods (e.g. C<< Foo->bar() >> or C<< $obj->bar() >>). -In code that currently says C<use AutoLoader; @ISA = qw(AutoLoader);> -you should remove AutoLoader from @ISA and change C<use AutoLoader;> to -C<use AutoLoader 'AUTOLOAD';>. +This was deprecated in Perl 5.004, and was made fatal in Perl 5.28. =item Use of %s in printf format not supported (F) You attempted to use a feature of printf that is accessible from only C. This usually means there's a better way to do it in Perl. -=item Use of %s is deprecated - -(D deprecated) The construct indicated is no longer recommended for use, -generally because there's a better way to do it, and also because the -old way has bad side effects. - -=item Use of literal control characters in variable names is deprecated - -=item Use of literal non-graphic characters in variable names is deprecated - -(D deprecated) Using literal non-graphic (including control) -characters in the source to refer to the ^FOO variables, like C<$^X> and -C<${^GLOBAL_PHASE}> is now deprecated. (We use C<^X> and C<^G> here for -legibility. They actually represent the non-printable control -characters, code points 0x18 and 0x07, respectively; C<^A> would mean -the control character whose code point is 0x01.) This only affects -code like C<$\cT>, where C<\cT> is a control in the source code; C<${"\cT"}> and -C<$^T> remain valid. Things that are non-controls and also not graphic -are NO-BREAK SPACE and SOFT HYPHEN, which were previously only allowed -for historical reasons. - =item Use of -l on filehandle%s (W io) A filehandle represents an opened file, and when you opened the file it already went past any symlink you are presumably trying to look for. The operation returned C<undef>. Use a filename instead. -=item Use of %s on a handle without * is deprecated - -(D deprecated) You used C<tie>, C<tied> or C<untie> on a scalar but that scalar -happens to hold a typeglob, which means its filehandle will be tied. If -you mean to tie a handle, use an explicit * as in C<tie *$handle>. - -This was a long-standing bug that was removed in Perl 5.16, as there was -no way to tie the scalar itself when it held a typeglob, and no way to -untie a scalar that had had a typeglob assigned to it. If you see this -message, you must be using an older version. - =item Use of reference "%s" as array index (W misc) You tried to use a reference as an array index; this probably @@ -6886,19 +7257,23 @@ C<$array[0+$ref]>. This warning is not given for overloaded objects, however, because you can overload the numification and stringification operators and then you presumably know what you are doing. -=item Use of state $_ is experimental +=item Use of strings with code points over 0xFF as arguments to %s +operator is not allowed -(S experimental::lexical_topic) Lexical $_ is an experimental feature and -its behavior may change or even be removed in any future release of perl. -See the explanation under L<perlvar/$_>. +(F) You tried to use one of the string bitwise operators (C<&> or C<|> or C<^> or +C<~>) on a string containing a code point over 0xFF. The string bitwise +operators treat their operands as strings of bytes, and values beyond +0xFF are nonsensical in this context. -=item Use of strings with code points over 0xFF as arguments to %s -operator is deprecated +This became fatal in Perl 5.28. + +=item Use of strings with code points over 0xFF as arguments to C<vec> +is deprecated. This will be a fatal error in Perl 5.32 + +(D deprecated) You tried to use L<C<vec>|perlfunc/vec EXPR,OFFSET,BITS> +on a string containing a code point over 0xFF, which is nonsensical here. -(D deprecated) You tried to use one of the string bitwise operators -(C<&> or C<|> or C<^> or C<~>) on a string containing a code point over -0xFF. The string bitwise operators treat their operands as strings of -bytes, and values beyond 0xFF are nonsensical in this context. +Such usage will be a fatal error in Perl 5.32. =item Use of tainted arguments in %s is deprecated @@ -6907,6 +7282,31 @@ arguments and at least one of them is tainted. This used to be allowed but will become a fatal error in a future version of perl. Untaint your arguments. See L<perlsec>. +=item Use of unassigned code point or non-standalone grapheme for a +delimiter will be a fatal error starting in Perl 5.30 + +(D deprecated) +A grapheme is what appears to a native-speaker of a language to be a +character. In Unicode (and hence Perl) a grapheme may actually be +several adjacent characters that together form a complete grapheme. For +example, there can be a base character, like "R" and an accent, like a +circumflex "^", that appear when displayed to be a single character with +the circumflex hovering over the "R". Perl currently allows things like +that circumflex to be delimiters of strings, patterns, I<etc>. When +displayed, the circumflex would look like it belongs to the character +just to the left of it. In order to move the language to be able to +accept graphemes as delimiters, we have to deprecate the use of +delimiters which aren't graphemes by themselves. Also, a delimiter must +already be assigned (or known to be never going to be assigned) to try +to future-proof code, for otherwise code that works today would fail to +compile if the currently unassigned delimiter ends up being something +that isn't a stand-alone grapheme. Because Unicode is never going to +assign +L<non-character code points|perlunicode/Noncharacter code points>, nor +L<code points that are above the legal Unicode maximum| +perlunicode/Beyond Unicode code points>, those can be delimiters, and +their use won't raise this warning. + =item Use of uninitialized value%s (W uninitialized) An undefined value was used as if it were already @@ -6965,7 +7365,7 @@ of the returned sequence, which is not likely what you want. (W regexp) You used a Unicode boundary (C<\b{...}> or C<\B{...}>) in a portion of a regular expression where the character set modifiers C</a> or C</aa> are in effect. These two modifiers indicate an ASCII -interpretation, and this doesn't make sense for a Unicode defintion. +interpretation, and this doesn't make sense for a Unicode definition. The generated regular expression will compile so that the boundary uses all of Unicode. No other portion of the regular expression is affected. @@ -7046,22 +7446,50 @@ front of your variable. (F) Lookbehind is allowed only for subexpressions whose length is fixed and known at compile time. For positive lookbehind, you can use the C<\K> regex construct as a way to get the equivalent functionality. See -L<perlre/(?<=pattern) \K>. - -There are non-obvious Unicode rules under C</i> that can match variably, -but which you might not think could. For example, the substring C<"ss"> -can match the single character LATIN SMALL LETTER SHARP S. There are -other sequences of ASCII characters that can match single ligature -characters, such as LATIN SMALL LIGATURE FFI matching C<qr/ffi/i>. -Starting in Perl v5.16, if you only care about ASCII matches, adding the -C</aa> modifier to the regex will exclude all these non-obvious matches, -thus getting rid of this message. You can also say C<S<use re qw(/aa)>> +L<(?<=pattern) and \K in perlre|perlre/\K>. + +Starting in Perl 5.18, there are non-obvious Unicode rules under C</i> +that can match variably, but which you might not think could. For +example, the substring C<"ss"> can match the single character LATIN +SMALL LETTER SHARP S. Here's a complete list of the current ones +affecting ASCII characters: + + ASCII + sequence Matches single letter under /i + FF U+FB00 LATIN SMALL LIGATURE FF + FFI U+FB03 LATIN SMALL LIGATURE FFI + FFL U+FB04 LATIN SMALL LIGATURE FFL + FI U+FB01 LATIN SMALL LIGATURE FI + FL U+FB02 LATIN SMALL LIGATURE FL + SS U+00DF LATIN SMALL LETTER SHARP S + U+1E9E LATIN CAPITAL LETTER SHARP S + ST U+FB06 LATIN SMALL LIGATURE ST + U+FB05 LATIN SMALL LIGATURE LONG S T + +This list is subject to change, but is quite unlikely to. +Each ASCII sequence can be any combination of upper- and lowercase. + +You can avoid this by using a bracketed character class in the +lookbehind assertion, like + + (?<![sS]t) + (?<![fF]f[iI]) + +This fools Perl into not matching the ligatures. + +Another option for Perls starting with 5.16, if you only care about +ASCII matches, is to add the C</aa> modifier to the regex. This will +exclude all these non-obvious matches, thus getting rid of this message. +You can also say + + use if $] ge 5.016, re => '/aa'; + to apply C</aa> to all regular expressions compiled within its scope. See L<re>. =item "%s" variable %s masks earlier declaration in same %s -(W misc) A "my", "our" or "state" variable has been redeclared in the +(W shadow) A "my", "our" or "state" variable has been redeclared in the current scope or statement, effectively eliminating all access to the previous instance. This is almost always a typographical error. Note that the earlier variable will still exist until the end of the scope @@ -7107,6 +7535,12 @@ S<<-- HERE> in m/%s/ (F) You used a verb pattern that is not allowed an argument. Remove the argument or check that you are using the right verb. +=item Version control conflict marker + +(F) The parser found a line starting with C<E<lt><<<<<<>, +C<E<gt>E<gt>E<gt>E<gt>E<gt>E<gt>E<gt>>, or C<=======>. These may be left by a +version control system to mark conflicts after a failed merge operation. + =item Version number must be a constant number (P) The attempt to translate a C<use Module n.n LIST> statement into @@ -7143,8 +7577,8 @@ when its reference count reached zero while it was still open, e.g.: Because various errors may only be detected by close() (e.g. buffering could allow the C<print> in this example to return true even when the disk is full), -it is dangerous to ignore its result. So when it happens implicitly, perl will -signal errors by warning. +it is dangerous to ignore its result. So when it happens implicitly, perl +will signal errors by warning. B<Prior to version 5.22.0, perl ignored such errors>, so the common idiom shown above was liable to cause B<silent data loss>. @@ -7178,14 +7612,21 @@ under L<perlsyn/Experimental Details on given and when>. =item Wide character in %s -(S utf8) Perl met a wide character (>255) when it wasn't expecting -one. This warning is by default on for I/O (like print). The easiest -way to quiet this warning is simply to add the C<:utf8> layer to the -output, e.g. C<binmode STDOUT, ':utf8'>. Another way to turn off the -warning is to add C<no warnings 'utf8';> but that is often closer to +(S utf8) Perl met a wide character (ordinal >255) when it wasn't +expecting one. This warning is by default on for I/O (like print). + +If this warning does come from I/O, the easiest +way to quiet it is simply to add the C<:utf8> layer, I<e.g.>, +S<C<binmode STDOUT, ':utf8'>>. Another way to turn off the warning is +to add S<C<no warnings 'utf8';>> but that is often closer to cheating. In general, you are supposed to explicitly mark the filehandle with an encoding, see L<open> and L<perlfunc/binmode>. +If the warning comes from other than I/O, this diagnostic probably +indicates that incorrect results are being obtained. You should examine +your code to determine how a wide character is getting to an operation +that doesn't handle them. + =item Wide character (U+%X) in %s (W locale) While in a single-byte locale (I<i.e.>, a non-UTF-8 @@ -7242,8 +7683,10 @@ the end of the string being unpacked. See L<perlfunc/pack>. (F) And you probably never will, because you probably don't have the sources to your kernel, and your vendor probably doesn't give a rip -about what you want. Your best bet is to put a setuid C wrapper around -your script. +about what you want. There is a vulnerability anywhere that you have a +set-id script, and to close it you need to remove the set-id bit from +the script that you're attempting to run. To actually run the script +set-id, your best bet is to put a set-id C wrapper around your script. =item You need to quote "%s" diff --git a/gnu/usr.bin/perl/pod/perlebcdic.pod b/gnu/usr.bin/perl/pod/perlebcdic.pod index a53879c796d..188d01f13c6 100644 --- a/gnu/usr.bin/perl/pod/perlebcdic.pod +++ b/gnu/usr.bin/perl/pod/perlebcdic.pod @@ -1524,7 +1524,7 @@ some user education. This is completely general, but the most computationally expensive strategy. Choose one or the other character set and transform to that -for every sort comparision. Here's a complete example that transforms +for every sort comparison. Here's a complete example that transforms to ASCII sort order: sub native_to_uni($) { @@ -1752,7 +1752,7 @@ and vice versa. Internationalization (I18N) and localization (L10N) are supported at least in principle even on EBCDIC platforms. The details are system-dependent -and discussed under the L<OS ISSUES> section below. +and discussed under the L</OS ISSUES> section below. =head1 MULTI-OCTET CHARACTER SETS @@ -1855,9 +1855,6 @@ EBCDIC platforms. And some of the failures are real bugs. If you compile and do a C<make test> on Perl, all tests on the C</cpan> directory are skipped. -In particular, the (now deprecated) L<encoding> pragma is not supported -under EBCDIC. - L<Encode> partially works. =item * diff --git a/gnu/usr.bin/perl/pod/perlembed.pod b/gnu/usr.bin/perl/pod/perlembed.pod index 596f28781ab..d6391f7a26f 100644 --- a/gnu/usr.bin/perl/pod/perlembed.pod +++ b/gnu/usr.bin/perl/pod/perlembed.pod @@ -118,7 +118,7 @@ Execute this statement for a hint about where to find CORE: perl -MConfig -e 'print $Config{archlib}' Here's how you'd compile the example in the next section, -L<Adding a Perl interpreter to your C program>, on my Linux box: +L</Adding a Perl interpreter to your C program>, on my Linux box: % gcc -O2 -Dbool=char -DHAS_BOOL -I/usr/local/include -I/usr/local/lib/perl5/i586-linux/5.003/CORE @@ -128,7 +128,7 @@ L<Adding a Perl interpreter to your C program>, on my Linux box: (That's all one line.) On my DEC Alpha running old 5.003_05, the incantation is a bit different: - % cc -O2 -Olimit 2900 -DSTANDARD_C -I/usr/local/include + % cc -O2 -Olimit 2900 -I/usr/local/include -I/usr/local/lib/perl5/alpha-dec_osf/5.00305/CORE -L/usr/local/lib/perl5/alpha-dec_osf/5.00305/CORE -L/usr/local/lib -D__LANGUAGE_C__ -D_NO_PROTO -o interp interp.c -lperl -lm @@ -161,7 +161,7 @@ you: If the B<ExtUtils::Embed> module isn't part of your Perl distribution, you can retrieve it from -http://www.perl.com/perl/CPAN/modules/by-module/ExtUtils/ +L<http://www.perl.com/perl/CPAN/modules/by-module/ExtUtils/> (If this documentation came from your Perl distribution, then you're running 5.004 or better and you already have it.) @@ -192,6 +192,7 @@ version of I<miniperlmain.c> containing the essentials of embedding: perl_destruct(my_perl); perl_free(my_perl); PERL_SYS_TERM(); + exit(EXIT_SUCCESS); } Notice that we don't use the C<env> pointer. Normally handed to @@ -211,6 +212,9 @@ Also notice that no matter what arguments you pass to perl_parse(), PERL_SYS_INIT3() must be invoked on the C main() argc, argv and env and only once. +Mind that argv[argc] must be NULL, same as those passed to a main +function in C. + Now compile this program (I'll call it I<interp.c>) into an executable: % cc -o interp interp.c `perl -MExtUtils::Embed -e ccopts -e ldopts` @@ -264,6 +268,7 @@ That's shown below, in a program I'll call I<showtime.c>. perl_destruct(my_perl); perl_free(my_perl); PERL_SYS_TERM(); + exit(EXIT_SUCCESS); } where I<showtime> is a Perl subroutine that takes no arguments (that's the @@ -297,7 +302,7 @@ If you want to pass arguments to the Perl subroutine, you can add strings to the C<NULL>-terminated C<args> list passed to I<call_argv>. For other data types, or to examine return values, you'll need to manipulate the Perl stack. That's demonstrated in -L<Fiddling with the Perl stack from your C program>. +L</Fiddling with the Perl stack from your C program>. =head2 Evaluating a Perl statement from your C program @@ -322,7 +327,7 @@ the first, a C<float> from the second, and a C<char *> from the third. main (int argc, char **argv, char **env) { - char *embedding[] = { "", "-e", "0" }; + char *embedding[] = { "", "-e", "0", NULL }; PERL_SYS_INIT3(&argc,&argv,&env); my_perl = perl_alloc(); @@ -426,7 +431,7 @@ been wrapped here): PUTBACK; if (croak_on_error && SvTRUE(ERRSV)) - croak(SvPVx_nolen(ERRSV)); + croak_sv(ERRSV); return retval; } @@ -501,7 +506,7 @@ been wrapped here): main (int argc, char **argv, char **env) { - char *embedding[] = { "", "-e", "0" }; + char *embedding[] = { "", "-e", "0", NULL }; AV *match_list; I32 num_matches, i; SV *text; @@ -642,7 +647,7 @@ deep breath... int main (int argc, char **argv, char **env) { - char *my_argv[] = { "", "power.pl" }; + char *my_argv[] = { "", "power.pl", NULL }; PERL_SYS_INIT3(&argc,&argv,&env); my_perl = perl_alloc(); @@ -657,6 +662,7 @@ deep breath... perl_destruct(my_perl); perl_free(my_perl); PERL_SYS_TERM(); + exit(EXIT_SUCCESS); } @@ -791,25 +797,25 @@ with L<perlfunc/my> whenever possible. int main(int argc, char **argv, char **env) { - char *embedding[] = { "", "persistent.pl" }; + char *embedding[] = { "", "persistent.pl", NULL }; char *args[] = { "", DO_CLEAN, NULL }; char filename[BUFFER_SIZE]; - int exitstatus = 0; + int failing, exitstatus; PERL_SYS_INIT3(&argc,&argv,&env); if((my_perl = perl_alloc()) == NULL) { fprintf(stderr, "no memory!"); - exit(1); + exit(EXIT_FAILURE); } perl_construct(my_perl); PL_origalen = 1; /* don't let $0 assignment update the proctitle or embedding[0] */ - exitstatus = perl_parse(my_perl, NULL, 2, embedding, NULL); + failing = perl_parse(my_perl, NULL, 2, embedding, NULL); PL_exit_flags |= PERL_EXIT_DESTRUCT_END; - if(!exitstatus) { - exitstatus = perl_run(my_perl); - + if(!failing) + failing = perl_run(my_perl); + if(!failing) { while(printf("Enter file name: ") && fgets(filename, BUFFER_SIZE, stdin)) { @@ -827,7 +833,7 @@ with L<perlfunc/my> whenever possible. } PL_perl_destruct_level = 0; - perl_destruct(my_perl); + exitstatus = perl_destruct(my_perl); perl_free(my_perl); PERL_SYS_TERM(); exit(exitstatus); @@ -948,8 +954,8 @@ Let's give it a try: int main(int argc, char **argv, char **env) { PerlInterpreter *one_perl, *two_perl; - char *one_args[] = { "one_perl", SAY_HELLO }; - char *two_args[] = { "two_perl", SAY_HELLO }; + char *one_args[] = { "one_perl", SAY_HELLO, NULL }; + char *two_args[] = { "two_perl", SAY_HELLO, NULL }; PERL_SYS_INIT3(&argc,&argv,&env); one_perl = perl_alloc(); @@ -980,6 +986,7 @@ Let's give it a try: PERL_SET_CONTEXT(two_perl); perl_free(two_perl); PERL_SYS_TERM(); + exit(EXIT_SUCCESS); } Note the calls to PERL_SET_CONTEXT(). These are necessary to initialize @@ -1090,17 +1097,22 @@ to use the system's default locale. This is often, but not necessarily, the "C" or "POSIX" locale. Absent a S<C<"use locale">> within the perl code, this mostly has no effect (but see L<perllocale/Not within the scope of "use locale">). Also, there is not a problem if the -locale you want to use in your embedded Perl is the same as the system +locale you want to use in your embedded perl is the same as the system default. However, this doesn't work if you have set up and want to use a locale that isn't the system default one. Starting in Perl v5.20, you can tell the embedded Perl interpreter that the locale is already properly set up, and to skip doing its own normal initialization. It skips if the environment variable C<PERL_SKIP_LOCALE_INIT> is set (even -if set to 0 or C<"">). A Perl that has this capability will define the +if set to 0 or C<"">). A perl that has this capability will define the C pre-processor symbol C<HAS_SKIP_LOCALE_INIT>. This allows code that has to work with multiple Perl versions to do some sort of work-around when confronted with an earlier Perl. +If your program is using the POSIX 2008 multi-thread locale +functionality, you should switch into the global locale and set that up +properly before starting the Perl interpreter. It will then properly +switch back to using the thread-safe functions. + =head1 Hiding Perl_ If you completely hide the short forms of the Perl public API, @@ -1134,7 +1146,7 @@ Christiansen, Guy Decoux, Hallvard Furuseth, Dov Grobgeld, and Ilya Zakharevich. Doug MacEachern has an article on embedding in Volume 1, Issue 4 of -The Perl Journal ( http://www.tpj.com/ ). Doug is also the developer of the +The Perl Journal ( L<http://www.tpj.com/> ). Doug is also the developer of the most widely-used Perl embedding: the mod_perl system (perl.apache.org), which embeds Perl in the Apache web server. Oracle, Binary Evolution, ActiveState, and Ben Sugars's nsapi_perl diff --git a/gnu/usr.bin/perl/pod/perlexperiment.pod b/gnu/usr.bin/perl/pod/perlexperiment.pod index a3052df8cfe..7963c05ac73 100644 --- a/gnu/usr.bin/perl/pod/perlexperiment.pod +++ b/gnu/usr.bin/perl/pod/perlexperiment.pod @@ -16,15 +16,6 @@ their inception, versions, etc. There's a lot of speculation here. =over 8 -=item C<our> can now have an experimental optional attribute C<unique> - -Introduced in Perl 5.8.0 - -Deprecated in Perl 5.10.0 - -The ticket for this feature is -L<[perl #119313]|https://rt.perl.org/rt3/Ticket/Display.html?id=119313>. - =item Smart match (C<~~>) Introduced in Perl 5.10.0 @@ -46,18 +37,6 @@ See L<perlapi/PL_keyword_plugin> for the mechanism. Introduced in Perl 5.11.2 -=item Lexical subroutines - -Introduced in Perl 5.18 - -See also: L<perlsub/Lexical Subroutines> - -Using this feature triggers warnings in the category -C<experimental::lexical_subs>. - -The ticket for this feature is -L<[perl #120085]|https://rt.perl.org/rt3/Ticket/Display.html?id=120085>. - =item Regular Expression Set Operations Introduced in Perl 5.18 @@ -113,24 +92,24 @@ C<experimental::re_strict>. See L<re/'strict' mode> -=item String- and number-specific bitwise operators +=item The <:win32> IO pseudolayer -Introduced in Perl 5.22.0 +The ticket for this feature is +L<[perl #119453]|https://rt.perl.org/rt3/Ticket/Display.html?id=119453>. -See also: L<perlop/Bitwise String Operators> +See also L<perlrun> -Using this feature triggers warnings in the category -C<experimental::bitwise>. +=item Declaring a reference to a variable -The ticket for this feature is -L<[perl #123707]|https://rt.perl.org/rt3/Ticket/Display.html?id=123707>. +Introduced in Perl 5.26.0 -=item The <:win32> IO pseudolayer +Using this feature triggers warnings in the category +C<experimental::declared_refs>. The ticket for this feature is -L<[perl #119453]|https://rt.perl.org/rt3/Ticket/Display.html?id=119453>. +L<[perl #128654]|https://rt.perl.org/rt3/Ticket/Display.html?id=128654>. -See also L<perlrun> +See also: L<perlref/Declaring a Reference to a Variable> =item There is an C<installhtml> target in the Makefile. @@ -139,6 +118,24 @@ L<[perl #116487]|https://rt.perl.org/rt3/Ticket/Display.html?id=116487>. =item Unicode in Perl on EBCDIC +=item Script runs + +Introduced in Perl 5.28.0 + +Using this feature triggers warnings in the category +C<experimental::script_run>. + +See also: L<perlre/Script Runs> + +=item Alpabetic assertions + +Introduced in Perl 5.28.0 + +Using this feature triggers warnings in the category +C<experimental::alpha_assertions>. + +See also: L<perlre/Extended Patterns>. + =back =head2 Accepted features @@ -249,6 +246,18 @@ Introduced in Perl 5.20.0 Accepted in Perl 5.24.0 +=item Lexical subroutines + +Introduced in Perl 5.18.0 + +Accepted in Perl 5.26.0 + +=item String- and number-specific bitwise operators + +Introduced in Perl 5.22.0 + +Accepted in Perl 5.28.0 + =back =head2 Removed features @@ -323,6 +332,14 @@ Introduced in Perl 5.14.0 Removed in Perl 5.24.0 +=item C<our> can have an experimental optional attribute C<unique> + +Introduced in Perl 5.8.0 + +Deprecated in Perl 5.10.0 + +Removed in Perl 5.28.0 + =back =head1 SEE ALSO diff --git a/gnu/usr.bin/perl/pod/perlform.pod b/gnu/usr.bin/perl/pod/perlform.pod index 84b6fc59dfd..1a57b5fc107 100644 --- a/gnu/usr.bin/perl/pod/perlform.pod +++ b/gnu/usr.bin/perl/pod/perlform.pod @@ -169,7 +169,7 @@ characters B<without> an embedded "."), the character used for the decimal point is determined by the current LC_NUMERIC locale if C<use locale> is in effect. This means that, if, for example, the run-time environment happens to specify a German locale, "," will be used instead of the default ".". See -L<perllocale> and L<"WARNINGS"> for more information. +L<perllocale> and L</"WARNINGS"> for more information. =head2 Using Fill Mode diff --git a/gnu/usr.bin/perl/pod/perlfunc.pod b/gnu/usr.bin/perl/pod/perlfunc.pod index b10b6323600..b55068f1f97 100644 --- a/gnu/usr.bin/perl/pod/perlfunc.pod +++ b/gnu/usr.bin/perl/pod/perlfunc.pod @@ -104,7 +104,8 @@ X<function> Here are Perl's functions (including things that look like functions, like some keywords and named operators) arranged by category. Some functions appear in more -than one place. +than one place. Any warnings, including those produced by +keywords, are described in L<perldiag> and L<warnings>. =over 4 @@ -199,7 +200,7 @@ L<C<flock>|/flock FILEHANDLE,OPERATION>, L<C<format>|/format>, L<C<getc>|/getc FILEHANDLE>, L<C<print>|/print FILEHANDLE LIST>, L<C<printf>|/printf FILEHANDLE FORMAT, LIST>, L<C<read>|/read FILEHANDLE,SCALAR,LENGTH,OFFSET>, -L<C<readdir>|/readdir DIRHANDLE>, L<C<readline>|/readline EXPR> +L<C<readdir>|/readdir DIRHANDLE>, L<C<readline>|/readline EXPR>, L<C<rewinddir>|/rewinddir DIRHANDLE>, L<C<say>|/say FILEHANDLE LIST>, L<C<seek>|/seek FILEHANDLE,POSITION,WHENCE>, L<C<seekdir>|/seekdir DIRHANDLE,POS>, @@ -241,7 +242,7 @@ L<C<chroot>|/chroot FILENAME>, L<C<fcntl>|/fcntl FILEHANDLE,FUNCTION,SCALAR>, L<C<glob>|/glob EXPR>, L<C<ioctl>|/ioctl FILEHANDLE,FUNCTION,SCALAR>, L<C<link>|/link OLDFILE,NEWFILE>, L<C<lstat>|/lstat FILEHANDLE>, -L<C<mkdir>|/mkdir FILENAME,MASK>, L<C<open>|/open FILEHANDLE,EXPR>, +L<C<mkdir>|/mkdir FILENAME,MODE>, L<C<open>|/open FILEHANDLE,EXPR>, L<C<opendir>|/opendir DIRHANDLE,EXPR>, L<C<readlink>|/readlink EXPR>, L<C<rename>|/rename OLDNAME,NEWNAME>, L<C<rmdir>|/rmdir FILENAME>, L<C<select>|/select FILEHANDLE>, L<C<stat>|/stat FILEHANDLE>, @@ -258,7 +259,7 @@ X<control flow> L<C<break>|/break>, L<C<caller>|/caller EXPR>, L<C<continue>|/continue BLOCK>, L<C<die>|/die LIST>, L<C<do>|/do BLOCK>, L<C<dump>|/dump LABEL>, L<C<eval>|/eval EXPR>, -L<C<evalbytes>|/evalbytes EXPR> L<C<exit>|/exit EXPR>, +L<C<evalbytes>|/evalbytes EXPR>, L<C<exit>|/exit EXPR>, L<C<__FILE__>|/__FILE__>, L<C<goto>|/goto LABEL>, L<C<last>|/last LABEL>, L<C<__LINE__>|/__LINE__>, L<C<next>|/next LABEL>, L<C<__PACKAGE__>|/__PACKAGE__>, @@ -893,7 +894,9 @@ X<bless> =for Pod::Functions create an object This function tells the thingy referenced by REF that it is now an object -in the CLASSNAME package. If CLASSNAME is omitted, the current package +in the CLASSNAME package. If CLASSNAME is an empty string, it is +interpreted as referring to the C<main> package. +If CLASSNAME is omitted, the current package is used. Because a L<C<bless>|/bless REF,CLASSNAME> is often the last thing in a constructor, it returns the reference for convenience. Always use the two-argument version if a derived class might inherit the @@ -903,8 +906,9 @@ method doing the blessing. See L<perlobj> for more about the blessing Consider always blessing objects in CLASSNAMEs that are mixed case. Namespaces with all lowercase names are considered reserved for Perl pragmas. Builtin types have all uppercase names. To prevent -confusion, you may wish to avoid such package names as well. Make sure -that CLASSNAME is a true value. +confusion, you may wish to avoid such package names as well. +It is advised to avoid the class name C<0>, because much code erroneously +uses the result of L<C<ref>|/ref EXPR> as a truth value. See L<perlmod/"Perl Modules">. @@ -1541,10 +1545,9 @@ makes it spring into existence the first time that it is called; see L<perlsub>. Use of L<C<defined>|/defined EXPR> on aggregates (hashes and arrays) is -deprecated. It -used to report whether memory for that aggregate had ever been -allocated. This behavior may disappear in future versions of Perl. -You should instead use a simple test for size: +no longer supported. It used to report whether memory for that +aggregate had ever been allocated. You should instead use a simple +test for size: if (@an_array) { print "has array elements\n" } if (%a_hash) { print "has hash members\n" } @@ -1592,10 +1595,13 @@ so that L<C<exists>|/exists EXPR> on that element no longer returns true. Setting a hash element to the undefined value does not remove its key, but deleting it does; see L<C<exists>|/exists EXPR>. -In list context, returns the value or values deleted, or the last such -element in scalar context. The return list's length always matches that of +In list context, usually returns the value or values deleted, or the last such +element in scalar context. The return list's length corresponds to that of the argument list: deleting non-existent elements returns the undefined value -in their corresponding positions. +in their corresponding positions. When a +L<keyE<sol>value hash slice|perldata/KeyE<sol>Value Hash Slices> is passed to +C<delete>, the return value is a list of key/value pairs (two elements for each +item deleted from the hash). L<C<delete>|/delete EXPR> may also be used on arrays and array slices, but its behavior is less straightforward. Although @@ -1670,22 +1676,27 @@ X<die> X<throw> X<exception> X<raise> X<$@> X<abort> =for Pod::Functions raise an exception or bail out -L<C<die>|/die LIST> raises an exception. Inside an -L<C<eval>|/eval EXPR> the error message is stuffed into -L<C<$@>|perlvar/$@> and the L<C<eval>|/eval EXPR> is terminated with the -undefined value. If the exception is outside of all enclosing -L<C<eval>|/eval EXPR>s, then the uncaught exception prints LIST to -C<STDERR> and exits with a non-zero value. If you need to exit the -process with a specific exit code, see L<C<exit>|/exit EXPR>. +L<C<die>|/die LIST> raises an exception. Inside an L<C<eval>|/eval EXPR> +the exception is stuffed into L<C<$@>|perlvar/$@> and the L<C<eval>|/eval +EXPR> is terminated with the undefined value. If the exception is +outside of all enclosing L<C<eval>|/eval EXPR>s, then the uncaught +exception is printed to C<STDERR> and perl exits with an exit code +indicating failure. If you need to exit the process with a specific +exit code, see L<C<exit>|/exit EXPR>. Equivalent examples: die "Can't cd to spool: $!\n" unless chdir '/usr/spool/news'; chdir '/usr/spool/news' or die "Can't cd to spool: $!\n" -If the last element of LIST does not end in a newline, the current -script line number and input line number (if any) are also printed, -and a newline is supplied. Note that the "input line number" (also +Most of the time, C<die> is called with a string to use as the exception. +You may either give a single non-reference operand to serve as the +exception, or a list of two or more items, which will be stringified +and concatenated to make the exception. + +If the string exception does not end in a newline, the current +script line number and input line number (if any) and a newline +are appended to it. Note that the "input line number" (also known as "chunk") is subject to whatever notion of "line" happens to be currently in effect, and is also available as the special variable L<C<$.>|perlvar/$.>. See L<perlvar/"$/"> and L<perlvar/"$.">. @@ -1702,49 +1713,45 @@ produce, respectively /etc/games is no good at canasta line 123. /etc/games is no good, stopped at canasta line 123. -If the output is empty and L<C<$@>|perlvar/$@> already contains a value -(typically from a previous L<C<eval>|/eval EXPR>) that value is reused after +If LIST was empty or made an empty string, and L<C<$@>|perlvar/$@> +already contains an exception value (typically from a previous +L<C<eval>|/eval EXPR>), then that value is reused after appending C<"\t...propagated">. This is useful for propagating exceptions: eval { ... }; die unless $@ =~ /Expected exception/; -If the output is empty and L<C<$@>|perlvar/$@> contains an object +If LIST was empty or made an empty string, +and L<C<$@>|perlvar/$@> contains an object reference that has a C<PROPAGATE> method, that method will be called with additional file and line number parameters. The return value replaces the value in L<C<$@>|perlvar/$@>; i.e., as if C<< $@ = eval { $@->PROPAGATE(__FILE__, __LINE__) }; >> were called. -If L<C<$@>|perlvar/$@> is empty, then the string C<"Died"> is used. - -If an uncaught exception results in interpreter exit, the exit code is -determined from the values of L<C<$!>|perlvar/$!> and -L<C<$?>|perlvar/$?> with this pseudocode: - - exit $! if $!; # errno - exit $? >> 8 if $? >> 8; # child exit status - exit 255; # last resort - -As with L<C<exit>|/exit EXPR>, L<C<$?>|perlvar/$?> is set prior to -unwinding the call stack; any C<DESTROY> or C<END> handlers can then -alter this value, and thus Perl's exit code. - -The intent is to squeeze as much possible information about the likely cause -into the limited space of the system exit code. However, as -L<C<$!>|perlvar/$!> is the value of C's C<errno>, which can be set by -any system call, this means that the value of the exit code used by -L<C<die>|/die LIST> can be non-predictable, so should not be relied -upon, other than to be non-zero. +If LIST was empty or made an empty string, and L<C<$@>|perlvar/$@> +is also empty, then the string C<"Died"> is used. You can also call L<C<die>|/die LIST> with a reference argument, and if this is trapped within an L<C<eval>|/eval EXPR>, L<C<$@>|perlvar/$@> contains that reference. This permits more elaborate exception handling using objects that maintain arbitrary state about the exception. Such a scheme is sometimes preferable to matching particular string values of -L<C<$@>|perlvar/$@> with regular expressions. Because -L<C<$@>|perlvar/$@> is a global variable and L<C<eval>|/eval EXPR> may -be used within object implementations, be careful that analyzing the -error object doesn't replace the reference in the global variable. It's +L<C<$@>|perlvar/$@> with regular expressions. + +Because Perl stringifies uncaught exception messages before display, +you'll probably want to overload stringification operations on +exception objects. See L<overload> for details about that. +The stringified message should be non-empty, and should end in a newline, +in order to fit in with the treatment of string exceptions. +Also, because an exception object reference cannot be stringified +without destroying it, Perl doesn't attempt to append location or other +information to a reference exception. If you want location information +with a complex exception object, you'll have to arrange to put the +location information into the object yourself. + +Because L<C<$@>|perlvar/$@> is a global variable, be careful that +analyzing an exception caught by C<eval> doesn't replace the reference +in the global variable. It's easiest to make a local copy of the reference before any manipulations. Here's an example: @@ -1761,14 +1768,30 @@ Here's an example: } } -Because Perl stringifies uncaught exception messages before display, -you'll probably want to overload stringification operations on -exception objects. See L<overload> for details about that. +If an uncaught exception results in interpreter exit, the exit code is +determined from the values of L<C<$!>|perlvar/$!> and +L<C<$?>|perlvar/$?> with this pseudocode: + + exit $! if $!; # errno + exit $? >> 8 if $? >> 8; # child exit status + exit 255; # last resort + +As with L<C<exit>|/exit EXPR>, L<C<$?>|perlvar/$?> is set prior to +unwinding the call stack; any C<DESTROY> or C<END> handlers can then +alter this value, and thus Perl's exit code. + +The intent is to squeeze as much possible information about the likely cause +into the limited space of the system exit code. However, as +L<C<$!>|perlvar/$!> is the value of C's C<errno>, which can be set by +any system call, this means that the value of the exit code used by +L<C<die>|/die LIST> can be non-predictable, so should not be relied +upon, other than to be non-zero. You can arrange for a callback to be run just before the L<C<die>|/die LIST> does its deed, by setting the L<C<$SIG{__DIE__}>|perlvar/%SIG> hook. The associated handler is called -with the error text and can change the error message, if it sees fit, by +with the exception as an argument, and can change the exception, +if it sees fit, by calling L<C<die>|/die LIST> again. See L<perlvar/%SIG> for details on setting L<C<%SIG>|perlvar/%SIG> entries, and L<C<eval>|/eval EXPR> for some examples. Although this feature was to be run only right before your @@ -1806,22 +1829,42 @@ See L<perlsyn> for alternative strategies. X<do> Uses the value of EXPR as a filename and executes the contents of the -file as a Perl script. +file as a Perl script: + + # load the exact specified file (./ and ../ special-cased) + do '/foo/stat.pl'; + do './stat.pl'; + do '../foo/stat.pl'; + # search for the named file within @INC do 'stat.pl'; + do 'foo/stat.pl'; -is largely like +C<do './stat.pl'> is largely like eval `cat stat.pl`; -except that it's more concise, runs no external processes, keeps track of -the current filename for error messages, searches the -L<C<@INC>|perlvar/@INC> directories, and updates L<C<%INC>|perlvar/%INC> -if the file is found. See L<perlvar/@INC> and L<perlvar/%INC> for these -variables. It also differs in that code evaluated with C<do FILE> -cannot see lexicals in the enclosing scope; C<eval STRING> does. It's -the same, however, in that it does reparse the file every time you call -it, so you probably don't want to do this inside a loop. +except that it's more concise, runs no external processes, and keeps +track of the current filename for error messages. It also differs in that +code evaluated with C<do FILE> cannot see lexicals in the enclosing +scope; C<eval STRING> does. It's the same, however, in that it does +reparse the file every time you call it, so you probably don't want +to do this inside a loop. + +Using C<do> with a relative path (except for F<./> and F<../>), like + + do 'foo/stat.pl'; + +will search the L<C<@INC>|perlvar/@INC> directories, and update +L<C<%INC>|perlvar/%INC> if the file is found. See L<perlvar/@INC> +and L<perlvar/%INC> for these variables. In particular, note that +whilst historically L<C<@INC>|perlvar/@INC> contained '.' (the +current directory) making these two cases equivalent, that is no +longer necessarily the case, as '.' is not included in C<@INC> by default +in perl versions 5.26.0 onwards. Instead, perl will now warn: + + do "stat.pl" failed, '.' is no longer in @INC; + did you mean do "./stat.pl"? If L<C<do>|/do EXPR> can read the file but cannot compile it, it returns L<C<undef>|/undef EXPR> and sets an error message in @@ -1839,7 +1882,8 @@ if there's a problem. You might like to use L<C<do>|/do EXPR> to read in a program configuration file. Manual error checking can be done this way: - # read in config files: system first, then user + # Read in config files: system first, then user. + # Beware of using relative pathnames here. for $file ("/share/prog/defaults.rc", "$ENV{HOME}/.someprogrc") { @@ -1925,7 +1969,8 @@ its own internal iterator, accessed by L<C<each>|/each HASH>, L<C<keys>|/keys HASH>, and L<C<values>|/values HASH>. The iterator is implicitly reset when L<C<each>|/each HASH> has reached the end as just described; it can be explicitly reset by calling L<C<keys>|/keys HASH> -or L<C<values>|/values HASH> on the hash or array. If you add or delete +or L<C<values>|/values HASH> on the hash or array, or by referencing +the hash (but not array) in list context. If you add or delete a hash's elements while iterating over it, the effect on the iterator is unspecified; for example, entries may be skipped or duplicated--so don't do that. Exception: It is always safe to delete the item most recently @@ -1939,6 +1984,21 @@ returned by L<C<each>|/each HASH>, so the following code works properly: Tied hashes may have a different ordering behaviour to perl's hash implementation. +The iterator used by C<each> is attached to the hash or array, and is +shared between all iteration operations applied to the same hash or array. +Thus all uses of C<each> on a single hash or array advance the same +iterator location. All uses of C<each> are also subject to having the +iterator reset by any use of C<keys> or C<values> on the same hash or +array, or by the hash (but not array) being referenced in list context. +This makes C<each>-based loops quite fragile: it is easy to arrive at +such a loop with the iterator already part way through the object, or to +accidentally clobber the iterator state during execution of the loop body. +It's easy enough to explicitly reset the iterator before starting a loop, +but there is no way to insulate the iterator state used by a loop from +the iterator state used by anything else that might execute during the +loop body. To avoid these problems, use a C<foreach> loop rather than +C<while>-C<each>. + This prints out your environment like the L<printenv(1)> program, but in a different order: @@ -1952,6 +2012,10 @@ been deemed unsuccessful, and was removed as of Perl 5.24. As of Perl 5.18 you can use a bare L<C<each>|/each HASH> in a C<while> loop, which will set L<C<$_>|perlvar/$_> on every iteration. +If either an C<each> expression or an explicit assignment of an C<each> +expression to a scalar is used as a C<while>/C<for> condition, then +the condition actually tests for definedness of the expression's value, +not for its regular truth value. while (each %ENV) { print "$_=$ENV{$_}\n"; @@ -2036,86 +2100,187 @@ X<error, handling> X<exception, handling> =for Pod::Functions catch exceptions or compile and run code -In the first form, often referred to as a "string eval", the return -value of EXPR is parsed and executed as if it -were a little Perl program. The value of the expression (which is itself -determined within scalar context) is first parsed, and if there were no -errors, executed as a block within the lexical context of the current Perl -program. This means, that in particular, any outer lexical variables are -visible to it, and any package variable settings or subroutine and format -definitions remain afterwards. - -Note that the value is parsed every time the L<C<eval>|/eval EXPR> -executes. If EXPR is omitted, evaluates L<C<$_>|perlvar/$_>. This form -is typically used to delay parsing and subsequent execution of the text -of EXPR until run time. - -If the -L<C<"unicode_eval"> feature|feature/The 'unicode_eval' and 'evalbytes' features> -is enabled (which is the default under a -C<use 5.16> or higher declaration), EXPR or L<C<$_>|perlvar/$_> is -treated as a string of characters, so L<C<use utf8>|utf8> declarations -have no effect, and source filters are forbidden. In the absence of the -L<C<"unicode_eval"> feature|feature/The 'unicode_eval' and 'evalbytes' features>, -will sometimes be treated as characters and sometimes as bytes, -depending on the internal encoding, and source filters activated within -the L<C<eval>|/eval EXPR> exhibit the erratic, but historical, behaviour -of affecting some outer file scope that is still compiling. See also -the L<C<evalbytes>|/evalbytes EXPR> operator, which always treats its -input as a byte stream and works properly with source filters, and the -L<feature> pragma. - -Problems can arise if the string expands a scalar containing a floating -point number. That scalar can expand to letters, such as C<"NaN"> or -C<"Infinity">; or, within the scope of a L<C<use locale>|locale>, the -decimal point character may be something other than a dot (such as a -comma). None of these are likely to parse as you are likely expecting. - -In the second form, the code within the BLOCK is parsed only once--at the -same time the code surrounding the L<C<eval>|/eval EXPR> itself was -parsed--and executed +C<eval> in all its forms is used to execute a little Perl program, +trapping any errors encountered so they don't crash the calling program. + +Plain C<eval> with no argument is just C<eval EXPR>, where the +expression is understood to be contained in L<C<$_>|perlvar/$_>. Thus +there are only two real C<eval> forms; the one with an EXPR is often +called "string eval". In a string eval, the value of the expression +(which is itself determined within scalar context) is first parsed, and +if there were no errors, executed as a block within the lexical context +of the current Perl program. This form is typically used to delay +parsing and subsequent execution of the text of EXPR until run time. +Note that the value is parsed every time the C<eval> executes. + +The other form is called "block eval". It is less general than string +eval, but the code within the BLOCK is parsed only once (at the same +time the code surrounding the C<eval> itself was parsed) and executed within the context of the current Perl program. This form is typically -used to trap exceptions more efficiently than the first (see below), while -also providing the benefit of checking the code within BLOCK at compile -time. - -The final semicolon, if any, may be omitted from the value of EXPR or within -the BLOCK. +used to trap exceptions more efficiently than the first, while also +providing the benefit of checking the code within BLOCK at compile time. +BLOCK is parsed and compiled just once. Since errors are trapped, it +often is used to check if a given feature is available. In both forms, the value returned is the value of the last expression -evaluated inside the mini-program; a return statement may be also used, just +evaluated inside the mini-program; a return statement may also be used, just as with subroutines. The expression providing the return value is evaluated in void, scalar, or list context, depending on the context of the -L<C<eval>|/eval EXPR> itself. See L<C<wantarray>|/wantarray> for more +C<eval> itself. See L<C<wantarray>|/wantarray> for more on how the evaluation context can be determined. If there is a syntax error or runtime error, or a L<C<die>|/die LIST> -statement is executed, L<C<eval>|/eval EXPR> returns -L<C<undef>|/undef EXPR> in scalar context or an empty list in list +statement is executed, C<eval> returns +L<C<undef>|/undef EXPR> in scalar context, or an empty list in list context, and L<C<$@>|perlvar/$@> is set to the error message. (Prior to 5.16, a bug caused L<C<undef>|/undef EXPR> to be returned in list context for syntax errors, but not for runtime errors.) If there was no error, L<C<$@>|perlvar/$@> is set to the empty string. A control flow operator like L<C<last>|/last LABEL> or L<C<goto>|/goto LABEL> can bypass the setting of L<C<$@>|perlvar/$@>. Beware that using -L<C<eval>|/eval EXPR> neither silences Perl from printing warnings to +C<eval> neither silences Perl from printing warnings to STDERR, nor does it stuff the text of warning messages into L<C<$@>|perlvar/$@>. To do either of those, you have to use the L<C<$SIG{__WARN__}>|perlvar/%SIG> facility, or turn off warnings inside the BLOCK or EXPR using S<C<no warnings 'all'>>. See L<C<warn>|/warn LIST>, L<perlvar>, and L<warnings>. -Note that, because L<C<eval>|/eval EXPR> traps otherwise-fatal errors, +Note that, because C<eval> traps otherwise-fatal errors, it is useful for determining whether a particular feature (such as L<C<socket>|/socket SOCKET,DOMAIN,TYPE,PROTOCOL> or L<C<symlink>|/symlink OLDFILE,NEWFILE>) is implemented. It is also Perl's exception-trapping mechanism, where the L<C<die>|/die LIST> operator is used to raise exceptions. -If you want to trap errors when loading an XS module, some problems with -the binary interface (such as Perl version skew) may be fatal even with -L<C<eval>|/eval EXPR> unless C<$ENV{PERL_DL_NONLAZY}> is set. See -L<perlrun>. +Before Perl 5.14, the assignment to L<C<$@>|perlvar/$@> occurred before +restoration +of localized variables, which means that for your code to run on older +versions, a temporary is required if you want to mask some, but not all +errors: + + # alter $@ on nefarious repugnancy only + { + my $e; + { + local $@; # protect existing $@ + eval { test_repugnancy() }; + # $@ =~ /nefarious/ and die $@; # Perl 5.14 and higher only + $@ =~ /nefarious/ and $e = $@; + } + die $e if defined $e + } + +There are some different considerations for each form: + +=over 4 + +=item String eval + +Since the return value of EXPR is executed as a block within the lexical +context of the current Perl program, any outer lexical variables are +visible to it, and any package variable settings or subroutine and +format definitions remain afterwards. + +=over 4 + +=item Under the L<C<"unicode_eval"> feature|feature/The 'unicode_eval' and 'evalbytes' features> + +If this feature is enabled (which is the default under a C<use 5.16> or +higher declaration), EXPR is considered to be +in the same encoding as the surrounding program. Thus if +S<L<C<use utf8>|utf8>> is in effect, the string will be treated as being +UTF-8 encoded. Otherwise, the string is considered to be a sequence of +independent bytes. Bytes that correspond to ASCII-range code points +will have their normal meanings for operators in the string. The +treatment of the other bytes depends on if the +L<C<'unicode_strings"> feature|feature/The 'unicode_strings' feature> is +in effect. + +In a plain C<eval> without an EXPR argument, being in S<C<use utf8>> or +not is irrelevant; the UTF-8ness of C<$_> itself determines the +behavior. + +Any S<C<use utf8>> or S<C<no utf8>> declarations within the string have +no effect, and source filters are forbidden. (C<unicode_strings>, +however, can appear within the string.) See also the +L<C<evalbytes>|/evalbytes EXPR> operator, which works properly with +source filters. + +Variables defined outside the C<eval> and used inside it retain their +original UTF-8ness. Everything inside the string follows the normal +rules for a Perl program with the given state of S<C<use utf8>>. + +=item Outside the C<"unicode_eval"> feature + +In this case, the behavior is problematic and is not so easily +described. Here are two bugs that cannot easily be fixed without +breaking existing programs: + +=over 4 + +=item * + +It can lose track of whether something should be encoded as UTF-8 or +not. + +=item * + +Source filters activated within C<eval> leak out into whichever file +scope is currently being compiled. To give an example with the CPAN module +L<Semi::Semicolons>: + + BEGIN { eval "use Semi::Semicolons; # not filtered" } + # filtered here! + +L<C<evalbytes>|/evalbytes EXPR> fixes that to work the way one would +expect: + + use feature "evalbytes"; + BEGIN { evalbytes "use Semi::Semicolons; # filtered" } + # not filtered + +=back + +=back + +Problems can arise if the string expands a scalar containing a floating +point number. That scalar can expand to letters, such as C<"NaN"> or +C<"Infinity">; or, within the scope of a L<C<use locale>|locale>, the +decimal point character may be something other than a dot (such as a +comma). None of these are likely to parse as you are likely expecting. + +You should be especially careful to remember what's being looked at +when: + + eval $x; # CASE 1 + eval "$x"; # CASE 2 + + eval '$x'; # CASE 3 + eval { $x }; # CASE 4 + + eval "\$$x++"; # CASE 5 + $$x++; # CASE 6 + +Cases 1 and 2 above behave identically: they run the code contained in +the variable $x. (Although case 2 has misleading double quotes making +the reader wonder what else might be happening (nothing is).) Cases 3 +and 4 likewise behave in the same way: they run the code C<'$x'>, which +does nothing but return the value of $x. (Case 4 is preferred for +purely visual reasons, but it also has the advantage of compiling at +compile-time instead of at run-time.) Case 5 is a place where +normally you I<would> like to use double quotes, except that in this +particular situation, you can just use symbolic references instead, as +in case 6. + +An C<eval ''> executed within a subroutine defined +in the C<DB> package doesn't see the usual +surrounding lexical scope, but rather the scope of the first non-DB piece +of code that called it. You don't normally need to worry about this unless +you are writing a Perl debugger. + +The final semicolon, if any, may be omitted from the value of EXPR. + +=item Block eval If the code to be executed doesn't vary, you may use the eval-BLOCK form to trap run-time errors without incurring the penalty of @@ -2135,6 +2300,11 @@ Examples: # a run-time error eval '$answer ='; # sets $@ +If you want to trap errors when loading an XS module, some problems with +the binary interface (such as Perl version skew) may be fatal even with +C<eval> unless C<$ENV{PERL_DL_NONLAZY}> is set. See +L<perlrun>. + Using the C<eval {}> form as an exception trap in libraries does have some issues. Due to the current arguably broken state of C<__DIE__> hooks, you may wish not to trigger any C<__DIE__> hooks that user code may have installed. @@ -2160,56 +2330,13 @@ messages: Because this promotes action at a distance, this counterintuitive behavior may be fixed in a future release. -With an L<C<eval>|/eval EXPR>, you should be especially careful to -remember what's being looked at when: - - eval $x; # CASE 1 - eval "$x"; # CASE 2 - - eval '$x'; # CASE 3 - eval { $x }; # CASE 4 - - eval "\$$x++"; # CASE 5 - $$x++; # CASE 6 - -Cases 1 and 2 above behave identically: they run the code contained in -the variable $x. (Although case 2 has misleading double quotes making -the reader wonder what else might be happening (nothing is).) Cases 3 -and 4 likewise behave in the same way: they run the code C<'$x'>, which -does nothing but return the value of $x. (Case 4 is preferred for -purely visual reasons, but it also has the advantage of compiling at -compile-time instead of at run-time.) Case 5 is a place where -normally you I<would> like to use double quotes, except that in this -particular situation, you can just use symbolic references instead, as -in case 6. - -Before Perl 5.14, the assignment to L<C<$@>|perlvar/$@> occurred before -restoration -of localized variables, which means that for your code to run on older -versions, a temporary is required if you want to mask some but not all -errors: - - # alter $@ on nefarious repugnancy only - { - my $e; - { - local $@; # protect existing $@ - eval { test_repugnancy() }; - # $@ =~ /nefarious/ and die $@; # Perl 5.14 and higher only - $@ =~ /nefarious/ and $e = $@; - } - die $e if defined $e - } - C<eval BLOCK> does I<not> count as a loop, so the loop control statements L<C<next>|/next LABEL>, L<C<last>|/last LABEL>, or L<C<redo>|/redo LABEL> cannot be used to leave or restart the block. -An C<eval ''> executed within a subroutine defined -in the C<DB> package doesn't see the usual -surrounding lexical scope, but rather the scope of the first non-DB piece -of code that called it. You don't normally need to worry about this unless -you are writing a Perl debugger. +The final semicolon, if any, may be omitted from within the BLOCK. + +=back =item evalbytes EXPR X<evalbytes> @@ -2218,18 +2345,42 @@ X<evalbytes> =for Pod::Functions +evalbytes similar to string eval, but intend to parse a bytestream -This function is like L<C<eval>|/eval EXPR> with a string argument, -except it always parses its argument, or L<C<$_>|perlvar/$_> if EXPR is -omitted, as a string of bytes. A string containing characters whose -ordinal value exceeds 255 results in an error. Source filters activated -within the evaluated code apply to the code itself. +This function is similar to a L<string eval|/eval EXPR>, except it +always parses its argument (or L<C<$_>|perlvar/$_> if EXPR is omitted) +as a string of independent bytes. -L<C<evalbytes>|/evalbytes EXPR> is available only if the -L<C<"evalbytes"> feature|feature/The 'unicode_eval' and 'evalbytes' features> -is enabled or if it is prefixed with C<CORE::>. The +If called when S<C<use utf8>> is in effect, the string will be assumed +to be encoded in UTF-8, and C<evalbytes> will make a temporary copy to +work from, downgraded to non-UTF-8. If this is not possible +(because one or more characters in it require UTF-8), the C<evalbytes> +will fail with the error stored in C<$@>. + +Bytes that correspond to ASCII-range code points will have their normal +meanings for operators in the string. The treatment of the other bytes +depends on if the L<C<'unicode_strings"> feature|feature/The +'unicode_strings' feature> is in effect. + +Of course, variables that are UTF-8 and are referred to in the string +retain that: + + my $a = "\x{100}"; + evalbytes 'print ord $a, "\n"'; + +prints + + 256 + +and C<$@> is empty. + +Source filters activated within the evaluated code apply to the code +itself. + +L<C<evalbytes>|/evalbytes EXPR> is available starting in Perl v5.16. To +access it, you must say C<CORE::evalbytes>, but you can omit the +C<CORE::> if the L<C<"evalbytes"> feature|feature/The 'unicode_eval' and 'evalbytes' features> -is enabled automatically with a C<use v5.16> (or higher) declaration in -the current scope. +is enabled. This is enabled automatically with a C<use v5.16> (or +higher) declaration in the current scope. =item exec LIST X<exec> X<execute> @@ -2379,10 +2530,6 @@ This happens anywhere the arrow operator is used, including even here: if (exists $ref->{"Some key"}) { } print $ref; # prints HASH(0x80d3d5c) -This surprising autovivification in what does not at first--or even -second--glance appear to be an lvalue context may be fixed in a future -release. - Use of a subroutine call, rather than a subroutine name, as an argument to L<C<exists>|/exists EXPR> is an error. @@ -2554,9 +2701,12 @@ A special token that returns the name of the file in which it occurs. =item fileno FILEHANDLE X<fileno> +=item fileno DIRHANDLE + =for Pod::Functions return file descriptor from filehandle -Returns the file descriptor for a filehandle, or undefined if the +Returns the file descriptor for a filehandle or directory handle, +or undefined if the filehandle is not open. If there is no real file descriptor at the OS level, as can happen with filehandles connected to memory objects via L<C<open>|/open FILEHANDLE,EXPR> with a reference for the third @@ -2653,8 +2803,8 @@ Here's a mailbox appender for BSD systems. sub lock { my ($fh) = @_; flock($fh, LOCK_EX) or die "Cannot lock mailbox - $!\n"; - - # and, in case someone appended while we were waiting... + # and, in case we're running on a very old UNIX + # variant without the modern O_APPEND semantics... seek($fh, 0, SEEK_END) or die "Cannot seek - $!\n"; } @@ -2879,6 +3029,9 @@ Returns the current priority for a process, a process group, or a user. (See L<getpriority(2)>.) Will raise a fatal exception if used on a machine that doesn't implement L<getpriority(2)>. +C<WHICH> can be any of C<PRIO_PROCESS>, C<PRIO_PGRP> or C<PRIO_USER> +imported from L<POSIX/RESOURCE CONSTANTS>. + Portability issues: L<perlport/getpriority>. =item getpwnam NAME @@ -3127,6 +3280,17 @@ Even though it looks as though they're the same method calls (uid), they aren't, because a C<File::stat> object is different from a C<User::pwent> object. +Many of these functions are not safe in a multi-threaded environment +where more than one thread can be using them. In particular, functions +like C<getpwent()> iterate per-process and not per-thread, so if two +threads are simultaneously iterating, neither will get all the records. + +Some systems have thread-safe versions of some of the functions, such as +C<getpwnam_r()> instead of C<getpwnam()>. There, Perl automatically and +invisibly substitutes the thread-safe version, without notice. This +means that code that safely runs on some systems can fail on others that +lack the thread-safe versions. + Portability issues: L<perlport/getpwnam> to L<perlport/endservent>. =item getsockname SOCKET @@ -3229,6 +3393,13 @@ See L<File::Glob> for details, including L<C<bsd_glob>|File::Glob/C<bsd_glob>>, which does not treat whitespace as a pattern separator. +If a C<glob> expression is used as the condition of a C<while> or C<for> +loop, then it will be implicitly assigned to C<$_>. If either a C<glob> +expression or an explicit assignment of a C<glob> expression to a scalar +is used as a C<while>/C<for> condition, then the condition actually +tests for definedness of the expression's value, not for its regular +truth value. + Portability issues: L<perlport/glob>. =item gmtime EXPR @@ -3288,7 +3459,12 @@ assignment. Use of C<goto LABEL> or C<goto EXPR> to jump into a construct is deprecated and will issue a warning. Even then, it may not be used to go into any construct that requires initialization, such as a -subroutine or a C<foreach> loop. It also can't be used to go into a +subroutine, a C<foreach> loop, or a C<given> +block. In general, it may not be used to jump into the parameter +of a binary or list operator, but it may be used to jump into the +I<first> parameter of a binary operator. (The C<=> +assignment operator's "first" operand is its right-hand +operand.) It also can't be used to go into a construct that is optimized away. The C<goto &NAME> form is quite different from the other forms of @@ -3507,7 +3683,8 @@ may behave differently to Perl's hashes with respect to changes in order on insertion and deletion of items. As a side effect, calling L<C<keys>|/keys HASH> resets the internal -iterator of the HASH or ARRAY (see L<C<each>|/each HASH>). In +iterator of the HASH or ARRAY (see L<C<each>|/each HASH>) before +yielding the keys. In particular, calling L<C<keys>|/keys HASH> in void context resets the iterator with no other overhead. @@ -3655,9 +3832,10 @@ L<C<continue>|/continue BLOCK> block, if any, is not executed: #... } -L<C<last>|/last LABEL> cannot be used to exit a block that returns a -value such as C<eval {}>, C<sub {}>, or C<do {}>, and should not be used -to exit a L<C<grep>|/grep BLOCK LIST> or L<C<map>|/map BLOCK LIST> +L<C<last>|/last LABEL> cannot return a value from a block that typically +returns a value, such as C<eval {}>, C<sub {}>, or C<do {}>. It will perform +its flow control behavior, which precludes any return value. It should not be +used to exit a L<C<grep>|/grep BLOCK LIST> or L<C<map>|/map BLOCK LIST> operation. Note that a block by itself is semantically identical to a loop @@ -3764,8 +3942,8 @@ many elements these have. For that, use C<scalar @array> and C<scalar keys Like all Perl character operations, L<C<length>|/length EXPR> normally deals in logical characters, not physical bytes. For how many bytes a string encoded as -UTF-8 would take up, use C<length(Encode::encode_utf8(EXPR))> (you'll have -to C<use Encode> first). See L<Encode> and L<perlunicode>. +UTF-8 would take up, use C<length(Encode::encode('UTF-8', EXPR))> +(you'll have to C<use Encode> first). See L<Encode> and L<perlunicode>. =item __LINE__ X<__LINE__> @@ -3964,12 +4142,11 @@ X<map> =for Pod::Functions apply a change to a list to get back a new list with the changes Evaluates the BLOCK or EXPR for each element of LIST (locally setting -L<C<$_>|perlvar/$_> to each element) and returns the list value composed -of the -results of each such evaluation. In scalar context, returns the -total number of elements so generated. Evaluates BLOCK or EXPR in -list context, so each element of LIST may produce zero, one, or -more elements in the returned value. +L<C<$_>|perlvar/$_> to each element) and composes a list of the results of +each such evaluation. Each element of LIST may produce zero, one, or more +elements in the generated list, so the number of elements in the generated +list may differ from that in LIST. In scalar context, returns the total +number of elements so generated. In list context, returns the generated list. my @chars = map(chr, @numbers); @@ -4036,7 +4213,7 @@ or to force an anon hash constructor use C<+{>: to get a list of anonymous hashes each with only one entry apiece. -=item mkdir FILENAME,MASK +=item mkdir FILENAME,MODE X<mkdir> X<md> X<directory, create> =item mkdir FILENAME @@ -4046,19 +4223,19 @@ X<mkdir> X<md> X<directory, create> =for Pod::Functions create a directory Creates the directory specified by FILENAME, with permissions -specified by MASK (as modified by L<C<umask>|/umask EXPR>). If it +specified by MODE (as modified by L<C<umask>|/umask EXPR>). If it succeeds it returns true; otherwise it returns false and sets L<C<$!>|perlvar/$!> (errno). -MASK defaults to 0777 if omitted, and FILENAME defaults +MODE defaults to 0777 if omitted, and FILENAME defaults to L<C<$_>|perlvar/$_> if omitted. -In general, it is better to create directories with a permissive MASK +In general, it is better to create directories with a permissive MODE and let the user modify that with their L<C<umask>|/umask EXPR> than it is to supply -a restrictive MASK and give the user no way to be more permissive. +a restrictive MODE and give the user no way to be more permissive. The exceptions to this rule are when the file or directory should be kept private (mail files, for instance). The documentation for -L<C<umask>|/umask EXPR> discusses the choice of MASK in more detail. +L<C<umask>|/umask EXPR> discusses the choice of MODE in more detail. Note that according to the POSIX 1003.1-1996 the FILENAME may have any number of trailing slashes. Some operating and filesystems do not get @@ -4186,9 +4363,10 @@ refers to the innermost enclosing loop. The C<next EXPR> form, available as of Perl 5.18.0, allows a label name to be computed at run time, being otherwise identical to C<next LABEL>. -L<C<next>|/next LABEL> cannot be used to exit a block which returns a -value such as C<eval {}>, C<sub {}>, or C<do {}>, and should not be used -to exit a L<C<grep>|/grep BLOCK LIST> or L<C<map>|/map BLOCK LIST> +L<C<next>|/next LABEL> cannot return a value from a block that typically +returns a value, such as C<eval {}>, C<sub {}>, or C<do {}>. It will perform +its flow control behavior, which precludes any return value. It should not be +used to exit a L<C<grep>|/grep BLOCK LIST> or L<C<map>|/map BLOCK LIST> operation. Note that a block by itself is semantically identical to a loop @@ -4350,7 +4528,7 @@ opens the UTF8-encoded file containing Unicode characters; see L<perluniintro>. Note that if layers are specified in the three-argument form, then default layers stored in ${^OPEN} (see L<perlvar>; usually set by the L<open> pragma or the switch C<-CioD>) are ignored. -Those layers will also be ignored if you specifying a colon with no name +Those layers will also be ignored if you specify a colon with no name following it. In that case the default layer for the operating system (:raw on Unix, :crlf on Windows) is used. @@ -4406,9 +4584,9 @@ argument being L<C<undef>|/undef EXPR>: open(my $tmp, "+>", undef) or die ... -opens a filehandle to an anonymous temporary file. Also using C<< +< >> -works for symmetry, but you really should consider writing something -to the temporary file first. You will need to +opens a filehandle to a newly created empty anonymous temporary file. +(This happens under any mode, which makes C<< +> >> the only useful and +sensible mode to use.) You will need to L<C<seek>|/seek FILEHANDLE,POSITION,WHENCE> to do the reading. Perl is built using PerlIO by default. Unless you've @@ -4423,6 +4601,13 @@ To (re)open C<STDOUT> or C<STDERR> as an in-memory file, close it first: open(STDOUT, ">", \$variable) or die "Can't open STDOUT: $!"; +The scalars for in-memory files are treated as octet strings: unless +the file is being opened with truncation the scalar may not contain +any code points over 0xFF. + +Opening in-memory files I<can> fail for a variety of reasons. As with +any other C<open>, check the return value for success. + See L<perliol> for detailed info on PerlIO. General examples: @@ -4668,7 +4853,8 @@ DIRHANDLE may be an expression whose value can be used as an indirect dirhandle, usually the real dirhandle name. If DIRHANDLE is an undefined scalar variable (or array or hash element), the variable is assigned a reference to a new anonymous dirhandle; that is, it's autovivified. -DIRHANDLEs have their own namespace separate from FILEHANDLEs. +Dirhandles are the same objects as filehandles; an I/O object can only +be open as one of these handle types at once. See the example at L<C<readdir>|/readdir DIRHANDLE>. @@ -4852,7 +5038,7 @@ of values, as follows: those. Raises an exception otherwise.) i A signed integer value. - I A unsigned integer value. + I An unsigned integer value. (This 'integer' is _at_least_ 32 bits wide. Its exact size depends on what a local C compiler calls 'int'.) @@ -5604,7 +5790,9 @@ X<pos> X<match, position> Returns the offset of where the last C<m//g> search left off for the variable in question (L<C<$_>|perlvar/$_> is used when the variable is not -specified). Note that 0 is a valid match offset. +specified). This offset is in characters unless the +(no-longer-recommended) L<C<use bytes>|bytes> pragma is in effect, in +which case the offset is in bytes. Note that 0 is a valid match offset. L<C<undef>|/undef EXPR> indicates that the search position is reset (usually due to match failure, but can also be because no match has yet been run on the scalar). @@ -5667,7 +5855,7 @@ returning the filehandle value instead, in which case the LIST may not be omitted: print { $files[$i] } "stuff\n"; - print { $OK ? STDOUT : STDERR } "stuff\n"; + print { $OK ? *STDOUT : *STDERR } "stuff\n"; Printing to a closed pipe or socket will generate a SIGPIPE signal. See L<perlipc> for more on signal handling. @@ -5970,6 +6158,10 @@ it would have been testing the wrong file. As of Perl 5.12 you can use a bare L<C<readdir>|/readdir DIRHANDLE> in a C<while> loop, which will set L<C<$_>|perlvar/$_> on every iteration. +If either a C<readdir> expression or an explicit assignment of a +C<readdir> expression to a scalar is used as a C<while>/C<for> condition, +then the condition actually tests for definedness of the expression's +value, not for its regular truth value. opendir(my $dh, $some_dir) || die "Can't open $some_dir: $!"; while (readdir $dh) { @@ -6039,6 +6231,13 @@ L<C<eof>|/eof FILEHANDLE> handles C<ARGV> differently. } } +Like the C<< <EXPR> >> operator, if a C<readline> expression is +used as the condition of a C<while> or C<for> loop, then it will be +implicitly assigned to C<$_>. If either a C<readline> expression or +an explicit assignment of a C<readline> expression to a scalar is used +as a C<while>/C<for> condition, then the condition actually tests for +definedness of the expression's value, not for its regular truth value. + =item readlink EXPR X<readlink> @@ -6089,7 +6288,7 @@ Note the I<characters>: depending on the status of the socket, either (8-bit) bytes or characters are received. By default all sockets operate on bytes, but for example if the socket has been changed using L<C<binmode>|/binmode FILEHANDLE, LAYER> to operate with the -C<:encoding(utf8)> I/O layer (see the L<open> pragma), the I/O will +C<:encoding(UTF-8)> I/O layer (see the L<open> pragma), the I/O will operate on UTF8-encoded Unicode characters, not bytes. Similarly for the C<:encoding> layer: in that case pretty much any characters can be read. @@ -6129,9 +6328,10 @@ normally use this command: print; } -L<C<redo>|/redo LABEL> cannot be used to retry a block that returns a -value such as C<eval {}>, C<sub {}>, or C<do {}>, and should not be used -to exit a L<C<grep>|/grep BLOCK LIST> or L<C<map>|/map BLOCK LIST> +L<C<redo>|/redo LABEL> cannot return a value from a block that typically +returns a value, such as C<eval {}>, C<sub {}>, or C<do {}>. It will perform +its flow control behavior, which precludes any return value. It should not be +used to exit a L<C<grep>|/grep BLOCK LIST> or L<C<map>|/map BLOCK LIST> operation. Note that a block by itself is semantically identical to a loop @@ -6154,59 +6354,46 @@ X<ref> X<reference> =for Pod::Functions find out the type of thing being referenced -Returns a non-empty string if EXPR is a reference, the empty -string otherwise. If EXPR is not specified, L<C<$_>|perlvar/$_> will be -used. The value returned depends on the type of thing the reference is -a reference to. - -Builtin types include: - - SCALAR - ARRAY - HASH - CODE - REF - GLOB - LVALUE - FORMAT - IO - VSTRING - Regexp - -You can think of L<C<ref>|/ref EXPR> as a C<typeof> operator. - - if (ref($r) eq "HASH") { - print "r is a reference to a hash.\n"; - } - unless (ref($r)) { - print "r is not a reference at all.\n"; - } - -The return value C<LVALUE> indicates a reference to an lvalue that is not -a variable. You get this from taking the reference of function calls like -L<C<pos>|/pos SCALAR> or -L<C<substr>|/substr EXPR,OFFSET,LENGTH,REPLACEMENT>. C<VSTRING> is -returned if the reference points to a -L<version string|perldata/"Version Strings">. - -The result C<Regexp> indicates that the argument is a regular expression -resulting from L<C<qrE<sol>E<sol>>|/qrE<sol>STRINGE<sol>>. - -If the referenced object has been blessed into a package, then that package -name is returned instead. But don't use that, as it's now considered -"bad practice". For one reason, an object could be using a class called -C<Regexp> or C<IO>, or even C<HASH>. Also, L<C<ref>|/ref EXPR> doesn't -take into account subclasses, like -L<C<isa>|UNIVERSAL/C<< $obj->isa( TYPE ) >>> does. - -Instead, use L<C<blessed>|Scalar::Util/blessed> (in the L<Scalar::Util> -module) for boolean checks, L<C<isa>|UNIVERSAL/C<< $obj->isa( TYPE ) >>> -for specific class checks and L<C<reftype>|Scalar::Util/reftype> (also -from L<Scalar::Util>) for type checks. (See L<perlobj> for details and -a L<C<blessed>|Scalar::Util/blessed>/L<C<isa>|UNIVERSAL/C<< $obj->isa( TYPE ) >>> -example.) - -See also L<perlref>. +Examines the value of EXPR, expecting it to be a reference, and returns +a string giving information about the reference and the type of referent. +If EXPR is not specified, L<C<$_>|perlvar/$_> will be used. + +If the operand is not a reference, then the empty string will be returned. +An empty string will only be returned in this situation. C<ref> is often +useful to just test whether a value is a reference, which can be done +by comparing the result to the empty string. It is a common mistake +to use the result of C<ref> directly as a truth value: this goes wrong +because C<0> (which is false) can be returned for a reference. + +If the operand is a reference to a blessed object, then the name of +the class into which the referent is blessed will be returned. C<ref> +doesn't care what the physical type of the referent is; blessing takes +precedence over such concerns. Beware that exact comparison of C<ref> +results against a class name doesn't perform a class membership test: +a class's members also include objects blessed into subclasses, for +which C<ref> will return the name of the subclass. Also beware that +class names can clash with the built-in type names (described below). + +If the operand is a reference to an unblessed object, then the return +value indicates the type of object. If the unblessed referent is not +a scalar, then the return value will be one of the strings C<ARRAY>, +C<HASH>, C<CODE>, C<FORMAT>, or C<IO>, indicating only which kind of +object it is. If the unblessed referent is a scalar, then the return +value will be one of the strings C<SCALAR>, C<VSTRING>, C<REF>, C<GLOB>, +C<LVALUE>, or C<REGEXP>, depending on the kind of value the scalar +currently has. Beware that these built-in type names can also be used as +class names, so C<ref> returning one of these names doesn't unambiguously +indicate that the referent is of the kind to which the name refers. + +The ambiguity between built-in type names and class names significantly +limits the utility of C<ref>. For unambiguous information, use +L<C<Scalar::Util::blessed()>|Scalar::Util/blessed> for information about +blessing, and L<C<Scalar::Util::reftype()>|Scalar::Util/reftype> for +information about physical types. Use L<the C<isa> method|UNIVERSAL/C<< +$obj->isa( TYPE ) >>> for class membership tests, though one must be +sure of blessedness before attempting a method call. + +See also L<perlref> and L<perlobj>. =item rename OLDNAME,NEWNAME X<rename> X<move> X<mv> X<ren> @@ -6240,23 +6427,24 @@ X<require> Demands a version of Perl specified by VERSION, or demands some semantics specified by EXPR or by L<C<$_>|perlvar/$_> if EXPR is not supplied. -VERSION may be either a numeric argument such as 5.006, which will be -compared to L<C<$]>|perlvar/$]>, or a literal of the form v5.6.1, which -will be compared to L<C<$^V>|perlvar/$^V> (or C<$PERL_VERSION> in -L<English>). An exception is raised if VERSION is greater than the -version of the current Perl interpreter. Compare with +VERSION may be either a literal such as v5.24.1, which will be +compared to L<C<$^V>|perlvar/$^V> (or C<$PERL_VERSION> in L<English>), +or a numeric argument of the form 5.024001, which will be compared to +L<C<$]>|perlvar/$]>. An exception is raised if VERSION is greater than +the version of the current Perl interpreter. Compare with L<C<use>|/use Module VERSION LIST>, which can do a similar check at compile time. -Specifying VERSION as a literal of the form v5.6.1 should generally be -avoided, because it leads to misleading error messages under earlier -versions of Perl that do not support this syntax. The equivalent numeric -version should be used instead. +Specifying VERSION as a numeric argument of the form 5.024001 should +generally be avoided as older less readable syntax compared to +v5.24.1. Before perl 5.8.0 (released in 2002), the more verbose numeric +form was the only supported syntax, which is why you might see it in +older code. - require v5.6.1; # run time version check - require 5.6.1; # ditto - require 5.006_001; # ditto; preferred for backwards - compatibility + require v5.24.1; # run time version check + require 5.24.1; # ditto + require 5.024_001; # ditto; older syntax compatible + with perl 5.6 Otherwise, L<C<require>|/require VERSION> demands that a library file be included if it hasn't already been included. The file is included via @@ -6323,14 +6511,16 @@ statements. If EXPR is a bareword, L<C<require>|/require VERSION> assumes a F<.pm> extension and replaces C<::> with C</> in the filename for you, to make it easy to load standard modules. This form of loading of -modules does not risk altering your namespace. +modules does not risk altering your namespace, however it will autovivify +the stash for the required module. In other words, if you try this: require Foo::Bar; # a splendid bareword The require function will actually look for the F<Foo/Bar.pm> file in the -directories specified in the L<C<@INC>|perlvar/@INC> array. +directories specified in the L<C<@INC>|perlvar/@INC> array, and it will +autovivify the C<Foo::Bar::> stash at compile time. But if you try this: @@ -6345,12 +6535,20 @@ will complain about not finding F<Foo::Bar> there. In this case you can do: eval "require $class"; +or you could do + + require "Foo/Bar.pm"; + +Neither of these forms will autovivify any stashes at compile time and +only have run time effects. + Now that you understand how L<C<require>|/require VERSION> looks for files with a bareword argument, there is a little extra functionality going on behind the scenes. Before L<C<require>|/require VERSION> looks for a F<.pm> extension, it will first look for a similar filename with a F<.pmc> extension. If this file is found, it will be loaded in place of -any file ending in a F<.pm> extension. +any file ending in a F<.pm> extension. This applies to both the explicit +C<require "Foo/Bar.pm";> form and the C<require Foo::Bar;> form. You can also insert hooks into the import facility by putting Perl code directly into the L<C<@INC>|perlvar/@INC> array. There are three forms @@ -6384,11 +6582,12 @@ subroutine will be called to act as a simple source filter, with the line as read in L<C<$_>|perlvar/$_>. Again, return 1 for each valid line, and 0 after all lines have been returned. +For historical reasons the subroutine will receive a meaningless argument +(in fact always the numeric value zero) as C<$_[0]>. =item 4 -Optional state for the subroutine. The state is passed in as C<$_[1]>. A -reference to the subroutine itself is passed in as C<$_[0]>. +Optional state for the subroutine. The state is passed in as C<$_[1]>. =back @@ -6648,12 +6847,13 @@ C<SEEK_CUR>, and C<SEEK_END> (start of the file, current position, end of the file) from the L<Fcntl> module. Returns C<1> on success, false otherwise. -Note the I<in bytes>: even if the filehandle has been set to -operate on characters (for example by using the C<:encoding(utf8)> open -layer), L<C<tell>|/tell FILEHANDLE> will return byte offsets, not -character offsets (because implementing that would render -L<C<seek>|/seek FILEHANDLE,POSITION,WHENCE> and -L<C<tell>|/tell FILEHANDLE> rather slow). +Note the emphasis on bytes: even if the filehandle has been set to operate +on characters (for example using the C<:encoding(UTF-8)> I/O layer), the +L<C<seek>|/seek FILEHANDLE,POSITION,WHENCE>, +L<C<tell>|/tell FILEHANDLE>, and +L<C<sysseek>|/sysseek FILEHANDLE,POSITION,WHENCE> +family of functions use byte offsets, not character offsets, +because seeking to a character offset would be very slow in a UTF-8 file. If you want to position the file for L<C<sysread>|/sysread FILEHANDLE,SCALAR,LENGTH,OFFSET> or @@ -6887,7 +7087,7 @@ Note the I<characters>: depending on the status of the socket, either (8-bit) bytes or characters are sent. By default all sockets operate on bytes, but for example if the socket has been changed using L<C<binmode>|/binmode FILEHANDLE, LAYER> to operate with the -C<:encoding(utf8)> I/O layer (see L<C<open>|/open FILEHANDLE,EXPR>, or +C<:encoding(UTF-8)> I/O layer (see L<C<open>|/open FILEHANDLE,EXPR>, or the L<open> pragma), the I/O will operate on UTF-8 encoded Unicode characters, not bytes. Similarly for the C<:encoding> layer: in that case pretty much any characters can be sent. @@ -6916,6 +7116,9 @@ Sets the current priority for a process, a process group, or a user. (See L<setpriority(2)>.) Raises an exception when used on a machine that doesn't implement L<setpriority(2)>. +C<WHICH> can be any of C<PRIO_PROCESS>, C<PRIO_PGRP> or C<PRIO_USER> +imported from L<POSIX/RESOURCE CONSTANTS>. + Portability issues: L<perlport/setpriority>. =item setsockopt SOCKET,LEVEL,OPTNAME,OPTVAL @@ -7138,7 +7341,7 @@ sockets but not socketpair. Portability issues: L<perlport/socketpair>. =item sort SUBNAME LIST -X<sort> X<qsort> X<quicksort> X<mergesort> +X<sort> =item sort BLOCK LIST @@ -7165,9 +7368,7 @@ If the subroutine's prototype is C<($$)>, the elements to be compared are passed by reference in L<C<@_>|perlvar/@_>, as for a normal subroutine. This is slower than unprototyped subroutines, where the elements to be compared are passed into the subroutine as the package global variables -C<$a> and C<$b> (see example below). Note that in the latter case, it -is usually highly counter-productive to declare C<$a> and C<$b> as -lexicals. +C<$a> and C<$b> (see example below). If the subroutine is an XSUB, the elements to be compared are pushed on to the stack, the way arguments are usually passed to XSUBs. C<$a> and @@ -7192,19 +7393,9 @@ L<C<grep>|/grep BLOCK LIST>) actually modifies the element in the original list. This is usually something to be avoided when writing clear code. -Perl 5.6 and earlier used a quicksort algorithm to implement sort. -That algorithm was not stable and I<could> go quadratic. (A I<stable> sort -preserves the input order of elements that compare equal. Although -quicksort's run time is O(NlogN) when averaged over all arrays of -length N, the time can be O(N**2), I<quadratic> behavior, for some -inputs.) In 5.7, the quicksort implementation was replaced with -a stable mergesort algorithm whose worst-case behavior is O(NlogN). -But benchmarks indicated that for some inputs, on some platforms, -the original quicksort was faster. 5.8 has a L<sort> pragma for -limited control of the sort. Its rather blunt control of the -underlying algorithm may not persist into future Perls, but the -ability to characterize the input or output in implementation -independent ways quite probably will. +Historically Perl has varied in whether sorting is stable by default. +If stability matters, it can be controlled explicitly by using the +L<sort> pragma. Examples: @@ -7287,14 +7478,10 @@ Examples: package main; my @new = sort Other::backwards @old; - # guarantee stability, regardless of algorithm + # guarantee stability use sort 'stable'; my @new = sort { substr($a, 3, 5) cmp substr($b, 3, 5) } @old; - # force use of mergesort (not portable outside Perl 5.8) - use sort '_mergesort'; # note discouraging _ - my @new = sort { substr($a, 3, 5) cmp substr($b, 3, 5) } @old; - Warning: syntactical care is required when sorting the list returned from a function. If you want to sort the list returned by the function call C<find_records(@key)>, you can use: @@ -7312,16 +7499,63 @@ C<find_records()> then you can use: my @contact = sort(find_records @key); my @contact = sort(find_records (@key)); -You I<must not> declare C<$a> -and C<$b> as lexicals. They are package globals. That means -that if you're in the C<main> package and type - - my @articles = sort {$b <=> $a} @files; - -then C<$a> and C<$b> are C<$main::a> and C<$main::b> (or C<$::a> and C<$::b>), -but if you're in the C<FooPack> package, it's the same as typing - - my @articles = sort {$FooPack::b <=> $FooPack::a} @files; +C<$a> and C<$b> are set as package globals in the package the sort() is +called from. That means C<$main::a> and C<$main::b> (or C<$::a> and +C<$::b>) in the C<main> package, C<$FooPack::a> and C<$FooPack::b> in the +C<FooPack> package, etc. If the sort block is in scope of a C<my> or +C<state> declaration of C<$a> and/or C<$b>, you I<must> spell out the full +name of the variables in the sort block : + + package main; + my $a = "C"; # DANGER, Will Robinson, DANGER !!! + + print sort { $a cmp $b } qw(A C E G B D F H); + # WRONG + sub badlexi { $a cmp $b } + print sort badlexi qw(A C E G B D F H); + # WRONG + # the above prints BACFEDGH or some other incorrect ordering + + print sort { $::a cmp $::b } qw(A C E G B D F H); + # OK + print sort { our $a cmp our $b } qw(A C E G B D F H); + # also OK + print sort { our ($a, $b); $a cmp $b } qw(A C E G B D F H); + # also OK + sub lexi { our $a cmp our $b } + print sort lexi qw(A C E G B D F H); + # also OK + # the above print ABCDEFGH + +With proper care you may mix package and my (or state) C<$a> and/or C<$b>: + + my $a = { + tiny => -2, + small => -1, + normal => 0, + big => 1, + huge => 2 + }; + + say sort { $a->{our $a} <=> $a->{our $b} } + qw{ huge normal tiny small big}; + + # prints tinysmallnormalbighuge + +C<$a> and C<$b> are implicitly local to the sort() execution and regain their +former values upon completing the sort. + +Sort subroutines written using C<$a> and C<$b> are bound to their calling +package. It is possible, but of limited interest, to define them in a +different package, since the subroutine must still refer to the calling +package's C<$a> and C<$b> : + + package Foo; + sub lexi { $Bar::a cmp $Bar::b } + package Bar; + ... sort Foo::lexi ... + +Use the prototyped versions (see above) for a more generic alternative. The comparison function is required to behave. If it returns inconsistent results (sometimes saying C<$x[1]> is less than C<$x[2]> and @@ -7403,6 +7637,8 @@ X<split> Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context. +(Prior to Perl 5.11, it also overwrote C<@_> with the list in +void and scalar context. If you target old perls, beware.) If only PATTERN is given, EXPR defaults to L<C<$_>|perlvar/$_>. @@ -7439,6 +7675,10 @@ If PATTERN is C</^/>, then it is treated as if it used the L<multiline modifier|perlreref/OPERATORS> (C</^/m>), since it isn't much use otherwise. +C<E<sol>m> and any of the other pattern modifiers valid for C<qr> +(summarized in L<perlop/qrE<sol>STRINGE<sol>msixpodualn>) may be +specified explicitly. + As another special case, L<C<split>|/split E<sol>PATTERNE<sol>,EXPR,LIMIT> emulates the default behavior of the @@ -7455,6 +7695,14 @@ special case was restricted to the use of a plain S<C<" ">> as the pattern argument to split; in Perl 5.18.0 and later this special case is triggered by any expression which evaluates to the simple string S<C<" ">>. +As of Perl 5.28, this special-cased whitespace splitting works as expected in +the scope of L<< S<C<"use feature 'unicode_strings">>|feature/The +'unicode_strings' feature >>. In previous versions, and outside the scope of +that feature, it exhibits L<perlunicode/The "Unicode Bug">: characters that are +whitespace according to Unicode rules but not according to ASCII rules can be +treated as part of fields rather than as field separators, depending on the +string's internal encoding. + If omitted, PATTERN defaults to a single space, S<C<" ">>, triggering the previously described I<awk> emulation. @@ -7745,9 +7993,8 @@ For example: printf '<%e>', 10; # prints "<1.000000e+01>" printf '<%.1e>', 10; # prints "<1.0e+01>" -For "g" and "G", this specifies the maximum number of digits to show, -including those prior to the decimal point and those after it; for -example: +For "g" and "G", this specifies the maximum number of significant digits to +show; for example: # These examples are subject to system-specific variation. printf '<%g>', 1; # prints "<1>" @@ -7757,6 +8004,9 @@ example: printf '<%.2g>', 100.01; # prints "<1e+02>" printf '<%.5g>', 100.01; # prints "<100.01>" printf '<%.4g>', 100.01; # prints "<100>" + printf '<%.1g>', 0.0111; # prints "<0.01>" + printf '<%.2g>', 0.0111; # prints "<0.011>" + printf '<%.3g>', 0.0111; # prints "<0.0111>" For integer conversions, specifying a precision implies that the output of the number itself should be zero-padded to this width, @@ -7818,8 +8068,8 @@ as supported by the compiler used to build Perl: h interpret integer as C type "short" or "unsigned short" j interpret integer as C type "intmax_t" on Perl - 5.14 or later, and only with a C99 compiler - (unportable) + 5.14 or later; and only with a C99 compiler + prior to Perl 5.30 (unportable) l interpret integer as C type "long" or "unsigned long" q, L, or ll interpret integer as C type "long long", @@ -8044,6 +8294,15 @@ L<C<lstat>|/lstat FILEHANDLE>, or filetest are returned. Example: (This works on machines only for which the device number is negative under NFS.) +On some platforms inode numbers are of a type larger than perl knows how +to handle as integer numerical values. If necessary, an inode number will +be returned as a decimal string in order to preserve the entire value. +If used in a numeric context, this will be converted to a floating-point +numerical value, with rounding, a fate that is best avoided. Therefore, +you should prefer to compare inode numbers using C<eq> rather than C<==>. +C<eq> will work fine on inode numbers that are represented numerically, +as well as those represented as strings. + Because the mode contains both the file type and its permissions, you should mask off the file type portion and (s)printf using a C<"%o"> if you want to see the real permissions. @@ -8154,7 +8413,7 @@ If more than one variable is listed, the list must be placed in parentheses. With a parenthesised list, L<C<undef>|/undef EXPR> can be used as a dummy placeholder. However, since initialization of state variables in -list context is currently not possible this would serve no purpose. +such lists is currently not possible this would serve no purpose. L<C<state>|/state VARLIST> is available only if the L<C<"state"> feature|feature/The 'state' feature> is enabled or if it is @@ -8169,68 +8428,16 @@ X<study> =item study -=for Pod::Functions optimize input data for repeated searches - -B<Note that since Perl version 5.16 this function has been a no-op, but -this might change in a future release.> - -May take extra time to study SCALAR (L<C<$_>|perlvar/$_> if unspecified) -in anticipation -of doing many pattern matches on the string before it is next modified. -This may or may not save time, depending on the nature and number of -patterns you are searching and the distribution of character -frequencies in the string to be searched; you probably want to compare -run times with and without it to see which is faster. Those loops -that scan for many short constant strings (including the constant -parts of more complex patterns) will benefit most. +=for Pod::Functions no-op, formerly optimized input data for repeated searches -(The way L<C<study>|/study SCALAR> used to work is this: a linked list -of every -character in the string to be searched is made, so we know, for -example, where all the C<'k'> characters are. From each search string, -the rarest character is selected, based on some static frequency tables -constructed from some C programs and English text. Only those places -that contain this "rarest" character are examined.) +At this time, C<study> does nothing. This may change in the future. -For example, here is a loop that inserts index producing entries -before any line containing a certain pattern: +Prior to Perl version 5.16, it would create an inverted index of all characters +that occurred in the given SCALAR (or L<C<$_>|perlvar/$_> if unspecified). When +matching a pattern, the rarest character from the pattern would be looked up in +this index. Rarity was based on some static frequency tables constructed from +some C programs and English text. - while (<>) { - study; - print ".IX foo\n" if /\bfoo\b/; - print ".IX bar\n" if /\bbar\b/; - print ".IX blurfl\n" if /\bblurfl\b/; - # ... - print; - } - -In searching for C</\bfoo\b/>, only locations in L<C<$_>|perlvar/$_> -that contain C<f> -will be looked at, because C<f> is rarer than C<o>. In general, this is -a big win except in pathological cases. The only question is whether -it saves you more time than it took to build the linked list in the -first place. - -Note that if you have to look for strings that you don't know till -runtime, you can build an entire loop as a string and L<C<eval>|/eval -EXPR> that to avoid recompiling all your patterns all the time. -Together with undefining L<C<$E<sol>>|perlvar/$E<sol>> to input entire -files as one record, this can be quite -fast, often faster than specialized programs like L<fgrep(1)>. The following -scans a list of files (C<@files>) for a list of words (C<@words>), and prints -out the names of those files that contain a match: - - my $search = 'local $/; while (<>) { study;'; - foreach my $word (@words) { - $search .= "++\$seen{\$ARGV} if /\\b$word\\b/;\n"; - } - $search .= "}"; - @ARGV = @files; - my %seen; - eval $search; # this screams - foreach my $file (sort keys(%seen)) { - print $file, "\n"; - } =item sub NAME BLOCK X<sub> @@ -8529,17 +8736,19 @@ X<sysseek> X<lseek> =for Pod::Functions +5.004 position I/O pointer on handle used with sysread and syswrite -Sets FILEHANDLE's system position in bytes using L<lseek(2)>. FILEHANDLE may +Sets FILEHANDLE's system position I<in bytes> using L<lseek(2)>. FILEHANDLE may be an expression whose value gives the name of the filehandle. The values for WHENCE are C<0> to set the new position to POSITION; C<1> to set the it to the current position plus POSITION; and C<2> to set it to EOF plus POSITION, typically negative. -Note the I<in bytes>: even if the filehandle has been set to operate -on characters (for example by using the C<:encoding(utf8)> I/O layer), -L<C<tell>|/tell FILEHANDLE> will return byte offsets, not character -offsets (because implementing that would render -L<C<sysseek>|/sysseek FILEHANDLE,POSITION,WHENCE> unacceptably slow). +Note the emphasis on bytes: even if the filehandle has been set to operate +on characters (for example using the C<:encoding(UTF-8)> I/O layer), the +L<C<seek>|/seek FILEHANDLE,POSITION,WHENCE>, +L<C<tell>|/tell FILEHANDLE>, and +L<C<sysseek>|/sysseek FILEHANDLE,POSITION,WHENCE> +family of functions use byte offsets, not character offsets, +because seeking to a character offset would be very slow in a UTF-8 file. L<C<sysseek>|/sysseek FILEHANDLE,POSITION,WHENCE> bypasses normal buffered IO, so mixing it with reads other than @@ -8700,19 +8909,21 @@ error. FILEHANDLE may be an expression whose value gives the name of the actual filehandle. If FILEHANDLE is omitted, assumes the file last read. -Note the I<in bytes>: even if the filehandle has been set to -operate on characters (for example by using the C<:encoding(utf8)> open -layer), L<C<tell>|/tell FILEHANDLE> will return byte offsets, not -character offsets (because that would render -L<C<seek>|/seek FILEHANDLE,POSITION,WHENCE> and -L<C<tell>|/tell FILEHANDLE> rather slow). +Note the emphasis on bytes: even if the filehandle has been set to operate +on characters (for example using the C<:encoding(UTF-8)> I/O layer), the +L<C<seek>|/seek FILEHANDLE,POSITION,WHENCE>, +L<C<tell>|/tell FILEHANDLE>, and +L<C<sysseek>|/sysseek FILEHANDLE,POSITION,WHENCE> +family of functions use byte offsets, not character offsets, +because seeking to a character offset would be very slow in a UTF-8 file. The return value of L<C<tell>|/tell FILEHANDLE> for the standard streams like the STDIN depends on the operating system: it may return -1 or something else. L<C<tell>|/tell FILEHANDLE> on pipes, fifos, and sockets usually returns -1. -There is no C<systell> function. Use C<sysseek($fh, 0, 1)> for that. +There is no C<systell> function. Use +L<C<sysseek($fh, 0, 1)>|/sysseek FILEHANDLE,POSITION,WHENCE> for that. Do not use L<C<tell>|/tell FILEHANDLE> (or other buffered I/O operations) on a filehandle that has been manipulated by @@ -8959,7 +9170,7 @@ The Unix permission C<rwxr-x---> is represented as three sets of three bits, or three octal digits: C<0750> (the leading 0 indicates octal and isn't one of the digits). The L<C<umask>|/umask EXPR> value is such a number representing disabled permissions bits. The permission (or -"mode") values you pass L<C<mkdir>|/mkdir FILENAME,MASK> or +"mode") values you pass L<C<mkdir>|/mkdir FILENAME,MODE> or L<C<sysopen>|/sysopen FILEHANDLE,FILENAME,MODE> are modified by your umask, so even if you tell L<C<sysopen>|/sysopen FILEHANDLE,FILENAME,MODE> to create a file with @@ -8972,7 +9183,7 @@ file with mode C<0640> (because C<0666 &~ 027> is C<0640>). Here's some advice: supply a creation mode of C<0666> for regular files (in L<C<sysopen>|/sysopen FILEHANDLE,FILENAME,MODE>) and one of -C<0777> for directories (in L<C<mkdir>|/mkdir FILENAME,MASK>) and +C<0777> for directories (in L<C<mkdir>|/mkdir FILENAME,MODE>) and executable files. This gives users the freedom of choice: if they want protected files, they might choose process umasks of C<022>, C<027>, or even the particularly antisocial mask of C<077>. @@ -9177,25 +9388,24 @@ package. It is exactly equivalent to except that Module I<must> be a bareword. The importation can be made conditional by using the L<if> module. -In the peculiar C<use VERSION> form, VERSION may be either a positive -decimal fraction such as 5.006, which will be compared to -L<C<$]>|perlvar/$]>, or a v-string of the form v5.6.1, which will be -compared to L<C<$^V>|perlvar/$^V> (aka $PERL_VERSION). An -exception is raised if VERSION is greater than the version of the -current Perl interpreter; Perl will not attempt to parse the rest of the -file. Compare with L<C<require>|/require VERSION>, which can do a -similar check at run time. -Symmetrically, C<no VERSION> allows you to specify that you want a version -of Perl older than the specified one. - -Specifying VERSION as a literal of the form v5.6.1 should generally be -avoided, because it leads to misleading error messages under earlier -versions of Perl (that is, prior to 5.6.0) that do not support this -syntax. The equivalent numeric version should be used instead. - - use v5.6.1; # compile time version check - use 5.6.1; # ditto - use 5.006_001; # ditto; preferred for backwards compatibility +In the C<use VERSION> form, VERSION may be either a v-string such as +v5.24.1, which will be compared to L<C<$^V>|perlvar/$^V> (aka +$PERL_VERSION), or a numeric argument of the form 5.024001, which will +be compared to L<C<$]>|perlvar/$]>. An exception is raised if VERSION +is greater than the version of the current Perl interpreter; Perl will +not attempt to parse the rest of the file. Compare with +L<C<require>|/require VERSION>, which can do a similar check at run +time. Symmetrically, C<no VERSION> allows you to specify that you +want a version of Perl older than the specified one. + +Specifying VERSION as a numeric argument of the form 5.024001 should +generally be avoided as older less readable syntax compared to +v5.24.1. Before perl 5.8.0 released in 2002 the more verbose numeric +form was the only supported syntax, which is why you might see it in + + use v5.24.1; # compile time version check + use 5.24.1; # ditto + use 5.024_001; # ditto; older syntax compatible with perl 5.6 This is often useful if you need to check the current Perl version before L<C<use>|/use Module VERSION LIST>ing library modules that won't work @@ -9254,6 +9464,15 @@ The L<default C<VERSION> method|UNIVERSAL/C<VERSION ( [ REQUIRE ] )>>, inherited from the L<C<UNIVERSAL>|UNIVERSAL> class, croaks if the given version is larger than the value of the variable C<$Module::VERSION>. +The VERSION argument cannot be an arbitrary expression. It only counts +as a VERSION argument if it is a version number literal, starting with +either a digit or C<v> followed by a digit. Anything that doesn't +look like a version literal will be parsed as the start of the LIST. +Nevertheless, many attempts to use an arbitrary expression as a VERSION +argument will appear to work, because L<Exporter>'s C<import> method +handles numeric arguments specially, performing version checks rather +than treating them as things to export. + Again, there is a distinction between omitting LIST (L<C<import>|/import LIST> called with no arguments) and an explicit empty LIST C<()> (L<C<import>|/import LIST> not called). Note that there is no comma @@ -9270,7 +9489,7 @@ pragmas are: use strict qw(subs vars refs); use subs qw(afunc blurfl); use warnings qw(all); - use sort qw(stable _quicksort _mergesort); + use sort qw(stable); Some of these pseudo-modules import semantics into the current block scope (like L<C<strict>|strict> or L<C<integer>|integer>, unlike @@ -9385,9 +9604,12 @@ may behave differently to Perl's hashes with respect to changes in order on insertion and deletion of items. As a side effect, calling L<C<values>|/values HASH> resets the HASH or -ARRAY's internal iterator, see L<C<each>|/each HASH>. (In particular, +ARRAY's internal iterator (see L<C<each>|/each HASH>) before yielding the +values. In particular, calling L<C<values>|/values HASH> in void context resets the iterator -with no other overhead. Apart from resetting the iterator, +with no other overhead. + +Apart from resetting the iterator, C<values @array> in list context is the same as plain C<@array>. (We recommend that you use void context C<keys @array> for this, but reasoned that taking C<values @array> out would require more @@ -9453,10 +9675,12 @@ extend the string with sufficiently many zero bytes. It is an error to try to write off the beginning of the string (i.e., negative OFFSET). If the string happens to be encoded as UTF-8 internally (and thus has -the UTF8 flag set), this is ignored by L<C<vec>|/vec EXPR,OFFSET,BITS>, -and it operates on the -internal byte string, not the conceptual character string, even if you -only have characters with values less than 256. +the UTF8 flag set), L<C<vec>|/vec EXPR,OFFSET,BITS> tries to convert it +to use a one-byte-per-character internal representation. However, if the +string contains characters with values of 256 or higher, that conversion +will fail, and a deprecation message will be raised. In that situation, +C<vec> will operate on the underlying buffer regardless, in its internal +UTF-8 representation. In Perl 5.32, this will be a fatal error. Strings created with L<C<vec>|/vec EXPR,OFFSET,BITS> can also be manipulated with the logical @@ -9748,21 +9972,19 @@ X<warn> X<warning> X<STDERR> =for Pod::Functions print debugging info -Prints the value of LIST to STDERR. If the last element of LIST does -not end in a newline, it appends the same file/line number text as -L<C<die>|/die LIST> does. - -If the output is empty and L<C<$@>|perlvar/$@> already contains a value -(typically from a previous eval) that value is used after appending -C<"\t...caught"> to L<C<$@>|perlvar/$@>. This is useful for staying -almost, but not entirely similar to L<C<die>|/die LIST>. - -If L<C<$@>|perlvar/$@> is empty, then the string -C<"Warning: Something's wrong"> is used. - -No message is printed if there is a L<C<$SIG{__WARN__}>|perlvar/%SIG> -handler -installed. It is the handler's responsibility to deal with the message +Emits a warning, usually by printing it to C<STDERR>. C<warn> interprets +its operand LIST in the same way as C<die>, but is slightly different +in what it defaults to when LIST is empty or makes an empty string. +If it is empty and L<C<$@>|perlvar/$@> already contains an exception +value then that value is used after appending C<"\t...caught">. If it +is empty and C<$@> is also empty then the string C<"Warning: Something's +wrong"> is used. + +By default, the exception derived from the operand LIST is stringified +and printed to C<STDERR>. This behaviour can be altered by installing +a L<C<$SIG{__WARN__}>|perlvar/%SIG> handler. If there is such a +handler then no message is automatically printed; it is the handler's +responsibility to deal with the exception as it sees fit (like, for instance, converting it into a L<C<die>|/die LIST>). Most handlers must therefore arrange to actually display the diff --git a/gnu/usr.bin/perl/pod/perlgit.pod b/gnu/usr.bin/perl/pod/perlgit.pod index 9d3edccba9f..000d6ac0d4f 100644 --- a/gnu/usr.bin/perl/pod/perlgit.pod +++ b/gnu/usr.bin/perl/pod/perlgit.pod @@ -481,6 +481,13 @@ the "first commit where the bug is solved". C<git help bisect> has much more information on how you can tweak your binary searches. +Following bisection you may wish to configure, build and test perl at +commits identified by the bisection process. Sometimes, particularly +with older perls, C<make> may fail during this process. In this case +you may be able to patch the source code at the older commit point. To +do so, please follow the suggestions provided in +L<perlhack/Building perl at older commits>. + =head2 Topic branches and rewriting history Individual committers should create topic branches under @@ -845,6 +852,9 @@ on other OSes test the change before you commit it to blead. Fortunately, there is a way to get your change smoke-tested on various OSes: push it to a "smoke-me" branch and wait for certain automated smoke-testers to report the results from their OSes. +A "smoke-me" branch is identified by the branch name: specifically, as +seen on perl5.git.perl.org it must be a local branch whose first name +component is precisely C<smoke-me>. The procedure for doing this is roughly as follows (using the example of of tonyc's smoke-me branch called win32stat): @@ -918,11 +928,11 @@ general testing and development. Dromedary syncs the git tree from camel every few minutes, you should not push there. Both machines also have a full CPAN mirror in F</srv/CPAN>, please use this. To share files with the general public, dromedary serves your F<~/public_html/> as -C<http://users.perl5.git.perl.org/~yourlogin/> +C<L<http://users.perl5.git.perl.org/~yourlogin/>> These hosts have fairly strict firewalls to the outside. Outgoing, only rsync, ssh and git are allowed. For http and ftp, you can use -http://webproxy:3128 as proxy. Incoming, the firewall tries to detect +L<http://webproxy:3128> as proxy. Incoming, the firewall tries to detect attacks and blocks IP addresses with suspicious activity. This sometimes (but very rarely) has false positives and you might get blocked. The quickest way to get unblocked is to notify the admins. diff --git a/gnu/usr.bin/perl/pod/perlguts.pod b/gnu/usr.bin/perl/pod/perlguts.pod index 42ebb8df228..5dce945cf17 100644 --- a/gnu/usr.bin/perl/pod/perlguts.pod +++ b/gnu/usr.bin/perl/pod/perlguts.pod @@ -56,7 +56,7 @@ The seven routines are: SV* newSVpvf(const char*, ...); SV* newSVsv(SV*); -C<STRLEN> is an integer type (Size_t, usually defined as size_t in +C<STRLEN> is an integer type (C<Size_t>, usually defined as C<size_t> in F<config.h>) guaranteed to be large enough to represent the size of any string that perl can handle. @@ -79,7 +79,7 @@ To change the value of an I<already-existing> SV, there are eight routines: void sv_setpvn(SV*, const char*, STRLEN) void sv_setpvf(SV*, const char*, ...); void sv_vsetpvfn(SV*, const char*, STRLEN, va_list *, - SV **, I32, bool *); + SV **, Size_t, bool *); void sv_setsv(SV*, SV*); Notice that you can choose to specify the length of the string to be @@ -103,7 +103,7 @@ important. Note that this function requires you to specify the length of the format. The C<sv_set*()> functions are not generic enough to operate on values -that have "magic". See L<Magic Virtual Tables> later in this document. +that have "magic". See L</Magic Virtual Tables> later in this document. All SVs that contain strings should be terminated with a C<NUL> character. If it is not C<NUL>-terminated there is a risk of @@ -186,7 +186,7 @@ sv_insert() or sv_insert_flags(). If you don't need the existing content of the SV, you can avoid some copying with: - sv_setpvn(sv, "", 0); + SvPVCLEAR(sv); s = SvGROW(sv, needlen + 1); /* something that modifies up to needlen bytes at s, but modifies newlen bytes @@ -252,7 +252,7 @@ SV with the string stored in the second SV. It also forces the second SV to be interpreted as a string. The C<sv_cat*()> functions are not generic enough to operate on values that -have "magic". See L<Magic Virtual Tables> later in this document. +have "magic". See L</Magic Virtual Tables> later in this document. If you know the name of a scalar variable, you can get a pointer to its SV by using the following: @@ -282,7 +282,7 @@ But won't work when called as: So to repeat always use SvOK() to check whether an sv is defined. Also you have to be careful when using C<&PL_sv_undef> as a value in -AVs or HVs (see L<AVs, HVs and undefined values>). +AVs or HVs (see L</AVs, HVs and undefined values>). There are also the two values C<PL_sv_yes> and C<PL_sv_no>, which contain boolean TRUE and FALSE values, respectively. Like C<PL_sv_undef>, their @@ -304,7 +304,7 @@ bus error, or just weird results. Change the zero to C<&PL_sv_undef> in the first line and all will be well. To free an SV that you've created, call C<SvREFCNT_dec(SV*)>. Normally this -call is not necessary (see L<Reference Counts and Mortality>). +call is not necessary (see L</Reference Counts and Mortality>). =head2 Offsets @@ -461,7 +461,7 @@ by using the following: This returns NULL if the variable does not exist. -See L<Understanding the Magic of Tied Hashes and Arrays> for more +See L</Understanding the Magic of Tied Hashes and Arrays> for more information on how to use the array access functions on tied arrays. =head2 Working with HVs @@ -545,7 +545,7 @@ The exact implementation of this macro varies by architecture and version of perl, and the return value may change per invocation, so the value is only valid for the duration of a single perl process. -See L<Understanding the Magic of Tied Hashes and Arrays> for more +See L</Understanding the Magic of Tied Hashes and Arrays> for more information on how to use the hash access functions on tied hashes. =head2 Hash API Extensions @@ -702,7 +702,7 @@ A reference can be blessed into a package with the following function: The C<sv> argument must be a reference value. The C<stash> argument specifies which class the reference will belong to. See -L<Stashes and Globs> for information on converting class names into stashes. +L</Stashes and Globs> for information on converting class names into stashes. /* Still under construction */ @@ -798,68 +798,116 @@ Perl uses a reference count-driven garbage collection mechanism. SVs, AVs, or HVs (xV for short in the following) start their life with a reference count of 1. If the reference count of an xV ever drops to 0, then it will be destroyed and its memory made available for reuse. - -This normally doesn't happen at the Perl level unless a variable is -undef'ed or the last variable holding a reference to it is changed or -overwritten. At the internal level, however, reference counts can be -manipulated with the following macros: +At the most basic internal level, reference counts can be manipulated +with the following macros: int SvREFCNT(SV* sv); SV* SvREFCNT_inc(SV* sv); void SvREFCNT_dec(SV* sv); -However, there is one other function which manipulates the reference -count of its argument. The C<newRV_inc> function, you will recall, -creates a reference to the specified argument. As a side effect, -it increments the argument's reference count. If this is not what -you want, use C<newRV_noinc> instead. - -For example, imagine you want to return a reference from an XSUB function. -Inside the XSUB routine, you create an SV which initially has a reference -count of one. Then you call C<newRV_inc>, passing it the just-created SV. -This returns the reference as a new SV, but the reference count of the -SV you passed to C<newRV_inc> has been incremented to two. Now you -return the reference from the XSUB routine and forget about the SV. -But Perl hasn't! Whenever the returned reference is destroyed, the -reference count of the original SV is decreased to one and nothing happens. -The SV will hang around without any way to access it until Perl itself -terminates. This is a memory leak. - -The correct procedure, then, is to use C<newRV_noinc> instead of -C<newRV_inc>. Then, if and when the last reference is destroyed, -the reference count of the SV will go to zero and it will be destroyed, -stopping any memory leak. +(There are also suffixed versions of the increment and decrement macros, +for situations where the full generality of these basic macros can be +exchanged for some performance.) + +However, the way a programmer should think about references is not so +much in terms of the bare reference count, but in terms of I<ownership> +of references. A reference to an xV can be owned by any of a variety +of entities: another xV, the Perl interpreter, an XS data structure, +a piece of running code, or a dynamic scope. An xV generally does not +know what entities own the references to it; it only knows how many +references there are, which is the reference count. + +To correctly maintain reference counts, it is essential to keep track +of what references the XS code is manipulating. The programmer should +always know where a reference has come from and who owns it, and be +aware of any creation or destruction of references, and any transfers +of ownership. Because ownership isn't represented explicitly in the xV +data structures, only the reference count need be actually maintained +by the code, and that means that this understanding of ownership is not +actually evident in the code. For example, transferring ownership of a +reference from one owner to another doesn't change the reference count +at all, so may be achieved with no actual code. (The transferring code +doesn't touch the referenced object, but does need to ensure that the +former owner knows that it no longer owns the reference, and that the +new owner knows that it now does.) + +An xV that is visible at the Perl level should not become unreferenced +and thus be destroyed. Normally, an object will only become unreferenced +when it is no longer visible, often by the same means that makes it +invisible. For example, a Perl reference value (RV) owns a reference to +its referent, so if the RV is overwritten that reference gets destroyed, +and the no-longer-reachable referent may be destroyed as a result. + +Many functions have some kind of reference manipulation as +part of their purpose. Sometimes this is documented in terms +of ownership of references, and sometimes it is (less helpfully) +documented in terms of changes to reference counts. For example, the +L<newRV_inc()|perlapi/newRV_inc> function is documented to create a new RV +(with reference count 1) and increment the reference count of the referent +that was supplied by the caller. This is best understood as creating +a new reference to the referent, which is owned by the created RV, +and returning to the caller ownership of the sole reference to the RV. +The L<newRV_noinc()|perlapi/newRV_noinc> function instead does not +increment the reference count of the referent, but the RV nevertheless +ends up owning a reference to the referent. It is therefore implied +that the caller of C<newRV_noinc()> is relinquishing a reference to the +referent, making this conceptually a more complicated operation even +though it does less to the data structures. + +For example, imagine you want to return a reference from an XSUB +function. Inside the XSUB routine, you create an SV which initially +has just a single reference, owned by the XSUB routine. This reference +needs to be disposed of before the routine is complete, otherwise it +will leak, preventing the SV from ever being destroyed. So to create +an RV referencing the SV, it is most convenient to pass the SV to +C<newRV_noinc()>, which consumes that reference. Now the XSUB routine +no longer owns a reference to the SV, but does own a reference to the RV, +which in turn owns a reference to the SV. The ownership of the reference +to the RV is then transferred by the process of returning the RV from +the XSUB. There are some convenience functions available that can help with the destruction of xVs. These functions introduce the concept of "mortality". -An xV that is mortal has had its reference count marked to be decremented, -but not actually decremented, until "a short time later". Generally the -term "short time later" means a single Perl statement, such as a call to -an XSUB function. The actual determinant for when mortal xVs have their -reference count decremented depends on two macros, SAVETMPS and FREETMPS. -See L<perlcall> and L<perlxs> for more details on these macros. - -"Mortalization" then is at its simplest a deferred C<SvREFCNT_dec>. -However, if you mortalize a variable twice, the reference count will -later be decremented twice. - -"Mortal" SVs are mainly used for SVs that are placed on perl's stack. -For example an SV which is created just to pass a number to a called sub -is made mortal to have it cleaned up automatically when it's popped off -the stack. Similarly, results returned by XSUBs (which are pushed on the -stack) are often made mortal. - -To create a mortal variable, use the functions: +Much documentation speaks of an xV itself being mortal, but this is +misleading. It is really I<a reference to> an xV that is mortal, and it +is possible for there to be more than one mortal reference to a single xV. +For a reference to be mortal means that it is owned by the temps stack, +one of perl's many internal stacks, which will destroy that reference +"a short time later". Usually the "short time later" is the end of +the current Perl statement. However, it gets more complicated around +dynamic scopes: there can be multiple sets of mortal references hanging +around at the same time, with different death dates. Internally, the +actual determinant for when mortal xV references are destroyed depends +on two macros, SAVETMPS and FREETMPS. See L<perlcall> and L<perlxs> +for more details on these macros. + +Mortal references are mainly used for xVs that are placed on perl's +main stack. The stack is problematic for reference tracking, because it +contains a lot of xV references, but doesn't own those references: they +are not counted. Currently, there are many bugs resulting from xVs being +destroyed while referenced by the stack, because the stack's uncounted +references aren't enough to keep the xVs alive. So when putting an +(uncounted) reference on the stack, it is vitally important to ensure that +there will be a counted reference to the same xV that will last at least +as long as the uncounted reference. But it's also important that that +counted reference be cleaned up at an appropriate time, and not unduly +prolong the xV's life. For there to be a mortal reference is often the +best way to satisfy this requirement, especially if the xV was created +especially to be put on the stack and would otherwise be unreferenced. + +To create a mortal reference, use the functions: SV* sv_newmortal() - SV* sv_2mortal(SV*) SV* sv_mortalcopy(SV*) + SV* sv_2mortal(SV*) -The first call creates a mortal SV (with no value), the second converts an existing -SV to a mortal SV (and thus defers a call to C<SvREFCNT_dec>), and the -third creates a mortal copy of an existing SV. -Because C<sv_newmortal> gives the new SV no value, it must normally be given one -via C<sv_setpv>, C<sv_setiv>, etc. : +C<sv_newmortal()> creates an SV (with the undefined value) whose sole +reference is mortal. C<sv_mortalcopy()> creates an xV whose value is a +copy of a supplied xV and whose sole reference is mortal. C<sv_2mortal()> +mortalises an existing xV reference: it transfers ownership of a reference +from the caller to the temps stack. Because C<sv_newmortal> gives the new +SV no value, it must normally be given one via C<sv_setpv>, C<sv_setiv>, +etc. : SV *tmp = sv_newmortal(); sv_setiv(tmp, an_integer); @@ -868,17 +916,6 @@ As that is multiple C statements it is quite common so see this idiom instead: SV *tmp = sv_2mortal(newSViv(an_integer)); - -You should be careful about creating mortal variables. Strange things -can happen if you make the same value mortal within multiple contexts, -or if you make a variable mortal multiple -times. Thinking of "Mortalization" -as deferred C<SvREFCNT_dec> should help to minimize such problems. -For example if you are passing an SV which you I<know> has a high enough REFCNT -to survive its use on the stack you need not do any mortalization. -If you are not sure then doing an C<SvREFCNT_inc> and C<sv_2mortal>, or -making a C<sv_mortalcopy> is safer. - The mortal routines are not just for SVs; AVs and HVs can be made mortal by passing their address (type-casted to C<SV*>) to the C<sv_2mortal> or C<sv_mortalcopy> routines. @@ -1076,7 +1113,7 @@ to contain an C<SV*> and is stored as-is with its REFCNT incremented. The sv_magic function uses C<how> to determine which, if any, predefined "Magic Virtual Table" should be assigned to the C<mg_virtual> field. -See the L<Magic Virtual Tables> section below. The C<how> argument is also +See the L</Magic Virtual Tables> section below. The C<how> argument is also stored in the C<mg_type> field. The value of C<how> should be chosen from the set of macros C<PERL_MAGIC_foo> found in F<perl.h>. Note that before @@ -1087,8 +1124,9 @@ referring to 'U' magic rather than C<PERL_MAGIC_uvar> for example. The C<obj> argument is stored in the C<mg_obj> field of the C<MAGIC> structure. If it is not the same as the C<sv> argument, the reference count of the C<obj> object is incremented. If it is the same, or if -the C<how> argument is C<PERL_MAGIC_arylen>, or if it is a NULL pointer, -then C<obj> is merely stored, without the reference count being incremented. +the C<how> argument is C<PERL_MAGIC_arylen>, C<PERL_MAGIC_regdatum>, +C<PERL_MAGIC_regdata>, or if it is a NULL pointer, then C<obj> is merely +stored, without the reference count being incremented. See also C<sv_magicext> in L<perlapi> for a more flexible way to add magic to an SV. @@ -1123,16 +1161,16 @@ applied to that variable. The C<MGVTBL> has five (or sometimes eight) pointers to the following routine types: - int (*svt_get)(SV* sv, MAGIC* mg); - int (*svt_set)(SV* sv, MAGIC* mg); - U32 (*svt_len)(SV* sv, MAGIC* mg); - int (*svt_clear)(SV* sv, MAGIC* mg); - int (*svt_free)(SV* sv, MAGIC* mg); + int (*svt_get) (pTHX_ SV* sv, MAGIC* mg); + int (*svt_set) (pTHX_ SV* sv, MAGIC* mg); + U32 (*svt_len) (pTHX_ SV* sv, MAGIC* mg); + int (*svt_clear)(pTHX_ SV* sv, MAGIC* mg); + int (*svt_free) (pTHX_ SV* sv, MAGIC* mg); - int (*svt_copy)(SV *sv, MAGIC* mg, SV *nsv, + int (*svt_copy) (pTHX_ SV *sv, MAGIC* mg, SV *nsv, const char *name, I32 namlen); - int (*svt_dup)(MAGIC *mg, CLONE_PARAMS *param); - int (*svt_local)(SV *nsv, MAGIC *mg); + int (*svt_dup) (pTHX_ MAGIC *mg, CLONE_PARAMS *param); + int (*svt_local)(pTHX_ SV *nsv, MAGIC *mg); This MGVTBL structure is set at compile-time in F<perl.h> and there are @@ -1233,6 +1271,8 @@ will be lost. v PERL_MAGIC_vec vtbl_vec vec() lvalue w PERL_MAGIC_utf8 vtbl_utf8 Cached UTF-8 information x PERL_MAGIC_substr vtbl_substr substr() lvalue + Y PERL_MAGIC_nonelem vtbl_nonelem Array element that does not + exist y PERL_MAGIC_defelem vtbl_defelem Shadow "foreach" iterator variable / smart parameter vivification @@ -1371,7 +1411,7 @@ creates a second hash which it blesses into the class which will implement the tie methods. Lastly it ties the two hashes together, and returns a reference to the new tied hash. Note that the code below does NOT call the TIEHASH method in the MyTie class - -see L<Calling Perl Routines from within C Programs> for details on how +see L</Calling Perl Routines from within C Programs> for details on how to do this. SV* @@ -1733,7 +1773,7 @@ reuse specially assigned SVs (I<target>s) which are (as a corollary) not constantly freed/created. Each of the targets is created only once (but see -L<Scratchpads and recursion> below), and when an opcode needs to put +L</Scratchpads and recursion> below), and when an opcode needs to put an integer, a double, or a string on stack, it just sets the corresponding parts of its I<target> and puts the I<target> on stack. @@ -2674,6 +2714,20 @@ whatever the compiler has. If you are printing addresses of pointers, use UVxf combined with PTR2UV(), do not use %lx or %p. +=head2 Formatted Printing of C<Size_t> and C<SSize_t> + +The most general way to do this is to cast them to a UV or IV, and +print as in the +L<previous section|/Formatted Printing of IVs, UVs, and NVs>. + +But if you're using C<PerlIO_printf()>, it's less typing and visual +clutter to use the C<"%z"> length modifier (for I<siZe>): + + PerlIO_printf("STRLEN is %zu\n", len); + +This modifier is not portable, so its use should be restricted to +C<PerlIO_printf()>. + =head2 Pointer-To-Integer and Integer-To-Pointer Because pointer size does not necessarily equal integer size, @@ -2855,10 +2909,13 @@ so you can test if you need to do something special with this character like this (the C<UTF8_IS_INVARIANT()> is a macro that tests whether the byte is encoded as a single byte even in UTF-8): - U8 *utf; - U8 *utf_end; /* 1 beyond buffer pointed to by utf */ - UV uv; /* Note: a UV, not a U8, not a char */ - STRLEN len; /* length of character in bytes */ + U8 *utf; /* Initialize this to point to the beginning of the + sequence to convert */ + U8 *utf_end; /* Initialize this to 1 beyond the end of the sequence + pointed to by 'utf' */ + UV uv; /* Returned code point; note: a UV, not a U8, not a + char */ + STRLEN len; /* Returned length of character in bytes */ if (!UTF8_IS_INVARIANT(*utf)) /* Must treat this as UTF-8 */ diff --git a/gnu/usr.bin/perl/pod/perlhack.pod b/gnu/usr.bin/perl/pod/perlhack.pod index 4e790633747..a5ac7a6082b 100644 --- a/gnu/usr.bin/perl/pod/perlhack.pod +++ b/gnu/usr.bin/perl/pod/perlhack.pod @@ -90,6 +90,16 @@ When prompted, pick a subject that summarizes your changes. The porters appreciate the time you spent helping to make Perl better. Thank you! +=item * Acknowledgement + +All contributors are credited (by name and email address) in the +AUTHORS file, which is part of the perl distribution, as well as the +Git commit history. + +If you don’t want to be included in the AUTHORS file, just let us +know. Otherwise we will take your submission of a patch as permission +to credit you in the AUTHORS file. + =item * Next time The next time you wish to make a patch, you need to start from the @@ -219,6 +229,9 @@ email from our ticket tracking system. This email will give you a ticket number. Once your patch has made it to the ticket tracking system, it will also be sent to the perl5-porters@perl.org list. +If your patch is related to an already-opened ticket you can also +attach your patch to that ticket, without having to use perlbug. + Patches are reviewed and discussed on the p5p list. Simple, uncontroversial patches will usually be applied without any discussion. When the patch is applied, the ticket will be updated and you will @@ -356,11 +369,16 @@ sources: =item * -8-wide tabs (no exceptions!) +4-wide indents for code, 2-wide indents for nested CPP C<#define>s, +with 8-wide tabstops. =item * -4-wide indents for code, 2-wide indents for nested CPP #defines +Use spaces for indentation, not tab characters. + +The codebase is a mixture of tabs and spaces for indentation, and we +are moving to spaces only. Converting lines you're patching from 8-wide +tabs to spaces will help this migration. =item * @@ -1036,6 +1054,55 @@ is broken (for example, the utf8 length cache on long utf8 strings). Add a test that will take a fraction of a second normally, and minutes otherwise, causing the test file to time out on failure. +=head2 Building perl at older commits + +In the course of hacking on the Perl core distribution, you may have occasion +to configure, build and test perl at an old commit. Sometimes C<make> will +fail during this process. If that happens, you may be able to salvage the +situation by using the Devel::PatchPerl library from CPAN (not included in the +core) to bring the source code at that commit to a buildable state. + +Here's a real world example, taken from work done to resolve +L<perl #72414|https://rt.perl.org/Ticket/Display.html?id=72414>. +Use of F<Porting/bisect.pl> had identified commit +C<ba77e4cc9d1ceebf472c9c5c18b2377ee47062e6> as the commit in which a bug was +corrected. To confirm, a P5P developer wanted to configure and build perl at +commit C<ba77e4c^> (presumably "bad") and then at C<ba77e4c> (presumably +"good"). Normal configuration and build was attempted: + + $ sh ./Configure -des -Dusedevel + $ make test_prep + +C<make>, however, failed with output (excerpted) like this: + + cc -fstack-protector -L/usr/local/lib -o miniperl \ + gv.o toke.o perly.o pad.o regcomp.o dump.o util.o \ + mg.o reentr.o mro.o hv.o av.o run.o pp_hot.o sv.o \ + pp.o scope.o pp_ctl.o pp_sys.o doop.o doio.o regexec.o \ + utf8.o taint.o deb.o universal.o globals.o perlio.o \ + perlapi.o numeric.o mathoms.o locale.o pp_pack.o pp_sort.o \ + miniperlmain.o opmini.o perlmini.o + pp.o: In function `Perl_pp_pow': + pp.c:(.text+0x2db9): undefined reference to `pow' + ... + collect2: error: ld returned 1 exit status + makefile:348: recipe for target 'miniperl' failed + make: *** [miniperl] Error 1 + +Another P5P contributor recommended installation and use of Devel::PatchPerl +for this situation, first to determine the version of perl at the commit in +question, then to patch the source code at that point to facilitate a build. + + $ perl -MDevel::PatchPerl -e \ + 'print Devel::PatchPerl->determine_version("/path/to/sourcecode"), "\n";' + 5.11.1 + $ perl -MDevel::PatchPerl -e \ + 'Devel::PatchPerl->patch_source("5.11.1", "/path/to/sourcecode");' + +Once the source was patched, C<./Configure> and C<make test_prep> were called +and completed successfully, enabling confirmation of the findings in RT +#72414. + =head1 MORE READING FOR GUTS HACKERS To hack on the Perl guts, you'll need to read the following things: @@ -1102,11 +1169,11 @@ wanting to go about Perl development. =head1 CPAN TESTERS AND PERL SMOKERS -The CPAN testers ( http://testers.cpan.org/ ) are a group of volunteers +The CPAN testers ( L<http://testers.cpan.org/> ) are a group of volunteers who test CPAN modules on a variety of platforms. -Perl Smokers ( http://www.nntp.perl.org/group/perl.daily-build/ and -http://www.nntp.perl.org/group/perl.daily-build.reports/ ) +Perl Smokers ( L<http://www.nntp.perl.org/group/perl.daily-build/> and +L<http://www.nntp.perl.org/group/perl.daily-build.reports/> ) automatically test Perl source releases on platforms with various configurations. diff --git a/gnu/usr.bin/perl/pod/perlhacktips.pod b/gnu/usr.bin/perl/pod/perlhacktips.pod index 1dd715a0bbb..cbf1d3c9f62 100644 --- a/gnu/usr.bin/perl/pod/perlhacktips.pod +++ b/gnu/usr.bin/perl/pod/perlhacktips.pod @@ -20,8 +20,7 @@ to do that first. =head1 COMMON PROBLEMS -Perl source plays by ANSI C89 rules: no C99 (or C++) extensions. In -some cases we have to take pre-ANSI requirements into consideration. +Perl source plays by ANSI C89 rules: no C99 (or C++) extensions. You don't care about some particular platform having broken Perl? I hear there is still a strong demand for J2EE programmers. @@ -134,7 +133,7 @@ Use the Configure C<-Dgccansipedantic> flag to enable the gcc C<-ansi -pedantic> flags which enforce stricter ANSI rules. If using the C<gcc -Wall> note that not all the possible warnings (like -C<-Wunitialized>) are given unless you also compile with C<-O>. +C<-Wuninitialized>) are given unless you also compile with C<-O>. Note that if using gcc, starting from Perl 5.9.5 the Perl core source code files (the ones at the top level of the source code distribution, @@ -516,6 +515,9 @@ Or you can try casting to a "wide enough" type: printf("i = %"IVdf"\n", (IV)something_very_small_and_signed); +See L<perlguts/Formatted Printing of Size_t and SSize_t> for how to +print those. + Also remember that the C<%p> format really does require a void pointer: U8* p = ...; @@ -733,28 +735,39 @@ happened, or how did we end up having wrong or unexpected results. To really poke around with Perl, you'll probably want to build Perl for debugging, like this: - ./Configure -d -D optimize=-g + ./Configure -d -DDEBUGGING make -C<-g> is a flag to the C compiler to have it produce debugging -information which will allow us to step through a running program, and -to see in which C function we are at (without the debugging information -we might see only the numerical addresses of the functions, which is -not very helpful). - -F<Configure> will also turn on the C<DEBUGGING> compilation symbol -which enables all the internal debugging code in Perl. There are a -whole bunch of things you can debug with this: L<perlrun> lists them -all, and the best way to find out about them is to play about with -them. The most useful options are probably +C<-DDEBUGGING> turns on the C compiler's C<-g> flag to have it produce +debugging information which will allow us to step through a running +program, and to see in which C function we are at (without the debugging +information we might see only the numerical addresses of the functions, +which is not very helpful). It will also turn on the C<DEBUGGING> +compilation symbol which enables all the internal debugging code in Perl. +There are a whole bunch of things you can debug with this: L<perlrun> +lists them all, and the best way to find out about them is to play about +with them. The most useful options are probably l Context (loop) stack processing + s Stack snapshots (with v, displays all stacks) t Trace execution o Method and overloading resolution c String/numeric conversions -Some of the functionality of the debugging code can be achieved using -XS modules. +For example + + $ perl -Dst -e '$a + 1' + .... + (-e:1) gvsv(main::a) + => UNDEF + (-e:1) const(IV(1)) + => UNDEF IV(1) + (-e:1) add + => NV(1) + + +Some of the functionality of the debugging code can be achieved with a +non-debugging perl by using XS modules: -Dr => use re 'debug' -Dx => use O 'Debug' @@ -1023,23 +1036,19 @@ and looking at the resulting graph, what does it tell about the execution and data flows. As a matter of fact, this is exactly how C compilers know to give warnings about dubious code. -=head2 lint, splint +=head2 lint The good old C code quality inspector, C<lint>, is available in several platforms, but please be aware that there are several different implementations of it by different vendors, which means that the flags are not identical across different platforms. -There is a lint variant called C<splint> (Secure Programming Lint) -available from http://www.splint.org/ that should compile on any -Unix-like platform. - -There are C<lint> and <splint> targets in Makefile, but you may have to +There is a C<lint> target in Makefile, but you may have to diddle with the flags (see above). =head2 Coverity -Coverity (http://www.coverity.com/) is a product similar to lint and as +Coverity (L<http://www.coverity.com/>) is a product similar to lint and as a testbed for their product they periodically check several open source projects, and they give out accounts to open source developers to the defect databases. @@ -1062,8 +1071,8 @@ cut-and-pasted code changes, all the other spots should probably be changed, too. Therefore such code should probably be turned into a subroutine or a macro. -cpd (http://pmd.sourceforge.net/cpd.html) is part of the pmd project -(http://pmd.sourceforge.net/). pmd was originally written for static +cpd (L<http://pmd.sourceforge.net/cpd.html>) is part of the pmd project +(L<http://pmd.sourceforge.net/>). pmd was originally written for static analysis of Java code, but later the cpd part of it was extended to parse also C and C++. @@ -1096,7 +1105,7 @@ being a prime example). If Configure C<-Dgccansipedantic> is used, the C<cflags> frontend selects C<-ansi -pedantic> for the platforms where they are known to be safe. -Starting from Perl 5.9.4 the following extra flags are added: +The following extra flags are added: =over 4 @@ -1110,7 +1119,19 @@ C<-Wextra> =item * -C<-Wdeclaration-after-statement> +C<-Wc++-compat> + +=item * + +C<-Wwrite-strings> + +=item * + +C<-Werror=declaration-after-statement> + +=item * + +C<-Werror=pointer-arith> =back @@ -1121,10 +1142,6 @@ their own Augean stablemaster: =item * -C<-Wpointer-arith> - -=item * - C<-Wshadow> =item * @@ -1419,7 +1436,8 @@ documentation for more information. Also, spawned threads do the equivalent of setting this variable to the value 1.) If, at the end of a run you get the message I<N scalars leaked>, you -can recompile with C<-DDEBUG_LEAKING_SCALARS>, which will cause the +can recompile with C<-DDEBUG_LEAKING_SCALARS>, +(C<Configure -Accflags=-DDEBUG_LEAKING_SCALARS>), which will cause the addresses of all those leaked SVs to be dumped along with details as to where each SV was originally allocated. This information is also displayed by Devel::Peek. Note that the extra details recorded with @@ -1617,8 +1635,10 @@ bugs in the past. =head2 When is a bool not a bool? On pre-C99 compilers, C<bool> is defined as equivalent to C<char>. -Consequently assignment of any larger type to a C<bool> is unsafe and may -be truncated. The C<cBOOL> macro exists to cast it correctly. +Consequently assignment of any larger type to a C<bool> is unsafe and may be +truncated. The C<cBOOL> macro exists to cast it correctly; you may also find +that using it is shorter and clearer than writing out the equivalent +conditional expression longhand. On those platforms and compilers where C<bool> really is a boolean (C++, C99), it is easy to forget the cast. You can force C<bool> to be a C<char> @@ -1630,6 +1650,10 @@ run C<Configure> with something like or your compiler's equivalent to make it easier to spot any unsafe truncations that show up. +The C<TRUE> and C<FALSE> macros are available for situations where using them +would clarify intent. (But they always just mean the same as the integers 1 and +0 regardless, so using them isn't compulsory.) + =head2 The .i Targets You can expand the macros in a F<foo.c> file by saying diff --git a/gnu/usr.bin/perl/pod/perlhist.pod b/gnu/usr.bin/perl/pod/perlhist.pod index b67040e796c..fc10ad87511 100644 --- a/gnu/usr.bin/perl/pod/perlhist.pod +++ b/gnu/usr.bin/perl/pod/perlhist.pod @@ -34,7 +34,7 @@ Chris C<BinGOs> Williams, Zefram, Ævar Arnfjörð Bjarmason, Stevan Little, Dave Rolsky, Max Maischein, Abigail, Jesse Luehrs, Tony Cook, Dominic Hargreaves, Aaron Crane, Aristotle Pagaltzis, Matthew Horsfall, Peter Martini, Sawyer X, Chad 'Exodist' Granum, Renee Bäcker, Eric Herman, -and John SJ Anderson. +John SJ Anderson, and Karen Etheridge. =head2 PUMPKIN? @@ -604,6 +604,8 @@ the strings?). Steve 5.24.2 2017-Jul-15 Steve 5.24.3-RC1 2017-Sep-10 Steve 5.24.3 2017-Sep-22 + Steve 5.24.4-RC1 2018-Mar-24 + Steve 5.24.4 2018-Apr-14 Ricardo 5.25.0 2016-May-09 The 5.25 development track Sawyer X 5.25.1 2016-May-20 @@ -624,12 +626,36 @@ the strings?). Sawyer X 5.26.0 2017-May-30 Steve 5.26.1-RC1 2017-Sep-10 Steve 5.26.1 2017-Sep-22 + Steve 5.26.2-RC1 2018-Mar-24 + Steve 5.26.2 2018-Apr-14 + Steve 5.26.3-RC1 2018-Nov-08 + Steve 5.26.3 2018-Nov-29 Sawyer X 5.27.0 2017-May-31 The 5.27 development track Eric 5.27.1 2017-Jun-20 Aaron 5.27.2 2017-Jul-20 Matthew H 5.27.3 2017-Aug-21 John 5.27.4 2017-Sep-20 + Steve 5.27.5 2017-Oct-20 + Ether 5.27.6 2017-Nov-20 + BinGOs 5.27.7 2017-Dec-20 + Abigail 5.27.8 2018-Jan-20 + Renee 5.27.9 2018-Feb-20 + Todd 5.27.10 2018-Mar-20 + Sawyer X 5.27.11 2018-Apr-20 + + Sawyer X 5.28.0-RC1 2018-May-21 The 5.28 maintenance track + Sawyer X 5.28.0-RC2 2018-Jun-06 + Sawyer X 5.28.0-RC3 2018-Jun-18 + Sawyer X 5.28.0-RC4 2018-Jun-19 + Sawyer X 5.28.0 2018-Jun-22 + Steve 5.28.1-RC1 2018-Nov-08 + Steve 5.28.1 2018-Nov-29 + + Sawyer X 5.29.0 2018-Jun-26 The 5.29 development track + Steve 5.29.1 2018-Jul-20 + BinGOs 5.29.2 2018-Aug-20 + John 5.29.3 2019-Sep-20 =head2 SELECTED RELEASE SIZES @@ -714,6 +740,7 @@ explained below. 5.22.0 7819 115 1284 77 19121 2635 9772 2434 5615 176 5.24.0 7922 113 1287 77 19535 2677 9994 2465 5702 177 5.26.0 9140 121 24925 1200 40643 3017 10514 2614 7854 211 + 5.28.0 13056 128 27267 1230 41745 3130 10952 2715 8185 218 The "core"..."doc" mean the following files from the Perl source code distribution. The glob notation ** means recursively, (.) means @@ -1040,25 +1067,25 @@ the Perl source distribution for somewhat more selected releases. ====================================================================== - 5.26.0 - - Configure 593 1 - Cross 122 15 - djgpp 21 7 - h2pl 24 15 - hints 376 87 - mad - - - NetWare 499 61 - os2 552 70 - plan9 322 17 - Porting 1380 73 - qnx 5 4 - symbian 315 54 - utils 578 50 - vms 527 12 - vos 12 7 - win32 1313 65 - x2p - - + 5.26.0 5.28.0 + + Configure 593 1 580 1 + Cross 122 15 125 15 + djgpp 21 7 21 7 + h2pl 24 15 24 15 + hints 376 87 364 85 + mad - - - - + NetWare 499 61 493 61 + os2 552 70 552 70 + plan9 322 17 309 17 + Porting 1380 73 1462 75 + qnx 5 4 5 4 + symbian 315 54 315 54 + utils 578 50 584 50 + vms 527 12 526 12 + vos 12 7 12 7 + win32 1313 65 1326 65 + x2p - - - - =head2 SELECTED PATCH SIZES diff --git a/gnu/usr.bin/perl/pod/perlintro.pod b/gnu/usr.bin/perl/pod/perlintro.pod index 9559cb1d12d..5c168c18e1e 100644 --- a/gnu/usr.bin/perl/pod/perlintro.pod +++ b/gnu/usr.bin/perl/pod/perlintro.pod @@ -675,7 +675,7 @@ in using third-party modules, which are documented below. =head2 Using Perl modules Perl modules provide a range of features to help you avoid reinventing -the wheel, and can be downloaded from CPAN ( http://www.cpan.org/ ). A +the wheel, and can be downloaded from CPAN ( L<http://www.cpan.org/> ). A number of popular modules are included with the Perl distribution itself. diff --git a/gnu/usr.bin/perl/pod/perliol.pod b/gnu/usr.bin/perl/pod/perliol.pod index 55aaf147f7f..b70a510aadd 100644 --- a/gnu/usr.bin/perl/pod/perliol.pod +++ b/gnu/usr.bin/perl/pod/perliol.pod @@ -505,6 +505,14 @@ arguments passed to them, I<n> is the index into that array of the layer being called. The macro C<PerlIOArg> will return a (possibly C<NULL>) SV * for the argument passed to the layer. +Where a layer opens or takes ownership of a file descriptor, that layer is +responsible for getting the file descriptor's close-on-exec flag into the +correct state. The flag should be clear for a file descriptor numbered +less than or equal to C<PL_maxsysfd>, and set for any file descriptor +numbered higher. For thread safety, when a layer opens a new file +descriptor it should if possible open it with the close-on-exec flag +initially set. + The I<mode> string is an "C<fopen()>-like" string which would match the regular expression C</^[I#]?[rwa]\+?[bt]?$/>. @@ -525,6 +533,9 @@ If I<fd> not negative then it is the numeric file descriptor I<fd>, which will be open in a manner compatible with the supplied mode string, the call is thus equivalent to C<PerlIO_fdopen>. In this case I<nargs> will be zero. +The file descriptor may have the close-on-exec flag either set or clear; +it is the responsibility of the layer that takes ownership of it to get +the flag into the correct state. If I<nargs> is greater than zero then it gives the number of arguments passed to C<open>, otherwise it will be 1 if for example diff --git a/gnu/usr.bin/perl/pod/perlipc.pod b/gnu/usr.bin/perl/pod/perlipc.pod index e3b74a55b95..902655dee6a 100644 --- a/gnu/usr.bin/perl/pod/perlipc.pod +++ b/gnu/usr.bin/perl/pod/perlipc.pod @@ -67,8 +67,8 @@ so it doesn't kill itself: # block scope for local { local $SIG{HUP} = "IGNORE"; - kill HUP => -$$; - # snazzy writing of: kill("HUP", -$$) + kill HUP => -getpgrp(); + # snazzy writing of: kill("HUP", -getpgrp()) } Another interesting signal to send is signal number zero. This doesn't diff --git a/gnu/usr.bin/perl/pod/perllocale.pod b/gnu/usr.bin/perl/pod/perllocale.pod index 018f916231f..a32f72c88e8 100644 --- a/gnu/usr.bin/perl/pod/perllocale.pod +++ b/gnu/usr.bin/perl/pod/perllocale.pod @@ -22,9 +22,14 @@ these kinds of matters is called B<internationalization> (often abbreviated as B<i18n>); telling such an application about a particular set of preferences is known as B<localization> (B<l10n>). -Perl has been extended to support the locale system. This -is controlled per application by using one pragma, one function call, -and several environment variables. +Perl has been extended to support certain types of locales available in +the locale system. This is controlled per application by using one +pragma, one function call, and several environment variables. + +Perl supports single-byte locales that are supersets of ASCII, such as +the ISO 8859 ones, and one multi-byte-type locale, UTF-8 ones, described +in the next paragraph. Perl doesn't support any other multi-byte +locales, such as the ones for East Asian languages. Unfortunately, there are quite a few deficiencies with the design (and often, the implementations) of locales. Unicode was invented (see @@ -33,9 +38,11 @@ design deficiencies, and nowadays, there is a series of "UTF-8 locales", based on Unicode. These are locales whose character set is Unicode, encoded in UTF-8. Starting in v5.20, Perl fully supports UTF-8 locales, except for sorting and string comparisons like C<lt> and -C<ge>. (Use L<Unicode::Collate> for these.) Perl continues to support -the old non UTF-8 locales as well. There are currently no UTF-8 locales -for EBCDIC platforms. +C<ge>. Starting in v5.26, Perl can handle these reasonably as well, +depending on the platform's implementation. However, for earlier +releases or for better control, use L<Unicode::Collate>. Perl continues to +support the old non UTF-8 locales as well. There are currently no UTF-8 +locales for EBCDIC platforms. (Unicode is also creating C<CLDR>, the "Common Locale Data Repository", L<http://cldr.unicode.org/> which includes more types of information than @@ -142,7 +149,7 @@ C<define>. If you want a Perl application to process and present your data according to a particular locale, the application code should include -the S<C<use locale>> pragma (see L<The "use locale" pragma>) where +the S<C<use locale>> pragma (see L</The "use locale" pragma>) where appropriate, and B<at least one> of the following must be true: =over 4 @@ -156,7 +163,7 @@ by yourself or by whomever set up your system account; or =item 2 B<The application must set its own locale> using the method described in -L<The setlocale function>. +L</The setlocale function>. =back @@ -164,14 +171,17 @@ L<The setlocale function>. =head2 The C<"use locale"> pragma -WARNING! Do NOT use this pragma in scripts that have multiple -L<threads|threads> active. The locale is not local to a single thread. -Another thread may change the locale at any time, which could cause at a -minimum that a given thread is operating in a locale it isn't expecting -to be in. On some platforms, segfaults can also occur. The locale -change need not be explicit; some operations cause perl to change the -locale itself. You are vulnerable simply by having done a C<"use -locale">. +Starting in Perl 5.28, this pragma may be used in +L<multi-threaded|threads> applications on systems that have thread-safe +locale ability. Some caveats apply, see L</Multi-threaded> below. On +systems without this capability, or in earlier Perls, do NOT use this +pragma in scripts that have multiple L<threads|threads> active. The +locale in these cases is not local to a single thread. Another thread +may change the locale at any time, which could cause at a minimum that a +given thread is operating in a locale it isn't expecting to be in. On +some platforms, segfaults can also occur. The locale change need not be +explicit; some operations cause perl to change the locale itself. You +are vulnerable simply by having done a S<C<"use locale">>. By default, Perl itself (outside the L<POSIX> module) ignores the current locale. The S<C<use locale>> @@ -196,8 +206,8 @@ The operations that are affected by locale are: =item B<Not within the scope of C<"use locale">> -Only certain operations originating outside Perl should be affected, as -follows: +Only certain operations (all originating outside Perl) should be +affected, as follows: =over 4 @@ -218,6 +228,8 @@ C<POSIX::strxfrm()> use C<LC_COLLATE>. All such functions will behave according to the current underlying locale, even if that locale isn't exposed to Perl space. +This applies as well to L<I18N::Langinfo>. + =item * XS modules for all categories but C<LC_NUMERIC> get the underlying @@ -334,7 +346,7 @@ the outer scope's rules at the end of the inner scope. The string result of any operation that uses locale information is tainted, as it is possible for a locale to be -untrustworthy. See L<"SECURITY">. +untrustworthy. See L</"SECURITY">. Starting in Perl v5.16 in a very limited way, and more generally in v5.22, you can restrict which category or categories are enabled by this @@ -359,8 +371,8 @@ will be locale aware. Everything else is unaffected. Since Perl doesn't currently do anything with the C<LC_MONETARY> category, specifying C<:monetary> does effectively nothing. Some -systems have other categories, such as C<LC_PAPER_SIZE>, but Perl -also doesn't know anything about them, and there is no way to specify +systems have other categories, such as C<LC_PAPER>, but Perl +also doesn't do anything with them, and there is no way to specify them in this pragma's arguments. You can also easily say to use all categories but one, by either, for @@ -398,12 +410,13 @@ this, as described in L</Unicode and UTF-8>. =head2 The setlocale function -WARNING! Do NOT use this function in a L<thread|threads>. The locale -will change in all other threads at the same time, and should your -thread get paused by the operating system, and another started, that -thread will not have the locale it is expecting. On some platforms, -there can be a race leading to segfaults if two threads call this -function nearly simultaneously. +WARNING! Prior to Perl 5.28 or on a system that does not support +thread-safe locale operations, do NOT use this function in a +L<thread|threads>. The locale will change in all other threads at the +same time, and should your thread get paused by the operating system, +and another started, that thread will not have the locale it is +expecting. On some platforms, there can be a race leading to segfaults +if two threads call this function nearly simultaneously. You can switch locales as often as you wish at run time with the C<POSIX::setlocale()> function: @@ -476,9 +489,59 @@ If C<set_locale()> fails for some reason (for example, an attempt to set to a locale unknown to the system), the locale for the category is not changed, and the function returns C<undef>. +Starting in Perl 5.28, on multi-threaded perls compiled on systems that +implement POSIX 2008 thread-safe locale operations, this function +doesn't actually call the system C<setlocale>. Instead those +thread-safe operations are used to emulate the C<setlocale> function, +but in a thread-safe manner. For further information about the categories, consult L<setlocale(3)>. +=head2 Multi-threaded operation + +Beginning in Perl 5.28, multi-threaded locale operation is supported on +systems that implement either the POSIX 2008 or Windows-specific +thread-safe locale operations. Many modern systems, such as various +Unix variants and Darwin do have this. + +You can tell if using locales is safe on your system by looking at the +read-only boolean variable C<${^SAFE_LOCALES}>. The value is 1 if the +perl is not threaded, or if it is using thread-safe locale operations. + +Thread-safe operations are supported in Windows starting in Visual Studio +2005, and in systems compatible with POSIX 2008. Some platforms claim +to support POSIX 2008, but have buggy implementations, so that the hints +files for compiling to run on them turn off attempting to use +thread-safety. C<${^SAFE_LOCALES}> will be 0 on them. + +Be aware that writing a multi-threaded application will not be portable +to a platform which lacks the native thread-safe locale support. On +systems that do have it, you automatically get this behavior for +threaded perls, without having to do anything. If for some reason, you +don't want to use this capability (perhaps the POSIX 2008 support is +buggy on your system), you can manually compile Perl to use the old +non-thread-safe implementation by passing the argument +C<-Accflags='-DNO_THREAD_SAFE_LOCALE'> to F<Configure>. +Except on Windows, this will continue to use certain of the POSIX 2008 +functions in some situations. If these are buggy, you can pass the +following to F<Configure> instead or additionally: +C<-Accflags='-DNO_POSIX_2008_LOCALE'>. This will also keep the code +from using thread-safe locales. +C<${^SAFE_LOCALES}> will be 0 on systems that turn off the thread-safe +operations. + +The initial program is started up using the locale specified from the +environment, as currently, described in L</ENVIRONMENT>. All newly +created threads start with C<LC_ALL> set to C<"C">>. Each thread may +use C<POSIX::setlocale()> to query or switch its locale at any time, +without affecting any other thread. All locale-dependent operations +automatically use their thread's locale. + +This should be completely transparent to any applications written +entirely in Perl (minus a few rarely encountered caveats given in the +L</Multi-threaded> section). Information for XS module writers is given +in L<perlxs/Locale-aware XS code>. + =head2 Finding locales For locales available in your system, consult also L<setlocale(3)> to @@ -590,8 +653,8 @@ than the C<PERL_BADLANG> approach, but setting C<LC_ALL> (or other locale variables) may affect other programs as well, not just Perl. In particular, external programs run from within Perl will see these changes. If you make the new settings permanent (read on), all -programs you run see the changes. See L<"ENVIRONMENT"> for -the full list of relevant environment variables and L<USING LOCALES> +programs you run see the changes. See L</"ENVIRONMENT"> for +the full list of relevant environment variables and L</"USING LOCALES"> for their effects in Perl. Effects in other programs are easily deducible. For example, the variable C<LC_COLLATE> may well affect your B<sort> program (or whatever the program that arranges "records" @@ -625,7 +688,7 @@ fix the misconfiguration of your own environment variables. The mis(sing)configuration of the whole system's locales usually requires the help of your friendly system administrator. -First, see earlier in this document about L<Finding locales>. That tells +First, see earlier in this document about L</Finding locales>. That tells how to find which locales are really supported--and more importantly, installed--on your system. In our example error message, environment variables affecting the locale are listed in the order of decreasing @@ -637,7 +700,7 @@ Second, if using the listed commands you see something B<exactly> (prefix matches do not count and case usually counts) like "En_US" without the quotes, then you should be okay because you are using a locale name that should be installed and available in your system. -In this case, see L<Permanently fixing your system's locale configuration>. +In this case, see L</Permanently fixing your system's locale configuration>. =head2 Permanently fixing your system's locale configuration @@ -654,14 +717,14 @@ the same. In this case, try running under a locale that you can list and which somehow matches what you tried. The rules for matching locale names are a bit vague because standardization is weak in this area. See again the -L<Finding locales> about general rules. +L</Finding locales> about general rules. =head2 Fixing system locale configuration Contact a system administrator (preferably your own) and report the exact error message you get, and ask them to read this same documentation you are now reading. They should be able to check whether there is something -wrong with the locale configuration of the system. The L<Finding locales> +wrong with the locale configuration of the system. The L</Finding locales> section is unfortunately a bit vague about the exact commands and places because these things are not that standardized. @@ -673,7 +736,7 @@ underlying C<LC_NUMERIC> and C<LC_MONETARY> locales (regardless of whether called from within the scope of C<S<use locale>> or not). (If you just want the name of the current locale for a particular category, use C<POSIX::setlocale()> -with a single parameter--see L<The setlocale function>.) +with a single parameter--see L</The setlocale function>.) use POSIX qw(locale_h); @@ -739,8 +802,7 @@ hash will be missing. =head2 I18N::Langinfo Another interface for querying locale-dependent information is the -C<I18N::Langinfo::langinfo()> function, available at least in Unix-like -systems and VMS. +C<I18N::Langinfo::langinfo()> function. The following example will import the C<langinfo()> function itself and three constants to be used as arguments to C<langinfo()>: a constant for @@ -766,9 +828,9 @@ See L<I18N::Langinfo> for more information. The following subsections describe basic locale categories. Beyond these, some combination categories allow manipulation of more than one -basic category at a time. See L<"ENVIRONMENT"> for a discussion of these. +basic category at a time. See L</"ENVIRONMENT"> for a discussion of these. -=head2 Category C<LC_COLLATE>: Collation +=head2 Category C<LC_COLLATE>: Collation: Text Comparisons and Sorting In the scope of a S<C<use locale>> form that includes collation, Perl looks to the C<LC_COLLATE> @@ -802,7 +864,7 @@ locale>> has appeared earlier in the same block) must be used for sorting raw binary data, whereas the locale-dependent collation of the first example is useful for natural text. -As noted in L<USING LOCALES>, C<cmp> compares according to the current +As noted in L</USING LOCALES>, C<cmp> compares according to the current collation locale when C<use locale> is in effect, but falls back to a char-by-char comparison for strings that the locale says are equal. You can use C<POSIX::strcoll()> if you don't want this fall-back: @@ -815,10 +877,31 @@ C<$equal_in_locale> will be true if the collation locale specifies a dictionary-like ordering that ignores space characters completely and which folds case. -Perl currently only supports single-byte locales for C<LC_COLLATE>. This means -that a UTF-8 locale likely will just give you machine-native ordering. -Use L<Unicode::Collate> for the full implementation of the Unicode -Collation Algorithm. +Perl uses the platform's C library collation functions C<strcoll()> and +C<strxfrm()>. That means you get whatever they give. On some +platforms, these functions work well on UTF-8 locales, giving +a reasonable default collation for the code points that are important in +that locale. (And if they aren't working well, the problem may only be +that the locale definition is deficient, so can be fixed by using a +better definition file. Unicode's definitions (see L</Freely available +locale definitions>) provide reasonable UTF-8 locale collation +definitions.) Starting in Perl v5.26, Perl's use of these functions has +been made more seamless. This may be sufficient for your needs. For +more control, and to make sure strings containing any code point (not +just the ones important in the locale) collate properly, the +L<Unicode::Collate> module is suggested. + +In non-UTF-8 locales (hence single byte), code points above 0xFF are +technically invalid. But if present, again starting in v5.26, they will +collate to the same position as the highest valid code point does. This +generally gives good results, but the collation order may be skewed if +the valid code point gets special treatment when it forms particular +sequences with other characters as defined by the locale. +When two strings collate identically, the code point order is used as a +tie breaker. + +If Perl detects that there are problems with the locale collation order, +it reverts to using non-locale collation rules for that locale. If you have a single string that you want to check for "equality in locale" against several others, you might think you could gain a little @@ -845,7 +928,7 @@ string the first time it's needed in a comparison, then keeps this version aroun in case it's needed again. An example rewritten the easy way with C<cmp> runs just about as fast. It also copes with null characters embedded in strings; if you call C<strxfrm()> directly, it treats the first -null it finds as a terminator. don't expect the transformed strings +null it finds as a terminator. Don't expect the transformed strings it produces to be portable across systems--or even from one revision of your operating system to the next. In short, don't call C<strxfrm()> directly: let Perl do it for you. @@ -877,17 +960,18 @@ characters between lower and uppercase. This affects the case-mapping functions--C<fc()>, C<lc()>, C<lcfirst()>, C<uc()>, and C<ucfirst()>; case-mapping interpolation with C<\F>, C<\l>, C<\L>, C<\u>, or C<\U> in double-quoted -strings and C<s///> substitutions; and case-independent regular expression +strings and C<s///> substitutions; and case-insensitive regular expression pattern matching using the C<i> modifier. Starting in v5.20, Perl supports UTF-8 locales for C<LC_CTYPE>, but otherwise Perl only supports single-byte locales, such as the ISO 8859 series. This means that wide character locales, for example for Asian -languages, are not well-supported. (If the platform has the capability -for Perl to detect such a locale, starting in Perl v5.22, -L<Perl will warn, default enabled|warnings/Category Hierarchy>, -using the C<locale> warning category, whenever such a locale is switched -into.) The UTF-8 locale support is actually a +languages, are not well-supported. Use of these locales may cause core +dumps. If the platform has the capability for Perl to detect such a +locale, starting in Perl v5.22, L<Perl will warn, default +enabled|warnings/Category Hierarchy>, using the C<locale> warning +category, whenever such a locale is switched into. The UTF-8 locale +support is actually a superset of POSIX locales, because it is really full Unicode behavior as if no C<LC_CTYPE> locale were in effect at all (except for tainting; see L</SECURITY>). POSIX locales, even UTF-8 ones, @@ -926,7 +1010,7 @@ B<Note:> A broken or malicious C<LC_CTYPE> locale definition may result in clearly ineligible characters being considered to be alphanumeric by your application. For strict matching of (mundane) ASCII letters and digits--for example, in command strings--locale-aware applications -should use C<\w> with the C</a> regular expression modifier. See L<"SECURITY">. +should use C<\w> with the C</a> regular expression modifier. See L</"SECURITY">. =head2 Category C<LC_NUMERIC>: Numeric Formatting @@ -937,7 +1021,7 @@ of how numbers should be formatted for human readability. In most implementations the only effect is to change the character used for the decimal point--perhaps from "." to ",". The functions aren't aware of such niceties as thousands separation and -so on. (See L<The localeconv function> if you care about these things.) +so on. (See L</The localeconv function> if you care about these things.) use POSIX qw(strtod setlocale LC_NUMERIC); use locale; @@ -964,7 +1048,7 @@ that is affected by its contents. (Those with experience of standards committees will recognize that the working group decided to punt on the issue.) Consequently, Perl essentially takes no notice of it. If you really want to use C<LC_MONETARY>, you can query its contents--see -L<The localeconv function>--and use the information that it returns in your +L</The localeconv function>--and use the information that it returns in your application's own formatting of currency amounts. However, you may well find that the information, voluminous and complex though it may be, still does not quite meet your requirements: currency formatting is a hard nut @@ -972,7 +1056,7 @@ to crack. See also L<I18N::Langinfo> and C<CRNCYSTR>. -=head2 C<LC_TIME> +=head2 Category C<LC_TIME>: Respresentation of time Output produced by C<POSIX::strftime()>, which builds a formatted human-readable date/time string, is affected by the current C<LC_TIME> @@ -1337,10 +1421,10 @@ You could also exclude C<LC_NUMERIC>, if you don't need it, by Versions of Perl prior to 5.004 B<mostly> ignored locale information, generally behaving as if something similar to the C<"C"> locale were always in force, even if the program environment suggested otherwise -(see L<The setlocale function>). By default, Perl still behaves this +(see L</The setlocale function>). By default, Perl still behaves this way for backward compatibility. If you want a Perl application to pay attention to locale information, you B<must> use the S<C<use locale>> -pragma (see L<The "use locale" pragma>) or, in the unlikely event +pragma (see L</The "use locale" pragma>) or, in the unlikely event that you want to do so for just pattern matching, the C</l> regular expression modifier (see L<perlre/Character set modifiers>) to instruct it to do so. @@ -1402,21 +1486,22 @@ the same way, "localization" is often abbreviated to B<l10n>. =head2 An imperfect standard Internationalization, as defined in the C and POSIX standards, can be -criticized as incomplete, ungainly, and having too large a granularity. -(Locales apply to a whole process, when it would arguably be more useful -to have them apply to a single thread, window group, or whatever.) They -also have a tendency, like standards groups, to divide the world into -nations, when we all know that the world can equally well be divided -into bankers, bikers, gamers, and so on. +criticized as incomplete and ungainly. They also have a tendency, like +standards groups, to divide the world into nations, when we all know +that the world can equally well be divided into bankers, bikers, gamers, +and so on. =head1 Unicode and UTF-8 The support of Unicode is new starting from Perl version v5.6, and more fully implemented in versions v5.8 and later. See L<perluniintro>. -Starting in Perl v5.20, UTF-8 locales are supported in Perl, except for -C<LC_COLLATE> (use L<Unicode::Collate> instead). If you have Perl v5.16 -or v5.18 and can't upgrade, you can use +Starting in Perl v5.20, UTF-8 locales are supported in Perl, except +C<LC_COLLATE> is only partially supported; collation support is improved +in Perl v5.26 to a level that may be sufficient for your needs +(see L</Category C<LC_COLLATE>: Collation: Text Comparisons and Sorting>). + +If you have Perl v5.16 or v5.18 and can't upgrade, you can use use locale ':not_characters'; @@ -1442,10 +1527,7 @@ command line switch. This form of the pragma allows essentially seamless handling of locales with Unicode. The collation order will be by Unicode code point order. -It is strongly -recommended that when you need to order and sort strings that you use -the standard module L<Unicode::Collate> which gives much better results -in many instances than you can get with the old-style locale handling. +L<Unicode::Collate> can be used to get Unicode rules collation. All the modules and switches just described can be used in v5.20 with just plain C<use locale>, and, should the input locales not be UTF-8, @@ -1499,7 +1581,7 @@ for Unicode only, such as C<\p{Alpha}>. They assume that 0xD7 always has its Unicode meaning (or the equivalent on EBCDIC platforms). Since Latin1 is a subset of Unicode and 0xD7 is the multiplication sign in both Latin1 and Unicode, C<\p{Alpha}> will never match it, regardless of locale. A similar -issue occurs with C<\N{...}>. Prior to v5.20, It is therefore a bad +issue occurs with C<\N{...}>. Prior to v5.20, it is therefore a bad idea to use C<\p{}> or C<\N{}> under plain C<use locale>--I<unless> you can guarantee that the locale will be ISO8859-1. Use POSIX character classes instead. @@ -1561,10 +1643,67 @@ consistently to regular expression matching except for bracketed character classes; in v5.14 it was extended to all regex matches; and in v5.16 to the casing operations such as C<\L> and C<uc()>. For collation, in all releases so far, the system's C<strxfrm()> function is -called, and whatever it does is what you get. +called, and whatever it does is what you get. Starting in v5.26, various +bugs are fixed with the way perl uses this function. =head1 BUGS +=head2 Collation of strings containing embedded C<NUL> characters + +C<NUL> characters will sort the same as the lowest collating control +character does, or to C<"\001"> in the unlikely event that there are no +control characters at all in the locale. In cases where the strings +don't contain this non-C<NUL> control, the results will be correct, and +in many locales, this control, whatever it might be, will rarely be +encountered. But there are cases where a C<NUL> should sort before this +control, but doesn't. If two strings do collate identically, the one +containing the C<NUL> will sort to earlier. Prior to 5.26, there were +more bugs. + +=head2 Multi-threaded + +XS code or C-language libraries called from it that use the system +L<C<setlocale(3)>> function (except on Windows) likely will not work +from a multi-threaded application without changes. See +L<perlxs/Locale-aware XS code>. + +An XS module that is locale-dependent could have been written under the +assumption that it will never be called in a multi-threaded environment, +and so uses other non-locale constructs that aren't multi-thread-safe. +See L<perlxs/Thread-aware system interfaces>. + +POSIX does not define a way to get the name of the current per-thread +locale. Some systems, such as Darwin and NetBSD do implement a +function, L<querylocale(3)> to do this. On non-Windows systems without +it, such as Linux, there are some additional caveats: + +=over + +=item * + +An embedded perl needs to be started up while the global locale is in +effect. See L<perlembed/Using embedded Perl with POSIX locales>. + +=item * + +It becomes more important for perl to know about all the possible +locale categories on the platform, even if they aren't apparently used +in your program. Perl knows all of the Linux ones. If your platform +has others, you can send email to L<mailto:perlbug@perl.org> for +inclusion of it in the next release. In the meantime, it is possible to +edit the Perl source to teach it about the category, and then recompile. +Search for instances of, say, C<LC_PAPER> in the source, and use that as +a template to add the omitted one. + +=item * + +It is possible, though hard to do, to call C<POSIX::setlocale> with a +locale that it doesn't recognize as syntactically legal, but actually is +legal on that system. This should happen only with embedded perls, or +if you hand-craft a locale name yourself. + +=back + =head2 Broken systems In certain systems, the operating system's locale support @@ -1581,10 +1720,7 @@ for broken locales>. =head1 SEE ALSO L<I18N::Langinfo>, L<perluniintro>, L<perlunicode>, L<open>, -L<POSIX/isalnum>, L<POSIX/isalpha>, -L<POSIX/isdigit>, L<POSIX/isgraph>, L<POSIX/islower>, -L<POSIX/isprint>, L<POSIX/ispunct>, L<POSIX/isspace>, -L<POSIX/isupper>, L<POSIX/isxdigit>, L<POSIX/localeconv>, +L<POSIX/localeconv>, L<POSIX/setlocale>, L<POSIX/strcoll>, L<POSIX/strftime>, L<POSIX/strtod>, L<POSIX/strxfrm>. @@ -1595,4 +1731,4 @@ see L<perlembed/Using embedded Perl with POSIX locales>. Jarkko Hietaniemi's original F<perli18n.pod> heavily hacked by Dominic Dunlop, assisted by the perl5-porters. Prose worked over a bit by -Tom Christiansen, and updated by Perl 5 porters. +Tom Christiansen, and now maintained by Perl 5 porters. diff --git a/gnu/usr.bin/perl/pod/perlmod.pod b/gnu/usr.bin/perl/pod/perlmod.pod index 0ed4bd91347..c87a68d837d 100644 --- a/gnu/usr.bin/perl/pod/perlmod.pod +++ b/gnu/usr.bin/perl/pod/perlmod.pod @@ -28,23 +28,42 @@ Best practices for making a new module. =head2 Packages X<package> X<namespace> X<variable, global> X<global variable> X<global> -Perl provides a mechanism for alternative namespaces to protect -packages from stomping on each other's variables. In fact, there's -really no such thing as a global variable in Perl. The package -statement declares the compilation unit as being in the given -namespace. The scope of the package declaration is from the +Unlike Perl 4, in which all the variables were dynamic and shared one +global name space, causing maintainability problems, Perl 5 provides two +mechanisms for protecting code from having its variables stomped on by +other code: lexically scoped variables created with C<my> or C<state> and +namespaced global variables, which are exposed via the C<vars> pragma, +or the C<our> keyword. Any global variable is considered to +be part of a namespace and can be accessed via a "fully qualified form". +Conversely, any lexically scoped variable is considered to be part of +that lexical-scope, and does not have a "fully qualified form". + +In perl namespaces are called "packages" and +the C<package> declaration tells the compiler which +namespace to prefix to C<our> variables and unqualified dynamic names. +This both protects +against accidental stomping and provides an interface for deliberately +clobbering global dynamic variables declared and used in other scopes or +packages, when that is what you want to do. + +The scope of the C<package> declaration is from the declaration itself through the end of the enclosing block, C<eval>, -or file, whichever comes first (the same scope as the my() and -local() operators). Unqualified dynamic identifiers will be in -this namespace, except for those few identifiers that if unqualified, +or file, whichever comes first (the same scope as the my(), our(), state(), and +local() operators, and also the effect +of the experimental "reference aliasing," which may change), or until +the next C<package> declaration. Unqualified dynamic identifiers will be in +this namespace, except for those few identifiers that, if unqualified, default to the main package instead of the current one as described -below. A package statement affects only dynamic variables--including -those you've used local() on--but I<not> lexical variables created -with my(). Typically it would be the first declaration in a file -included by the C<do>, C<require>, or C<use> operators. You can -switch into a package in more than one place; it merely influences -which symbol table is used by the compiler for the rest of that -block. You can refer to variables and filehandles in other packages +below. A C<package> statement affects only dynamic global +symbols, including subroutine names, and variables you've used local() +on, but I<not> lexical variables created with my(), our() or state(). + +Typically, a C<package> statement is the first declaration in a file +included in a program by one of the C<do>, C<require>, or C<use> operators. You can +switch into a package in more than one place: C<package> has no +effect beyond specifying which symbol table the compiler will use for +dynamic symbols for the rest of that block or until the next C<package> statement. +You can refer to variables and filehandles in other packages by prefixing the identifier with the package name and a double colon: C<$Package::Variable>. If the package name is null, the C<main> package is assumed. That is, C<$::sail> is equivalent to @@ -69,7 +88,8 @@ are either local to the current package, or must be fully qualified from the outer package name down. For instance, there is nowhere within package C<OUTER> that C<$INNER::var> refers to C<$OUTER::INNER::var>. C<INNER> refers to a totally -separate global package. +separate global package. The custom of treating package names as a +hierarchy is very strong, but the language in no way enforces it. Only identifiers starting with letters (or underscore) are stored in a package's symbol table. All other symbols are kept in package @@ -101,7 +121,9 @@ expressions in the context of the C<main> package (or wherever you came from). See L<perldebug>. The special symbol C<__PACKAGE__> contains the current package, but cannot -(easily) be used to construct variable names. +(easily) be used to construct variable names. After C<my($foo)> has hidden +package variable C<$foo>, it can still be accessed, without knowing what +package you are in, as C<${__PACKAGE__.'::foo'}>. See L<perlsub> for other scoping issues related to my() and local(), and L<perlref> regarding closures. diff --git a/gnu/usr.bin/perl/pod/perlmodinstall.pod b/gnu/usr.bin/perl/pod/perlmodinstall.pod index 9a2634e9ab3..72728f69e14 100644 --- a/gnu/usr.bin/perl/pod/perlmodinstall.pod +++ b/gnu/usr.bin/perl/pod/perlmodinstall.pod @@ -7,10 +7,10 @@ perlmodinstall - Installing CPAN Modules You can think of a module as the fundamental unit of reusable Perl code; see L<perlmod> for details. Whenever anyone creates a chunk of Perl code that they think will be useful to the world, they register -as a Perl developer at http://www.cpan.org/modules/04pause.html +as a Perl developer at L<http://www.cpan.org/modules/04pause.html> so that they can then upload their code to the CPAN. The CPAN is the Comprehensive Perl Archive Network and can be accessed at -http://www.cpan.org/ , and searched at http://search.cpan.org/ . +L<http://www.cpan.org/> , and searched at L<http://search.cpan.org/> . This documentation is for people who want to download CPAN modules and install them on their own computer. @@ -68,22 +68,22 @@ directory) and use this approach. B<If you're on a Unix or Unix-like system,> You can use Andreas Koenig's CPAN module -( http://www.cpan.org/modules/by-module/CPAN ) +( L<http://www.cpan.org/modules/by-module/CPAN> ) to automate the following steps, from DECOMPRESS through INSTALL. A. DECOMPRESS Decompress the file with C<gzip -d yourmodule.tar.gz> -You can get gzip from ftp://prep.ai.mit.edu/pub/gnu/ +You can get gzip from L<ftp://prep.ai.mit.edu/pub/gnu/> Or, you can combine this step with the next to save disk space: - gzip -dc yourmodule.tar.gz | tar -xf - + gzip -dc yourmodule.tar.gz | tar -xof - B. UNPACK -Unpack the result with C<tar -xf yourmodule.tar> +Unpack the result with C<tar -xof yourmodule.tar> C. BUILD @@ -128,7 +128,7 @@ steps below. A. DECOMPRESS -You can use the shareware Winzip ( http://www.winzip.com ) to +You can use the shareware Winzip ( L<http://www.winzip.com> ) to decompress and unpack modules. B. UNPACK @@ -138,9 +138,9 @@ If you used WinZip, this was already done for you. C. BUILD You'll need the C<nmake> utility, available at -http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe +L<http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe> or dmake, available on CPAN. -http://search.cpan.org/dist/dmake/ +L<http://search.cpan.org/dist/dmake/> Does the module require compilation (i.e. does it have files that end in .xs, .c, .h, .y, .cc, .cxx, or .C)? If it does, life is now @@ -148,7 +148,7 @@ officially tough for you, because you have to compile the module yourself (no easy feat on Windows). You'll need a compiler such as Visual C++. Alternatively, you can download a pre-built PPM package from ActiveState. -http://aspn.activestate.com/ASPN/Downloads/ActivePerl/PPM/ +L<http://aspn.activestate.com/ASPN/Downloads/ActivePerl/PPM/> Go into the newly-created directory and type: @@ -164,84 +164,11 @@ While still in that directory, type: =item * -B<If you're using a Macintosh with "Classic" MacOS and MacPerl,> - - -A. DECOMPRESS - -First, make sure you have the latest B<cpan-mac> distribution ( -http://www.cpan.org/authors/id/CNANDOR/ ), which has utilities for -doing all of the steps. Read the cpan-mac directions carefully and -install it. If you choose not to use cpan-mac for some reason, there -are alternatives listed here. - -After installing cpan-mac, drop the module archive on the -B<untarzipme> droplet, which will decompress and unpack for you. - -B<Or>, you can either use the shareware B<StuffIt Expander> program -( http://my.smithmicro.com/mac/stuffit/ ) -or the freeware B<MacGzip> program ( -http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html ). - -B. UNPACK - -If you're using untarzipme or StuffIt, the archive should be extracted -now. B<Or>, you can use the freeware B<suntar> or I<Tar> ( -http://hyperarchive.lcs.mit.edu/HyperArchive/Archive/cmp/ ). - -C. BUILD - -Check the contents of the distribution. -Read the module's documentation, looking for -reasons why you might have trouble using it with MacPerl. Look for -F<.xs> and F<.c> files, which normally denote that the distribution -must be compiled, and you cannot install it "out of the box." -(See L<"PORTABILITY">.) - -D. INSTALL - -If you are using cpan-mac, just drop the folder on the -B<installme> droplet, and use the module. - -B<Or>, if you aren't using cpan-mac, do some manual labor. - -Make sure the newlines for the modules are in Mac format, not Unix format. -If they are not then you might have decompressed them incorrectly. Check -your decompression and unpacking utilities settings to make sure they are -translating text files properly. - -As a last resort, you can use the perl one-liner: - - perl -i.bak -pe 's/(?:\015)?\012/\015/g' <filenames> - -on the source files. - -Then move the files (probably just the F<.pm> files, though there -may be some additional ones, too; check the module documentation) -to their final destination: This will -most likely be in C<$ENV{MACPERL}site_lib:> (i.e., -C<HD:MacPerl folder:site_lib:>). You can add new paths to -the default C<@INC> in the Preferences menu item in the -MacPerl application (C<$ENV{MACPERL}site_lib:> is added -automagically). Create whatever directory structures are required -(i.e., for C<Some::Module>, create -C<$ENV{MACPERL}site_lib:Some:> and put -C<Module.pm> in that directory). - -Then run the following script (or something like it): - - #!perl -w - use AutoSplit; - my $dir = "${MACPERL}site_perl"; - autosplit("$dir:Some:Module.pm", "$dir:auto", 0, 1, 1); - -=item * - B<If you're on the DJGPP port of DOS,> A. DECOMPRESS -djtarx ( ftp://ftp.delorie.com/pub/djgpp/current/v2/ ) +djtarx ( L<ftp://ftp.delorie.com/pub/djgpp/current/v2/> ) will both uncompress and unpack. B. UNPACK @@ -271,7 +198,7 @@ You will need the packages mentioned in F<README.dos> in the Perl distribution. B<If you're on OS/2,> Get the EMX development suite and gzip/tar, from either Hobbes ( -http://hobbes.nmsu.edu ) or Leo ( http://www.leo.org ), and then follow +L<http://hobbes.nmsu.edu> ) or Leo ( L<http://www.leo.org> ), and then follow the instructions for Unix. =item * @@ -319,7 +246,7 @@ Or, if you're fond of VMS command syntax: C. BUILD Make sure you have MMS (from Digital) or the freeware MMK ( available -from MadGoat at http://www.madgoat.com ). Then type this to create +from MadGoat at L<http://www.madgoat.com> ). Then type this to create the DESCRIP.MMS for the module: perl Makefile.PL @@ -350,7 +277,7 @@ A. DECOMPRESS Decompress the file with C<gzip -d yourmodule.tar.gz> You can get gzip from -http://www.s390.ibm.com/products/oe/bpxqp1.html +L<http://www.s390.ibm.com/products/oe/bpxqp1.html> B. UNPACK @@ -360,7 +287,7 @@ Unpack the result with The BUILD and INSTALL steps are identical to those for Unix. Some modules generate Makefiles that work better with GNU make, which is -available from http://www.mks.com/s390/gnu/ +available from L<http://www.mks.com/s390/gnu/> =back @@ -415,8 +342,8 @@ If you have any suggested changes for this page, let me know. Please don't send me mail asking for help on how to install your modules. There are too many modules, and too few Orwants, for me to be able to answer or even acknowledge all your questions. Contact the module -author instead, or post to comp.lang.perl.modules, or ask someone -familiar with Perl on your operating system. +author instead, ask someone familiar with Perl on your operating +system, or if all else fails, file a ticket at L<http://rt.cpan.org/>. =head1 AUTHOR diff --git a/gnu/usr.bin/perl/pod/perlmodlib.PL b/gnu/usr.bin/perl/pod/perlmodlib.PL index aa6b18cd3cb..b92f9ca151e 100644 --- a/gnu/usr.bin/perl/pod/perlmodlib.PL +++ b/gnu/usr.bin/perl/pod/perlmodlib.PL @@ -3,7 +3,7 @@ use strict; use warnings; -$ENV{LC_ALL} = 'C'; +local $ENV{LC_ALL} = 'C'; my $Quiet; @ARGV = grep { not($_ eq '-q' and $Quiet = 1) } @ARGV; @@ -13,7 +13,7 @@ if (@ARGV) { chdir $workdir or die "Couldn't chdir to '$workdir': $!"; } -require 'regen/regen_lib.pl'; +require './regen/regen_lib.pl'; # MANIFEST itself is Unix style filenames, so we have to assume that Unix style # filenames will work. @@ -341,11 +341,23 @@ Generated by Porting/make_modlib_cpan.pl =item South Africa - http://cpan.mirror.ac.za/ - ftp://cpan.mirror.ac.za/ http://mirror.is.co.za/pub/cpan/ ftp://ftp.is.co.za/pub/cpan/ + http://cpan.mirror.ac.za/ + ftp://cpan.mirror.ac.za/ + http://cpan.saix.net/ ftp://ftp.saix.net/pub/CPAN/ + http://ftp.wa.co.za/pub/CPAN/ + ftp://ftp.wa.co.za/pub/CPAN/ + +=item Uganda + + http://mirror.ucu.ac.ug/cpan/ + +=item Zimbabwe + + http://mirror.zol.co.zw/CPAN/ + ftp://mirror.zol.co.zw/CPAN/ =back @@ -353,103 +365,123 @@ Generated by Porting/make_modlib_cpan.pl =over 4 -=item China +=item Bangladesh - http://cpan.wenzk.com/ + http://mirror.dhakacom.com/CPAN/ + ftp://mirror.dhakacom.com/CPAN/ -=item Hong Kong +=item China + http://cpan.communilink.net/ http://ftp.cuhk.edu.hk/pub/packages/perl/CPAN/ ftp://ftp.cuhk.edu.hk/pub/packages/perl/CPAN/ - http://mirrors.geoexpat.com/cpan/ + http://mirrors.hust.edu.cn/CPAN/ + http://mirrors.neusoft.edu.cn/cpan/ + http://mirror.lzu.edu.cn/CPAN/ + http://mirrors.163.com/cpan/ + http://mirrors.sohu.com/CPAN/ + http://mirrors.ustc.edu.cn/CPAN/ + ftp://mirrors.ustc.edu.cn/CPAN/ + http://mirrors.xmu.edu.cn/CPAN/ + ftp://mirrors.xmu.edu.cn/CPAN/ + http://mirrors.zju.edu.cn/CPAN/ =item India + http://cpan.excellmedia.net/ http://perlmirror.indialinks.com/ =item Indonesia - http://cpan.biz.net.id/ - http://komo.vlsm.org/CPAN/ - ftp://komo.vlsm.org/CPAN/ - http://cpan.cermin.lipi.go.id/ - ftp://cermin.lipi.go.id/pub/CPAN/ + http://kambing.ui.ac.id/cpan/ http://cpan.pesat.net.id/ + http://mirror.poliwangi.ac.id/CPAN/ + http://kartolo.sby.datautama.net.id/CPAN/ + http://mirror.wanxp.id/cpan/ + +=item Iran + + http://mirror.yazd.ac.ir/cpan/ + +=item Israel + + http://biocourse.weizmann.ac.il/CPAN/ =item Japan - ftp://ftp.u-aizu.ac.jp/pub/CPAN + http://ftp.jaist.ac.jp/pub/CPAN/ + ftp://ftp.jaist.ac.jp/pub/CPAN/ + http://mirror.jre655.com/CPAN/ + ftp://mirror.jre655.com/CPAN/ ftp://ftp.kddilabs.jp/CPAN/ http://ftp.nara.wide.ad.jp/pub/CPAN/ ftp://ftp.nara.wide.ad.jp/pub/CPAN/ - http://ftp.jaist.ac.jp/pub/CPAN/ - ftp://ftp.jaist.ac.jp/pub/CPAN/ - ftp://ftp.dti.ad.jp/pub/lang/CPAN/ - ftp://ftp.ring.gr.jp/pub/lang/perl/CPAN/ http://ftp.riken.jp/lang/CPAN/ ftp://ftp.riken.jp/lang/CPAN/ + ftp://ftp.u-aizu.ac.jp/pub/CPAN/ http://ftp.yz.yamagata-u.ac.jp/pub/lang/cpan/ ftp://ftp.yz.yamagata-u.ac.jp/pub/lang/cpan/ +=item Kazakhstan + + http://mirror.neolabs.kz/CPAN/ + ftp://mirror.neolabs.kz/CPAN/ + +=item Philippines + + http://mirror.pregi.net/CPAN/ + ftp://mirror.pregi.net/CPAN/ + http://mirror.rise.ph/cpan/ + ftp://mirror.rise.ph/cpan/ + +=item Qatar + + http://mirror.qnren.qa/CPAN/ + ftp://mirror.qnren.qa/CPAN/ + =item Republic of Korea - http://ftp.kaist.ac.kr/pub/CPAN - ftp://ftp.kaist.ac.kr/pub/CPAN http://cpan.mirror.cdnetworks.com/ ftp://cpan.mirror.cdnetworks.com/CPAN/ - http://cpan.sarang.net/ - ftp://cpan.sarang.net/CPAN/ - -=item Russia - - http://cpan.tomsk.ru/ - ftp://cpan.tomsk.ru/ + http://ftp.kaist.ac.kr/pub/CPAN/ + ftp://ftp.kaist.ac.kr/CPAN/ + http://ftp.kr.freebsd.org/pub/CPAN/ + ftp://ftp.kr.freebsd.org/pub/CPAN/ + http://mirror.navercorp.com/CPAN/ + http://ftp.neowiz.com/CPAN/ + ftp://ftp.neowiz.com/CPAN/ =item Singapore - http://mirror.averse.net/pub/CPAN - ftp://mirror.averse.net/pub/CPAN http://cpan.mirror.choon.net/ - http://cpan.oss.eznetsols.org - ftp://ftp.oss.eznetsols.org/cpan + http://mirror.0x.sg/CPAN/ + ftp://mirror.0x.sg/CPAN/ =item Taiwan - http://ftp.cse.yzu.edu.tw/pub/CPAN/ - ftp://ftp.cse.yzu.edu.tw/pub/CPAN/ + http://cpan.cdpa.nsysu.edu.tw/Unix/Lang/CPAN/ + ftp://cpan.cdpa.nsysu.edu.tw/Unix/Lang/CPAN/ + http://cpan.stu.edu.tw/ + ftp://ftp.stu.edu.tw/CPAN/ + http://ftp.yzu.edu.tw/CPAN/ + ftp://ftp.yzu.edu.tw/CPAN/ http://cpan.nctu.edu.tw/ ftp://cpan.nctu.edu.tw/ - ftp://ftp.ncu.edu.tw/CPAN/ - http://cpan.cdpa.nsysu.edu.tw/ - ftp://cpan.cdpa.nsysu.edu.tw/Unix/Lang/CPAN/ - http://cpan.stu.edu.tw - ftp://ftp.stu.edu.tw/CPAN - http://ftp.stu.edu.tw/CPAN - ftp://ftp.stu.edu.tw/pub/CPAN - http://cpan.cs.pu.edu.tw/ - ftp://cpan.cs.pu.edu.tw/pub/CPAN - -=item Thailand - - http://mirrors.issp.co.th/cpan/ - ftp://mirrors.issp.co.th/cpan/ - http://mirror.yourconnect.com/CPAN/ - ftp://mirror.yourconnect.com/CPAN/ + http://ftp.ubuntu-tw.org/mirror/CPAN/ + ftp://ftp.ubuntu-tw.org/mirror/CPAN/ =item Turkey - http://cpan.gazi.edu.tr/ + http://cpan.ulak.net.tr/ + ftp://ftp.ulak.net.tr/pub/perl/CPAN/ + http://mirror.vit.com.tr/mirror/CPAN/ + ftp://mirror.vit.com.tr/CPAN/ -=back - -=head2 Central America - -=over 4 - -=item Costa Rica +=item Viet Nam - http://mirrors.ucr.ac.cr/CPAN/ - ftp://mirrors.ucr.ac.cr/CPAN/ + http://mirrors.digipower.vn/CPAN/ + http://mirror.downloadvn.com/cpan/ + http://mirrors.vinahost.vn/CPAN/ =back @@ -460,27 +492,38 @@ Generated by Porting/make_modlib_cpan.pl =item Austria http://cpan.inode.at/ - ftp://cpan.inode.at + ftp://cpan.inode.at/ + http://mirror.easyname.at/cpan/ + ftp://mirror.easyname.at/cpan/ http://gd.tuwien.ac.at/languages/perl/CPAN/ ftp://gd.tuwien.ac.at/pub/CPAN/ +=item Belarus + + http://ftp.byfly.by/pub/CPAN/ + ftp://ftp.byfly.by/pub/CPAN/ + http://mirror.datacenter.by/pub/CPAN/ + ftp://mirror.datacenter.by/pub/CPAN/ + =item Belgium - http://ftp.belnet.be/mirror/ftp.cpan.org/ + http://ftp.belnet.be/ftp.cpan.org/ ftp://ftp.belnet.be/mirror/ftp.cpan.org/ - http://ftp.easynet.be/pub/CPAN/ - http://cpan.weepee.org/ + http://cpan.cu.be/ + http://lib.ugent.be/CPAN/ + http://cpan.weepeetelecom.be/ =item Bosnia and Herzegovina - http://cpan.blic.net/ + http://cpan.mirror.ba/ + ftp://ftp.mirror.ba/CPAN/ =item Bulgaria - http://cpan.cbox.biz/ - ftp://cpan.cbox.biz/cpan/ - http://cpan.digsys.bg/ - ftp://ftp.digsys.bg/pub/CPAN + http://mirrors.neterra.net/CPAN/ + ftp://mirrors.neterra.net/CPAN/ + http://mirrors.netix.net/CPAN/ + ftp://mirrors.netix.net/CPAN/ =item Croatia @@ -489,104 +532,100 @@ Generated by Porting/make_modlib_cpan.pl =item Czech Republic + http://mirror.dkm.cz/cpan/ + ftp://mirror.dkm.cz/cpan/ ftp://ftp.fi.muni.cz/pub/CPAN/ - http://archive.cpan.cz/ + http://mirrors.nic.cz/CPAN/ + ftp://mirrors.nic.cz/pub/CPAN/ + http://cpan.mirror.vutbr.cz/ + ftp://mirror.vutbr.cz/cpan/ =item Denmark - http://mirrors.dotsrc.org/cpan - ftp://mirrors.dotsrc.org/cpan/ http://www.cpan.dk/ - http://mirror.uni-c.dk/pub/CPAN/ + http://mirrors.dotsrc.org/cpan/ + ftp://mirrors.dotsrc.org/cpan/ =item Finland ftp://ftp.funet.fi/pub/languages/perl/CPAN/ - http://mirror.eunet.fi/CPAN =item France - http://cpan.enstimac.fr/ - ftp://ftp.inria.fr/pub/CPAN/ + http://ftp.ciril.fr/pub/cpan/ + ftp://ftp.ciril.fr/pub/cpan/ http://distrib-coffee.ipsl.jussieu.fr/pub/mirrors/cpan/ ftp://distrib-coffee.ipsl.jussieu.fr/pub/mirrors/cpan/ + http://ftp.lip6.fr/pub/perl/CPAN/ ftp://ftp.lip6.fr/pub/perl/CPAN/ - http://mir2.ovh.net/ftp.cpan.org - ftp://mir1.ovh.net/ftp.cpan.org + http://mirror.ibcp.fr/pub/CPAN/ ftp://ftp.oleane.net/pub/CPAN/ - http://ftp.crihan.fr/mirrors/ftp.cpan.org/ - ftp://ftp.crihan.fr/mirrors/ftp.cpan.org/ - http://ftp.u-strasbg.fr/CPAN - ftp://ftp.u-strasbg.fr/CPAN - http://cpan.cict.fr/ - ftp://cpan.cict.fr/pub/CPAN/ + http://cpan.mirrors.ovh.net/ftp.cpan.org/ + ftp://cpan.mirrors.ovh.net/ftp.cpan.org/ + http://cpan.enstimac.fr/ =item Germany - ftp://ftp.fu-berlin.de/unix/languages/perl/ - http://mirrors.softliste.de/cpan/ - ftp://ftp.rub.de/pub/CPAN/ - http://www.planet-elektronik.de/CPAN/ + http://mirror.23media.de/cpan/ + ftp://mirror.23media.de/cpan/ + http://artfiles.org/cpan.org/ + ftp://artfiles.org/cpan.org/ + http://mirror.bibleonline.ru/cpan/ + http://mirror.checkdomain.de/CPAN/ + ftp://mirror.checkdomain.de/CPAN/ + http://cpan.noris.de/ + http://mirror.de.leaseweb.net/CPAN/ + ftp://mirror.de.leaseweb.net/CPAN/ + http://cpan.mirror.euserv.net/ + ftp://mirror.euserv.net/cpan/ + http://ftp-stud.hs-esslingen.de/pub/Mirrors/CPAN/ + ftp://mirror.fraunhofer.de/CPAN/ + ftp://ftp.freenet.de/pub/ftp.cpan.org/pub/CPAN/ http://ftp.hosteurope.de/pub/CPAN/ ftp://ftp.hosteurope.de/pub/CPAN/ - http://www.mirrorspace.org/cpan/ - http://mirror.netcologne.de/cpan/ - ftp://mirror.netcologne.de/cpan/ - ftp://ftp.freenet.de/pub/ftp.cpan.org/pub/CPAN/ - http://ftp-stud.hs-esslingen.de/pub/Mirrors/CPAN/ - ftp://ftp-stud.hs-esslingen.de/pub/Mirrors/CPAN/ - http://mirrors.zerg.biz/cpan/ + ftp://ftp.fu-berlin.de/unix/languages/perl/ http://ftp.gwdg.de/pub/languages/perl/CPAN/ ftp://ftp.gwdg.de/pub/languages/perl/CPAN/ - http://dl.ambiweb.de/mirrors/ftp.cpan.org/ - http://cpan.mirror.clusters.kg/ + http://ftp.hawo.stw.uni-erlangen.de/CPAN/ + ftp://ftp.hawo.stw.uni-erlangen.de/CPAN/ http://cpan.mirror.iphh.net/ ftp://cpan.mirror.iphh.net/pub/CPAN/ - http://cpan.mirroring.de/ - http://mirror.informatik.uni-mannheim.de/pub/mirrors/CPAN/ - ftp://mirror.informatik.uni-mannheim.de/pub/mirrors/CPAN/ - http://www.chemmedia.de/mirrors/CPAN/ - http://ftp.cw.net/pub/CPAN/ - ftp://ftp.cw.net/pub/CPAN/ - http://cpan.cpantesters.org/ - ftp://cpan.cpantesters.org/CPAN/ - http://cpan.mirrored.de/ + ftp://ftp.mpi-inf.mpg.de/pub/perl/CPAN/ + http://cpan.netbet.org/ + http://mirror.netcologne.de/cpan/ + ftp://mirror.netcologne.de/cpan/ ftp://mirror.petamem.com/CPAN/ - http://cpan.noris.de/ - ftp://cpan.noris.de/pub/CPAN/ - ftp://ftp.mpi-sb.mpg.de/pub/perl/CPAN/ - ftp://ftp.gmd.de/mirrors/CPAN/ + http://www.planet-elektronik.de/CPAN/ + http://ftp.halifax.rwth-aachen.de/cpan/ + ftp://ftp.halifax.rwth-aachen.de/cpan/ + http://mirror.softaculous.com/cpan/ + http://ftp.u-tx.net/CPAN/ + ftp://ftp.u-tx.net/CPAN/ + http://mirror.reismil.ch/CPAN/ =item Greece - ftp://ftp.forthnet.gr/pub/languages/perl/CPAN - ftp://ftp.ntua.gr/pub/lang/perl/ - http://cpan.cc.uoc.gr/ + http://cpan.cc.uoc.gr/mirrors/CPAN/ ftp://ftp.cc.uoc.gr/mirrors/CPAN/ + http://ftp.ntua.gr/pub/lang/perl/ + ftp://ftp.ntua.gr/pub/lang/perl/ =item Hungary - http://cpan.mirrors.enexis.hu/ - ftp://cpan.mirrors.enexis.hu/mirrors/cpan/ - http://cpan.hu/ - -=item Iceland - - http://ftp.rhnet.is/pub/CPAN/ - ftp://ftp.rhnet.is/pub/CPAN/ + http://mirror.met.hu/CPAN/ =item Ireland - http://ftp.esat.net/pub/languages/perl/CPAN/ - ftp://ftp.esat.net/pub/languages/perl/CPAN/ - http://ftp.heanet.ie/mirrors/ftp.perl.org/pub/CPAN - ftp://ftp.heanet.ie/mirrors/ftp.perl.org/pub/CPAN + http://ftp.heanet.ie/mirrors/ftp.perl.org/pub/CPAN/ + ftp://ftp.heanet.ie/mirrors/ftp.perl.org/pub/CPAN/ =item Italy http://bo.mirror.garr.it/mirrors/CPAN/ + ftp://ftp.eutelia.it/CPAN_Mirror/ http://cpan.panu.it/ ftp://ftp.panu.it/pub/mirrors/perl/CPAN/ + http://cpan.muzzy.it/ =item Latvia @@ -597,136 +636,133 @@ Generated by Porting/make_modlib_cpan.pl http://ftp.litnet.lt/pub/CPAN/ ftp://ftp.litnet.lt/pub/CPAN/ -=item Malta +=item Moldova - http://cpan.waldonet.net.mt/ + http://mirror.as43289.net/pub/CPAN/ + ftp://mirror.as43289.net/pub/CPAN/ =item Netherlands - ftp://ftp.quicknet.nl/pub/CPAN/ - http://mirror.hostfuss.com/CPAN/ - ftp://mirror.hostfuss.com/CPAN/ - http://mirrors3.kernel.org/cpan/ - ftp://mirrors3.kernel.org/pub/CPAN/ - http://cpan.mirror.versatel.nl/ - ftp://ftp.mirror.versatel.nl/cpan/ + http://cpan.cs.uu.nl/ + ftp://ftp.cs.uu.nl/pub/CPAN/ + http://mirror.nl.leaseweb.net/CPAN/ + ftp://mirror.nl.leaseweb.net/CPAN/ + http://ftp.nluug.nl/languages/perl/CPAN/ + ftp://ftp.nluug.nl/pub/languages/perl/CPAN/ + http://mirror.transip.net/CPAN/ + ftp://mirror.transip.net/CPAN/ + http://cpan.mirror.triple-it.nl/ + http://ftp.tudelft.nl/cpan/ + ftp://ftp.tudelft.nl/pub/CPAN/ ftp://download.xs4all.nl/pub/mirror/CPAN/ - http://mirror.leaseweb.com/CPAN/ - ftp://mirror.leaseweb.com/CPAN/ - ftp://ftp.cpan.nl/pub/CPAN/ - http://archive.cs.uu.nl/mirror/CPAN/ - ftp://ftp.cs.uu.nl/mirror/CPAN/ - http://luxitude.net/cpan/ =item Norway - ftp://ftp.uninett.no/pub/languages/perl/CPAN - ftp://ftp.uit.no/pub/languages/perl/cpan/ + http://cpan.uib.no/ + ftp://cpan.uib.no/pub/CPAN/ + ftp://ftp.uninett.no/pub/languages/perl/CPAN/ + http://cpan.vianett.no/ =item Poland - http://piotrkosoft.net/pub/mirrors/CPAN/ + http://ftp.agh.edu.pl/CPAN/ + ftp://ftp.agh.edu.pl/CPAN/ + http://ftp.piotrkosoft.net/pub/mirrors/CPAN/ ftp://ftp.piotrkosoft.net/pub/mirrors/CPAN/ - http://ftp.man.poznan.pl/pub/CPAN - ftp://ftp.man.poznan.pl/pub/CPAN ftp://ftp.ps.pl/pub/CPAN/ + http://sunsite.icm.edu.pl/pub/CPAN/ ftp://sunsite.icm.edu.pl/pub/CPAN/ - ftp://ftp.tpnet.pl/d4/CPAN/ =item Portugal - http://cpan.dei.uc.pt/ - ftp://ftp.dei.uc.pt/pub/CPAN - ftp://ftp.ist.utl.pt/pub/CPAN/ - http://cpan.perl.pt/ - http://cpan.ip.pt/ - ftp://cpan.ip.pt/pub/cpan/ - http://mirrors.nfsi.pt/CPAN/ - ftp://mirrors.nfsi.pt/pub/CPAN/ http://cpan.dcc.fc.up.pt/ + http://mirrors.fe.up.pt/pub/CPAN/ + http://cpan.perl-hackers.net/ + http://cpan.perl.pt/ =item Romania - http://ftp.astral.ro/pub/CPAN/ - ftp://ftp.astral.ro/pub/CPAN/ - ftp://ftp.lug.ro/CPAN + http://mirrors.hostingromania.ro/cpan.org/ + ftp://ftp.lug.ro/CPAN/ + http://mirrors.m247.ro/CPAN/ + http://mirrors.evowise.com/CPAN/ + http://mirrors.teentelecom.net/CPAN/ + ftp://mirrors.teentelecom.net/CPAN/ http://mirrors.xservers.ro/CPAN/ - http://mirrors.hostingromania.ro/ftp.cpan.org/ - ftp://ftp.hostingromania.ro/mirrors/ftp.cpan.org/ - ftp://ftp.iasi.roedu.net/pub/mirrors/ftp.cpan.org/ -=item Russia +=item Russian Federation ftp://ftp.aha.ru/CPAN/ http://cpan.rinet.ru/ ftp://cpan.rinet.ru/pub/mirror/CPAN/ - ftp://ftp.SpringDaemons.com/pub/CPAN/ + http://cpan-mirror.rbc.ru/pub/CPAN/ http://mirror.rol.ru/CPAN/ - http://ftp.silvernet.ru/CPAN/ - http://ftp.spbu.ru/CPAN/ - ftp://ftp.spbu.ru/CPAN/ + http://cpan.uni-altai.ru/ + http://cpan.webdesk.ru/ + ftp://cpan.webdesk.ru/cpan/ + http://mirror.yandex.ru/mirrors/cpan/ + ftp://mirror.yandex.ru/mirrors/cpan/ + +=item Serbia + + http://mirror.sbb.rs/CPAN/ + ftp://mirror.sbb.rs/CPAN/ =item Slovakia - http://cpan.fyxm.net/ + http://cpan.lnx.sk/ + http://tux.rainside.sk/CPAN/ + ftp://tux.rainside.sk/CPAN/ =item Slovenia - http://www.klevze.si/cpan + http://ftp.arnes.si/software/perl/CPAN/ + ftp://ftp.arnes.si/software/perl/CPAN/ =item Spain + http://mirrors.evowise.com/CPAN/ http://osl.ugr.es/CPAN/ + http://ftp.rediris.es/mirror/CPAN/ ftp://ftp.rediris.es/mirror/CPAN/ - http://ftp.gui.uva.es/sites/cpan.org/ - ftp://ftp.gui.uva.es/sites/cpan.org/ =item Sweden - http://mirrors4.kernel.org/cpan/ - ftp://mirrors4.kernel.org/pub/CPAN/ + http://ftp.acc.umu.se/mirror/CPAN/ + ftp://ftp.acc.umu.se/mirror/CPAN/ =item Switzerland - http://cpan.mirror.solnet.ch/ - ftp://ftp.solnet.ch/mirror/CPAN/ - ftp://ftp.adwired.ch/CPAN/ + http://www.pirbot.com/mirrors/cpan/ http://mirror.switch.ch/ftp/mirror/CPAN/ ftp://mirror.switch.ch/mirror/CPAN/ =item Ukraine - http://cpan.makeperl.org/ - ftp://cpan.makeperl.org/pub/CPAN - http://cpan.org.ua/ - http://cpan.gafol.net/ - ftp://ftp.gafol.net/pub/cpan/ + http://cpan.ip-connect.vn.ua/ + ftp://cpan.ip-connect.vn.ua/mirror/cpan/ =item United Kingdom - http://www.mirrorservice.org/sites/ftp.funet.fi/pub/languages/perl/CPAN/ - ftp://ftp.mirrorservice.org/sites/ftp.funet.fi/pub/languages/perl/CPAN/ - http://mirror.tje.me.uk/pub/mirrors/ftp.cpan.org/ - ftp://mirror.tje.me.uk/pub/mirrors/ftp.cpan.org/ - http://www.mirror.8086.net/sites/CPAN/ - ftp://ftp.mirror.8086.net/sites/CPAN/ http://cpan.mirror.anlx.net/ ftp://ftp.mirror.anlx.net/CPAN/ http://mirror.bytemark.co.uk/CPAN/ ftp://mirror.bytemark.co.uk/CPAN/ + http://mirrors.coreix.net/CPAN/ http://cpan.etla.org/ - ftp://cpan.etla.org/pub/CPAN - ftp://ftp.demon.co.uk/pub/CPAN/ + ftp://cpan.etla.org/pub/CPAN/ + http://cpan.cpantesters.org/ + http://mirror.sax.uk.as61049.net/CPAN/ http://mirror.sov.uk.goscomb.net/CPAN/ - ftp://mirror.sov.uk.goscomb.net/pub/CPAN/ - http://ftp.plig.net/pub/CPAN/ - ftp://ftp.plig.net/pub/CPAN/ + http://www.mirrorservice.org/sites/cpan.perl.org/CPAN/ + ftp://ftp.mirrorservice.org/sites/cpan.perl.org/CPAN/ + http://mirror.ox.ac.uk/sites/www.cpan.org/ + ftp://mirror.ox.ac.uk/sites/www.cpan.org/ http://ftp.ticklers.org/pub/CPAN/ ftp://ftp.ticklers.org/pub/CPAN/ http://cpan.mirrors.uk2.net/ ftp://mirrors.uk2.net/pub/CPAN/ - http://mirror.ox.ac.uk/sites/www.cpan.org/ - ftp://mirror.ox.ac.uk/sites/www.cpan.org/ + http://mirror.ukhost4u.com/CPAN/ =back @@ -734,25 +770,20 @@ Generated by Porting/make_modlib_cpan.pl =over 4 -=item Bahamas - - http://www.securehost.com/mirror/CPAN/ - =item Canada - http://cpan.arcticnetwork.ca - ftp://mirror.arcticnetwork.ca/pub/CPAN - http://cpan.sunsite.ualberta.ca/ - ftp://cpan.sunsite.ualberta.ca/pub/CPAN/ - http://theoryx5.uwinnipeg.ca/pub/CPAN/ - ftp://theoryx5.uwinnipeg.ca/pub/CPAN/ - http://arwen.cs.dal.ca/mirror/CPAN/ - ftp://arwen.cs.dal.ca/pub/mirror/CPAN/ http://CPAN.mirror.rafal.ca/ ftp://CPAN.mirror.rafal.ca/pub/CPAN/ - ftp://ftp.nrc.ca/pub/CPAN/ - http://mirror.csclub.uwaterloo.ca/pub/CPAN/ - ftp://mirror.csclub.uwaterloo.ca/pub/CPAN/ + http://mirror.csclub.uwaterloo.ca/CPAN/ + ftp://mirror.csclub.uwaterloo.ca/CPAN/ + http://mirrors.gossamer-threads.com/CPAN/ + http://mirror.its.dal.ca/cpan/ + ftp://mirror.its.dal.ca/cpan/ + ftp://ftp.ottix.net/pub/CPAN/ + +=item Costa Rica + + http://mirrors.ucr.ac.cr/CPAN/ =item Mexico @@ -765,58 +796,43 @@ Generated by Porting/make_modlib_cpan.pl =item Alabama - http://mirror.hiwaay.net/CPAN/ - ftp://mirror.hiwaay.net/CPAN/ + http://mirror.teklinks.com/CPAN/ =item Arizona - http://cpan.ezarticleinformation.com/ + http://mirror.n5tech.com/CPAN/ + http://mirrors.namecheap.com/CPAN/ + ftp://mirrors.namecheap.com/CPAN/ =item California - http://cpan.knowledgematters.net/ - http://cpan.binkerton.com/ http://cpan.develooper.com/ - http://mirrors.gossamer-threads.com/CPAN - http://cpan.schatt.com/ - http://mirrors.kernel.org/cpan/ - ftp://mirrors.kernel.org/pub/CPAN - http://mirrors2.kernel.org/cpan/ - ftp://mirrors2.kernel.org/pub/CPAN/ - http://cpan.mirror.facebook.net/ - http://mirrors1.kernel.org/cpan/ - ftp://mirrors1.kernel.org/pub/CPAN/ - http://cpan-sj.viaverio.com/ - ftp://cpan-sj.viaverio.com/pub/CPAN/ + http://httpupdate127.cpanel.net/CPAN/ + http://mirrors.sonic.net/cpan/ + ftp://mirrors.sonic.net/cpan/ http://www.perl.com/CPAN/ - -=item Florida - - ftp://ftp.cise.ufl.edu/pub/mirrors/CPAN/ - http://mirror.atlantic.net/pub/CPAN/ - ftp://mirror.atlantic.net/pub/CPAN/ + http://cpan.yimg.com/ =item Idaho - http://mirror.its.uidaho.edu/pub/cpan/ - ftp://mirror.its.uidaho.edu/cpan/ + http://mirrors.syringanetworks.net/CPAN/ + ftp://mirrors.syringanetworks.net/CPAN/ =item Illinois http://cpan.mirrors.hoobly.com/ - http://cpan.uchicago.edu/pub/CPAN/ - ftp://cpan.uchicago.edu/pub/CPAN/ - http://mirrors.servercentral.net/CPAN/ - http://www.stathy.com/CPAN/ - ftp://www.stathy.com/CPAN/ + http://mirror.team-cymru.org/CPAN/ + ftp://mirror.team-cymru.org/CPAN/ =item Indiana - ftp://ftp.uwsg.iu.edu/pub/perl/CPAN/ http://cpan.netnitco.net/ ftp://cpan.netnitco.net/pub/mirrors/CPAN/ - http://ftp.ndlug.nd.edu/pub/perl/ - ftp://ftp.ndlug.nd.edu/pub/perl/ + ftp://ftp.uwsg.iu.edu/pub/perl/CPAN/ + +=item Kansas + + http://mirrors.concertpass.com/cpan/ =item Massachusetts @@ -824,22 +840,27 @@ Generated by Porting/make_modlib_cpan.pl =item Michigan - http://ftp.wayne.edu/cpan/ - ftp://ftp.wayne.edu/cpan/ + http://cpan.cse.msu.edu/ + ftp://cpan.cse.msu.edu/ + http://httpupdate118.cpanel.net/CPAN/ + http://mirrors-usa.go-parts.com/cpan/ + http://ftp.wayne.edu/CPAN/ + ftp://ftp.wayne.edu/CPAN/ -=item Minnesota +=item New Hampshire - http://cpan.msi.umn.edu/ + http://mirror.metrocast.net/cpan/ =item New Jersey http://mirror.datapipe.net/CPAN/ ftp://mirror.datapipe.net/pub/CPAN/ + http://www.hoovism.com/CPAN/ + ftp://ftp.hoovism.com/CPAN/ + http://cpan.mirror.nac.net/ =item New York - http://mirrors.24-7-solutions.net/pub/CPAN/ - ftp://mirrors.24-7-solutions.net/pub/CPAN/ http://mirror.cc.columbia.edu/pub/software/cpan/ ftp://mirror.cc.columbia.edu/pub/software/cpan/ http://cpan.belfry.net/ @@ -847,63 +868,62 @@ Generated by Porting/make_modlib_cpan.pl ftp://cpan.erlbaum.net/CPAN/ http://cpan.hexten.net/ ftp://cpan.hexten.net/ - ftp://mirror.nyi.net/CPAN/ - http://mirror.rit.edu/CPAN/ - ftp://mirror.rit.edu/CPAN/ + http://mirror.nyi.net/CPAN/ + ftp://mirror.nyi.net/pub/CPAN/ + http://noodle.portalus.net/CPAN/ + ftp://noodle.portalus.net/CPAN/ + http://mirrors.rit.edu/CPAN/ + ftp://mirrors.rit.edu/CPAN/ =item North Carolina - http://www.ibiblio.org/pub/mirrors/CPAN - ftp://ftp.ncsu.edu/pub/mirror/CPAN/ + http://httpupdate140.cpanel.net/CPAN/ + http://mirrors.ibiblio.org/CPAN/ =item Oregon http://ftp.osuosl.org/pub/CPAN/ ftp://ftp.osuosl.org/pub/CPAN/ + http://mirror.uoregon.edu/CPAN/ =item Pennsylvania - http://ftp.epix.net/CPAN/ - ftp://ftp.epix.net/pub/languages/perl/ http://cpan.pair.com/ ftp://cpan.pair.com/pub/CPAN/ + http://cpan.mirrors.ionfish.org/ =item South Carolina http://cpan.mirror.clemson.edu/ -=item Tennessee - - http://mira.sunsite.utk.edu/CPAN/ - =item Texas - http://mirror.uta.edu/CPAN + http://mirror.uta.edu/CPAN/ =item Utah + http://cpan.cs.utah.edu/ + ftp://cpan.cs.utah.edu/CPAN/ ftp://mirror.xmission.com/CPAN/ =item Virginia - http://cpan-du.viaverio.com/ - ftp://cpan-du.viaverio.com/pub/CPAN/ - http://perl.secsup.org/ - ftp://perl.secsup.org/pub/perl/ + http://mirror.cogentco.com/pub/CPAN/ ftp://mirror.cogentco.com/pub/CPAN/ + http://mirror.jmu.edu/pub/CPAN/ + ftp://mirror.jmu.edu/pub/CPAN/ + http://mirror.us.leaseweb.net/CPAN/ + ftp://mirror.us.leaseweb.net/CPAN/ =item Washington http://cpan.llarian.net/ ftp://cpan.llarian.net/pub/CPAN/ - ftp://ftp-mirror.internap.com/pub/CPAN/ =item Wisconsin - http://cpan.mirrors.tds.net - ftp://cpan.mirrors.tds.net/pub/CPAN - http://mirror.sit.wisc.edu/pub/CPAN/ - ftp://mirror.sit.wisc.edu/pub/CPAN/ + http://cpan.mirrors.tds.net/ + ftp://cpan.mirrors.tds.net/pub/CPAN/ =back @@ -915,19 +935,32 @@ Generated by Porting/make_modlib_cpan.pl =item Australia - http://mirror.internode.on.net/pub/cpan/ + http://mirror.as24220.net/pub/cpan/ + ftp://mirror.as24220.net/pub/cpan/ + http://cpan.mirrors.ilisys.com.au/ + http://cpan.mirror.digitalpacific.com.au/ ftp://mirror.internode.on.net/pub/cpan/ - http://cpan.mirror.aussiehq.net.au/ - http://mirror.as24220.net/cpan/ - ftp://mirror.as24220.net/cpan/ + http://mirror.optusnet.com.au/CPAN/ + http://cpan.mirror.serversaustralia.com.au/ + http://cpan.uberglobalmirror.com/ + http://mirror.waia.asn.au/pub/cpan/ + +=item New Caledonia + + http://cpan.lagoon.nc/pub/CPAN/ + ftp://cpan.lagoon.nc/pub/CPAN/ + http://cpan.nautile.nc/CPAN/ + ftp://cpan.nautile.nc/CPAN/ =item New Zealand ftp://ftp.auckland.ac.nz/pub/perl/CPAN/ - http://cpan.inspire.net.nz - ftp://cpan.inspire.net.nz/cpan http://cpan.catalyst.net.nz/CPAN/ ftp://cpan.catalyst.net.nz/pub/CPAN/ + http://cpan.inspire.net.nz/ + ftp://cpan.inspire.net.nz/cpan/ + http://mirror.webtastix.net/CPAN/ + ftp://mirror.webtastix.net/CPAN/ =back @@ -937,107 +970,117 @@ Generated by Porting/make_modlib_cpan.pl =item Argentina - http://cpan.patan.com.ar/ - http://cpan.localhost.net.ar - ftp://mirrors.localhost.net.ar/pub/mirrors/CPAN + http://cpan.mmgdesigns.com.ar/ =item Brazil - ftp://cpan.pop-mg.com.br/pub/CPAN/ - http://ftp.pucpr.br/CPAN - ftp://ftp.pucpr.br/CPAN http://cpan.kinghost.net/ + http://linorg.usp.br/CPAN/ + http://mirror.nbtelecom.com.br/CPAN/ =item Chile http://cpan.dcc.uchile.cl/ ftp://cpan.dcc.uchile.cl/pub/lang/cpan/ -=item Colombia - - http://www.laqee.unal.edu.co/CPAN/ - =back =head2 RSYNC Mirrors - mirror.as24220.net::cpan - cpan.inode.at::CPAN - gd.tuwien.ac.at::CPAN - ftp.belnet.be::packages/cpan - rsync.linorg.usp.br::CPAN - rsync.arcticnetwork.ca::CPAN - CPAN.mirror.rafal.ca::CPAN - mirror.csclub.uwaterloo.ca::CPAN - theoryx5.uwinnipeg.ca::CPAN - www.laqee.unal.edu.co::CPAN - mirror.uni-c.dk::CPAN - rsync.nic.funet.fi::CPAN - rsync://distrib-coffee.ipsl.jussieu.fr/pub/mirrors/cpan/ - mir1.ovh.net::CPAN - miroir-francais.fr::cpan - ftp.crihan.fr::CPAN - rsync://mirror.cict.fr/cpan/ - rsync://mirror.netcologne.de/cpan/ - ftp-stud.hs-esslingen.de::CPAN/ - ftp.gwdg.de::FTP/languages/perl/CPAN/ - cpan.mirror.iphh.net::CPAN - cpan.cpantesters.org::cpan - cpan.hu::CPAN - komo.vlsm.org::CPAN - mirror.unej.ac.id::cpan - ftp.esat.net::/pub/languages/perl/CPAN - ftp.heanet.ie::mirrors/ftp.perl.org/pub/CPAN - rsync.panu.it::CPAN - cpan.fastbull.org::CPAN - ftp.kddilabs.jp::cpan - ftp.nara.wide.ad.jp::cpan/ - rsync://ftp.jaist.ac.jp/pub/CPAN/ - rsync://ftp.riken.jp/cpan/ - mirror.linuxiso.kz::CPAN - rsync://mirrors3.kernel.org/mirrors/CPAN/ - rsync://rsync.osmirror.nl/cpan/ - mirror.leaseweb.com::CPAN - cpan.nautile.nc::CPAN - mirror.icis.pcz.pl::CPAN - piotrkosoft.net::mirrors/CPAN - rsync://cpan.perl.pt/ - ftp.kaist.ac.kr::cpan - cpan.sarang.net::CPAN - mirror.averse.net::cpan - rsync.oss.eznetsols.org - mirror.ac.za::cpan - ftp.is.co.za::IS-Mirror/ftp.cpan.org/ - rsync://ftp.gui.uva.es/cpan/ - rsync://mirrors4.kernel.org/mirrors/CPAN/ - ftp.solnet.ch::CPAN - ftp.ulak.net.tr::CPAN - gafol.net::cpan - rsync.mirrorservice.org::ftp.funet.fi/pub/ - rsync://rsync.mirror.8086.net/CPAN/ - rsync.mirror.anlx.net::CPAN - mirror.bytemark.co.uk::CPAN - ftp.plig.net::CPAN - rsync://ftp.ticklers.org:CPAN/ - mirrors.ibiblio.org::CPAN - cpan-du.viaverio.com::CPAN - mirror.hiwaay.net::CPAN - rsync://mira.sunsite.utk.edu/CPAN/ - cpan.mirrors.tds.net::CPAN - mirror.its.uidaho.edu::cpan - rsync://mirror.cc.columbia.edu::cpan/ - ftp.fxcorporate.com::CPAN - rsync.atlantic.net::CPAN - mirrors.kernel.org::mirrors/CPAN - rsync://mirrors2.kernel.org/mirrors/CPAN/ - cpan.pair.com::CPAN - rsync://mirror.rit.edu/CPAN/ - rsync://mirror.facebook.net/cpan/ - rsync://mirrors1.kernel.org/mirrors/CPAN/ - cpan-sj.viaverio.com::CPAN + rsync://ftp.is.co.za/IS-Mirror/ftp.cpan.org/ + rsync://mirror.ac.za/CPAN/ + rsync://mirror.zol.co.zw/CPAN/ + rsync://mirror.dhakacom.com/CPAN/ + rsync://mirrors.ustc.edu.cn/CPAN/ + rsync://mirrors.xmu.edu.cn/CPAN/ + rsync://kambing.ui.ac.id/CPAN/ + rsync://ftp.jaist.ac.jp/pub/CPAN/ + rsync://mirror.jre655.com/CPAN/ + rsync://ftp.kddilabs.jp/cpan/ + rsync://ftp.nara.wide.ad.jp/cpan/ + rsync://ftp.riken.jp/cpan/ + rsync://mirror.neolabs.kz/CPAN/ + rsync://mirror.qnren.qa/CPAN/ + rsync://ftp.neowiz.com/CPAN/ + rsync://mirror.0x.sg/CPAN/ + rsync://ftp.yzu.edu.tw/pub/CPAN/ + rsync://ftp.ubuntu-tw.org/CPAN/ + rsync://mirrors.digipower.vn/CPAN/ + rsync://cpan.inode.at/CPAN/ + rsync://ftp.byfly.by/CPAN/ + rsync://mirror.datacenter.by/CPAN/ + rsync://ftp.belnet.be/cpan/ + rsync://cpan.mirror.ba/CPAN/ + rsync://mirrors.neterra.net/CPAN/ + rsync://mirrors.netix.net/CPAN/ + rsync://mirror.dkm.cz/cpan/ + rsync://mirrors.nic.cz/CPAN/ + rsync://cpan.mirror.vutbr.cz/cpan/ + rsync://rsync.nic.funet.fi/CPAN/ + rsync://ftp.ciril.fr/pub/cpan/ + rsync://distrib-coffee.ipsl.jussieu.fr/pub/mirrors/cpan/ + rsync://cpan.mirrors.ovh.net/CPAN/ + rsync://mirror.de.leaseweb.net/CPAN/ + rsync://mirror.euserv.net/cpan/ + rsync://ftp-stud.hs-esslingen.de/CPAN/ + rsync://ftp.gwdg.de/pub/languages/perl/CPAN/ + rsync://ftp.hawo.stw.uni-erlangen.de/CPAN/ + rsync://cpan.mirror.iphh.net/CPAN/ + rsync://mirror.netcologne.de/cpan/ + rsync://ftp.halifax.rwth-aachen.de/cpan/ + rsync://ftp.ntua.gr/CPAN/ + rsync://mirror.met.hu/CPAN/ + rsync://ftp.heanet.ie/mirrors/ftp.perl.org/pub/CPAN/ + rsync://rsync.panu.it/CPAN/ + rsync://mirror.as43289.net/CPAN/ + rsync://rsync.cs.uu.nl/CPAN/ + rsync://mirror.nl.leaseweb.net/CPAN/ + rsync://ftp.nluug.nl/CPAN/ + rsync://mirror.transip.net/CPAN/ + rsync://cpan.uib.no/cpan/ + rsync://cpan.vianett.no/CPAN/ + rsync://cpan.perl-hackers.net/CPAN/ + rsync://cpan.perl.pt/cpan/ + rsync://mirrors.m247.ro/CPAN/ + rsync://mirrors.teentelecom.net/CPAN/ + rsync://cpan.webdesk.ru/CPAN/ + rsync://mirror.yandex.ru/mirrors/cpan/ + rsync://mirror.sbb.rs/CPAN/ + rsync://ftp.acc.umu.se/mirror/CPAN/ + rsync://rsync.pirbot.com/ftp/cpan/ + rsync://cpan.ip-connect.vn.ua/CPAN/ + rsync://rsync.mirror.anlx.net/CPAN/ + rsync://mirror.bytemark.co.uk/CPAN/ + rsync://mirror.sax.uk.as61049.net/CPAN/ + rsync://rsync.mirrorservice.org/cpan.perl.org/CPAN/ + rsync://ftp.ticklers.org/CPAN/ + rsync://mirrors.uk2.net/CPAN/ + rsync://CPAN.mirror.rafal.ca/CPAN/ + rsync://mirror.csclub.uwaterloo.ca/CPAN/ + rsync://mirrors.namecheap.com/CPAN/ + rsync://mirrors.syringanetworks.net/CPAN/ + rsync://mirror.team-cymru.org/CPAN/ + rsync://debian.cse.msu.edu/cpan/ + rsync://mirrors-usa.go-parts.com/mirrors/cpan/ + rsync://rsync.hoovism.com/CPAN/ + rsync://mirror.cc.columbia.edu/cpan/ + rsync://noodle.portalus.net/CPAN/ + rsync://mirrors.rit.edu/cpan/ + rsync://mirrors.ibiblio.org/CPAN/ + rsync://cpan.pair.com/CPAN/ + rsync://cpan.cs.utah.edu/CPAN/ + rsync://mirror.cogentco.com/CPAN/ + rsync://mirror.jmu.edu/CPAN/ + rsync://mirror.us.leaseweb.net/CPAN/ + rsync://cpan.mirror.digitalpacific.com.au/cpan/ + rsync://mirror.internode.on.net/cpan/ + rsync://uberglobalmirror.com/cpan/ + rsync://cpan.lagoon.nc/cpan/ + rsync://mirrors.mmgdesigns.com.ar/CPAN/ + For an up-to-date listing of CPAN sites, -see http://www.cpan.org/SITES or ftp://www.cpan.org/SITES . +see L<http://www.cpan.org/SITES> or L<ftp://www.cpan.org/SITES>. =head1 Modules: Creation, Use, and Abuse @@ -1362,43 +1405,22 @@ See perldoc ExtUtils::MakeMaker.pm for details. How to release and distribute a module. -It's good idea to post an announcement of the availability of your -module (or the module itself if small) to the comp.lang.perl.announce -Usenet newsgroup. This will at least ensure very wide once-off -distribution. +If possible, register the module with CPAN. Follow the instructions +and links on: -If possible, register the module with CPAN. You should -include details of its location in your announcement. - -Some notes about ftp archives: Please use a long descriptive file -name that includes the version number. Most incoming directories -will not be readable/listable, i.e., you won't be able to see your -file after uploading it. Remember to send your email notification -message as soon as possible after uploading else your file may get -deleted automatically. Allow time for the file to be processed -and/or check the file has been processed before announcing its -location. - -FTP Archives for Perl Modules: - -Follow the instructions and links on: - - http://www.cpan.org/modules/00modlist.long.html http://www.cpan.org/modules/04pause.html -or upload to one of these sites: +and upload to: - https://pause.kbx.de/pause/ http://pause.perl.org/ -and notify <modules@perl.org>. +and notify <modules@perl.org>. This will allow anyone to install +your module using the C<cpan> tool distributed with Perl. By using the WWW interface you can ask the Upload Server to mirror your modules from your ftp or WWW site into your own directory on CPAN! -Please remember to send me an updated entry for the Module list! - =item * Take care when changing a released module. diff --git a/gnu/usr.bin/perl/pod/perlmodstyle.pod b/gnu/usr.bin/perl/pod/perlmodstyle.pod index 62390a4917f..73e09b38cdd 100644 --- a/gnu/usr.bin/perl/pod/perlmodstyle.pod +++ b/gnu/usr.bin/perl/pod/perlmodstyle.pod @@ -798,7 +798,7 @@ L<ExtUtils::MakeMaker>, L<Module::Build> L<Test::Simple>, L<Test::Inline>, L<Carp::Assert>, L<Test::More>, L<Test::MockObject> -=item http://pause.perl.org/ +=item L<http://pause.perl.org/> Perl Authors Upload Server. Contains links to information for module authors. diff --git a/gnu/usr.bin/perl/pod/perlnewmod.pod b/gnu/usr.bin/perl/pod/perlnewmod.pod index eae2997aada..61cc7bfc418 100644 --- a/gnu/usr.bin/perl/pod/perlnewmod.pod +++ b/gnu/usr.bin/perl/pod/perlnewmod.pod @@ -78,7 +78,7 @@ Dig into a bunch of modules to see how they're written. I'd suggest starting with L<Text::Tabs|Text::Tabs>, since it's in the standard library and is nice and simple, and then looking at something a little more complex like L<File::Copy|File::Copy>. For object oriented -code, C<WWW::Mechanize> or the C<Email::*> modules provide some good +code, L<WWW::Mechanize> or the C<Email::*> modules provide some good examples. These should give you an overall feel for how modules are laid out and @@ -88,18 +88,18 @@ written. There are a lot of modules on CPAN, and it's easy to miss one that's similar to what you're planning on contributing. Have a good plough -through the L<http://search.cpan.org> and make sure you're not the one +through L<http://metacpan.org> and make sure you're not the one reinventing the wheel! =item Discuss the need You might love it. You might feel that everyone else needs it. But there might not actually be any real demand for it out there. If you're unsure -about the demand your module will have, consider sending out feelers -on the C<comp.lang.perl.modules> newsgroup, or as a last resort, ask the -modules list at C<modules@perl.org>. Remember that this is a closed list -with a very long turn-around time - be prepared to wait a good while for -a response from them. +about the demand your module will have, consider asking the +C<module-authors@perl.org> mailing list (send an email to +C<module-authors-subscribe@perl.org> to subscribe; see +L<http://lists.perl.org/list/module-authors.html> for more information +and a link to the archives). =item Choose a name @@ -212,13 +212,18 @@ more than just checking your module will compile. L<Test::Simple|Test::Simple> and L<Test::More|Test::More> are good places to start when writing a test suite. -=item Write the README +=item Write the F<README> If you're uploading to CPAN, the automated gremlins will extract the README file and place that in your CPAN directory. It'll also appear in the main F<by-module> and F<by-category> directories if you make it onto the modules list. It's a good idea to put here what the module actually -does in detail, and the user-visible changes since the last release. +does in detail. + +=item Write F<Changes> + +Add any user-visible changes since the last release to your F<Changes> +file. =back @@ -229,17 +234,18 @@ does in detail, and the user-visible changes since the last release. =item Get a CPAN user ID Every developer publishing modules on CPAN needs a CPAN ID. Visit -C<http://pause.perl.org/>, select "Request PAUSE Account", and wait for +C<L<http://pause.perl.org/>>, select "Request PAUSE Account", and wait for your request to be approved by the PAUSE administrators. -=item C<perl Makefile.PL; make test; make dist> +=item C<perl Makefile.PL; make test; make distcheck; make dist> Once again, C<module-starter> or C<h2xs> has done all the work for you. They produce the standard C<Makefile.PL> you see when you download and install modules, and this produces a Makefile with a C<dist> target. Once you've ensured that your module passes its own tests - always a -good thing to make sure - you can C<make dist>, and the Makefile will +good thing to make sure - you can C<make distcheck> to make sure +everything looks OK, followed by C<make dist>, and the Makefile will hopefully produce you a nice tarball of your module, ready for upload. =item Upload the tarball @@ -248,17 +254,8 @@ The email you got when you received your CPAN ID will tell you how to log in to PAUSE, the Perl Authors Upload SErver. From the menus there, you can upload your module to CPAN. -=item Announce to the modules list - -Once uploaded, it'll sit unnoticed in your author directory. If you want -it connected to the rest of the CPAN, you'll need to go to "Register -Namespace" on PAUSE. Once registered, your module will appear in the -by-module and by-category listings on CPAN. - -=item Announce to clpa - -If you have a burning desire to tell the world about your release, post -an announcement to the moderated C<comp.lang.perl.announce> newsgroup. +Alternatively you can use the F<cpan-upload> script, part of the +L<CPAN::Uploader> distribution on CPAN. =item Fix bugs! @@ -279,5 +276,5 @@ Updated by Kirrily "Skud" Robert, C<skud@cpan.org> L<perlmod>, L<perlmodlib>, L<perlmodinstall>, L<h2xs>, L<strict>, L<Carp>, L<Exporter>, L<perlpod>, L<Test::Simple>, L<Test::More> L<ExtUtils::MakeMaker>, L<Module::Build>, L<Module::Starter> -http://www.cpan.org/ , Ken Williams's tutorial on building your own -module at http://mathforum.org/~ken/perl_modules.html +L<http://www.cpan.org/>, Ken Williams' tutorial on building your own +module at L<http://mathforum.org/~ken/perl_modules.html> diff --git a/gnu/usr.bin/perl/pod/perlobj.pod b/gnu/usr.bin/perl/pod/perlobj.pod index 6513d8a8669..0060443e98e 100644 --- a/gnu/usr.bin/perl/pod/perlobj.pod +++ b/gnu/usr.bin/perl/pod/perlobj.pod @@ -131,7 +131,7 @@ documented methods on the object. Note, however, that (unlike most other OO languages) Perl does not ensure or enforce encapsulation in any way. If you want objects to actually I<be> opaque you need to arrange for that yourself. This can -be done in a variety of ways, including using L<"Inside-Out objects"> +be done in a variety of ways, including using L</"Inside-Out objects"> or modules from CPAN. =head3 Objects Are Blessed; Variables Are Not @@ -179,12 +179,12 @@ Each package contains a special array called C<@ISA>. The C<@ISA> array contains a list of that class's parent classes, if any. This array is examined when Perl does method resolution, which we will cover later. -It is possible to manually set C<@ISA>, and you may see this in older -Perl code. Much older code also uses the L<base> pragma. For new code, -we recommend that you use the L<parent> pragma to declare your parents. -This pragma will take care of setting C<@ISA>. It will also load the -parent classes and make sure that the package doesn't inherit from -itself. +Calling methods from a package means it must be loaded, of course, so +you will often want to load a module and add it to C<@ISA> at the same +time. You can do so in a single step using the L<parent> pragma. +(In older code you may encounter the L<base> pragma, which is nowadays +discouraged except when you have to work with the equally discouraged +L<fields> pragma.) However the parent classes are set, the package's C<@ISA> variable will contain a list of those parents. This is simply a list of scalars, each @@ -489,8 +489,8 @@ As you can see, we've stored the path and file data in the object itself. Remember, under the hood, this object is still just a hash. Later, we'll write accessors to manipulate this data. -For our File::MP3 class, we can check to make sure that the path we're -given ends with ".mp3": +For our C<File::MP3> class, we can check to make sure that the path +we're given ends with ".mp3": package File::MP3; @@ -581,6 +581,34 @@ X<method> Perl supports several other ways to call methods besides the C<< $object->method() >> usage we've seen so far. +=head3 Method Names with a Fully Qualified Name + +Perl allows you to call methods using their fully qualified name (the +package and method name): + + my $mp3 = File::MP3->new( 'Regin.mp3', $data ); + $mp3->File::save(); + +When you call a fully qualified method name like C<File::save>, the method +resolution search for the C<save> method starts in the C<File> class, +skipping any C<save> method the C<File::MP3> class may have defined. It +still searches the C<File> class's parents if necessary. + +While this feature is most commonly used to explicitly call methods +inherited from an ancestor class, there is no technical restriction +that enforces this: + + my $obj = Tree->new(); + $obj->Dog::bark(); + +This calls the C<bark> method from class C<Dog> on an object of class +C<Tree>, even if the two classes are completely unrelated. Use this +with great care. + +The C<SUPER> pseudo-class that was described earlier is I<not> the same +as calling a method with a fully-qualified name. See the earlier +L</Inheritance> section for details. + =head3 Method Names as Strings Perl lets you use a scalar variable containing a string as a method @@ -624,7 +652,7 @@ this idiom in the wild combined with a call to C<can>: $object->$meth(); } -=head3 Deferencing Method Call +=head3 Dereferencing Method Call Perl also lets you use a dereferenced scalar reference in a method call. That's a mouthful, so let's look at some code: @@ -903,15 +931,22 @@ argument. It does not receive any additional arguments. However, the C<$_[0]> variable will be read-only in the destructor, so you cannot assign a value to it. -If your C<DESTROY> method throws an error, this error will be ignored. -It will not be sent to C<STDERR> and it will not cause the program to -die. However, if your destructor is running inside an C<eval {}> block, -then the error will change the value of C<$@>. +If your C<DESTROY> method throws an exception, this will not cause +any control transfer beyond exiting the method. The exception will be +reported to C<STDERR> as a warning, marked "(in cleanup)", and Perl will +continue with whatever it was doing before. + +Because C<DESTROY> methods can be called at any time, you should localize +any global status variables that might be set by anything you do in +your C<DESTROY> method. If you are in doubt about a particular status +variable, it doesn't hurt to localize it. There are five global status +variables, and the safest way is to localize all five of them: -Because C<DESTROY> methods can be called at any time, you should -localize any global variables you might update in your C<DESTROY>. In -particular, if you use C<eval {}> you should localize C<$@>, and if you -use C<system> or backticks you should localize C<$?>. + sub DESTROY { + local($., $@, $!, $^E, $?); + my $self = shift; + ...; + } If you define an C<AUTOLOAD> in your class, then Perl will call your C<AUTOLOAD> to handle the C<DESTROY> method. You can prevent this by diff --git a/gnu/usr.bin/perl/pod/perlootut.pod b/gnu/usr.bin/perl/pod/perlootut.pod index 6d0ae03380e..b340dc6ea70 100644 --- a/gnu/usr.bin/perl/pod/perlootut.pod +++ b/gnu/usr.bin/perl/pod/perlootut.pod @@ -232,8 +232,8 @@ from C<File>. An C<File::MP3> B<is-a> I<more specific> type of C<File>. All mp3 files are files, but not all files are mp3 files. We often refer to inheritance relationships as B<parent-child> or -C<superclass>/C<subclass> relationships. Sometimes we say that the child -has an B<is-a> relationship with its parent class. +C<superclass>/C<subclass> relationships. Sometimes we say that the +child has an B<is-a> relationship with its parent class. C<File> is a B<superclass> of C<File::MP3>, and C<File::MP3> is a B<subclass> of C<File>. @@ -505,7 +505,7 @@ new C<File>. C<Moose> lets you define roles the same way you define classes: - package HasOnOfSwitch; + package HasOnOffSwitch; use Moose::Role; has is_on => ( @@ -650,8 +650,8 @@ constructor for your class. Finally, we have L<Class::Tiny>. This module truly lives up to its name. It has an incredibly minimal API and absolutely no dependencies -on any recent Perl. Still, we think it's a lot easier to use than writing -your own OO code from scratch. +on any recent Perl. Still, we think it's a lot easier to use than +writing your own OO code from scratch. Here's our C<File> class once more: @@ -715,9 +715,9 @@ to worry about details. =item * L<Role::Tiny> -Use C<Role::Tiny> with C<Class::Accessor> or C<Class::Tiny> if you -find yourself considering multiple inheritance. If you go with -C<Moose>, it comes with its own role implementation. +Use C<Role::Tiny> with C<Class::Accessor> or C<Class::Tiny> if you find +yourself considering multiple inheritance. If you go with C<Moose>, it +comes with its own role implementation. =back @@ -743,9 +743,11 @@ For small systems, L<Class::Tiny> and L<Class::Accessor> both provide minimal object systems that take care of basic boilerplate for you. For bigger projects, L<Moose> provides a rich set of features that will -let you focus on implementing your business logic. +let you focus on implementing your business logic. L<Moo> provides a +nice alternative to L<Moose> when you want a lot of features but need +faster compile time or to avoid XS. -We encourage you to play with and evaluate L<Moose>, +We encourage you to play with and evaluate L<Moose>, L<Moo>, L<Class::Accessor>, and L<Class::Tiny> to see which OO system is right for you. diff --git a/gnu/usr.bin/perl/pod/perlop.pod b/gnu/usr.bin/perl/pod/perlop.pod index a9716c3524c..7f0faaa4cbc 100644 --- a/gnu/usr.bin/perl/pod/perlop.pod +++ b/gnu/usr.bin/perl/pod/perlop.pod @@ -27,17 +27,37 @@ X<operator, precedence> X<precedence> X<associativity> Operator precedence and associativity work in Perl more or less like they do in mathematics. -I<Operator precedence> means some operators are evaluated before -others. For example, in S<C<2 + 4 * 5>>, the multiplication has higher -precedence so S<C<4 * 5>> is evaluated first yielding S<C<2 + 20 == -22>> and not S<C<6 * 5 == 30>>. - -I<Operator associativity> defines what happens if a sequence of the -same operators is used one after another: whether the evaluator will -evaluate the left operations first, or the right first. For example, in -S<C<8 - 4 - 2>>, subtraction is left associative so Perl evaluates the -expression left to right. S<C<8 - 4>> is evaluated first making the -expression S<C<4 - 2 == 2>> and not S<C<8 - 2 == 6>>. +I<Operator precedence> means some operators group more tightly than others. +For example, in C<2 + 4 * 5>, the multiplication has higher precedence, so C<4 +* 5> is grouped together as the right-hand operand of the addition, rather +than C<2 + 4> being grouped together as the left-hand operand of the +multiplication. It is as if the expression were written C<2 + (4 * 5)>, not +C<(2 + 4) * 5>. So the expression yields C<2 + 20 == 22>, rather than +C<6 * 5 == 30>. + +I<Operator associativity> defines what happens if a sequence of the same +operators is used one after another: whether they will be grouped at the left +or the right. For example, in C<9 - 3 - 2>, subtraction is left associative, +so C<9 - 3> is grouped together as the left-hand operand of the second +subtraction, rather than C<3 - 2> being grouped together as the right-hand +operand of the first subtraction. It is as if the expression were written +C<(9 - 3) - 2>, not C<9 - (3 - 2)>. So the expression yields C<6 - 2 == 4>, +rather than C<9 - 1 == 8>. + +For simple operators that evaluate all their operands and then combine the +values in some way, precedence and associativity (and parentheses) imply some +ordering requirements on those combining operations. For example, in C<2 + 4 * +5>, the grouping implied by precedence means that the multiplication of 4 and +5 must be performed before the addition of 2 and 20, simply because the result +of that multiplication is required as one of the operands of the addition. But +the order of operations is not fully determined by this: in C<2 * 2 + 4 * 5> +both multiplications must be performed before the addition, but the grouping +does not say anything about the order in which the two multiplications are +performed. In fact Perl has a general rule that the operands of an operator +are evaluated in left-to-right order. A few operators such as C<&&=> have +special evaluation rules that can result in an operand not being evaluated at +all; in general, the top-level operator in an expression has control of +operand evaluation. Perl operators have the following associativity and precedence, listed from highest precedence to lowest. Operators borrowed from @@ -71,7 +91,8 @@ values only, not array values. left and left or xor -In the following sections, these operators are covered in precedence order. +In the following sections, these operators are covered in detail, in the +same order in which they appear in the table above. Many operators can be overloaded for objects. See L<overload>. @@ -128,13 +149,13 @@ To do what you meant properly, you must write: print(($foo & 255) + 1, "\n"); -See L<Named Unary Operators> for more discussion of this. +See L</Named Unary Operators> for more discussion of this. Also parsed as terms are the S<C<do {}>> and S<C<eval {}>> constructs, as well as subroutine and method calls, and the anonymous constructors C<[]> and C<{}>. -See also L<Quote and Quote-like Operators> toward the end of this section, +See also L</Quote and Quote-like Operators> toward the end of this section, as well as L</"I/O Operators">. =head2 The Arrow Operator @@ -216,8 +237,8 @@ are platform-dependent. =head2 Symbolic Unary Operators X<unary operator> X<operator, unary> -Unary C<"!"> performs logical negation, that is, "not". See also C<not> for a lower -precedence version of this. +Unary C<"!"> performs logical negation, that is, "not". See also +L<C<not>|/Logical Not> for a lower precedence version of this. X<!> Unary C<"-"> performs arithmetic negation if the operand is numeric, @@ -235,25 +256,23 @@ B<Argument "the string" isn't numeric in negation (-) at ...>. X<-> X<negation, arithmetic> Unary C<"~"> performs bitwise negation, that is, 1's complement. For -example, S<C<0666 & ~027>> is 0640. (See also L<Integer Arithmetic> and -L<Bitwise String Operators>.) Note that the width of the result is +example, S<C<0666 & ~027>> is 0640. (See also L</Integer Arithmetic> and +L</Bitwise String Operators>.) Note that the width of the result is platform-dependent: C<~0> is 32 bits wide on a 32-bit platform, but 64 bits wide on a 64-bit platform, so if you are expecting a certain bit width, remember to use the C<"&"> operator to mask off the excess bits. X<~> X<negation, binary> -When complementing strings, if all characters have ordinal values under -256, then their complements will, also. But if they do not, all -characters will be in either 32- or 64-bit complements, depending on your -architecture. So for example, C<~"\x{3B1}"> is C<"\x{FFFF_FC4E}"> on -32-bit machines and C<"\x{FFFF_FFFF_FFFF_FC4E}"> on 64-bit machines. +Starting in Perl 5.28, it is a fatal error to try to complement a string +containing a character with an ordinal value above 255. -If the experimental "bitwise" feature is enabled via S<C<use feature -'bitwise'>>, then unary C<"~"> always treats its argument as a number, and an +If the "bitwise" feature is enabled via S<C<use +feature 'bitwise'>> or C<use v5.28>, then unary +C<"~"> always treats its argument as a number, and an alternate form of the operator, C<"~.">, always treats its argument as a string. So C<~0> and C<~"0"> will both give 2**32-1 on 32-bit platforms, -whereas C<~.0> and C<~."0"> will both yield C<"\xff">. This feature -produces a warning unless you use S<C<no warnings 'experimental::bitwise'>>. +whereas C<~.0> and C<~."0"> will both yield C<"\xff">. Until Perl 5.28, +this feature produced a warning in the C<"experimental::bitwise"> category. Unary C<"+"> has no effect whatsoever, even on strings. It is useful syntactically for separating a function name from a parenthesized expression @@ -261,7 +280,12 @@ that would otherwise be interpreted as the complete list of function arguments. (See examples above under L</Terms and List Operators (Leftward)>.) X<+> -Unary C<"\"> creates a reference to whatever follows it. See L<perlreftut> +Unary C<"\"> creates references. If its operand is a single sigilled +thing, it creates a reference to that object. If its operand is a +parenthesised list, then it creates references to the things mentioned +in the list. Otherwise it puts its operand in list context, and creates +a list of references to the scalars in the list provided by the operand. +See L<perlreftut> and L<perlref>. Do not confuse this behavior with the behavior of backslash within a string, although both forms do convey the notion of protecting the next thing from interpolation. @@ -332,11 +356,16 @@ operator is not as well defined for negative operands, but it will execute faster. X<%> X<remainder> X<modulo> X<mod> -Binary C<"x"> is the repetition operator. In scalar context or if the left -operand is not enclosed in parentheses, it returns a string consisting -of the left operand repeated the number of times specified by the right -operand. In list context, if the left operand is enclosed in -parentheses or is a list formed by C<qw/I<STRING>/>, it repeats the list. +Binary C<x> is the repetition operator. In scalar context, or if the +left operand is neither enclosed in parentheses nor a C<qw//> list, +it performs a string repetition. In that case it supplies scalar +context to the left operand, and returns a string consisting of the +left operand string repeated the number of times specified by the right +operand. If the C<x> is in list context, and the left operand is either +enclosed in parentheses or a C<qw//> list, it performs a list repetition. +In that case it supplies list context to the left operand, and returns +a list consisting of the left operand list repeated the number of times +specified by the right operand. If the right operand is zero or negative (raising a warning on negative), it returns an empty string or an empty list, depending on the context. @@ -370,13 +399,13 @@ X<shl> X<shr> X<shift, right> X<shift, left> Binary C<<< "<<" >>> returns the value of its left argument shifted left by the number of bits specified by the right argument. Arguments should be -integers. (See also L<Integer Arithmetic>.) +integers. (See also L</Integer Arithmetic>.) Binary C<<< ">>" >>> returns the value of its left argument shifted right by the number of bits specified by the right argument. Arguments should -be integers. (See also L<Integer Arithmetic>.) +be integers. (See also L</Integer Arithmetic>.) -If S<C<use integer>> (see L<Integer Arithmetic>) is in force then +If S<C<use integer>> (see L</Integer Arithmetic>) is in force then signed C integers are used (I<arithmetic shift>), otherwise unsigned C integers are used (I<logical shift>), even for negative shiftees. In arithmetic right shift the sign bit is replicated on the left, @@ -403,7 +432,7 @@ If you get tired of being subject to your platform's native integers, the S<C<use bigint>> pragma neatly sidesteps the issue altogether: print 20 << 20; # 20971520 - print 20 << 40; # 5120 on 32-bit machines, + print 20 << 40; # 5120 on 32-bit machines, # 21990232555520 on 64-bit machines use bigint; print 20 << 100; # 25353012004564588029934064107520 @@ -443,12 +472,12 @@ parenthesis rule. That means, for example, that C<-f($file).".bak"> is equivalent to S<C<-f "$file.bak">>. X<-X> X<filetest> X<operator, filetest> -See also L<"Terms and List Operators (Leftward)">. +See also L</"Terms and List Operators (Leftward)">. =head2 Relational Operators X<relational operator> X<operator, relational> -Perl operators that return true or false generally return values +Perl operators that return true or false generally return values that can be safely used as numbers. For example, the relational operators in this section and the equality operators in the next one return C<1> for true and a special version of the defined empty @@ -541,7 +570,7 @@ The standard C<L<Unicode::Collate>> and C<L<Unicode::Collate::Locale>> modules offer much more powerful solutions to collation issues. -For case-insensitive comparisions, look at the L<perlfunc/fc> case-folding +For case-insensitive comparisons, look at the L<perlfunc/fc> case-folding function, available in Perl v5.16 or later: if ( fc($x) eq fc($y) ) { ... } @@ -578,81 +607,81 @@ whose types apply determines the smartmatch behavior. Because what actually happens is mostly determined by the type of the second operand, the table is sorted on the right operand instead of on the left. - Left Right Description and pseudocode + Left Right Description and pseudocode =============================================================== - Any undef check whether Any is undefined + Any undef check whether Any is undefined like: !defined Any Any Object invoke ~~ overloading on Object, or die Right operand is an ARRAY: - Left Right Description and pseudocode + Left Right Description and pseudocode =============================================================== ARRAY1 ARRAY2 recurse on paired elements of ARRAY1 and ARRAY2[2] like: (ARRAY1[0] ~~ ARRAY2[0]) && (ARRAY1[1] ~~ ARRAY2[1]) && ... - HASH ARRAY any ARRAY elements exist as HASH keys + HASH ARRAY any ARRAY elements exist as HASH keys like: grep { exists HASH->{$_} } ARRAY Regexp ARRAY any ARRAY elements pattern match Regexp like: grep { /Regexp/ } ARRAY - undef ARRAY undef in ARRAY + undef ARRAY undef in ARRAY like: grep { !defined } ARRAY - Any ARRAY smartmatch each ARRAY element[3] + Any ARRAY smartmatch each ARRAY element[3] like: grep { Any ~~ $_ } ARRAY Right operand is a HASH: - Left Right Description and pseudocode + Left Right Description and pseudocode =============================================================== - HASH1 HASH2 all same keys in both HASHes + HASH1 HASH2 all same keys in both HASHes like: keys HASH1 == grep { exists HASH2->{$_} } keys HASH1 - ARRAY HASH any ARRAY elements exist as HASH keys + ARRAY HASH any ARRAY elements exist as HASH keys like: grep { exists HASH->{$_} } ARRAY - Regexp HASH any HASH keys pattern match Regexp + Regexp HASH any HASH keys pattern match Regexp like: grep { /Regexp/ } keys HASH - undef HASH always false (undef can't be a key) + undef HASH always false (undef can't be a key) like: 0 == 1 - Any HASH HASH key existence + Any HASH HASH key existence like: exists HASH->{Any} Right operand is CODE: - Left Right Description and pseudocode + Left Right Description and pseudocode =============================================================== ARRAY CODE sub returns true on all ARRAY elements[1] like: !grep { !CODE->($_) } ARRAY HASH CODE sub returns true on all HASH keys[1] like: !grep { !CODE->($_) } keys HASH - Any CODE sub passed Any returns true + Any CODE sub passed Any returns true like: CODE->(Any) Right operand is a Regexp: - Left Right Description and pseudocode + Left Right Description and pseudocode =============================================================== - ARRAY Regexp any ARRAY elements match Regexp + ARRAY Regexp any ARRAY elements match Regexp like: grep { /Regexp/ } ARRAY - HASH Regexp any HASH keys match Regexp + HASH Regexp any HASH keys match Regexp like: grep { /Regexp/ } keys HASH - Any Regexp pattern match + Any Regexp pattern match like: Any =~ /Regexp/ Other: - Left Right Description and pseudocode + Left Right Description and pseudocode =============================================================== Object Any invoke ~~ overloading on Object, or fall back to... - Any Num numeric equality + Any Num numeric equality like: Any == Num Num nummy[4] numeric equality like: Num == nummy undef Any check whether undefined like: !defined(Any) - Any Any string equality + Any Any string equality like: Any eq Any @@ -661,13 +690,13 @@ Notes: =over =item 1. -Empty hashes or arrays match. +Empty hashes or arrays match. =item 2. That is, each element smartmatches the element of the same index in the other array.[3] =item 3. -If a circular reference is found, fall back to referential equality. +If a circular reference is found, fall back to referential equality. =item 4. Either an actual number, or a string that looks like one. @@ -722,7 +751,7 @@ recursively. my @bigger = ("red", "blue", [ "orange", "green" ] ); if (@little ~~ @bigger) { # true! say "little is contained in bigger"; - } + } Because the smartmatch operator recurses on nested arrays, this will still report that "red" is in the array. @@ -736,21 +765,21 @@ If two arrays smartmatch each other, then they are deep copies of each others' values, as this example reports: use v5.12.0; - my @a = (0, 1, 2, [3, [4, 5], 6], 7); - my @b = (0, 1, 2, [3, [4, 5], 6], 7); + my @a = (0, 1, 2, [3, [4, 5], 6], 7); + my @b = (0, 1, 2, [3, [4, 5], 6], 7); if (@a ~~ @b && @b ~~ @a) { say "a and b are deep copies of each other"; - } + } elsif (@a ~~ @b) { say "a smartmatches in b"; - } + } elsif (@b ~~ @a) { say "b smartmatches in a"; - } + } else { say "a and b don't smartmatch each other at all"; - } + } If you were to set S<C<$b[3] = 4>>, then instead of reporting that "a and b @@ -817,7 +846,7 @@ C<I<X>>, overloading may or may not be invoked. For simple strings or numbers, "in" becomes equivalent to this: $object ~~ $number ref($object) == $number - $object ~~ $string ref($object) eq $string + $object ~~ $string ref($object) eq $string For example, this reports that the handle smells IOish (but please don't really do this!): @@ -826,7 +855,7 @@ For example, this reports that the handle smells IOish my $fh = IO::Handle->new(); if ($fh ~~ /\bIO\b/) { say "handle smells IOish"; - } + } That's because it treats C<$fh> as a string like C<"IO::Handle=GLOB(0x8039e0)">, then pattern matches against that. @@ -837,17 +866,17 @@ X<operator, bitwise, and> X<bitwise and> X<&> Binary C<"&"> returns its operands ANDed together bit by bit. Although no warning is currently raised, the result is not well defined when this operation is performed on operands that aren't either numbers (see -L<Integer Arithmetic>) nor bitstrings (see L<Bitwise String Operators>). +L</Integer Arithmetic>) nor bitstrings (see L</Bitwise String Operators>). Note that C<"&"> has lower priority than relational operators, so for example the parentheses are essential in a test like print "Even\n" if ($x & 1) == 0; -If the experimental "bitwise" feature is enabled via S<C<use feature -'bitwise'>>, then this operator always treats its operand as numbers. This -feature produces a warning unless you also use C<S<no warnings -'experimental::bitwise'>>. +If the "bitwise" feature is enabled via S<C<use feature 'bitwise'>> or +C<use v5.28>, then this operator always treats its operands as numbers. +Before Perl 5.28 this feature produced a warning in the +C<"experimental::bitwise"> category. =head2 Bitwise Or and Exclusive Or X<operator, bitwise, or> X<bitwise or> X<|> X<operator, bitwise, xor> @@ -859,7 +888,7 @@ Binary C<"^"> returns its operands XORed together bit by bit. Although no warning is currently raised, the results are not well defined when these operations are performed on operands that aren't either -numbers (see L<Integer Arithmetic>) nor bitstrings (see L<Bitwise String +numbers (see L</Integer Arithmetic>) nor bitstrings (see L</Bitwise String Operators>). Note that C<"|"> and C<"^"> have lower priority than relational operators, so @@ -867,10 +896,10 @@ for example the parentheses are essential in a test like print "false\n" if (8 | 2) != 10; -If the experimental "bitwise" feature is enabled via S<C<use feature -'bitwise'>>, then this operator always treats its operand as numbers. This -feature produces a warning unless you also use S<C<no warnings -'experimental::bitwise'>>. +If the "bitwise" feature is enabled via S<C<use feature 'bitwise'>> or +C<use v5.28>, then this operator always treats its operands as numbers. +Before Perl 5.28. this feature produced a warning in the +C<"experimental::bitwise"> category. =head2 C-style Logical And X<&&> X<logical and> X<operator, logical, and> @@ -939,7 +968,7 @@ It would be even more readable to write that this way: unless(unlink("alpha", "beta", "gamma")) { gripe(); next LINE; - } + } Using C<"or"> for assignment is unlikely to do what you want; see below. @@ -1073,6 +1102,12 @@ If the final value specified is not in the sequence that the magical increment would produce, the sequence goes until the next value would be longer than the final value specified. +As of Perl 5.26, the list-context range operator on strings works as expected +in the scope of L<< S<C<"use feature 'unicode_strings">>|feature/The +'unicode_strings' feature >>. In previous versions, and outside the scope of +that feature, it exhibits L<perlunicode/The "Unicode Bug">: its behavior +depends on the internal encoding of the range endpoint. + If the initial value specified isn't part of a magical increment sequence (that is, a non-empty string matching C</^[a-zA-Z]*[0-9]*\z/>), only the initial value will be returned. So the following will only @@ -1085,9 +1120,9 @@ To get the 25 traditional lowercase Greek letters, including both sigmas, you could use this instead: use charnames "greek"; - my @greek_small = map { chr } ( ord("\N{alpha}") + my @greek_small = map { chr } ( ord("\N{alpha}") .. - ord("\N{omega}") + ord("\N{omega}") ); However, because there are I<many> other lowercase Greek characters than @@ -1199,7 +1234,7 @@ the number of elements produced by the expression on the right hand side of the assignment. The three dotted bitwise assignment operators (C<&.=> C<|.=> C<^.=>) are new in -Perl 5.22 and experimental. See L</Bitwise String Operators>. +Perl 5.22. See L</Bitwise String Operators>. =head2 Comma Operator X<comma> X<operator, comma> X<,> @@ -1261,16 +1296,19 @@ The only operators with lower precedence are the logical operators C<"and">, C<"or">, and C<"not">, which may be used to evaluate calls to list operators without the need for parentheses: - open HANDLE, "< :utf8", "filename" or die "Can't open: $!\n"; + open HANDLE, "< :encoding(UTF-8)", "filename" + or die "Can't open: $!\n"; However, some people find that code harder to read than writing it with parentheses: - open(HANDLE, "< :utf8", "filename") or die "Can't open: $!\n"; + open(HANDLE, "< :encoding(UTF-8)", "filename") + or die "Can't open: $!\n"; in which case you might as well just use the more customary C<"||"> operator: - open(HANDLE, "< :utf8", "filename") || die "Can't open: $!\n"; + open(HANDLE, "< :encoding(UTF-8)", "filename") + || die "Can't open: $!\n"; See also discussion of list operators in L</Terms and List Operators (Leftward)>. @@ -1388,7 +1426,8 @@ Note, however, that this does not always work for quoting Perl code: is a syntax error. The C<L<Text::Balanced>> module (standard as of v5.8, and from CPAN before then) is able to do this properly. -There can be whitespace between the operator and the quoting +There can (and in some cases, must) be whitespace between the operator +and the quoting characters, except when C<#> is being used as the quoting character. C<q#foo#> is parsed as the string C<foo>, while S<C<q #foo#>> is the operator C<q> followed by a comment. Its argument will be taken @@ -1397,6 +1436,12 @@ from the next line. This allows you to write: s {foo} # Replace foo {bar} # with bar. +The cases where whitespace must be used are when the quoting character +is a word character (meaning it matches C</\w/>): + + q XfooX # Works: means the string 'foo' + qXfooX # WRONG! + The following escape sequences are available in constructs that interpolate, and in transliterations: X<\t> X<\n> X<\r> X<\f> X<\b> X<\a> X<\e> X<\x> X<\0> X<\c> X<\N> X<\N{}> @@ -1555,12 +1600,9 @@ as a Unicode code point no matter what the native encoding is. The name of the character in the 256th position (indexed by 0) in Unicode is C<LATIN CAPITAL LETTER A WITH MACRON>. -There are a couple of exceptions to the above rule. S<C<\N{U+I<hex number>}>> is +An exception to the above rule is that S<C<\N{U+I<hex number>}>> is always interpreted as a Unicode code point, so that C<\N{U+0050}> is C<"P"> even -on EBCDIC platforms. And if C<S<L<use encoding|encoding>>> is in effect, the -number is considered to be in that encoding, and is translated from that into -the platform's native encoding if there is a corresponding native character; -otherwise to Unicode. +on EBCDIC platforms. =back @@ -1618,7 +1660,7 @@ and although they often accept just C<"\012">, they seldom tolerate just C<"\015">. If you get in the habit of using C<"\n"> for networking, you may be burned some day. X<newline> X<line terminator> X<eol> X<end of line> -X<\r> +X<\n> X<\r> X<\r\n> For constructs that do interpolate, variables beginning with "C<$>" or "C<@>" are interpolated. Subscripted variables such as C<$a[3]> or @@ -1675,12 +1717,12 @@ X<qr> X</i> X</m> X</o> X</s> X</x> X</p> This operator quotes (and possibly compiles) its I<STRING> as a regular expression. I<STRING> is interpolated the same way as I<PATTERN> -in C<m/I<PATTERN>/>. If C<"'"> is used as the delimiter, no interpolation -is done. Returns a Perl value which may be used instead of the +in C<m/I<PATTERN>/>. If C<"'"> is used as the delimiter, no variable +interpolation is done. Returns a Perl value which may be used instead of the corresponding C</I<STRING>/msixpodualn> expression. The returned value is a normalized version of the original pattern. It magically differs from a string containing the same characters: C<ref(qr/x/)> returns "Regexp"; -however, dereferencing it is not well defined (you currently get the +however, dereferencing it is not well defined (you currently get the normalized version of the original pattern, but this may change). @@ -1729,15 +1771,18 @@ Options (specified by the following modifiers) are: m Treat string as multiple lines. s Treat string as single line. (Make . match a newline) i Do case-insensitive pattern matching. - x Use extended regular expressions. + x Use extended regular expressions; specifying two + x's means \t and the SPACE character are ignored within + square-bracketed character classes p When matching preserve a copy of the matched string so that ${^PREMATCH}, ${^MATCH}, ${^POSTMATCH} will be defined (ignored starting in v5.20) as these are always - defined starting in that relese + defined starting in that release o Compile pattern only once. - a ASCII-restrict: Use ASCII for \d, \s, \w; specifying two - a's further restricts things to that that no ASCII - character will match a non-ASCII one under /i. + a ASCII-restrict: Use ASCII for \d, \s, \w and [[:posix:]] + character classes; specifying two a's adds the further + restriction that no ASCII character will match a + non-ASCII one under /i. l Use the current run-time locale's rules. u Use Unicode rules. d Use Unicode or native charset, as in 5.12 and earlier. @@ -1748,7 +1793,7 @@ of C<"msixpluadn"> will be propagated appropriately. The effect that the C</o> modifier has is not propagated, being restricted to those patterns explicitly using it. -The last four modifiers listed above, added in Perl 5.14, +The C</a>, C</d>, C</l>, and C</u> modifiers (added in Perl 5.14) control the character set rules, but C</a> is the only one you are likely to want to specify explicitly; the other three are selected automatically by various pragmas. @@ -1785,7 +1830,7 @@ as delimiters. This is particularly useful for matching path names that contain C<"/">, to avoid LTS (leaning toothpick syndrome). If C<"?"> is the delimiter, then a match-only-once rule applies, described in C<m?I<PATTERN>?> below. If C<"'"> (single quote) is the delimiter, -no interpolation is performed on the I<PATTERN>. +no variable interpolation is performed on the I<PATTERN>. When using a delimiter character valid in an identifier, whitespace is required after the C<m>. @@ -1858,7 +1903,7 @@ If the C</g> option is not used, C<m//> in list context returns a list consisting of the subexpressions matched by the parentheses in the pattern, that is, (C<$1>, C<$2>, C<$3>...) (Note that here C<$1> etc. are also set). When there are no parentheses in the pattern, the return -value is the list C<(1)> for success. +value is the list C<(1)> for success. With or without parentheses, an empty list is returned upon failure. Examples: @@ -1981,8 +2026,8 @@ The last example should print: Notice that the final match matched C<q> instead of C<p>, which a match without the C<\G> anchor would have done. Also note that the final match did not update C<pos>. C<pos> is only updated on a C</g> match. If the -final match did indeed match C<p>, it's a good bet that you're running a -very old (pre-5.6.0) version of Perl. +final match did indeed match C<p>, it's a good bet that you're running an +ancient (pre-5.6.0) version of Perl. A useful idiom for C<lex>-like scanners is C</\G.../gc>. You can combine several regexps like this to process a string part-by-part, @@ -2019,8 +2064,6 @@ Here is the output (split into several lines): =item C<m?I<PATTERN>?msixpodualngc> X<?> X<operator, match-once> -=item C<?I<PATTERN>?msixpodualngc> - This is just like the C<m/I<PATTERN>/> search, except that it matches only once between calls to the C<reset()> operator. This is a useful optimization when you want to see only the first occurrence of @@ -2049,12 +2092,13 @@ syntax error. If you encounter this construct in older code, you can just add C<m>. =item C<s/I<PATTERN>/I<REPLACEMENT>/msixpodualngcer> -X<substitute> X<substitution> X<replace> X<regexp, replace> +X<s> X<substitute> X<substitution> X<replace> X<regexp, replace> X<regexp, substitute> X</m> X</s> X</i> X</x> X</p> X</o> X</g> X</c> X</e> X</r> Searches a string for a pattern, and if found, replaces that pattern with the replacement text and returns the number of substitutions -made. Otherwise it returns false (specifically, the empty string). +made. Otherwise it returns false (a value that is both an empty string (C<"">) +and numeric zero (C<0>) as described in L</Relational Operators>). If the C</r> (non-destructive) option is used then it runs the substitution on a copy of the string and instead of returning the @@ -2069,7 +2113,7 @@ the string specified must be a scalar variable, an array element, a hash element, or an assignment to one of those; that is, some sort of scalar lvalue. -If the delimiter chosen is a single quote, no interpolation is +If the delimiter chosen is a single quote, no variable interpolation is done on either the I<PATTERN> or the I<REPLACEMENT>. Otherwise, if the I<PATTERN> contains a C<$> that looks like a variable rather than an end-of-string test, the variable will be interpolated into the pattern @@ -2278,7 +2322,7 @@ On some platforms (notably DOS-like ones), the shell may not be capable of dealing with multiline commands, so putting newlines in the string may not get you what you want. You may be able to evaluate multiple commands in a single line by separating them with the command -separator character, if your shell supports that (for example, C<;> on +separator character, if your shell supports that (for example, C<;> on many Unix shells and C<&> on the Windows NT C<cmd> shell). Perl will attempt to flush all files opened for @@ -2316,6 +2360,12 @@ failure modes by inspecting C<$?> like this: printf "child exited with value %d\n", $? >> 8; } +Use the L<open> pragma to control the I/O layers used when reading the +output of the command, for example: + + use open IN => ":encoding(UTF-8)"; + my $x = `cmd-producing-utf-8`; + See L</"I/O Operators"> for more discussion. =item C<qw/I<STRING>/> @@ -2327,7 +2377,8 @@ equivalent to: split(" ", q/STRING/); -the differences being that it generates a real list at compile time, and +the differences being that it only splits on ASCII whitespace, +generates a real list at compile time, and in scalar context it returns the last element in the list. So this expression: @@ -2370,25 +2421,27 @@ of those; in other words, an lvalue. A character range may be specified with a hyphen, so C<tr/A-J/0-9/> does the same replacement as C<tr/ACEGIBDFHJ/0246813579/>. For B<sed> devotees, C<y> is provided as a synonym for C<tr>. If the -I<SEARCHLIST> is delimited by bracketing quotes, the I<REPLACEMENTLIST> has -its own pair of quotes, which may or may not be bracketing quotes; -for example, C<tr[aeiouy][yuoiea]> or C<tr(+\-*/)/ABCD/>. +I<SEARCHLIST> is delimited by bracketing quotes, the I<REPLACEMENTLIST> +must have its own pair of quotes, which may or may not be bracketing +quotes; for example, C<tr[aeiouy][yuoiea]> or C<tr(+\-*/)/ABCD/>. Characters may be literals or any of the escape sequences accepted in -double-quoted strings. But there is no interpolation, so C<"$"> and -C<"@"> are treated as literals. A hyphen at the beginning or end, or +double-quoted strings. But there is no variable interpolation, so C<"$"> +and C<"@"> are treated as literals. A hyphen at the beginning or end, or preceded by a backslash is considered a literal. Escape sequence details are in L<the table near the beginning of this section|/Quote and Quote-like Operators>. Note that C<tr> does B<not> do regular expression character classes such as C<\d> or C<\pL>. The C<tr> operator is not equivalent to the C<L<tr(1)>> -utility. If you want to map strings between lower/upper cases, see -L<perlfunc/lc> and L<perlfunc/uc>, and in general consider using the C<s> -operator if you need regular expressions. The C<\U>, C<\u>, C<\L>, and -C<\l> string-interpolation escapes on the right side of a substitution -operator will perform correct case-mappings, but C<tr[a-z][A-Z]> will not -(except sometimes on legacy 7-bit data). +utility. C<tr[a-z][A-Z]> will uppercase the 26 letters "a" through "z", +but for case changing not confined to ASCII, use +L<C<lc>|perlfunc/lc>, L<C<uc>|perlfunc/uc>, +L<C<lcfirst>|perlfunc/lcfirst>, L<C<ucfirst>|perlfunc/ucfirst> +(all documented in L<perlfunc>), or the +L<substitution operator C<sE<sol>I<PATTERN>E<sol>I<REPLACEMENT>E<sol>>|/sE<sol>PATTERNE<sol>REPLACEMENTE<sol>msixpodualngcer> +(with C<\U>, C<\u>, C<\L>, and C<\l> string-interpolation escapes in the +I<REPLACEMENT> portion). Most ranges are unportable between character sets, but certain ones signal Perl to do special handling to make them portable. There are two @@ -2436,22 +2489,36 @@ Options: untouched. If the C</c> modifier is specified, the I<SEARCHLIST> character set -is complemented. If the C</d> modifier is specified, any characters +is complemented. So for example these two are equivalent (the exact +maximum number will depend on your platform): + + tr/\x00-\xfd/ABCD/c + tr/\xfe-\x{7fffffff}/ABCD/ + +If the C</d> modifier is specified, any characters specified by I<SEARCHLIST> not found in I<REPLACEMENTLIST> are deleted. (Note that this is slightly more flexible than the behavior of some B<tr> programs, which delete anything they find in the I<SEARCHLIST>, -period.) If the C</s> modifier is specified, sequences of characters -that were transliterated to the same character are squashed down -to a single instance of the character. +period.) + +If the C</s> modifier is specified, runs of the same character in the +result, where each those characters were substituted by the +transliteration, are squashed down to a single instance of the character. If the C</d> modifier is used, the I<REPLACEMENTLIST> is always interpreted exactly as specified. Otherwise, if the I<REPLACEMENTLIST> is shorter than the I<SEARCHLIST>, the final character is replicated till it is long enough. If the I<REPLACEMENTLIST> is empty, the I<SEARCHLIST> is replicated. This latter is useful for counting characters in a class or for -squashing character sequences in a class. +squashing character sequences in a class. For example, each of these pairs +are equivalent: -Examples: + tr/abcd// tr/abcd/abcd/ + tr/abcd/AB/ tr/abcd/ABBB/ + tr/abcd//d s/[abcd]//g + tr/abcd/AB/d (tr/ab/AB/ + s/[cd]//g) - but run together + +Some examples: $ARGV[1] =~ tr/A-Z/a-z/; # canonicalize to lower case ASCII @@ -2502,6 +2569,9 @@ syntax. Following a C<< << >> you specify a string to terminate the quoted material, and all lines following the current line down to the terminating string are the value of the item. +Prefixing the terminating string with a C<~> specifies that you +want to use L</Indented Here-docs> (see below). + The terminating string may be either an identifier (a word), or some quoted text. An unquoted identifier works like double quotes. There may not be a space between the C<< << >> and the identifier, @@ -2565,6 +2635,55 @@ the results of the execution returned. =back +=over 4 + +=item Indented Here-docs + +The here-doc modifier C<~> allows you to indent your here-docs to make +the code more readable: + + if ($some_var) { + print <<~EOF; + This is a here-doc + EOF + } + +This will print... + + This is a here-doc + +...with no leading whitespace. + +The delimiter is used to determine the B<exact> whitespace to +remove from the beginning of each line. All lines B<must> have +at least the same starting whitespace (except lines only +containing a newline) or perl will croak. Tabs and spaces can +be mixed, but are matched exactly. One tab will not be equal to +8 spaces! + +Additional beginning whitespace (beyond what preceded the +delimiter) will be preserved: + + print <<~EOF; + This text is not indented + This text is indented with two spaces + This text is indented with two tabs + EOF + +Finally, the modifier may be used with all of the forms +mentioned above: + + <<~\EOF; + <<~'EOF' + <<~"EOF" + <<~`EOF` + +And whitespace may be used between the C<~> and quoted delimiters: + + <<~ 'EOF'; # ... "EOF", `EOF` + +=back + It is possible to stack multiple here-docs in a row: print <<"foo", <<"bar"; # you can stack them @@ -2597,12 +2716,12 @@ use C<chomp()>. END If you want your here-docs to be indented with the rest of the code, -you'll need to remove leading whitespace from each line manually: +use the C<<< <<~FOO >>> construct described under L</Indented Here-docs>: - ($quote = <<'FINIS') =~ s/^\s+//gm; + $quote = <<~'FINIS'; The Road goes ever on and on, down from the door where it began. - FINIS + FINIS If you use a here-doc within a delimited construct, such as in C<s///eg>, the quoted material must still come on the line following the @@ -2714,7 +2833,7 @@ If the left part is delimited by bracketing punctuation (that is C<()>, C<[]>, C<{}>, or C<< <> >>), the right part needs another pair of delimiters such as C<s(){}> and C<tr[]//>. In these cases, whitespace and comments are allowed between the two parts, although the comment must follow -at least one whitespace character; otherwise a character expected as the +at least one whitespace character; otherwise a character expected as the start of the comment may be regarded as the starting delimiter of the right part. During this search no attention is paid to the semantics of the construct. @@ -2837,7 +2956,7 @@ I<sed> hackers who haven't picked up the saner idiom yet. A warning is emitted if the S<C<use warnings>> pragma or the B<-w> command-line flag (that is, the C<$^W> variable) was set. -=item C<RE> in C<?RE?>, C</RE/>, C<m/RE/>, C<s/RE/foo/>, +=item C<RE> in C<m?RE?>, C</RE/>, C<m/RE/>, C<s/RE/foo/>, Processing of C<\Q>, C<\U>, C<\u>, C<\L>, C<\l>, C<\F>, C<\E>, and interpolation happens (almost) as with C<qq//> constructs. @@ -2880,7 +2999,7 @@ finish the regular expression, C<\/> will be stripped to C</> on the previous step, and C<\\/> will be left as is. Because C</> is equivalent to C<\/> inside a regular expression, this does not matter unless the delimiter happens to be character special to the -RE engine, such as in C<s*foo*bar*>, C<m[foo]>, or C<?foo?>; or an +RE engine, such as in C<s*foo*bar*>, C<m[foo]>, or C<m?foo?>; or an alphanumeric char, as in: m m ^ a \s* b mmx; @@ -2989,9 +3108,12 @@ destroying whatever was there previously. (This may seem like an odd thing to you, but you'll use the construct in almost every Perl script you write.) The C<$_> variable is not implicitly localized. You'll have to put a S<C<local $_;>> before the loop if you want that -to happen. +to happen. Furthermore, if the input symbol or an explicit assignment +of the input symbol to a scalar is used as a C<while>/C<for> condition, +then the condition actually tests for definedness of the expression's +value, not for its regular truth value. -The following lines are equivalent: +Thus the following lines are equivalent: while (defined($_ = <STDIN>)) { print; } while ($_ = <STDIN>) { print; } @@ -3001,7 +3123,7 @@ The following lines are equivalent: print while ($_ = <STDIN>); print while <STDIN>; -This also behaves similarly, but assigns to a lexical variable +This also behaves similarly, but assigns to a lexical variable instead of to C<$_>: while (my $line = <STDIN>) { print $line } @@ -3197,6 +3319,13 @@ to become confused with the indirect filehandle notation. @files = glob("$dir/*.[ch]"); @files = glob($files[$i]); +If an angle-bracket-based globbing expression is used as the condition of +a C<while> or C<for> loop, then it will be implicitly assigned to C<$_>. +If either a globbing expression or an explicit assignment of a globbing +expression to a scalar is used as a C<while>/C<for> condition, then +the condition actually tests for definedness of the expression's value, +not for its regular truth value. + =head2 Constant Folding X<constant folding> X<folding> @@ -3208,7 +3337,7 @@ variable substitution. Backslash interpolation also happens at compile time. You can say 'Now is the time for all' - . "\n" + . "\n" . 'good men to come to.' and this all reduces to one string internally. Likewise, if @@ -3262,16 +3391,15 @@ operation you intend by using C<""> or C<0+>, as in the examples below. $baz = 0+$foo & 0+$bar; # both ops explicitly numeric $biz = "$foo" ^ "$bar"; # both ops explicitly stringy -This somewhat unpredictable behavior can be avoided with the experimental -"bitwise" feature, new in Perl 5.22. You can enable it via S<C<use feature -'bitwise'>>. By default, it will warn unless the C<"experimental::bitwise"> -warnings category has been disabled. (S<C<use experimental 'bitwise'>> will -enable the feature and disable the warning.) Under this feature, the four +This somewhat unpredictable behavior can be avoided with the "bitwise" +feature, new in Perl 5.22. You can enable it via S<C<use feature +'bitwise'>> or C<use v5.28>. Before Perl 5.28, it used to emit a warning +in the C<"experimental::bitwise"> category. Under this feature, the four standard bitwise operators (C<~ | & ^>) are always numeric. Adding a dot after each operator (C<~. |. &. ^.>) forces it to treat its operands as strings: - use experimental "bitwise"; + use feature "bitwise"; $foo = 150 | 105; # yields 255 (0x96 | 0x69 is 0xFF) $foo = '150' | 105; # yields 255 $foo = 150 | '105'; # yields 255 @@ -3287,9 +3415,10 @@ strings: The assignment variants of these operators (C<&= |= ^= &.= |.= ^.=>) behave likewise under the feature. -The behavior of these operators is problematic (and subject to change) -if either or both of the strings are encoded in UTF-8 (see -L<perlunicode/Byte and Character Semantics>. +It is a fatal error if an operand contains a character whose ordinal +value is above 0xFF, and hence not expressible except in UTF-8. The +operation is performed on a non-UTF-8 copy for other operands encoded in +UTF-8. See L<perlunicode/Byte and Character Semantics>. See L<perlfunc/vec> for information on how to manipulate individual bits in a bit vector. @@ -3316,7 +3445,7 @@ still get C<1.4142135623731> or so. Used on numbers, the bitwise operators (C<&> C<|> C<^> C<~> C<< << >> C<< >> >>) always produce integral results. (But see also -L<Bitwise String Operators>.) However, S<C<use integer>> still has meaning for +L</Bitwise String Operators>.) However, S<C<use integer>> still has meaning for them. By default, their results are interpreted as unsigned integers, but if S<C<use integer>> is in effect, their results are interpreted as signed integers. For example, C<~0> usually evaluates to a large diff --git a/gnu/usr.bin/perl/pod/perlpacktut.pod b/gnu/usr.bin/perl/pod/perlpacktut.pod index a710f20f3f2..ce3dba1799f 100644 --- a/gnu/usr.bin/perl/pod/perlpacktut.pod +++ b/gnu/usr.bin/perl/pod/perlpacktut.pod @@ -459,7 +459,7 @@ or even: and pass C<$buf> to your send routine. Some protocols demand that the count should include the length of the count itself: then just add 4 -to the data length. (But make sure to read L<"Lengths and Widths"> before +to the data length. (But make sure to read L</"Lengths and Widths"> before you really code this!) @@ -487,7 +487,7 @@ obviously works for C<E<lt>>, where the "little end" touches the code. You will probably find these modifiers even more useful if you have to deal with big- or little-endian C structures. Be sure to read -L<"Packing and Unpacking C Structures"> for more on that. +L</"Packing and Unpacking C Structures"> for more on that. =head2 Floating point Numbers @@ -668,9 +668,10 @@ Usually you'll want to pack or unpack UTF-8 strings: my @hebrew = unpack( 'U*', $utf ); Please note: in the general case, you're better off using -Encode::decode_utf8 to decode a UTF-8 encoded byte string to a Perl -Unicode string, and Encode::encode_utf8 to encode a Perl Unicode string -to UTF-8 bytes. These functions provide means of handling invalid byte +L<C<Encode::decode('UTF-8', $utf)>|Encode/decode> to decode a UTF-8 +encoded byte string to a Perl Unicode string, and +L<C<Encode::encode('UTF-8', $str)>|Encode/encode> to encode a Perl Unicode +string to UTF-8 bytes. These functions provide means of handling invalid byte sequences and generally have a friendlier interface. =head2 Another Portable Binary Encoding @@ -798,7 +799,7 @@ C<A4> or C<Z*>: my $txt = unpack( 'A4/A*', $buf ); C</> is not implemented in Perls before 5.6, so if your code is required to -work on older Perls you'll need to C<unpack( 'Z* Z* C')> to get the length, +work on ancient Perls you'll need to C<unpack( 'Z* Z* C')> to get the length, then use it to make a new unpack string. For example # pack a message: ASCIIZ, ASCIIZ, length, string, byte diff --git a/gnu/usr.bin/perl/pod/perlperf.pod b/gnu/usr.bin/perl/pod/perlperf.pod index 87d632f0d1b..260acaba290 100644 --- a/gnu/usr.bin/perl/pod/perlperf.pod +++ b/gnu/usr.bin/perl/pod/perlperf.pod @@ -406,7 +406,7 @@ C<wordmatch> program. The wallclock, user and system, times are at the top of the analysis, and after this are the main columns defining which define the report. Check the C<dprofpp> docs for details of the many options it supports. -See also C<Apache::DProf> which hooks C<Devel::DProf> into C<mod_perl>. +See also C<L<Apache::DProf>> which hooks C<Devel::DProf> into C<mod_perl>. =head2 Devel::Profiler @@ -470,7 +470,8 @@ As the author of C<Devel::Proviler> writes: YMMV. -See also C<Devel::Apache::Profiler> which hooks C<Devel::Profiler> into C<mod_perl>. +See also C<L<Devel::Apache::Profiler>> which hooks C<Devel::Profiler> +into C<mod_perl>. =head2 Devel::SmallProf @@ -530,7 +531,8 @@ time. That regex line is looking a bit suspicious, for example. Remember that these tools are supposed to be used together, there is no single best way to profile your code, you need to use the best tools for the job. -See also C<Apache::SmallProf> which hooks C<Devel::SmallProf> into C<mod_perl>. +See also C<L<Apache::SmallProf>> which hooks C<Devel::SmallProf> into +C<mod_perl>. =head2 Devel::FastProf @@ -749,7 +751,8 @@ sort of output you can expect from this cool tool. Oodles of very useful information in there - this seems to be the way forward. -See also C<Devel::NYTProf::Apache> which hooks C<Devel::NYTProf> into C<mod_perl>. +See also C<L<Devel::NYTProf::Apache>> which hooks C<Devel::NYTProf> into +C<mod_perl>. =head1 SORTING diff --git a/gnu/usr.bin/perl/pod/perlpodspec.pod b/gnu/usr.bin/perl/pod/perlpodspec.pod index 65f79f50558..4fea607ba5c 100644 --- a/gnu/usr.bin/perl/pod/perlpodspec.pod +++ b/gnu/usr.bin/perl/pod/perlpodspec.pod @@ -480,7 +480,7 @@ the current document. Discussed briefly in L<perlpod/"Formatting Codes">. -This code is unusual is that it should have no content. That is, +This code is unusual in that it should have no content. That is, a processor may complain if it sees C<ZE<lt>potatoesE<gt>>. Whether or not it complains, the I<potatoes> text should ignored. @@ -837,7 +837,7 @@ is noncompliant behavior.) Authors of Pod formatters/processors should make every effort to avoid writing their own Pod parser. There are already several in CPAN, with a wide range of interface styles -- and one of them, -Pod::Parser, comes with modern versions of Perl. +Pod::Simple, comes with modern versions of Perl. =item * diff --git a/gnu/usr.bin/perl/pod/perlpodstyle.pod b/gnu/usr.bin/perl/pod/perlpodstyle.pod index 22524a96fcd..a2aecbacb87 100644 --- a/gnu/usr.bin/perl/pod/perlpodstyle.pod +++ b/gnu/usr.bin/perl/pod/perlpodstyle.pod @@ -276,7 +276,7 @@ section numbering conventions. This documentation is maintained as part of the podlators distribution. The current version is always available from its web site at -<http://www.eyrie.org/~eagle/software/podlators/>. +L<http://www.eyrie.org/~eagle/software/podlators/>. =head1 AUTHOR diff --git a/gnu/usr.bin/perl/pod/perlpolicy.pod b/gnu/usr.bin/perl/pod/perlpolicy.pod index ff841fc0db7..148b911b234 100644 --- a/gnu/usr.bin/perl/pod/perlpolicy.pod +++ b/gnu/usr.bin/perl/pod/perlpolicy.pod @@ -84,9 +84,9 @@ the Perl community should expect from Perl's developers: =item * -We "officially" support the two most recent stable release series. 5.20.x -and earlier are now out of support. As of the release of 5.26.0, we will -"officially" end support for Perl 5.22.x, other than providing security +We "officially" support the two most recent stable release series. 5.22.x +and earlier are now out of support. As of the release of 5.28.0, we will +"officially" end support for Perl 5.24.x, other than providing security updates as described below. =item * @@ -359,19 +359,27 @@ be included. Historically, only the pumpking cherry-picked changes from bleadperl into maintperl. This has scaling problems. At the same time, maintenance branches of stable versions of Perl need to be treated with -great care. To that end, as of Perl 5.12, we have a new process for +great care. To that end, as of Perl 5.12, we have a new process for maint branches. -Any committer may cherry-pick any commit from blead to a maint branch if -they send mail to perl5-porters announcing their intent to cherry-pick -a specific commit along with a rationale for doing so and at least two -other committers respond to the list giving their assent. (This policy -applies to current and former pumpkings, as well as other committers.) - -Other voting mechanisms may be used instead, as long as the same number of -votes is gathered in a transparent manner. Specifically, proposals of -which changes to cherry-pick must be visible to everyone on perl5-porters -so that the views of everyone interested may be heard. +Any committer may cherry-pick any commit from blead to a maint branch by +first adding an entry to the relevant voting file in the maint-votes branch +announcing the commit as a candidate for back-porting, and then waiting for +at least two other committers to add their votes in support of this (i.e. a +total of at least three votes is required before a commit may be back-ported). + +Most of the work involved in both rounding up a suitable set of candidate +commits and cherry-picking those for which three votes have been cast will +be done by the maint branch release manager, but anyone else is free to add +other proposals if they're keen to ensure certain fixes don't get overlooked +or fear they already have been. + +Other voting mechanisms may also be used instead (e.g. sending mail to +perl5-porters and at least two other committers responding to the list +giving their assent), as long as the same number of votes is gathered in a +transparent manner. Specifically, proposals of which changes to cherry-pick +must be visible to everyone on perl5-porters so that the views of everyone +interested may be heard. It is not necessary for voting to be held on cherry-picking perldelta entries associated with changes that have already been cherry-picked, nor @@ -537,8 +545,9 @@ it doesn't need to fully describe how all old versions used to work. =head1 STANDARDS OF CONDUCT The official forum for the development of perl is the perl5-porters mailing -list, mentioned above, and its bugtracker at rt.perl.org. All participants in -discussion there are expected to adhere to a standard of conduct. +list, mentioned above, and its bugtracker at rt.perl.org. Posting to the +list and the bugtracker is not a right: all participants in discussion are +expected to adhere to a standard of conduct. =over 4 @@ -546,15 +555,18 @@ discussion there are expected to adhere to a standard of conduct. Always be civil. -=item * +=item * Heed the moderators. =back -Civility is simple: stick to the facts while avoiding demeaning remarks and -sarcasm. It is not enough to be factual. You must also be civil. Responding -in kind to incivility is not acceptable. +Civility is simple: stick to the facts while avoiding demeaning remarks, +belittling other individuals, sarcasm, or a presumption of bad faith. It is +not enough to be factual. You must also be civil. Responding in kind to +incivility is not acceptable. If you relay otherwise-unposted comments to +the list from a third party, you take responsibility for the content of +those comments, and you must therefore ensure that they are civil. While civility is required, kindness is encouraged; if you have any doubt about whether you are being civil, simply ask yourself, "Am I being kind?" and aspire @@ -563,16 +575,30 @@ to that. If the list moderators tell you that you are not being civil, carefully consider how your words have appeared before responding in any way. Were they kind? You may protest, but repeated protest in the face of a repeatedly -reaffirmed decision is not acceptable. - -Unacceptable behavior will result in a public and clearly identified warning. -Repeated unacceptable behavior will result in removal from the mailing list and -revocation of rights to update rt.perl.org. The first removal is for one -month. Subsequent removals will double in length. After six months with no -warning, a user's ban length is reset. Removals, like warnings, are public. +reaffirmed decision is not acceptable. Repeatedly protesting about the +moderators' decisions regarding a third party is also unacceptable, as is +continuing to initiate off-list contact with the moderators about their +decisions. + +Unacceptable behavior will result in a public and clearly identified +warning. A second instance of unacceptable behavior from the same +individual will result in removal from the mailing list and rt.perl.org, +for a period of one calendar month. The rationale for this is to +provide an opportunity for the person to change the way they act. + +After the time-limited ban has been lifted, a third instance of +unacceptable behavior will result in a further public warning. A fourth +or subsequent instance will result in an indefinite ban. The rationale +is that, in the face of an apparent refusal to change behavior, we must +protect other community members from future unacceptable actions. The +moderators may choose to lift an indefinite ban if the person in +question affirms they will not transgress again. + +Removals, like warnings, are public. The list of moderators will be public knowledge. At present, it is: -Aaron Crane, Andy Dougherty, Ricardo Signes, Sawyer X, Steffen Müller. +Aaron Crane, Andy Dougherty, Karen Etheridge, Ricardo Signes, Sawyer X, +Steffen Müller, Todd Rinaldo. =head1 CREDITS diff --git a/gnu/usr.bin/perl/pod/perlport.pod b/gnu/usr.bin/perl/pod/perlport.pod index 15d411c8eb8..5ad2ffc3e61 100644 --- a/gnu/usr.bin/perl/pod/perlport.pod +++ b/gnu/usr.bin/perl/pod/perlport.pod @@ -67,9 +67,9 @@ The important thing is to decide where the code will run and to be deliberate in your decision. The material below is separated into three main sections: main issues of -portability (L<"ISSUES">), platform-specific issues (L<"PLATFORMS">), and +portability (L</"ISSUES">), platform-specific issues (L</"PLATFORMS">), and built-in Perl functions that behave differently on various ports -(L<"FUNCTION IMPLEMENTATIONS">). +(L</"FUNCTION IMPLEMENTATIONS">). This information should not be considered complete; it includes possibly transient information about idiosyncrasies of some of the ports, almost @@ -95,42 +95,50 @@ translates it to (or from) C<\015\012>, depending on whether you're reading or writing. Unix does the same thing on ttys in canonical mode. C<\015\012> is commonly referred to as CRLF. -To trim trailing newlines from text lines use C<chomp()>. With default -settings that function looks for a trailing C<\n> character and thus -trims in a portable way. +To trim trailing newlines from text lines use +L<C<chomp>|perlfunc/chomp VARIABLE>. With default settings that function +looks for a trailing C<\n> character and thus trims in a portable way. When dealing with binary files (or text files in binary mode) be sure -to explicitly set $/ to the appropriate value for your file format -before using C<chomp()>. - -Because of the "text" mode translation, DOSish perls have limitations -in using C<seek> and C<tell> on a file accessed in "text" mode. -Stick to C<seek>-ing to locations you got from C<tell> (and no -others), and you are usually free to use C<seek> and C<tell> even -in "text" mode. Using C<seek> or C<tell> or other file operations -may be non-portable. If you use C<binmode> on a file, however, you -can usually C<seek> and C<tell> with arbitrary values safely. +to explicitly set L<C<$E<sol>>|perlvar/$E<sol>> to the appropriate value for +your file format before using L<C<chomp>|perlfunc/chomp VARIABLE>. + +Because of the "text" mode translation, DOSish perls have limitations in +using L<C<seek>|perlfunc/seek FILEHANDLE,POSITION,WHENCE> and +L<C<tell>|perlfunc/tell FILEHANDLE> on a file accessed in "text" mode. +Stick to L<C<seek>|perlfunc/seek FILEHANDLE,POSITION,WHENCE>-ing to +locations you got from L<C<tell>|perlfunc/tell FILEHANDLE> (and no +others), and you are usually free to use +L<C<seek>|perlfunc/seek FILEHANDLE,POSITION,WHENCE> and +L<C<tell>|perlfunc/tell FILEHANDLE> even in "text" mode. Using +L<C<seek>|perlfunc/seek FILEHANDLE,POSITION,WHENCE> or +L<C<tell>|perlfunc/tell FILEHANDLE> or other file operations may be +non-portable. If you use L<C<binmode>|perlfunc/binmode FILEHANDLE> on a +file, however, you can usually +L<C<seek>|perlfunc/seek FILEHANDLE,POSITION,WHENCE> and +L<C<tell>|perlfunc/tell FILEHANDLE> with arbitrary values safely. A common misconception in socket programming is that S<C<\n eq \012>> everywhere. When using protocols such as common Internet protocols, C<\012> and C<\015> are called for specifically, and the values of the logical C<\n> and C<\r> (carriage return) are not reliable. - print SOCKET "Hi there, client!\r\n"; # WRONG - print SOCKET "Hi there, client!\015\012"; # RIGHT + print $socket "Hi there, client!\r\n"; # WRONG + print $socket "Hi there, client!\015\012"; # RIGHT However, using C<\015\012> (or C<\cM\cJ>, or C<\x0D\x0A>) can be tedious and unsightly, as well as confusing to those maintaining the code. As -such, the C<Socket> module supplies the Right Thing for those who want it. +such, the L<C<Socket>|Socket> module supplies the Right Thing for those +who want it. use Socket qw(:DEFAULT :crlf); - print SOCKET "Hi there, client!$CRLF" # RIGHT + print $socket "Hi there, client!$CRLF" # RIGHT When reading from a socket, remember that the default input record -separator C<$/> is C<\n>, but robust socket code will recognize as -either C<\012> or C<\015\012> as end of line: +separator L<C<$E<sol>>|perlvar/$E<sol>> is C<\n>, but robust socket code +will recognize as either C<\012> or C<\015\012> as end of line: - while (<SOCKET>) { # NOT ADVISABLE! + while (<$socket>) { # NOT ADVISABLE! # ... } @@ -140,7 +148,7 @@ be set to LF and any CR stripped later. Better to write: use Socket qw(:DEFAULT :crlf); local($/) = LF; # not needed if $/ is already \012 - while (<SOCKET>) { + while (<$socket>) { s/$CR?$LF/\n/; # not sure if socket uses LF or CRLF, OK # s/\015?\012/\n/; # same thing } @@ -210,7 +218,8 @@ decimal), a big-endian host (Motorola, Sparc, PA) reads it as 0x78563412 (2018915346 in decimal). Alpha and MIPS can be either: Digital/Compaq used/uses them in little-endian mode; SGI/Cray uses them in big-endian mode. To avoid this problem in network (socket) -connections use the C<pack> and C<unpack> formats C<n> and C<N>, the +connections use the L<C<pack>|perlfunc/pack TEMPLATE,LIST> and +L<C<unpack>|perlfunc/unpack TEMPLATE,EXPR> formats C<n> and C<N>, the "network" orders. These are guaranteed to be portable. As of Perl 5.10.0, you can also use the C<E<gt>> and C<E<lt>> modifiers @@ -237,10 +246,9 @@ transferring or storing raw binary numbers. One can circumnavigate both these problems in two ways. Either transfer and store numbers always in text format, instead of raw -binary, or else consider using modules like C<Data::Dumper> and -C<Storable> -(included as of Perl 5.8). Keeping all data as text significantly -simplifies matters. +binary, or else consider using modules like +L<C<Data::Dumper>|Data::Dumper> and L<C<Storable>|Storable> (included as +of Perl 5.8). Keeping all data as text significantly simplifies matters. =head2 Files and Filesystems @@ -261,16 +269,20 @@ and LPT:). S<Mac OS> 9 and earlier used C<:> as a path separator instead of C</>. -The filesystem may support neither hard links (C<link>) nor -symbolic links (C<symlink>, C<readlink>, C<lstat>). +The filesystem may support neither hard links +(L<C<link>|perlfunc/link OLDFILE,NEWFILE>) nor symbolic links +(L<C<symlink>|perlfunc/symlink OLDFILE,NEWFILE>, +L<C<readlink>|perlfunc/readlink EXPR>, +L<C<lstat>|perlfunc/lstat FILEHANDLE>). The filesystem may support neither access timestamp nor change timestamp (meaning that about the only portable timestamp is the modification timestamp), or one second granularity of any timestamps (e.g. the FAT filesystem limits the time granularity to two seconds). -The "inode change timestamp" (the C<-C> filetest) may really be the -"creation timestamp" (which it is not in Unix). +The "inode change timestamp" (the L<C<-C>|perlfunc/-X FILEHANDLE> +filetest) may really be the "creation timestamp" (which it is not in +Unix). VOS perl can emulate Unix filenames with C</> as path separator. The native pathname characters greater-than, less-than, number-sign, and @@ -282,19 +294,19 @@ signal filesystems and disk names. Don't assume Unix filesystem access semantics: that read, write, and execute are all the permissions there are, and even if they exist, -that their semantics (for example what do C<"r">, C<"w">, and C<"x"> mean on +that their semantics (for example what do C<r>, C<w>, and C<x> mean on a directory) are the Unix ones. The various Unix/POSIX compatibility -layers usually try to make interfaces like C<chmod()> work, but sometimes -there simply is no good mapping. +layers usually try to make interfaces like L<C<chmod>|perlfunc/chmod LIST> +work, but sometimes there simply is no good mapping. -The C<File::Spec> modules provide methods to manipulate path +The L<C<File::Spec>|File::Spec> modules provide methods to manipulate path specifications and return the results in native format for each platform. This is often unnecessary as Unix-style paths are understood by Perl on every supported platform, but if you need to produce native paths for a native utility that does not understand Unix syntax, or if you are operating on paths or path components -in unknown (and thus possibly native) syntax, C<File::Spec> is -your friend. Here are two brief examples: +in unknown (and thus possibly native) syntax, L<C<File::Spec>|File::Spec> +is your friend. Here are two brief examples: use File::Spec::Functions; chdir(updir()); # go up one directory @@ -313,9 +325,9 @@ machines. This is especially noticeable in scripts like Makefiles and test suites, which often assume C</> as a path separator for subdirectories. -Also of use is C<File::Basename> from the standard distribution, which -splits a pathname into pieces (base filename, full path to directory, -and file suffix). +Also of use is L<C<File::Basename>|File::Basename> from the standard +distribution, which splits a pathname into pieces (base filename, full +path to directory, and file suffix). Even when on a single platform (if you can call Unix a single platform), remember not to count on the existence or the contents of particular @@ -338,9 +350,9 @@ not to have non-word characters (except for C<.>) in the names, and keep them to the 8.3 convention, for maximum portability, onerous a burden though this may appear. -Likewise, when using the C<AutoSplit> module, try to keep your functions to -8.3 naming and case-insensitive conventions; or, at the least, -make it so the resulting files have a unique (case-insensitively) +Likewise, when using the L<C<AutoSplit>|AutoSplit> module, try to keep +your functions to 8.3 naming and case-insensitive conventions; or, at the +least, make it so the resulting files have a unique (case-insensitively) first 8 characters. Whitespace in filenames is tolerated on most systems, but not all, @@ -351,18 +363,16 @@ Many systems (DOS, VMS ODS-2) cannot have more than one C<.> in their filenames. Don't assume C<< > >> won't be the first character of a filename. -Always use C<< < >> explicitly to open a file for reading, or even -better, use the three-arg version of C<open>, unless you want the user to -be able to specify a pipe open. +Always use the three-arg version of +L<C<open>|perlfunc/open FILEHANDLE,EXPR>: open my $fh, '<', $existing_file) or die $!; -If filenames might use strange characters, it is safest to open it -with C<sysopen> instead of C<open>. C<open> is magic and can -translate characters like C<< > >>, C<< < >>, and C<|>, which may -be the wrong thing to do. (Sometimes, though, it's the right thing.) -Three-arg open can also help protect against this translation in cases -where it is undesirable. +Two-arg L<C<open>|perlfunc/open FILEHANDLE,EXPR> is magic and can +translate characters like C<< > >>, C<< < >>, and C<|> in filenames, +which is usually the wrong thing to do. +L<C<sysopen>|perlfunc/sysopen FILEHANDLE,FILENAME,MODE> and three-arg +L<C<open>|perlfunc/open FILEHANDLE,EXPR> don't have this problem. Don't use C<:> as a part of a filename since many systems use that for their own semantics (Mac OS Classic for separating pathname components, @@ -381,7 +391,7 @@ The I<portable filename characters> as defined by ANSI C are 0 1 2 3 4 5 6 7 8 9 . _ - -and the C<"-"> shouldn't be the first character. If you want to be +and C<-> shouldn't be the first character. If you want to be hypercorrect, stay case-insensitive and within the 8.3 naming convention (all the files and directories have to be unique within one directory if their names are lowercased and truncated to eight @@ -398,10 +408,14 @@ to deal with, so don't stay up late worrying about it. Some platforms can't delete or rename files held open by the system, this limitation may also apply to changing filesystem metainformation -like file permissions or owners. Remember to C<close> files when you -are done with them. Don't C<unlink> or C<rename> an open file. Don't -C<tie> or C<open> a file already tied or opened; C<untie> or C<close> -it first. +like file permissions or owners. Remember to +L<C<close>|perlfunc/close FILEHANDLE> files when you are done with them. +Don't L<C<unlink>|perlfunc/unlink LIST> or +L<C<rename>|perlfunc/rename OLDNAME,NEWNAME> an open file. Don't +L<C<tie>|perlfunc/tie VARIABLE,CLASSNAME,LIST> or +L<C<open>|perlfunc/open FILEHANDLE,EXPR> a file already tied or opened; +L<C<untie>|perlfunc/untie VARIABLE> or +L<C<close>|perlfunc/close FILEHANDLE> it first. Don't open the same file more than once at a time for writing, as some operating systems put mandatory locks on such files. @@ -413,84 +427,95 @@ permission also (or even just) in the file/directory itself. In some filesystems (AFS, DFS) the permission to add/delete directory entries is a completely separate permission. -Don't assume that a single C<unlink> completely gets rid of the file: -some filesystems (most notably the ones in VMS) have versioned -filesystems, and C<unlink()> removes only the most recent one (it doesn't -remove all the versions because by default the native tools on those -platforms remove just the most recent version, too). The portable -idiom to remove all the versions of a file is +Don't assume that a single L<C<unlink>|perlfunc/unlink LIST> completely +gets rid of the file: some filesystems (most notably the ones in VMS) have +versioned filesystems, and L<C<unlink>|perlfunc/unlink LIST> removes only +the most recent one (it doesn't remove all the versions because by default +the native tools on those platforms remove just the most recent version, +too). The portable idiom to remove all the versions of a file is 1 while unlink "file"; -This will terminate if the file is undeleteable for some reason +This will terminate if the file is undeletable for some reason (protected, not there, and so on). -Don't count on a specific environment variable existing in C<%ENV>. -Don't count on C<%ENV> entries being case-sensitive, or even -case-preserving. Don't try to clear C<%ENV> by saying C<%ENV = ();>, or, -if you really have to, make it conditional on C<$^O ne 'VMS'> since in -VMS the C<%ENV> table is much more than a per-process key-value string -table. - -On VMS, some entries in the C<%ENV> hash are dynamically created when -their key is used on a read if they did not previously exist. The -values for C<$ENV{HOME}>, C<$ENV{TERM}>, C<$ENV{PATH}>, and C<$ENV{USER}>, -are known to be dynamically generated. The specific names that are -dynamically generated may vary with the version of the C library on VMS, -and more may exist than are documented. - -On VMS by default, changes to the %ENV hash persist after perl exits. -Subsequent invocations of perl in the same process can inadvertently -inherit environment settings that were meant to be temporary. - -Don't count on signals or C<%SIG> for anything. - -Don't count on filename globbing. Use C<opendir>, C<readdir>, and -C<closedir> instead. +Don't count on a specific environment variable existing in +L<C<%ENV>|perlvar/%ENV>. Don't count on L<C<%ENV>|perlvar/%ENV> entries +being case-sensitive, or even case-preserving. Don't try to clear +L<C<%ENV>|perlvar/%ENV> by saying C<%ENV = ();>, or, if you really have +to, make it conditional on C<$^O ne 'VMS'> since in VMS the +L<C<%ENV>|perlvar/%ENV> table is much more than a per-process key-value +string table. + +On VMS, some entries in the L<C<%ENV>|perlvar/%ENV> hash are dynamically +created when their key is used on a read if they did not previously +exist. The values for C<$ENV{HOME}>, C<$ENV{TERM}>, C<$ENV{PATH}>, and +C<$ENV{USER}>, are known to be dynamically generated. The specific names +that are dynamically generated may vary with the version of the C library +on VMS, and more may exist than are documented. + +On VMS by default, changes to the L<C<%ENV>|perlvar/%ENV> hash persist +after perl exits. Subsequent invocations of perl in the same process can +inadvertently inherit environment settings that were meant to be +temporary. + +Don't count on signals or L<C<%SIG>|perlvar/%SIG> for anything. + +Don't count on filename globbing. Use +L<C<opendir>|perlfunc/opendir DIRHANDLE,EXPR>, +L<C<readdir>|perlfunc/readdir DIRHANDLE>, and +L<C<closedir>|perlfunc/closedir DIRHANDLE> instead. Don't count on per-program environment variables, or per-program current directories. -Don't count on specific values of C<$!>, neither numeric nor +Don't count on specific values of L<C<$!>|perlvar/$!>, neither numeric nor especially the string values. Users may switch their locales causing error messages to be translated into their languages. If you can trust a POSIXish environment, you can portably use the symbols defined -by the C<Errno> module, like C<ENOENT>. And don't trust on the values of C<$!> -at all except immediately after a failed system call. +by the L<C<Errno>|Errno> module, like C<ENOENT>. And don't trust on the +values of L<C<$!>|perlvar/$!> at all except immediately after a failed +system call. =head2 Command names versus file pathnames Don't assume that the name used to invoke a command or program with -C<system> or C<exec> can also be used to test for the existence of the -file that holds the executable code for that command or program. +L<C<system>|perlfunc/system LIST> or L<C<exec>|perlfunc/exec LIST> can +also be used to test for the existence of the file that holds the +executable code for that command or program. First, many systems have "internal" commands that are built-in to the shell or OS and while these commands can be invoked, there is no corresponding file. Second, some operating systems (e.g., Cygwin, DJGPP, OS/2, and VOS) have required suffixes for executable files; these suffixes are generally permitted on the command name but are not -required. Thus, a command like F<"perl"> might exist in a file named -F<"perl">, F<"perl.exe">, or F<"perl.pm">, depending on the operating system. -The variable C<"_exe"> in the C<Config> module holds the executable suffix, -if any. Third, the VMS port carefully sets up C<$^X> and -C<$Config{perlpath}> so that no further processing is required. This is -just as well, because the matching regular expression used below would -then have to deal with a possible trailing version number in the VMS -file name. - -To convert C<$^X> to a file pathname, taking account of the requirements -of the various operating system possibilities, say: +required. Thus, a command like C<perl> might exist in a file named +F<perl>, F<perl.exe>, or F<perl.pm>, depending on the operating system. +The variable L<C<$Config{_exe}>|Config/C<_exe>> in the +L<C<Config>|Config> module holds the executable suffix, if any. Third, +the VMS port carefully sets up L<C<$^X>|perlvar/$^X> and +L<C<$Config{perlpath}>|Config/C<perlpath>> so that no further processing +is required. This is just as well, because the matching regular +expression used below would then have to deal with a possible trailing +version number in the VMS file name. + +To convert L<C<$^X>|perlvar/$^X> to a file pathname, taking account of +the requirements of the various operating system possibilities, say: use Config; my $thisperl = $^X; - if ($^O ne 'VMS') - {$thisperl .= $Config{_exe} unless $thisperl =~ m/$Config{_exe}$/i;} + if ($^O ne 'VMS') { + $thisperl .= $Config{_exe} + unless $thisperl =~ m/\Q$Config{_exe}\E$/i; + } -To convert C<$Config{perlpath}> to a file pathname, say: +To convert L<C<$Config{perlpath}>|Config/C<perlpath>> to a file pathname, say: use Config; my $thisperl = $Config{perlpath}; - if ($^O ne 'VMS') - {$thisperl .= $Config{_exe} unless $thisperl =~ m/$Config{_exe}$/i;} + if ($^O ne 'VMS') { + $thisperl .= $Config{_exe} + unless $thisperl =~ m/\Q$Config{_exe}\E$/i; + } =head2 Networking @@ -512,17 +537,18 @@ can't bind to many virtual IP addresses. Don't assume a particular network device name. -Don't assume a particular set of C<ioctl()>s will work. +Don't assume a particular set of +L<C<ioctl>|perlfunc/ioctl FILEHANDLE,FUNCTION,SCALAR>s will work. Don't assume that you can ping hosts and get replies. Don't assume that any particular port (service) will respond. -Don't assume that C<Sys::Hostname> (or any other API or command) returns -either a fully qualified hostname or a non-qualified hostname: it all -depends on how the system had been configured. Also remember that for -things such as DHCP and NAT, the hostname you get back might not be -very useful. +Don't assume that L<C<Sys::Hostname>|Sys::Hostname> (or any other API or +command) returns either a fully qualified hostname or a non-qualified +hostname: it all depends on how the system had been configured. Also +remember that for things such as DHCP and NAT, the hostname you get back +might not be very useful. All the above I<don't>s may look daunting, and they are, but the key is to degrade gracefully if one cannot reach the particular network @@ -531,9 +557,12 @@ service one wants. Croaking or hanging do not look very professional. =head2 Interprocess Communication (IPC) In general, don't directly access the system in code meant to be -portable. That means, no C<system>, C<exec>, C<fork>, C<pipe>, -C<``>, C<qx//>, C<open> with a C<|>, nor any of the other things -that makes being a Perl hacker worth being. +portable. That means, no L<C<system>|perlfunc/system LIST>, +L<C<exec>|perlfunc/exec LIST>, L<C<fork>|perlfunc/fork>, +L<C<pipe>|perlfunc/pipe READHANDLE,WRITEHANDLE>, +L<C<``> or C<qxE<sol>E<sol>>|perlop/C<qxE<sol>I<STRING>E<sol>>>, +L<C<open>|perlfunc/open FILEHANDLE,EXPR> with a C<|>, nor any of the other +things that makes being a Perl hacker worth being. Commands that launch external processes are generally supported on most platforms (though many of them do not support any type of @@ -542,22 +571,23 @@ them on. External tools are often named differently on different platforms, may not be available in the same location, might accept different arguments, can behave differently, and often present their results in a platform-dependent way. Thus, you should seldom depend -on them to produce consistent results. (Then again, if you're calling -I<netstat -a>, you probably don't expect it to run on both Unix and CP/M.) +on them to produce consistent results. (Then again, if you're calling +C<netstat -a>, you probably don't expect it to run on both Unix and CP/M.) One especially common bit of Perl code is opening a pipe to B<sendmail>: - open(MAIL, '|/usr/lib/sendmail -t') + open(my $mail, '|-', '/usr/lib/sendmail -t') or die "cannot fork sendmail: $!"; This is fine for systems programming when sendmail is known to be available. But it is not fine for many non-Unix systems, and even some Unix systems that may not have sendmail installed. If a portable solution is needed, see the various distributions on CPAN that deal -with it. C<Mail::Mailer> and C<Mail::Send> in the C<MailTools> distribution are -commonly used, and provide several mailing methods, including C<mail>, -C<sendmail>, and direct SMTP (via C<Net::SMTP>) if a mail transfer agent is -not available. C<Mail::Sendmail> is a standalone module that provides +with it. L<C<Mail::Mailer>|Mail::Mailer> and L<C<Mail::Send>|Mail::Send> +in the C<MailTools> distribution are commonly used, and provide several +mailing methods, including C<mail>, C<sendmail>, and direct SMTP (via +L<C<Net::SMTP>|Net::SMTP>) if a mail transfer agent is not available. +L<C<Mail::Sendmail>|Mail::Sendmail> is a standalone module that provides simple, platform-independent mailing. The Unix System V IPC (C<msg*(), sem*(), shm*()>) is not available @@ -568,8 +598,10 @@ bare v-strings (such as C<v10.20.30.40>) to represent IPv4 addresses: both forms just pack the four bytes into network order. That this would be equal to the C language C<in_addr> struct (which is what the socket code internally uses) is not guaranteed. To be portable use -the routines of the C<Socket> extension, such as C<inet_aton()>, -C<inet_ntoa()>, and C<sockaddr_in()>. +the routines of the L<C<Socket>|Socket> module, such as +L<C<inet_aton>|Socket/$ip_address = inet_aton $string>, +L<C<inet_ntoa>|Socket/$string = inet_ntoa $ip_address>, and +L<C<sockaddr_in>|Socket/$sockaddr = sockaddr_in $port, $ip_address>. The rule of thumb for portable code is: Do it all in portable Perl, or use a module (that may internally implement it with platform-specific @@ -592,19 +624,20 @@ achieve portability. =head2 Standard Modules In general, the standard modules work across platforms. Notable -exceptions are the C<CPAN> module (which currently makes connections to external -programs that may not be available), platform-specific modules (like -C<ExtUtils::MM_VMS>), and DBM modules. +exceptions are the L<C<CPAN>|CPAN> module (which currently makes +connections to external programs that may not be available), +platform-specific modules (like L<C<ExtUtils::MM_VMS>|ExtUtils::MM_VMS>), +and DBM modules. There is no one DBM module available on all platforms. -C<SDBM_File> and the others are generally available on all Unix and DOSish -ports, but not in MacPerl, where only C<NDBM_File> and C<DB_File> are -available. +L<C<SDBM_File>|SDBM_File> and the others are generally available on all +Unix and DOSish ports, but not in MacPerl, where only +L<C<NDBM_File>|NDBM_File> and L<C<DB_File>|DB_File> are available. The good news is that at least some DBM module should be available, and -C<AnyDBM_File> will use whichever module it can find. Of course, then -the code needs to be fairly strict, dropping to the greatest common -factor (e.g., not exceeding 1K for each record), so that it will +L<C<AnyDBM_File>|AnyDBM_File> will use whichever module it can find. Of +course, then the code needs to be fairly strict, dropping to the greatest +common factor (e.g., not exceeding 1K for each record), so that it will work with any DBM module. See L<AnyDBM_File> for more details. =head2 Time and Date @@ -627,15 +660,17 @@ defines YYYY-MM-DD as the date format, or YYYY-MM-DDTHH:MM:SS Please do use the ISO 8601 instead of making us guess what date 02/03/04 might be. ISO 8601 even sorts nicely as-is. A text representation (like "1987-12-18") can be easily converted -into an OS-specific value using a module like C<Date::Parse>. -An array of values, such as those returned by C<localtime>, can be -converted to an OS-specific representation using C<Time::Local>. +into an OS-specific value using a module like +L<C<Time::Piece>|Time::Piece> (see L<Time::Piece/Date Parsing>) or +L<C<Date::Parse>|Date::Parse>. An array of values, such as those +returned by L<C<localtime>|perlfunc/localtime EXPR>, can be converted to an OS-specific +representation using L<C<Time::Local>|Time::Local>. When calculating specific times, such as for tests in time or date modules, it may be appropriate to calculate an offset for the epoch. - require Time::Local; - my $offset = Time::Local::timegm(0, 0, 0, 1, 0, 70); + use Time::Local qw(timegm); + my $offset = timegm(0, 0, 0, 1, 0, 70); The value for C<$offset> in Unix will be C<0>, but in Mac OS Classic will be some large number. C<$offset> can then be added to a Unix time @@ -645,20 +680,25 @@ value to get what should be the proper value on any system. Assume very little about character sets. -Assume nothing about numerical values (C<ord>, C<chr>) of characters. +Assume nothing about numerical values (L<C<ord>|perlfunc/ord EXPR>, +L<C<chr>|perlfunc/chr NUMBER>) of characters. Do not use explicit code point ranges (like C<\xHH-\xHH)>. However, starting in Perl v5.22, regular expression pattern bracketed character class ranges specified like C<qr/[\N{U+HH}-\N{U+HH}]/> are portable, -and starting in Perl v5.24, the same ranges are portable in C<tr///>. +and starting in Perl v5.24, the same ranges are portable in +L<C<trE<sol>E<sol>E<sol>>|perlop/C<trE<sol>I<SEARCHLIST>E<sol>I<REPLACEMENTLIST>E<sol>cdsr>>. You can portably use symbolic character classes like C<[:print:]>. Do not assume that the alphabetic characters are encoded contiguously (in the numeric sense). There may be gaps. Special coding in Perl, however, guarantees that all subsets of C<qr/[A-Z]/>, C<qr/[a-z]/>, and -C<qr/[0-9]/> behave as expected. C<tr///> behaves the same for these -ranges. In patterns, any ranges specified with end points using the -C<\N{...}> notations ensures character set portability, but it is a bug -in Perl v5.22, that this isn't true of C<tr///>, fixed in v5.24. +C<qr/[0-9]/> behave as expected. +L<C<trE<sol>E<sol>E<sol>>|perlop/C<trE<sol>I<SEARCHLIST>E<sol>I<REPLACEMENTLIST>E<sol>cdsr>> +behaves the same for these ranges. In patterns, any ranges specified with +end points using the C<\N{...}> notations ensures character set +portability, but it is a bug in Perl v5.22 that this isn't true of +L<C<trE<sol>E<sol>E<sol>>|perlop/C<trE<sol>I<SEARCHLIST>E<sol>I<REPLACEMENTLIST>E<sol>cdsr>>, +fixed in v5.24. Do not assume anything about the ordering of the characters. The lowercase letters may come before or after the uppercase letters; @@ -679,18 +719,13 @@ and time formatting--amongst other things. If you really want to be international, you should consider Unicode. See L<perluniintro> and L<perlunicode> for more information. -If you want to use non-ASCII bytes (outside the bytes 0x00..0x7f) in -the "source code" of your code, to be portable you have to be explicit -about what bytes they are. Someone might for example be using your -code under a UTF-8 locale, in which case random native bytes might be -illegal ("Malformed UTF-8 ...") This means that for example embedding -ISO 8859-1 bytes beyond 0x7f into your strings might cause trouble -later. If the bytes are native 8-bit bytes, you can use the C<bytes> -pragma. If the bytes are in a string (regular expressions being -curious strings), you can often also use the C<\xHH> or more portably, -the C<\N{U+HH}> notations instead -of embedding the bytes as-is. If you want to write your code in UTF-8, -you can use L<utf8>. +By default Perl assumes your source code is written in an 8-bit ASCII +superset. To embed Unicode characters in your strings and regexes, you can +use the L<C<\x{HH}> or (more portably) C<\N{U+HH}> +notations|perlop/Quote and Quote-like Operators>. You can also use the +L<C<utf8>|utf8> pragma and write your code in UTF-8, which lets you use +Unicode characters directly (not just in quoted constructs but also in +identifiers). =head2 System Resources @@ -731,19 +766,20 @@ permissions between the permissions check and the actual operation. Just try the operation.) Don't assume the Unix user and group semantics: especially, don't -expect C<< $< >> and C<< $> >> (or C<$(> and C<$)>) to work -for switching identities (or memberships). +expect L<C<< $< >>|perlvar/$E<lt>> and L<C<< $> >>|perlvar/$E<gt>> (or +L<C<$(>|perlvar/$(> and L<C<$)>|perlvar/$)>) to work for switching +identities (or memberships). -Don't assume set-uid and set-gid semantics. (And even if you do, +Don't assume set-uid and set-gid semantics. (And even if you do, think twice: set-uid and set-gid are a known can of security worms.) =head2 Style For those times when it is necessary to have platform-specific code, consider keeping the platform-specific code in one place, making porting -to other platforms easier. Use the C<Config> module and the special -variable C<$^O> to differentiate platforms, as described in -L<"PLATFORMS">. +to other platforms easier. Use the L<C<Config>|Config> module and the +special variable L<C<$^O>|perlvar/$^O> to differentiate platforms, as +described in L</"PLATFORMS">. Beware of the "else syndrome": @@ -762,12 +798,12 @@ often happens when tests spawn off other processes or call external programs to aid in the testing, or when (as noted above) the tests assume certain things about the filesystem and paths. Be careful not to depend on a specific output style for errors, such as when checking -C<$!> after a failed system call. Using C<$!> for anything else than -displaying it as output is doubtful (though see the C<Errno> module for -testing reasonably portably for error value). Some platforms expect -a certain output format, and Perl on those platforms may have been -adjusted accordingly. Most specifically, don't anchor a regex when -testing an error value. +L<C<$!>|perlvar/$!> after a failed system call. Using +L<C<$!>|perlvar/$!> for anything else than displaying it as output is +doubtful (though see the L<C<Errno>|Errno> module for testing reasonably +portably for error value). Some platforms expect a certain output format, +and Perl on those platforms may have been adjusted accordingly. Most +specifically, don't anchor a regex when testing an error value. =head1 CPAN Testers @@ -797,30 +833,31 @@ Testing results: L<http://www.cpantesters.org/> =head1 PLATFORMS -Perl is built with a C<$^O> variable that indicates the operating -system it was built on. This was implemented +Perl is built with a L<C<$^O>|perlvar/$^O> variable that indicates the +operating system it was built on. This was implemented to help speed up code that would otherwise have to C<use Config> -and use the value of C<$Config{osname}>. Of course, to get more -detailed information about the system, looking into C<%Config> is -certainly recommended. +and use the value of L<C<$Config{osname}>|Config/C<osname>>. Of course, +to get more detailed information about the system, looking into +L<C<%Config>|Config/DESCRIPTION> is certainly recommended. -C<%Config> cannot always be trusted, however, because it was built -at compile time. If perl was built in one place, then transferred -elsewhere, some values may be wrong. The values may even have been -edited after the fact. +L<C<%Config>|Config/DESCRIPTION> cannot always be trusted, however, +because it was built at compile time. If perl was built in one place, +then transferred elsewhere, some values may be wrong. The values may +even have been edited after the fact. =head2 Unix Perl works on a bewildering variety of Unix and Unix-like platforms (see e.g. most of the files in the F<hints/> directory in the source code kit). -On most of these systems, the value of C<$^O> (hence C<$Config{'osname'}>, -too) is determined either by lowercasing and stripping punctuation from the -first field of the string returned by typing C<uname -a> (or a similar command) -at the shell prompt or by testing the file system for the presence of -uniquely named files such as a kernel or header file. Here, for example, -are a few of the more popular Unix flavors: - - uname $^O $Config{'archname'} +On most of these systems, the value of L<C<$^O>|perlvar/$^O> (hence +L<C<$Config{osname}>|Config/C<osname>>, too) is determined either by +lowercasing and stripping punctuation from the first field of the string +returned by typing C<uname -a> (or a similar command) at the shell prompt +or by testing the file system for the presence of uniquely named files +such as a kernel or header file. Here, for example, are a few of the +more popular Unix flavors: + + uname $^O $Config{archname} -------------------------------------------- AIX aix aix BSD/OS bsdos i386-bsdos @@ -850,8 +887,9 @@ are a few of the more popular Unix flavors: SunOS solaris i86pc-solaris SunOS4 sunos sun4-sunos -Because the value of C<$Config{archname}> may depend on the -hardware architecture, it can vary more than the value of C<$^O>. +Because the value of L<C<$Config{archname}>|Config/C<archname>> may +depend on the hardware architecture, it can vary more than the value of +L<C<$^O>|perlvar/$^O>. =head2 DOS and Derivatives @@ -878,69 +916,97 @@ not to. The DOS FAT filesystem can accommodate only "8.3" style filenames. Under the "case-insensitive, but case-preserving" HPFS (OS/2) and NTFS (NT) filesystems you may have to be careful about case returned with functions -like C<readdir> or used with functions like C<open> or C<opendir>. +like L<C<readdir>|perlfunc/readdir DIRHANDLE> or used with functions like +L<C<open>|perlfunc/open FILEHANDLE,EXPR> or +L<C<opendir>|perlfunc/opendir DIRHANDLE,EXPR>. -DOS also treats several filenames as special, such as AUX, PRN, -NUL, CON, COM1, LPT1, LPT2, etc. Unfortunately, sometimes these -filenames won't even work if you include an explicit directory -prefix. It is best to avoid such filenames, if you want your code -to be portable to DOS and its derivatives. It's hard to know what -these all are, unfortunately. +DOS also treats several filenames as special, such as F<AUX>, F<PRN>, +F<NUL>, F<CON>, F<COM1>, F<LPT1>, F<LPT2>, etc. Unfortunately, sometimes +these filenames won't even work if you include an explicit directory +prefix. It is best to avoid such filenames, if you want your code to be +portable to DOS and its derivatives. It's hard to know what these all +are, unfortunately. Users of these operating systems may also wish to make use of -scripts such as I<pl2bat.bat> or I<pl2cmd> to -put wrappers around your scripts. - -Newline (C<\n>) is translated as C<\015\012> by STDIO when reading from -and writing to files (see L<"Newlines">). C<binmode(FILEHANDLE)> -will keep C<\n> translated as C<\012> for that filehandle. Since it is a -no-op on other systems, C<binmode> should be used for cross-platform code -that deals with binary data. That's assuming you realize in advance -that your data is in binary. General-purpose programs should -often assume nothing about their data. - -The C<$^O> variable and the C<$Config{archname}> values for various -DOSish perls are as follows: - - OS $^O $Config{archname} ID Version - -------------------------------------------------------- - MS-DOS dos ? - PC-DOS dos ? - OS/2 os2 ? - Windows 3.1 ? ? 0 3 01 - Windows 95 MSWin32 MSWin32-x86 1 4 00 - Windows 98 MSWin32 MSWin32-x86 1 4 10 - Windows ME MSWin32 MSWin32-x86 1 ? - Windows NT MSWin32 MSWin32-x86 2 4 xx - Windows NT MSWin32 MSWin32-ALPHA 2 4 xx - Windows NT MSWin32 MSWin32-ppc 2 4 xx - Windows 2000 MSWin32 MSWin32-x86 2 5 00 - Windows XP MSWin32 MSWin32-x86 2 5 01 - Windows 2003 MSWin32 MSWin32-x86 2 5 02 - Windows Vista MSWin32 MSWin32-x86 2 6 00 - Windows 7 MSWin32 MSWin32-x86 2 6 01 - Windows 7 MSWin32 MSWin32-x64 2 6 01 - Windows 2008 MSWin32 MSWin32-x86 2 6 01 - Windows 2008 MSWin32 MSWin32-x64 2 6 01 - Windows CE MSWin32 ? 3 - Cygwin cygwin cygwin +scripts such as F<pl2bat.bat> to put wrappers around your scripts. + +Newline (C<\n>) is translated as C<\015\012> by the I/O system when +reading from and writing to files (see L</"Newlines">). +C<binmode($filehandle)> will keep C<\n> translated as C<\012> for that +filehandle. +L<C<binmode>|perlfunc/binmode FILEHANDLE> should always be used for code +that deals with binary data. That's assuming you realize in advance that +your data is in binary. General-purpose programs should often assume +nothing about their data. + +The L<C<$^O>|perlvar/$^O> variable and the +L<C<$Config{archname}>|Config/C<archname>> values for various DOSish +perls are as follows: + + OS $^O $Config{archname} ID Version + --------------------------------------------------------- + MS-DOS dos ? + PC-DOS dos ? + OS/2 os2 ? + Windows 3.1 ? ? 0 3 01 + Windows 95 MSWin32 MSWin32-x86 1 4 00 + Windows 98 MSWin32 MSWin32-x86 1 4 10 + Windows ME MSWin32 MSWin32-x86 1 ? + Windows NT MSWin32 MSWin32-x86 2 4 xx + Windows NT MSWin32 MSWin32-ALPHA 2 4 xx + Windows NT MSWin32 MSWin32-ppc 2 4 xx + Windows 2000 MSWin32 MSWin32-x86 2 5 00 + Windows XP MSWin32 MSWin32-x86 2 5 01 + Windows 2003 MSWin32 MSWin32-x86 2 5 02 + Windows Vista MSWin32 MSWin32-x86 2 6 00 + Windows 7 MSWin32 MSWin32-x86 2 6 01 + Windows 7 MSWin32 MSWin32-x64 2 6 01 + Windows 2008 MSWin32 MSWin32-x86 2 6 01 + Windows 2008 MSWin32 MSWin32-x64 2 6 01 + Windows CE MSWin32 ? 3 + Cygwin cygwin cygwin The various MSWin32 Perl's can distinguish the OS they are running on via the value of the fifth element of the list returned from -C<Win32::GetOSVersion()>. For example: +L<C<Win32::GetOSVersion()>|Win32/Win32::GetOSVersion()>. For example: if ($^O eq 'MSWin32') { my @os_version_info = Win32::GetOSVersion(); print +('3.1','95','NT')[$os_version_info[4]],"\n"; } -There are also C<Win32::IsWinNT()> and C<Win32::IsWin95()>; try C<perldoc Win32>, -and as of libwin32 0.19 (not part of the core Perl distribution) -C<Win32::GetOSName()>. The very portable C<POSIX::uname()> will work too: +There are also C<Win32::IsWinNT()|Win32/Win32::IsWinNT()>, +C<Win32::IsWin95()|Win32/Win32::IsWin95()>, and +L<C<Win32::GetOSName()>|Win32/Win32::GetOSName()>; try +L<C<perldoc Win32>|Win32>. +The very portable L<C<POSIX::uname()>|POSIX/C<uname>> will work too: c:\> perl -MPOSIX -we "print join '|', uname" Windows NT|moonru|5.0|Build 2195 (Service Pack 2)|x86 +Errors set by Winsock functions are now put directly into C<$^E>, +and the relevant C<WSAE*> error codes are now exported from the +L<Errno> and L<POSIX> modules for testing this against. + +The previous behavior of putting the errors (converted to POSIX-style +C<E*> error codes since Perl 5.20.0) into C<$!> was buggy due to +the non-equivalence of like-named Winsock and POSIX error constants, +a relationship between which has unfortunately been established +in one way or another since Perl 5.8.0. + +The new behavior provides a much more robust solution for checking +Winsock errors in portable software without accidentally matching +POSIX tests that were intended for other OSes and may have different +meanings for Winsock. + +The old behavior is currently retained, warts and all, for backwards +compatibility, but users are encouraged to change any code that +tests C<$!> against C<E*> constants for Winsock errors to instead +test C<$^E> against C<WSAE*> constants. After a suitable deprecation +period, which started with Perl 5.24, the old behavior may be +removed, leaving C<$!> unchanged after Winsock function calls, to +avoid any possible confusion over which error variable to check. + Also see: =over 4 @@ -1036,32 +1102,34 @@ but not a mixture of both as in: In general, the easiest path to portability is always to specify filenames in Unix format unless they will need to be processed by native commands or utilities. Because of this latter consideration, the -File::Spec module by default returns native format specifications +L<File::Spec> module by default returns native format specifications regardless of input format. This default may be reversed so that filenames are always reported in Unix format by specifying the C<DECC$FILENAME_UNIX_REPORT> feature logical in the environment. The file type, or extension, is always present in a VMS-format file specification even if it's zero-length. This means that, by default, -C<readdir> will return a trailing dot on a file with no extension, so -where you would see C<"a"> on Unix you'll see C<"a."> on VMS. However, -the trailing dot may be suppressed by enabling the -C<DECC$READDIR_DROPDOTNOTYPE> feature in the environment (see the CRTL +L<C<readdir>|perlfunc/readdir DIRHANDLE> will return a trailing dot on a +file with no extension, so where you would see C<"a"> on Unix you'll see +C<"a."> on VMS. However, the trailing dot may be suppressed by enabling +the C<DECC$READDIR_DROPDOTNOTYPE> feature in the environment (see the CRTL documentation on feature logical names). What C<\n> represents depends on the type of file opened. It usually represents C<\012> but it could also be C<\015>, C<\012>, C<\015\012>, C<\000>, C<\040>, or nothing depending on the file organization and -record format. The C<VMS::Stdio> module provides access to the -special C<fopen()> requirements of files with unusual attributes on VMS. +record format. The L<C<VMS::Stdio>|VMS::Stdio> module provides access to +the special C<fopen()> requirements of files with unusual attributes on +VMS. -The value of C<$^O> on OpenVMS is "VMS". To determine the architecture -that you are running on refer to C<$Config{'archname'}>. +The value of L<C<$^O>|perlvar/$^O> on OpenVMS is "VMS". To determine the +architecture that you are running on refer to +L<C<$Config{archname}>|Config/C<archname>>. On VMS, perl determines the UTC offset from the C<SYS$TIMEZONE_DIFFERENTIAL> logical name. Although the VMS epoch began at 17-NOV-1858 00:00:00.00, -calls to C<localtime> are adjusted to count offsets from -01-JAN-1970 00:00:00.00, just like Unix. +calls to L<C<localtime>|perlfunc/localtime EXPR> are adjusted to count +offsets from 01-JAN-1970 00:00:00.00, just like Unix. Also see: @@ -1108,13 +1176,13 @@ must be renamed before they can be processed by Perl. Older releases of VOS (prior to OpenVOS Release 17.0) limit file names to 32 or fewer characters, prohibit file names from starting with a C<-> character, and prohibit file names from -containing any character matching C<< tr/ !#%&'()*;<=>?// >>. +containing C< > (space) or any character from the set C<< !#%&'()*;<=>? >>. Newer releases of VOS (OpenVOS Release 17.0 or later) support a feature known as extended names. On these releases, file names can contain up to 255 characters, are prohibited from starting with a C<-> character, and the set of prohibited characters is -reduced to any character matching C<< tr/#%*<>?// >>. There are +reduced to C<< #%*<>? >>. There are restrictions involving spaces and apostrophes: these characters must not begin or end a name, nor can they immediately precede or follow a period. Additionally, a space must not immediately @@ -1126,17 +1194,9 @@ trailing apostrophe. Although an extended file name is limited to 255 characters, a path name is still limited to 256 characters. -The value of C<$^O> on VOS is "vos". To determine the -architecture that you are running on without resorting to loading -all of C<%Config> you can examine the content of the C<@INC> array -like so: - - if ($^O =~ /vos/) { - print "I'm on a Stratus box!\n"; - } else { - print "I'm not on a Stratus box!\n"; - die; - } +The value of L<C<$^O>|perlvar/$^O> on VOS is "vos". To determine the +architecture that you are running on refer to +L<C<$Config{archname}>|Config/C<archname>>. Also see: @@ -1170,9 +1230,8 @@ VOS Open-Source Software on the web at L<http://ftp.stratus.com/pub/vos/vos.html v5.22 core Perl runs on z/OS (formerly OS/390). Theoretically it could run on the successors of OS/400 on AS/400 minicomputers as well as VM/ESA, and BS2000 for S/390 Mainframes. Such computers use EBCDIC -character sets internally (usually -Character Code Set ID 0037 for OS/400 and either 1047 or POSIX-BC for S/390 -systems). +character sets internally (usually Character Code Set ID 0037 for OS/400 +and either 1047 or POSIX-BC for S/390 systems). The rest of this section may need updating, but we don't know what it should say. Please email comments to @@ -1198,8 +1257,8 @@ similar to the following simple script: print "Hello from perl!\n"; OS/390 will support the C<#!> shebang trick in release 2.8 and beyond. -Calls to C<system> and backticks can use POSIX shell syntax on all -S/390 systems. +Calls to L<C<system>|perlfunc/system LIST> and backticks can use POSIX +shell syntax on all S/390 systems. On the AS/400, if PERL5 is in your library list, you may need to wrap your Perl scripts in a CL procedure to invoke them like so: @@ -1209,15 +1268,20 @@ to wrap your Perl scripts in a CL procedure to invoke them like so: ENDPGM This will invoke the Perl script F<hello.pl> in the root of the -QOpenSys file system. On the AS/400 calls to C<system> or backticks -must use CL syntax. +QOpenSys file system. On the AS/400 calls to +L<C<system>|perlfunc/system LIST> or backticks must use CL syntax. On these platforms, bear in mind that the EBCDIC character set may have -an effect on what happens with some Perl functions (such as C<chr>, -C<pack>, C<print>, C<printf>, C<ord>, C<sort>, C<sprintf>, C<unpack>), as -well as bit-fiddling with ASCII constants using operators like C<^>, C<&> -and C<|>, not to mention dealing with socket interfaces to ASCII computers -(see L<"Newlines">). +an effect on what happens with some Perl functions (such as +L<C<chr>|perlfunc/chr NUMBER>, L<C<pack>|perlfunc/pack TEMPLATE,LIST>, +L<C<print>|perlfunc/print FILEHANDLE LIST>, +L<C<printf>|perlfunc/printf FILEHANDLE FORMAT, LIST>, +L<C<ord>|perlfunc/ord EXPR>, L<C<sort>|perlfunc/sort SUBNAME LIST>, +L<C<sprintf>|perlfunc/sprintf FORMAT, LIST>, +L<C<unpack>|perlfunc/unpack TEMPLATE,EXPR>), as +well as bit-fiddling with ASCII constants using operators like +L<C<^>, C<&> and C<|>|perlop/Bitwise String Operators>, not to mention +dealing with socket interfaces to ASCII computers (see L</"Newlines">). Fortunately, most web servers for the mainframe will correctly translate the C<\n> in the following statement to its ASCII equivalent @@ -1225,9 +1289,9 @@ translate the C<\n> in the following statement to its ASCII equivalent print "Content-type: text/html\r\n\r\n"; -The values of C<$^O> on some of these platforms includes: +The values of L<C<$^O>|perlvar/$^O> on some of these platforms include: - uname $^O $Config{'archname'} + uname $^O $Config{archname} -------------------------------------------- OS/390 os390 os390 OS400 os400 os400 @@ -1236,7 +1300,7 @@ The values of C<$^O> on some of these platforms includes: Some simple tricks for determining if you are running on an EBCDIC platform could include any of the following (perhaps all): - if ("\t" eq "\005") { print "EBCDIC may be spoken here!\n"; } + if ("\t" eq "\005") { print "EBCDIC may be spoken here!\n"; } if (ord('A') == 193) { print "EBCDIC may be spoken here!\n"; } @@ -1297,11 +1361,12 @@ where ^ is the parent directory Directory and File =~ m|[^\0- "\.\$\%\&:\@\\^\|\177]+| -The default filename translation is roughly C<tr|/.|./|;> +The default filename translation is roughly C<tr|/.|./|>, swapping dots +and slashes. Note that C<"ADFS::HardDisk.$.File" ne 'ADFS::HardDisk.$.File'> and that the second stage of C<$> interpolation in regular expressions will fall -foul of the C<$.> if scripts are not careful. +foul of the L<C<$.>|perlvar/$.> variable if scripts are not careful. Logical paths specified by system variables containing comma-separated search lists are also allowed; hence C<System:Modules> is a valid @@ -1312,8 +1377,9 @@ C<System$Path> contains a single item list. The filesystem will also expand system variables in filenames if enclosed in angle brackets, so C<< <System$Dir>.Modules >> would look for the file S<C<$ENV{'System$Dir'} . 'Modules'>>. The obvious implication of this is -that B<fully qualified filenames can start with C<< <> >>> and should -be protected when C<open> is used for input. +that B<fully qualified filenames can start with C<< <> >>> and the +three-argument form of L<C<open>|perlfunc/open FILEHANDLE,EXPR> should +always be used. Because C<.> was in use as a directory separator and filenames could not be assumed to be unique after 10 characters, Acorn implemented the C @@ -1332,13 +1398,15 @@ The Unix emulation library's translation of filenames to native assumes that this sort of translation is required, and it allows a user-defined list of known suffixes that it will transpose in this fashion. This may seem transparent, but consider that with these rules F<foo/bar/baz.h> -and F<foo/bar/h/baz> both map to F<foo.bar.h.baz>, and that C<readdir> and -C<glob> cannot and do not attempt to emulate the reverse mapping. Other +and F<foo/bar/h/baz> both map to F<foo.bar.h.baz>, and that +L<C<readdir>|perlfunc/readdir DIRHANDLE> and L<C<glob>|perlfunc/glob EXPR> +cannot and do not attempt to emulate the reverse mapping. Other C<.>'s in filenames are translated to C</>. -As implied above, the environment accessed through C<%ENV> is global, and -the convention is that program specific environment variables are of the -form C<Program$Name>. Each filesystem maintains a current directory, +As implied above, the environment accessed through +L<C<%ENV>|perlvar/%ENV> is global, and the convention is that program +specific environment variables are of the form C<Program$Name>. +Each filesystem maintains a current directory, and the current filesystem's current directory is the B<global> current directory. Consequently, sociable programs don't change the current directory but rely on full pathnames, and programs (and Makefiles) cannot @@ -1353,9 +1421,9 @@ passing C<STDIN>, C<STDOUT>, or C<STDERR> to your children. The desire of users to express filenames of the form C<< <Foo$Dir>.Bar >> on the command line unquoted causes problems, -too: C<``> command output capture has to perform a guessing game. It -assumes that a string C<< <[^<>]+\$[^<>]> >> is a -reference to an environment variable, whereas anything else involving +too: L<C<``>|perlop/C<qxE<sol>I<STRING>E<sol>>> command output capture has +to perform a guessing game. It assumes that a string C<< <[^<>]+\$[^<>]> >> +is a reference to an environment variable, whereas anything else involving C<< < >> or C<< > >> is redirection, and generally manages to be 99% right. Of course, the problem remains that scripts cannot rely on any Unix tools being available, or that any tools found have Unix-like command @@ -1366,11 +1434,11 @@ tools. In practice, many don't, as users of the Acorn platform are used to binary distributions. MakeMaker does run, but no available make currently copes with MakeMaker's makefiles; even if and when this should be fixed, the lack of a Unix-like shell will cause -problems with makefile rules, especially lines of the form C<cd -sdbm && make all>, and anything using quoting. +problems with makefile rules, especially lines of the form +C<cd sdbm && make all>, and anything using quoting. -"S<RISC OS>" is the proper name for the operating system, but the value -in C<$^O> is "riscos" (because we don't like shouting). +S<"RISC OS"> is the proper name for the operating system, but the value +in L<C<$^O>|perlvar/$^O> is "riscos" (because we don't like shouting). =head2 Other perls @@ -1383,10 +1451,10 @@ aos, Atari ST, lynxos, riscos, Novell Netware, Tandem Guardian, I<etc.> (Yes, we know that some of these OSes may fall under the Unix category, but we are not a standards body.) -Some approximate operating system names and their C<$^O> values -in the "OTHER" category include: +Some approximate operating system names and their L<C<$^O>|perlvar/$^O> +values in the "OTHER" category include: - OS $^O $Config{'archname'} + OS $^O $Config{archname} ------------------------------------------ Amiga DOS amigaos m68k-amigos @@ -1414,7 +1482,7 @@ S<Plan 9>, F<README.plan9> Listed below are functions that are either completely unimplemented or else have been implemented differently on various platforms. -Following each description will be, in parentheses, a list of +Preceding each description will be, in parentheses, a list of platforms that the description applies to. The list may well be incomplete, or even wrong in some places. When @@ -1424,10 +1492,11 @@ a given port. Be aware, moreover, that even among Unix-ish systems there are variations. -For many functions, you can also query C<%Config>, exported by -default from the C<Config> module. For example, to check whether the -platform has the C<lstat> call, check C<$Config{d_lstat}>. See -L<Config> for a full description of available variables. +For many functions, you can also query L<C<%Config>|Config/DESCRIPTION>, +exported by default from the L<C<Config>|Config> module. For example, to +check whether the platform has the L<C<lstat>|perlfunc/lstat FILEHANDLE> +call, check L<C<$Config{d_lstat}>|Config/C<d_lstat>>. See L<Config> for a +full description of available variables. =head2 Alphabetical Listing of Perl Functions @@ -1435,362 +1504,447 @@ L<Config> for a full description of available variables. =item -X +(Win32) C<-w> only inspects the read-only file attribute (FILE_ATTRIBUTE_READONLY), which determines whether the directory can be deleted, not whether it can be written to. Directories always have read and write access unless denied -by discretionary access control lists (DACLs). (S<Win32>) +by discretionary access control lists (DACLs). +(VMS) C<-r>, C<-w>, C<-x>, and C<-o> tell whether the file is accessible, -which may not reflect UIC-based file protections. (VMS) +which may not reflect UIC-based file protections. +(S<RISC OS>) C<-s> by name on an open file will return the space reserved on disk, rather than the current extent. C<-s> on an open filehandle returns the -current size. (S<RISC OS>) +current size. +(Win32, VMS, S<RISC OS>) C<-R>, C<-W>, C<-X>, C<-O> are indistinguishable from C<-r>, C<-w>, -C<-x>, C<-o>. (Win32, VMS, S<RISC OS>) +C<-x>, C<-o>. -C<-g>, C<-k>, C<-l>, C<-u>, C<-A> are not particularly meaningful. (Win32, VMS, S<RISC OS>) +C<-g>, C<-k>, C<-l>, C<-u>, C<-A> are not particularly meaningful. -C<-p> is not particularly meaningful. (VMS, S<RISC OS>) +(VMS, S<RISC OS>) +C<-p> is not particularly meaningful. -C<-d> is true if passed a device spec without an explicit directory. (VMS) +C<-d> is true if passed a device spec without an explicit directory. +(Win32) C<-x> (or C<-X>) determine if a file ends in one of the executable -suffixes. C<-S> is meaningless. (Win32) +suffixes. C<-S> is meaningless. -C<-x> (or C<-X>) determine if a file has an executable file type. (S<RISC OS>) +C<-x> (or C<-X>) determine if a file has an executable file type. =item alarm +(Win32) Emulated using timers that must be explicitly polled whenever Perl wants to dispatch "safe signals" and therefore cannot interrupt -blocking system calls. (Win32) +blocking system calls. =item atan2 +(Tru64, HP-UX 10.20) Due to issues with various CPUs, math libraries, compilers, and standards, -results for C<atan2()> may vary depending on any combination of the above. +results for C<atan2> may vary depending on any combination of the above. Perl attempts to conform to the Open Group/IEEE standards for the results -returned from C<atan2()>, but cannot force the issue if the system Perl is -run on does not allow it. (Tru64, HP-UX 10.20) +returned from C<atan2>, but cannot force the issue if the system Perl is +run on does not allow it. -The current version of the standards for C<atan2()> is available at +The current version of the standards for C<atan2> is available at L<http://www.opengroup.org/onlinepubs/009695399/functions/atan2.html>. =item binmode -Meaningless. (S<RISC OS>) +(S<RISC OS>) +Meaningless. +(VMS) Reopens file and restores pointer; if function fails, underlying filehandle may be closed, or pointer may be in a different position. -(VMS) -The value returned by C<tell> may be affected after the call, and -the filehandle may be flushed. (Win32) +(Win32) +The value returned by L<C<tell>|perlfunc/tell FILEHANDLE> may be affected +after the call, and the filehandle may be flushed. =item chmod -Only good for changing "owner" read-write access, "group", and "other" -bits are meaningless. (Win32) +(Win32) +Only good for changing "owner" read-write access; "group" and "other" +bits are meaningless. -Only good for changing "owner" and "other" read-write access. (S<RISC OS>) +(S<RISC OS>) +Only good for changing "owner" and "other" read-write access. -Access permissions are mapped onto VOS access-control list changes. (VOS) +(VOS) +Access permissions are mapped onto VOS access-control list changes. -The actual permissions set depend on the value of the C<CYGWIN> -in the SYSTEM environment settings. (Cygwin) +(Cygwin) +The actual permissions set depend on the value of the C<CYGWIN> variable +in the SYSTEM environment settings. +(Android) Setting the exec bit on some locations (generally F</sdcard>) will return true -but not actually set the bit. (Android) +but not actually set the bit. + +(VMS) +A mode argument of zero sets permissions to the user's default permission mask +rather than disabling all permissions. =item chown -Not implemented. (Win32, S<Plan 9>, S<RISC OS>) +(S<Plan 9>, S<RISC OS>) +Not implemented. -Does nothing, but won't fail. (Win32) +(Win32) +Does nothing, but won't fail. -A little funky, because VOS's notion of ownership is a little funky (VOS). +(VOS) +A little funky, because VOS's notion of ownership is a little funky. =item chroot -Not implemented. (Win32, VMS, S<Plan 9>, S<RISC OS>, VOS) +(Win32, VMS, S<Plan 9>, S<RISC OS>, VOS) +Not implemented. =item crypt +(Win32) May not be available if library or source was not provided when building -perl. (Win32) +perl. -Not implemented. (Android) +(Android) +Not implemented. =item dbmclose -Not implemented. (VMS, S<Plan 9>, VOS) +(VMS, S<Plan 9>, VOS) +Not implemented. =item dbmopen -Not implemented. (VMS, S<Plan 9>, VOS) +(VMS, S<Plan 9>, VOS) +Not implemented. =item dump -Not useful. (S<RISC OS>) +(S<RISC OS>) +Not useful. -Not supported. (Cygwin, Win32) +(Cygwin, Win32) +Not supported. -Invokes VMS debugger. (VMS) +(VMS) +Invokes VMS debugger. =item exec +(Win32) C<exec LIST> without the use of indirect object syntax (C<exec PROGRAM LIST>) -may fall back to trying the shell if the first C<spawn()> fails. (Win32) +may fall back to trying the shell if the first C<spawn()> fails. -Does not automatically flush output handles on some platforms. (SunOS, Solaris, HP-UX) +Does not automatically flush output handles on some platforms. -Not supported. (Symbian OS) +(Symbian OS) +Not supported. =item exit -Emulates Unix C<exit()> (which considers C<exit 1> to indicate an error) by +(VMS) +Emulates Unix C<exit> (which considers C<exit 1> to indicate an error) by mapping the C<1> to C<SS$_ABORT> (C<44>). This behavior may be overridden -with the pragma C<use vmsish 'exit'>. As with the CRTL's C<exit()> -function, C<exit 0> is also mapped to an exit status of C<SS$_NORMAL> -(C<1>); this mapping cannot be overridden. Any other argument to -C<exit()> +with the pragma L<C<use vmsish 'exit'>|vmsish/C<vmsish exit>>. As with +the CRTL's C<exit()> function, C<exit 0> is also mapped to an exit status +of C<SS$_NORMAL> (C<1>); this mapping cannot be overridden. Any other +argument to C<exit> is used directly as Perl's exit status. On VMS, unless the future POSIX_EXIT mode is enabled, the exit code should always be a valid VMS exit code and not a generic number. When the POSIX_EXIT mode is enabled, a generic number will be encoded in a method compatible with the C library _POSIX_EXIT macro so that it can be decoded by other -programs, particularly ones written in C, like the GNV package. (VMS) +programs, particularly ones written in C, like the GNV package. -C<exit()> resets file pointers, which is a problem when called -from a child process (created by C<fork()>) in C<BEGIN>. -A workaround is to use C<POSIX::_exit>. (Solaris) +(Solaris) +C<exit> resets file pointers, which is a problem when called +from a child process (created by L<C<fork>|perlfunc/fork>) in +L<C<BEGIN>|perlmod/BEGIN, UNITCHECK, CHECK, INIT and END>. +A workaround is to use L<C<POSIX::_exit>|POSIX/C<_exit>>. exit unless $Config{archname} =~ /\bsolaris\b/; - require POSIX and POSIX::_exit(0); + require POSIX; + POSIX::_exit(0); =item fcntl -Not implemented. (Win32) +(Win32) +Not implemented. -Some functions available based on the version of VMS. (VMS) +(VMS) +Some functions available based on the version of VMS. =item flock -Not implemented (VMS, S<RISC OS>, VOS). +(VMS, S<RISC OS>, VOS) +Not implemented. =item fork -Not implemented. (AmigaOS, S<RISC OS>, VMS) +(AmigaOS, S<RISC OS>, VMS) +Not implemented. -Emulated using multiple interpreters. See L<perlfork>. (Win32) +(Win32) +Emulated using multiple interpreters. See L<perlfork>. -Does not automatically flush output handles on some platforms. (SunOS, Solaris, HP-UX) +Does not automatically flush output handles on some platforms. =item getlogin -Not implemented. (S<RISC OS>) +(S<RISC OS>) +Not implemented. =item getpgrp -Not implemented. (Win32, VMS, S<RISC OS>) +(Win32, VMS, S<RISC OS>) +Not implemented. =item getppid -Not implemented. (Win32, S<RISC OS>) +(Win32, S<RISC OS>) +Not implemented. =item getpriority -Not implemented. (Win32, VMS, S<RISC OS>, VOS) +(Win32, VMS, S<RISC OS>, VOS) +Not implemented. =item getpwnam -Not implemented. (Win32) +(Win32) +Not implemented. -Not useful. (S<RISC OS>) +(S<RISC OS>) +Not useful. =item getgrnam -Not implemented. (Win32, VMS, S<RISC OS>) +(Win32, VMS, S<RISC OS>) +Not implemented. =item getnetbyname -Not implemented. (Android, Win32, S<Plan 9>) +(Android, Win32, S<Plan 9>) +Not implemented. =item getpwuid -Not implemented. (Win32) +(Win32) +Not implemented. -Not useful. (S<RISC OS>) +(S<RISC OS>) +Not useful. =item getgrgid -Not implemented. (Win32, VMS, S<RISC OS>) +(Win32, VMS, S<RISC OS>) +Not implemented. =item getnetbyaddr -Not implemented. (Android, Win32, S<Plan 9>) +(Android, Win32, S<Plan 9>) +Not implemented. =item getprotobynumber -Not implemented. (Android) - -=item getservbyport +(Android) +Not implemented. =item getpwent -Not implemented. (Android, Win32) +(Android, Win32) +Not implemented. =item getgrent -Not implemented. (Android, Win32, VMS) +(Android, Win32, VMS) +Not implemented. =item gethostbyname +(S<Irix 5>) C<gethostbyname('localhost')> does not work everywhere: you may have -to use C<gethostbyname('127.0.0.1')>. (S<Irix 5>) +to use C<gethostbyname('127.0.0.1')>. =item gethostent -Not implemented. (Win32) +(Win32) +Not implemented. =item getnetent -Not implemented. (Android, Win32, S<Plan 9>) +(Android, Win32, S<Plan 9>) +Not implemented. =item getprotoent -Not implemented. (Android, Win32, S<Plan 9>) +(Android, Win32, S<Plan 9>) +Not implemented. =item getservent -Not implemented. (Win32, S<Plan 9>) +(Win32, S<Plan 9>) +Not implemented. =item seekdir -Not implemented. (Android) +(Android) +Not implemented. =item sethostent -Not implemented. (Android, Win32, S<Plan 9>, S<RISC OS>) +(Android, Win32, S<Plan 9>, S<RISC OS>) +Not implemented. =item setnetent -Not implemented. (Win32, S<Plan 9>, S<RISC OS>) +(Win32, S<Plan 9>, S<RISC OS>) +Not implemented. =item setprotoent -Not implemented. (Android, Win32, S<Plan 9>, S<RISC OS>) +(Android, Win32, S<Plan 9>, S<RISC OS>) +Not implemented. =item setservent -Not implemented. (S<Plan 9>, Win32, S<RISC OS>) +(S<Plan 9>, Win32, S<RISC OS>) +Not implemented. =item endpwent -Not implemented. (Win32) +(Win32) +Not implemented. -Either not implemented or a no-op. (Android) +(Android) +Either not implemented or a no-op. =item endgrent -Not implemented. (Android, S<RISC OS>, VMS, Win32) +(Android, S<RISC OS>, VMS, Win32) +Not implemented. =item endhostent -Not implemented. (Android, Win32) +(Android, Win32) +Not implemented. =item endnetent -Not implemented. (Android, Win32, S<Plan 9>) +(Android, Win32, S<Plan 9>) +Not implemented. =item endprotoent -Not implemented. (Android, Win32, S<Plan 9>) +(Android, Win32, S<Plan 9>) +Not implemented. =item endservent -Not implemented. (S<Plan 9>, Win32) +(S<Plan 9>, Win32) +Not implemented. -=item getsockopt SOCKET,LEVEL,OPTNAME +=item getsockopt -Not implemented. (S<Plan 9>) +(S<Plan 9>) +Not implemented. =item glob -This operator is implemented via the C<File::Glob> extension on most -platforms. See L<File::Glob> for portability information. +This operator is implemented via the L<C<File::Glob>|File::Glob> extension +on most platforms. See L<File::Glob> for portability information. =item gmtime -In theory, C<gmtime()> is reliable from -2**63 to 2**63-1. However, -because work arounds in the implementation use floating point numbers, +In theory, C<gmtime> is reliable from -2**63 to 2**63-1. However, +because work-arounds in the implementation use floating point numbers, it will become inaccurate as the time gets larger. This is a bug and will be fixed in the future. -On VOS, time values are 32-bit quantities. +(VOS) +Time values are 32-bit quantities. -=item ioctl FILEHANDLE,FUNCTION,SCALAR +=item ioctl -Not implemented. (VMS) +(VMS) +Not implemented. +(Win32) Available only for socket handles, and it does what the C<ioctlsocket()> call -in the Winsock API does. (Win32) +in the Winsock API does. -Available only for socket handles. (S<RISC OS>) +(S<RISC OS>) +Available only for socket handles. =item kill -Not implemented, hence not useful for taint checking. (S<RISC OS>) +(S<RISC OS>) +Not implemented, hence not useful for taint checking. -C<kill()> doesn't have the semantics of C<raise()>, i.e. it doesn't send -a signal to the identified process like it does on Unix platforms. -Instead C<kill($sig, $pid)> terminates the process identified by C<$pid>, -and makes it exit immediately with exit status $sig. As in Unix, if -$sig is 0 and the specified process exists, it returns true without -actually terminating it. (Win32) +(Win32) +C<kill> doesn't send a signal to the identified process like it does on +Unix platforms. Instead C<kill($sig, $pid)> terminates the process +identified by C<$pid>, and makes it exit immediately with exit status +C<$sig>. As in Unix, if C<$sig> is 0 and the specified process exists, it +returns true without actually terminating it. +(Win32) C<kill(-9, $pid)> will terminate the process specified by C<$pid> and recursively all child processes owned by it. This is different from the Unix semantics, where the signal will be delivered to all processes in the same process group as the process specified by -$pid. (Win32) +C<$pid>. +(VMS) A pid of -1 indicating all processes on the system is not currently -supported. (VMS) +supported. =item link -Not implemented. (S<RISC OS>, VOS) +(S<RISC OS>, VOS) +Not implemented. +(AmigaOS) Link count not updated because hard links are not quite that hard -(They are sort of half-way between hard and soft links). (AmigaOS) +(They are sort of half-way between hard and soft links). +(Win32) Hard links are implemented on Win32 under NTFS only. They are natively supported on Windows 2000 and later. On Windows NT they are implemented using the Windows POSIX subsystem support and the Perl process will need Administrator or Backup Operator privileges to create hard links. -Available on 64 bit OpenVMS 8.2 and later. (VMS) +(VMS) +Available on 64 bit OpenVMS 8.2 and later. =item localtime -localtime() has the same range as L</gmtime>, but because time zone -rules change its accuracy for historical and future times may degrade +C<localtime> has the same range as L</gmtime>, but because time zone +rules change, its accuracy for historical and future times may degrade but usually by no more than an hour. =item lstat -Not implemented. (S<RISC OS>) +(S<RISC OS>) +Not implemented. -Return values (especially for device and inode) may be bogus. (Win32) +(Win32) +Return values (especially for device and inode) may be bogus. =item msgctl @@ -1800,36 +1954,45 @@ Return values (especially for device and inode) may be bogus. (Win32) =item msgrcv -Not implemented. (Android, Win32, VMS, S<Plan 9>, S<RISC OS>, VOS) +(Android, Win32, VMS, S<Plan 9>, S<RISC OS>, VOS) +Not implemented. =item open -open to C<|-> and C<-|> are unsupported. (Win32, S<RISC OS>) +(Win32, S<RISC OS>) +Open modes C<|-> and C<-|> are unsupported. +(SunOS, Solaris, HP-UX) Opening a process does not automatically flush output handles on some -platforms. (SunOS, Solaris, HP-UX) +platforms. =item readlink -Not implemented. (Win32, VMS, S<RISC OS>) +(Win32, VMS, S<RISC OS>) +Not implemented. =item rename -Can't move directories between directories on different logical volumes. (Win32) +(Win32) +Can't move directories between directories on different logical volumes. =item rewinddir -Will not cause C<readdir()> to re-read the directory stream. The entries -already read before the C<rewinddir()> call will just be returned again -from a cache buffer. (Win32) +(Win32) +Will not cause L<C<readdir>|perlfunc/readdir DIRHANDLE> to re-read the +directory stream. The entries already read before the C<rewinddir> call +will just be returned again from a cache buffer. =item select -Only implemented on sockets. (Win32, VMS) +(Win32, VMS) +Only implemented on sockets. -Only reliable on sockets. (S<RISC OS>) +(S<RISC OS>) +Only reliable on sockets. -Note that the C<select FILEHANDLE> form is generally portable. +Note that the L<C<select FILEHANDLE>|perlfunc/select FILEHANDLE> form is +generally portable. =item semctl @@ -1837,27 +2000,33 @@ Note that the C<select FILEHANDLE> form is generally portable. =item semop -Not implemented. (Android, Win32, VMS, S<RISC OS>) +(Android, Win32, VMS, S<RISC OS>) +Not implemented. =item setgrent -Not implemented. (Android, VMS, Win32, S<RISC OS>) +(Android, VMS, Win32, S<RISC OS>) +Not implemented. =item setpgrp -Not implemented. (Win32, VMS, S<RISC OS>, VOS) +(Win32, VMS, S<RISC OS>, VOS) +Not implemented. =item setpriority -Not implemented. (Win32, VMS, S<RISC OS>, VOS) +(Win32, VMS, S<RISC OS>, VOS) +Not implemented. =item setpwent -Not implemented. (Android, Win32, S<RISC OS>) +(Android, Win32, S<RISC OS>) +Not implemented. =item setsockopt -Not implemented. (S<Plan 9>) +(S<Plan 9>) +Not implemented. =item shmctl @@ -1867,154 +2036,186 @@ Not implemented. (S<Plan 9>) =item shmwrite -Not implemented. (Android, Win32, VMS, S<RISC OS>) +(Android, Win32, VMS, S<RISC OS>) +Not implemented. =item sleep +(Win32) Emulated using synchronization functions such that it can be -interrupted by C<alarm()>, and limited to a maximum of 4294967 seconds, -approximately 49 days. (Win32) - -=item sockatmark - -A relatively recent addition to socket functions, may not -be implemented even in Unix platforms. +interrupted by L<C<alarm>|perlfunc/alarm SECONDS>, and limited to a +maximum of 4294967 seconds, approximately 49 days. =item socketpair -Not implemented. (S<RISC OS>) +(S<RISC OS>) +Not implemented. -Available on 64 bit OpenVMS 8.2 and later. (VMS) +(VMS) +Available on 64 bit OpenVMS 8.2 and later. =item stat -Platforms that do not have rdev, blksize, or blocks will return these -as '', so numeric comparison or manipulation of these fields may cause -'not numeric' warnings. +Platforms that do not have C<rdev>, C<blksize>, or C<blocks> will return +these as C<''>, so numeric comparison or manipulation of these fields may +cause 'not numeric' warnings. -ctime not supported on UFS (S<Mac OS X>). +(S<Mac OS X>) +C<ctime> not supported on UFS. -ctime is creation time instead of inode change time (Win32). +(Win32) +C<ctime> is creation time instead of inode change time. -device and inode are not meaningful. (Win32) +(Win32) +C<dev> and C<ino> are not meaningful. -device and inode are not necessarily reliable. (VMS) +(VMS) +C<dev> and C<ino> are not necessarily reliable. -mtime, atime and ctime all return the last modification time. Device and -inode are not necessarily reliable. (S<RISC OS>) +(S<RISC OS>) +C<mtime>, C<atime> and C<ctime> all return the last modification time. +C<dev> and C<ino> are not necessarily reliable. -dev, rdev, blksize, and blocks are not available. inode is not -meaningful and will differ between stat calls on the same file. (os2) +(OS/2) +C<dev>, C<rdev>, C<blksize>, and C<blocks> are not available. C<ino> is not +meaningful and will differ between stat calls on the same file. -some versions of cygwin when doing a C<stat("foo")> and if not finding it -may then attempt to C<stat("foo.exe")> (Cygwin) +(Cygwin) +Some versions of cygwin when doing a C<stat("foo")> and not finding it +may then attempt to C<stat("foo.exe")>. -On Win32 C<stat()> needs to open the file to determine the link count +(Win32) +C<stat> needs to open the file to determine the link count and update attributes that may have been changed through hard links. -Setting C<${^WIN32_SLOPPY_STAT}> to a true value speeds up C<stat()> by -not performing this operation. (Win32) +Setting L<C<${^WIN32_SLOPPY_STAT}>|perlvar/${^WIN32_SLOPPY_STAT}> to a +true value speeds up C<stat> by not performing this operation. =item symlink -Not implemented. (Win32, S<RISC OS>) +(Win32, S<RISC OS>) +Not implemented. +(VMS) Implemented on 64 bit VMS 8.3. VMS requires the symbolic link to be in Unix syntax if it is intended to resolve to a valid path. =item syscall -Not implemented. (Win32, VMS, S<RISC OS>, VOS) +(Win32, VMS, S<RISC OS>, VOS) +Not implemented. =item sysopen -The traditional "0", "1", and "2" MODEs are implemented with different -numeric values on some systems. The flags exported by C<Fcntl> -(O_RDONLY, O_WRONLY, O_RDWR) should work everywhere though. (S<Mac -OS>, OS/390) +(S<Mac OS>, OS/390) +The traditional C<0>, C<1>, and C<2> MODEs are implemented with different +numeric values on some systems. The flags exported by L<C<Fcntl>|Fcntl> +(C<O_RDONLY>, C<O_WRONLY>, C<O_RDWR>) should work everywhere though. =item system +(Win32) As an optimization, may not call the command shell specified in C<$ENV{PERL5SHELL}>. C<system(1, @args)> spawns an external process and immediately returns its process designator, without waiting for it to terminate. Return value may be used subsequently -in C<wait> or C<waitpid>. Failure to C<spawn()> a subprocess is indicated -by setting C<$?> to S<C<"255 << 8">>. C<$?> is set in a way compatible with -Unix (i.e. the exitstatus of the subprocess is obtained by S<C<"$? >> 8">>, -as described in the documentation). (Win32) +in L<C<wait>|perlfunc/wait> or L<C<waitpid>|perlfunc/waitpid PID,FLAGS>. +Failure to C<spawn()> a subprocess is indicated by setting +L<C<$?>|perlvar/$?> to C<<< 255 << 8 >>>. L<C<$?>|perlvar/$?> is set in a +way compatible with Unix (i.e. the exit status of the subprocess is +obtained by C<<< $? >> 8 >>>, as described in the documentation). +(S<RISC OS>) There is no shell to process metacharacters, and the native standard is to pass a command line terminated by "\n" "\r" or "\0" to the spawned program. Redirection such as C<< > foo >> is performed (if at all) by -the run time library of the spawned program. C<system> I<list> will call -the Unix emulation library's C<exec> emulation, which attempts to provide -emulation of the stdin, stdout, stderr in force in the parent, providing -the child program uses a compatible version of the emulation library. -I<scalar> will call the native command line direct and no such emulation -of a child Unix program will exists. Mileage B<will> vary. (S<RISC OS>) - +the run time library of the spawned program. C<system LIST> will call +the Unix emulation library's L<C<exec>|perlfunc/exec LIST> emulation, +which attempts to provide emulation of the stdin, stdout, stderr in force +in the parent, provided the child program uses a compatible version of the +emulation library. C<system SCALAR> will call the native command line +directly and no such emulation of a child Unix program will occur. +Mileage B<will> vary. + +(Win32) C<system LIST> without the use of indirect object syntax (C<system PROGRAM LIST>) -may fall back to trying the shell if the first C<spawn()> fails. (Win32) +may fall back to trying the shell if the first C<spawn()> fails. -Does not automatically flush output handles on some platforms. (SunOS, Solaris, HP-UX) +Does not automatically flush output handles on some platforms. -The return value is POSIX-like (shifted up by 8 bits), which only allows -room for a made-up value derived from the severity bits of the native -32-bit condition code (unless overridden by C<use vmsish 'status'>). -If the native condition code is one that has a POSIX value encoded, the -POSIX value will be decoded to extract the expected exit value. -For more details see L<perlvms/$?>. (VMS) +(VMS) +As with Win32, C<system(1, @args)> spawns an external process and +immediately returns its process designator without waiting for the +process to terminate. In this case the return value may be used subsequently +in L<C<wait>|perlfunc/wait> or L<C<waitpid>|perlfunc/waitpid PID,FLAGS>. +Otherwise the return value is POSIX-like (shifted up by 8 bits), which only +allows room for a made-up value derived from the severity bits of the native +32-bit condition code (unless overridden by +L<C<use vmsish 'status'>|vmsish/C<vmsish status>>). If the native +condition code is one that has a POSIX value encoded, the POSIX value will +be decoded to extract the expected exit value. For more details see +L<perlvms/$?>. =item telldir -Not implemented. (Android) +(Android) +Not implemented. =item times -"cumulative" times will be bogus. On anything other than Windows NT +(Win32) +"Cumulative" times will be bogus. On anything other than Windows NT or Windows 2000, "system" time will be bogus, and "user" time is -actually the time returned by the C<clock()> function in the C runtime -library. (Win32) +actually the time returned by the L<C<clock()>|clock(3)> function in the C +runtime library. -Not useful. (S<RISC OS>) +(S<RISC OS>) +Not useful. =item truncate -Not implemented. (Older versions of VMS) +(Older versions of VMS) +Not implemented. -Truncation to same-or-shorter lengths only. (VOS) +(VOS) +Truncation to same-or-shorter lengths only. +(Win32) If a FILEHANDLE is supplied, it must be writable and opened in append -mode (i.e., use C<<< open(FH, '>>filename') >>> -or C<sysopen(FH,...,O_APPEND|O_RDWR)>. If a filename is supplied, it -should not be held open elsewhere. (Win32) +mode (i.e., use C<<< open(my $fh, '>>', 'filename') >>> +or C<sysopen(my $fh, ..., O_APPEND|O_RDWR)>. If a filename is supplied, it +should not be held open elsewhere. =item umask -Returns undef where unavailable. +Returns C<undef> where unavailable. +(AmigaOS) C<umask> works but the correct permissions are set only when the file -is finally closed. (AmigaOS) +is finally closed. =item utime -Only the modification time is updated. (VMS, S<RISC OS>) +(VMS, S<RISC OS>) +Only the modification time is updated. +(Win32) May not behave as expected. Behavior depends on the C runtime -library's implementation of C<utime()>, and the filesystem being -used. The FAT filesystem typically does not support an "access -time" field, and it may limit timestamps to a granularity of -two seconds. (Win32) +library's implementation of L<C<utime()>|utime(2)>, and the filesystem +being used. The FAT filesystem typically does not support an "access +time" field, and it may limit timestamps to a granularity of two seconds. =item wait =item waitpid +(Win32) Can only be applied to process handles returned for processes spawned -using C<system(1, ...)> or pseudo processes created with C<fork()>. (Win32) +using C<system(1, ...)> or pseudo processes created with +L<C<fork>|perlfunc/fork>. -Not useful. (S<RISC OS>) +(S<RISC OS>) +Not useful. =back @@ -2215,7 +2416,6 @@ available at L<http://www.cpan.org/src/> Open UNIX (Unixware) (since Perl 5.8.1/5.9.0) OS/2 OS/400 (using the PASE) (since Perl 5.8.1/5.9.0) - PowerUX POSIX-BC (formerly BS2000) QNX Solaris @@ -2326,7 +2526,7 @@ L<perlunicode>, L<perlvms>, L<perlvos>, L<perlwin32>, and L<Win32>. =head1 AUTHORS / CONTRIBUTORS -Abigail <abigail@foad.org>, +Abigail <abigail@abigail.be>, Charles Bailey <bailey@newman.upenn.edu>, Graham Barr <gbarr@pobox.com>, Tom Christiansen <tchrist@perl.com>, @@ -2344,6 +2544,7 @@ Nick Ing-Simmons <nick@ing-simmons.net>, Andreas J. KE<ouml>nig <a.koenig@mind.de>, Markus Laker <mlaker@contax.co.uk>, Andrew M. Langmead <aml@world.std.com>, +Lukas Mai <l.mai@web.de>, Larry Moore <ljmoore@freespace.net>, Paul Moore <Paul.Moore@uk.origin-it.com>, Chris Nandor <pudge@pobox.com>, diff --git a/gnu/usr.bin/perl/pod/perlre.pod b/gnu/usr.bin/perl/pod/perlre.pod index 094a87b8068..70c53f15367 100644 --- a/gnu/usr.bin/perl/pod/perlre.pod +++ b/gnu/usr.bin/perl/pod/perlre.pod @@ -7,27 +7,312 @@ perlre - Perl regular expressions This page describes the syntax of regular expressions in Perl. -If you haven't used regular expressions before, a quick-start -introduction is available in L<perlrequick>, and a longer tutorial -introduction is available in L<perlretut>. +If you haven't used regular expressions before, a tutorial introduction +is available in L<perlretut>. If you know just a little about them, +a quick-start introduction is available in L<perlrequick>. -For reference on how regular expressions are used in matching -operations, plus various examples of the same, see discussions of -C<m//>, C<s///>, C<qr//> and C<"??"> in L<perlop/"Regexp Quote-Like -Operators">. +Except for L</The Basics> section, this page assumes you are familiar +with regular expression basics, like what is a "pattern", what does it +look like, and how it is basically used. For a reference on how they +are used, plus various examples of the same, see discussions of C<m//>, +C<s///>, C<qr//> and C<"??"> in L<perlop/"Regexp Quote-Like Operators">. New in v5.22, L<C<use re 'strict'>|re/'strict' mode> applies stricter rules than otherwise when compiling regular expression patterns. It can find things that, while legal, may not be what you intended. +=head2 The Basics +X<regular expression, version 8> X<regex, version 8> X<regexp, version 8> + +Regular expressions are strings with the very particular syntax and +meaning described in this document and auxiliary documents referred to +by this one. The strings are called "patterns". Patterns are used to +determine if some other string, called the "target", has (or doesn't +have) the characteristics specified by the pattern. We call this +"matching" the target string against the pattern. Usually the match is +done by having the target be the first operand, and the pattern be the +second operand, of one of the two binary operators C<=~> and C<!~>, +listed in L<perlop/Binding Operators>; and the pattern will have been +converted from an ordinary string by one of the operators in +L<perlop/"Regexp Quote-Like Operators">, like so: + + $foo =~ m/abc/ + +This evaluates to true if and only if the string in the variable C<$foo> +contains somewhere in it, the sequence of characters "a", "b", then "c". +(The C<=~ m>, or match operator, is described in +L<perlop/m/PATTERN/msixpodualngc>.) + +Patterns that aren't already stored in some variable must be delimitted, +at both ends, by delimitter characters. These are often, as in the +example above, forward slashes, and the typical way a pattern is written +in documentation is with those slashes. In most cases, the delimitter +is the same character, fore and aft, but there are a few cases where a +character looks like it has a mirror-image mate, where the opening +version is the beginning delimiter, and the closing one is the ending +delimiter, like + + $foo =~ m<abc> + +Most times, the pattern is evaluated in double-quotish context, but it +is possible to choose delimiters to force single-quotish, like + + $foo =~ m'abc' + +If the pattern contains its delimiter within it, that delimiter must be +escaped. Prefixing it with a backslash (I<e.g.>, C<"/foo\/bar/">) +serves this purpose. + +Any single character in a pattern matches that same character in the +target string, unless the character is a I<metacharacter> with a special +meaning described in this document. A sequence of non-metacharacters +matches the same sequence in the target string, as we saw above with +C<m/abc/>. + +Only a few characters (all of them being ASCII punctuation characters) +are metacharacters. The most commonly used one is a dot C<".">, which +normally matches almost any character (including a dot itself). + +You can cause characters that normally function as metacharacters to be +interpreted literally by prefixing them with a C<"\">, just like the +pattern's delimiter must be escaped if it also occurs within the +pattern. Thus, C<"\."> matches just a literal dot, C<"."> instead of +its normal meaning. This means that the backslash is also a +metacharacter, so C<"\\"> matches a single C<"\">. And a sequence that +contains an escaped metacharacter matches the same sequence (but without +the escape) in the target string. So, the pattern C</blur\\fl/> would +match any target string that contains the sequence C<"blur\fl">. + +The metacharacter C<"|"> is used to match one thing or another. Thus + + $foo =~ m/this|that/ + +is TRUE if and only if C<$foo> contains either the sequence C<"this"> or +the sequence C<"that">. Like all metacharacters, prefixing the C<"|"> +with a backslash makes it match the plain punctuation character; in its +case, the VERTICAL LINE. + + $foo =~ m/this\|that/ + +is TRUE if and only if C<$foo> contains the sequence C<"this|that">. + +You aren't limited to just a single C<"|">. + + $foo =~ m/fee|fie|foe|fum/ + +is TRUE if and only if C<$foo> contains any of those 4 sequences from +the children's story "Jack and the Beanstalk". + +As you can see, the C<"|"> binds less tightly than a sequence of +ordinary characters. We can override this by using the grouping +metacharacters, the parentheses C<"("> and C<")">. + + $foo =~ m/th(is|at) thing/ + +is TRUE if and only if C<$foo> contains either the sequence S<C<"this +thing">> or the sequence S<C<"that thing">>. The portions of the string +that match the portions of the pattern enclosed in parentheses are +normally made available separately for use later in the pattern, +substitution, or program. This is called "capturing", and it can get +complicated. See L</Capture groups>. + +The first alternative includes everything from the last pattern +delimiter (C<"(">, C<"(?:"> (described later), I<etc>. or the beginning +of the pattern) up to the first C<"|">, and the last alternative +contains everything from the last C<"|"> to the next closing pattern +delimiter. That's why it's common practice to include alternatives in +parentheses: to minimize confusion about where they start and end. + +Alternatives are tried from left to right, so the first +alternative found for which the entire expression matches, is the one that +is chosen. This means that alternatives are not necessarily greedy. For +example: when matching C<foo|foot> against C<"barefoot">, only the C<"foo"> +part will match, as that is the first alternative tried, and it successfully +matches the target string. (This might not seem important, but it is +important when you are capturing matched text using parentheses.) + +Besides taking away the special meaning of a metacharacter, a prefixed +backslash changes some letter and digit characters away from matching +just themselves to instead have special meaning. These are called +"escape sequences", and all such are described in L<perlrebackslash>. A +backslash sequence (of a letter or digit) that doesn't currently have +special meaning to Perl will raise a warning if warnings are enabled, +as those are reserved for potential future use. + +One such sequence is C<\b>, which matches a boundary of some sort. +C<\b{wb}> and a few others give specialized types of boundaries. +(They are all described in detail starting at +L<perlrebackslash/\b{}, \b, \B{}, \B>.) Note that these don't match +characters, but the zero-width spaces between characters. They are an +example of a L<zero-width assertion|/Assertions>. Consider again, + + $foo =~ m/fee|fie|foe|fum/ + +It evaluates to TRUE if, besides those 4 words, any of the sequences +"feed", "field", "Defoe", "fume", and many others are in C<$foo>. By +judicious use of C<\b> (or better (because it is designed to handle +natural language) C<\b{wb}>), we can make sure that only the Giant's +words are matched: + + $foo =~ m/\b(fee|fie|foe|fum)\b/ + $foo =~ m/\b{wb}(fee|fie|foe|fum)\b{wb}/ + +The final example shows that the characters C<"{"> and C<"}"> are +metacharacters. + +Another use for escape sequences is to specify characters that cannot +(or which you prefer not to) be written literally. These are described +in detail in L<perlrebackslash/Character Escapes>, but the next three +paragraphs briefly describe some of them. + +Various control characters can be written in C language style: C<"\n"> +matches a newline, C<"\t"> a tab, C<"\r"> a carriage return, C<"\f"> a +form feed, I<etc>. + +More generally, C<\I<nnn>>, where I<nnn> is a string of three octal +digits, matches the character whose native code point is I<nnn>. You +can easily run into trouble if you don't have exactly three digits. So +always use three, or since Perl 5.14, you can use C<\o{...}> to specify +any number of octal digits. + +Similarly, C<\xI<nn>>, where I<nn> are hexadecimal digits, matches the +character whose native ordinal is I<nn>. Again, not using exactly two +digits is a recipe for disaster, but you can use C<\x{...}> to specify +any number of hex digits. + +Besides being a metacharacter, the C<"."> is an example of a "character +class", something that can match any single character of a given set of +them. In its case, the set is just about all possible characters. Perl +predefines several character classes besides the C<".">; there is a +separate reference page about just these, L<perlrecharclass>. + +You can define your own custom character classes, by putting into your +pattern in the appropriate place(s), a list of all the characters you +want in the set. You do this by enclosing the list within C<[]> bracket +characters. These are called "bracketed character classes" when we are +being precise, but often the word "bracketed" is dropped. (Dropping it +usually doesn't cause confusion.) This means that the C<"["> character +is another metacharacter. It doesn't match anything just by itself; it +is used only to tell Perl that what follows it is a bracketed character +class. If you want to match a literal left square bracket, you must +escape it, like C<"\[">. The matching C<"]"> is also a metacharacter; +again it doesn't match anything by itself, but just marks the end of +your custom class to Perl. It is an example of a "sometimes +metacharacter". It isn't a metacharacter if there is no corresponding +C<"[">, and matches its literal self: + + print "]" =~ /]/; # prints 1 + +The list of characters within the character class gives the set of +characters matched by the class. C<"[abc]"> matches a single "a" or "b" +or "c". But if the first character after the C<"["> is C<"^">, the +class instead matches any character not in the list. Within a list, the +C<"-"> character specifies a range of characters, so that C<a-z> +represents all characters between "a" and "z", inclusive. If you want +either C<"-"> or C<"]"> itself to be a member of a class, put it at the +start of the list (possibly after a C<"^">), or escape it with a +backslash. C<"-"> is also taken literally when it is at the end of the +list, just before the closing C<"]">. (The following all specify the +same class of three characters: C<[-az]>, C<[az-]>, and C<[a\-z]>. All +are different from C<[a-z]>, which specifies a class containing +twenty-six characters, even on EBCDIC-based character sets.) + +There is lots more to bracketed character classes; full details are in +L<perlrecharclass/Bracketed Character Classes>. + +=head3 Metacharacters +X<metacharacter> +X<\> X<^> X<.> X<$> X<|> X<(> X<()> X<[> X<[]> + +L</The Basics> introduced some of the metacharacters. This section +gives them all. Most of them have the same meaning as in the I<egrep> +command. + +Only the C<"\"> is always a metacharacter. The others are metacharacters +just sometimes. The following tables lists all of them, summarizes +their use, and gives the contexts where they are metacharacters. +Outside those contexts or if prefixed by a C<"\">, they match their +corresponding punctuation character. In some cases, their meaning +varies depending on various pattern modifiers that alter the default +behaviors. See L</Modifiers>. + + + PURPOSE WHERE + \ Escape the next character Always, except when + escaped by another \ + ^ Match the beginning of the string Not in [] + (or line, if /m is used) + ^ Complement the [] class At the beginning of [] + . Match any single character except newline Not in [] + (under /s, includes newline) + $ Match the end of the string Not in [], but can + (or before newline at the end of the mean interpolate a + string; or before any newline if /m is scalar + used) + | Alternation Not in [] + () Grouping Not in [] + [ Start Bracketed Character class Not in [] + ] End Bracketed Character class Only in [], and + not first + * Matches the preceding element 0 or more Not in [] + times + + Matches the preceding element 1 or more Not in [] + times + ? Matches the preceding element 0 or 1 Not in [] + times + { Starts a sequence that gives number(s) Not in [] + of times the preceding element can be + matched + { when following certain escape sequences + starts a modifier to the meaning of the + sequence + } End sequence started by { + - Indicates a range Only in [] interior + # Beginning of comment, extends to line end Only with /x modifier + +Notice that most of the metacharacters lose their special meaning when +they occur in a bracketed character class, except C<"^"> has a different +meaning when it is at the beginning of such a class. And C<"-"> and C<"]"> +are metacharacters only at restricted positions within bracketed +character classes; while C<"}"> is a metacharacter only when closing a +special construct started by C<"{">. + +In double-quotish context, as is usually the case, you need to be +careful about C<"$"> and the non-metacharacter C<"@">. Those could +interpolate variables, which may or may not be what you intended. + +These rules were designed for compactness of expression, rather than +legibility and maintainability. The L</E<sol>x and E<sol>xx> pattern +modifiers allow you to insert white space to improve readability. And +use of S<C<L<re 'strict'|re/'strict' mode>>> adds extra checking to +catch some typos that might silently compile into something unintended. + +By default, the C<"^"> character is guaranteed to match only the +beginning of the string, the C<"$"> character only the end (or before the +newline at the end), and Perl does certain optimizations with the +assumption that the string contains only one line. Embedded newlines +will not be matched by C<"^"> or C<"$">. You may, however, wish to treat a +string as a multi-line buffer, such that the C<"^"> will match after any +newline within the string (except if the newline is the last character in +the string), and C<"$"> will match before any newline. At the +cost of a little more overhead, you can do this by using the +L</C<E<sol>m>> modifier on the pattern match operator. (Older programs +did this by setting C<$*>, but this option was removed in perl 5.10.) +X<^> X<$> X</m> + +To simplify multi-line substitutions, the C<"."> character never matches a +newline unless you use the L<C<E<sol>s>|/s> modifier, which in effect tells +Perl to pretend the string is a single line--even if it isn't. +X<.> X</s> + =head2 Modifiers =head3 Overview -Matching operations can have various modifiers. Modifiers -that relate to the interpretation of the regular expression inside -are listed below. Modifiers that alter the way a regular expression -is used by Perl are detailed in L<perlop/"Regexp Quote-Like Operators"> and +The default behavior for matching can be changed, using various +modifiers. Modifiers that relate to the interpretation of the pattern +are listed just below. Modifiers that alter the way a pattern is used +by Perl are detailed in L<perlop/"Regexp Quote-Like Operators"> and L<perlop/"Gory details of parsing quoted constructs">. =over 4 @@ -35,7 +320,7 @@ L<perlop/"Gory details of parsing quoted constructs">. =item B<C<m>> X</m> X<regex, multiline> X<regexp, multiline> X<regular expression, multiline> -Treat the string as multiple lines. That is, change C<"^"> and C<"$"> from matching +Treat the string being matched against as multiple lines. That is, change C<"^"> and C<"$"> from matching the start of the string's first line and the end of its last line to matching the start and end of each line within the string. @@ -85,11 +370,11 @@ inverted, which otherwise could be highly confusing. See L<perlrecharclass/Bracketed Character Classes>, and L<perlrecharclass/Negation>. -=item B<C<x>> +=item B<C<x>> and B<C<xx>> X</x> Extend your pattern's legibility by permitting whitespace and comments. -Details in L</"/x"> +Details in L</E<sol>x and E<sol>xx> =item B<C<p>> X</p> X<regex, preserve> X<regexp, preserve> @@ -105,7 +390,7 @@ after the match regardless of the modifier. X</a> X</d> X</l> X</u> These modifiers, all new in 5.14, affect which character-set rules -(Unicode, etc.) are used, as described below in +(Unicode, I<etc>.) are used, as described below in L</Character set modifiers>. =item B<C<n>> @@ -113,7 +398,7 @@ X</n> X<regex, non-capture> X<regexp, non-capture> X<regular expression, non-capture> Prevent the grouping metacharacters C<()> from capturing. This modifier, -new in 5.22, will stop C<$1>, C<$2>, etc... from being filled in. +new in 5.22, will stop C<$1>, C<$2>, I<etc>... from being filled in. "hello" =~ /(hi|hello)/; # $1 is "hello" "hello" =~ /(hi|hello)/n; # $1 is undef @@ -143,7 +428,6 @@ L<perlretut/"Using regular expressions in Perl"> are: g - globally match the pattern repeatedly in the string Substitution-specific modifiers described in - L<perlop/"s/PATTERN/REPLACEMENT/msixpodualngcer"> are: e - evaluate the right-hand side as an expression @@ -154,7 +438,7 @@ L<perlop/"s/PATTERN/REPLACEMENT/msixpodualngcer"> are: =back Regular expression modifiers are usually written in documentation -as e.g., "the C</x> modifier", even though the delimiter +as I<e.g.>, "the C</x> modifier", even though the delimiter in question might not really be a slash. The modifiers C</imnsxadlup> may also be embedded within the regular expression itself using the C<(?...)> construct, see L</Extended Patterns> below. @@ -164,12 +448,12 @@ the C<(?...)> construct, see L</Extended Patterns> below. Some of the modifiers require more explanation than given in the L</Overview> above. -=head4 /x +=head4 C</x> and C</xx> -C</x> tells +A single C</x> tells the regular expression parser to ignore most whitespace that is neither backslashed nor within a bracketed character class. You can use this to -break up your regular expression into (slightly) more readable parts. +break up your regular expression into more readable parts. Also, the C<"#"> character is treated as a metacharacter introducing a comment that runs up to the pattern's closing delimiter, or to the end of the current line if the pattern extends onto the next line. Hence, @@ -189,6 +473,24 @@ You can use L</(?#text)> to create a comment that ends earlier than the end of the current line, but C<text> also can't contain the closing delimiter unless escaped with a backslash. +A common pitfall is to forget that C<"#"> characters begin a comment under +C</x> and are not matched literally. Just keep that in mind when trying +to puzzle out why a particular C</x> pattern isn't working as expected. + +Starting in Perl v5.26, if the modifier has a second C<"x"> within it, +it does everything that a single C</x> does, but additionally +non-backslashed SPACE and TAB characters within bracketed character +classes are also generally ignored, and hence can be added to make the +classes more readable. + + / [d-e g-i 3-7]/xx + /[ ! @ " # $ % ^ & * () = ? <> ' ]/xx + +may be easier to grasp than the squashed equivalents + + /[d-eg-i3-7]/ + /[!@"#$%^&*()=?<>']/ + Taken together, these features go a long way towards making Perl's regular expressions more readable. Here's an example: @@ -204,7 +506,7 @@ a C<\Q...\E> stays unaffected by C</x>. And note that C</x> doesn't affect space interpretation within a single multi-character construct. For example in C<\x{...}>, regardless of the C</x> modifier, there can be no spaces. Same for a L<quantifier|/Quantifiers> such as C<{3}> or -C<{5,}>. Similarly, C<(?:...)> can't have a space between the C<"{">, +C<{5,}>. Similarly, C<(?:...)> can't have a space between the C<"(">, C<"?">, and C<":">. Within any delimiters for such a construct, allowed spaces are not affected by C</x>, and depend on the construct. For example, C<\x{...}> can't have spaces because hexadecimal @@ -404,6 +706,10 @@ the pattern uses a Unicode break (C<\b{...}> or C<\B{...}>); or the pattern uses L</C<(?[ ])>> +=item 8 + +the pattern uses L<C<(*script_run: ...)>|/Script Runs> + =back Another mnemonic for this modifier is "Depends", as the rules actually @@ -433,8 +739,8 @@ compatibilities. =head4 /a (and /aa) -This modifier stands for ASCII-restrict (or ASCII-safe). This modifier, -unlike the others, may be doubled-up to increase its effect. +This modifier stands for ASCII-restrict (or ASCII-safe). This modifier +may be doubled-up to increase its effect. When it appears singly, it causes the sequences C<\d>, C<\s>, C<\w>, and the Posix character classes to match only in the ASCII range. They thus @@ -473,7 +779,7 @@ comes to case-insensitive matching. To forbid ASCII/non-ASCII matches (like "k" with C<\N{KELVIN SIGN}>), specify the C<"a"> twice, for example C</aai> or C</aia>. (The first -occurrence of C<"a"> restricts the C<\d>, etc., and the second occurrence +occurrence of C<"a"> restricts the C<\d>, I<etc>., and the second occurrence adds the C</i> restrictions.) But, note that code points outside the ASCII range will use Unicode rules for C</i> matching, so the modifier doesn't really restrict things to just ASCII; it just forbids the @@ -520,7 +826,7 @@ sets the default to C</u>, overriding any plain C<use locale>.) Unlike the mechanisms mentioned above, these affect operations besides regular expressions pattern matching, and so give more consistent results with other operators, including using -C<\U>, C<\l>, etc. in substitution replacements. +C<\U>, C<\l>, I<etc>. in substitution replacements. If none of the above apply, for backwards compatibility reasons, the C</d> modifier is the one in effect by default. As this can lead to @@ -540,50 +846,12 @@ Unicode rules, and neither did all occurrences of C<\N{}>, until 5.12. =head2 Regular Expressions -=head3 Metacharacters - -The patterns used in Perl pattern matching evolved from those supplied in -the Version 8 regex routines. (The routines are derived -(distantly) from Henry Spencer's freely redistributable reimplementation -of the V8 routines.) See L<Version 8 Regular Expressions> for -details. - -In particular the following metacharacters have their standard I<egrep>-ish -meanings: -X<metacharacter> -X<\> X<^> X<.> X<$> X<|> X<(> X<()> X<[> X<[]> - - - \ Quote the next metacharacter - ^ Match the beginning of the line - . Match any character (except newline) - $ Match the end of the string (or before newline at the end - of the string) - | Alternation - () Grouping - [] Bracketed Character class - -By default, the C<"^"> character is guaranteed to match only the -beginning of the string, the C<"$"> character only the end (or before the -newline at the end), and Perl does certain optimizations with the -assumption that the string contains only one line. Embedded newlines -will not be matched by C<"^"> or C<"$">. You may, however, wish to treat a -string as a multi-line buffer, such that the C<"^"> will match after any -newline within the string (except if the newline is the last character in -the string), and C<"$"> will match before any newline. At the -cost of a little more overhead, you can do this by using the /m modifier -on the pattern match operator. (Older programs did this by setting C<$*>, -but this option was removed in perl 5.10.) -X<^> X<$> X</m> - -To simplify multi-line substitutions, the C<"."> character never matches a -newline unless you use the C</s> modifier, which in effect tells Perl to pretend -the string is a single line--even if it isn't. -X<.> X</s> - =head3 Quantifiers -The following standard quantifiers are recognized: +Quantifiers are used when a particular portion of a pattern needs to +match a certain number (or numbers) of times. If there isn't a +quantifier the number of times to match is exactly one. The following +standard quantifiers are recognized: X<metacharacter> X<quantifier> X<*> X<+> X<?> X<{n}> X<{n,}> X<{n,m}> * Match 0 or more times @@ -593,15 +861,15 @@ X<metacharacter> X<quantifier> X<*> X<+> X<?> X<{n}> X<{n,}> X<{n,m}> {n,} Match at least n times {n,m} Match at least n but not more than m times -(If a curly bracket occurs in a context other than one of the -quantifiers listed above, where it does not form part of a backslashed -sequence like C<\x{...}>, it is treated as a regular character. -However, a deprecation warning is raised for these -occurrences, and in Perl v5.26, literal uses of a curly bracket will be -required to be escaped, say by preceding them with a backslash (C<"\{">) -or enclosing them within square brackets (C<"[{]">). This change will -allow for future syntax extensions (like making the lower bound of a -quantifier optional), and better error checking of quantifiers.) +(If a non-escaped curly bracket occurs in a context other than one of +the quantifiers listed above, where it does not form part of a +backslashed sequence like C<\x{...}>, it is either a fatal syntax error, +or treated as a regular character, generally with a deprecation warning +raised. To escape it, you can precede it with a backslash (C<"\{">) or +enclose it within square brackets (C<"[{]">). +This change will allow for future syntax extensions (like making the +lower bound of a quantifier optional), and better error checking of +quantifiers). The C<"*"> quantifier is equivalent to C<{0,}>, the C<"+"> quantifier to C<{1,}>, and the C<"?"> quantifier to C<{0,1}>. I<n> and I<m> are limited @@ -642,7 +910,7 @@ For instance, 'aaaa' =~ /a++a/ -will never match, as the C<a++> will gobble up all the C<a>'s in the +will never match, as the C<a++> will gobble up all the C<"a">'s in the string and won't leave any for the remaining part of the pattern. This feature can be extremely useful to give perl hints about where it shouldn't backtrack. For instance, the typical "match a double-quoted @@ -775,20 +1043,21 @@ See L<perlrecharclass/Extended Bracketed Character Classes> for details. =head3 Assertions -Perl defines the following zero-width assertions: +Besides L<C<"^"> and C<"$">|/Metacharacters>, Perl defines the following +zero-width assertions: X<zero-width assertion> X<assertion> X<regex, zero-width assertion> X<regexp, zero-width assertion> X<regular expression, zero-width assertion> X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G> - \b{} Match at Unicode boundary of specified type - \B{} Match where corresponding \b{} doesn't match - \b Match a word boundary - \B Match except at a word boundary - \A Match only at beginning of string - \Z Match only at end of string, or before newline at the end - \z Match only at end of string - \G Match only at pos() (e.g. at the end-of-match position + \b{} Match at Unicode boundary of specified type + \B{} Match where corresponding \b{} doesn't match + \b Match a \w\W or \W\w boundary + \B Match except at a \w\W or \W\w boundary + \A Match only at beginning of string + \Z Match only at end of string, or before newline at the end + \z Match only at end of string + \G Match only at pos() (e.g. at the end-of-match position of prior m//g) A Unicode boundary (C<\b{}>), available starting in v5.22, is a spot @@ -849,7 +1118,7 @@ string: =head3 Capture groups -The bracketing construct C<( ... )> creates capture groups (also referred to as +The grouping construct C<( ... )> creates capture groups (also referred to as capture buffers). To refer to the current contents of a group later on, within the same pattern, use C<\g1> (or C<\g{1}>) for the first, C<\g2> (or C<\g{2}>) for the second, and so on. @@ -863,11 +1132,11 @@ X<named capture buffer> X<regular expression, named capture buffer> X<named capture group> X<regular expression, named capture group> X<%+> X<$+{name}> X<< \k<name> >> There is no limit to the number of captured substrings that you may use. -Groups are numbered with the leftmost open parenthesis being number 1, etc. If +Groups are numbered with the leftmost open parenthesis being number 1, I<etc>. If a group did not match, the associated backreference won't match either. (This can happen if the group is optional, or in a different branch of an alternation.) -You can omit the C<"g">, and write C<"\1">, etc, but there are some issues with +You can omit the C<"g">, and write C<"\1">, I<etc>, but there are some issues with this form, described below. You can also refer to capture groups relatively, by using a negative number, so @@ -904,7 +1173,7 @@ Capture group contents are dynamically scoped and available to you outside the pattern until the end of the enclosing block or until the next successful match, whichever comes first. (See L<perlsyn/"Compound Statements">.) You can refer to them by absolute number (using C<"$1"> instead of C<"\g1">, -etc); or by name via the C<%+> hash, using C<"$+{I<name>}">. +I<etc>); or by name via the C<%+> hash, using C<"$+{I<name>}">. Braces are required in referring to named capture groups, but are optional for absolute or relative numbered ones. Braces are safer when creating a regex by @@ -915,7 +1184,7 @@ is probably not what you intended. The C<\g> and C<\k> notations were introduced in Perl 5.10.0. Prior to that there were no named nor relative numbered capture groups. Absolute numbered groups were referred to using C<\1>, -C<\2>, etc., and this notation is still +C<\2>, I<etc>., and this notation is still accepted (and likely always will be). But it leads to some ambiguities if there are more than 9 capture groups, as C<\10> could mean either the tenth capture group, or the character whose ordinal in octal is 010 (a backspace in @@ -977,7 +1246,7 @@ variable. X<$+> X<$^N> X<$&> X<$`> X<$'> These special variables, like the C<%+> hash and the numbered match variables -(C<$1>, C<$2>, C<$3>, etc.) are dynamically scoped +(C<$1>, C<$2>, C<$3>, I<etc>.) are dynamically scoped until the end of the enclosing block or until the next successful match, whichever comes first. (See L<perlsyn/"Compound Statements">.) X<$+> X<$^N> X<$&> X<$`> X<$'> @@ -992,7 +1261,7 @@ beware that once Perl sees that you need one of C<$&>, C<$`>, or C<$'> anywhere in the program, it has to provide them for every pattern match. This may substantially slow your program. -Perl uses the same mechanism to produce C<$1>, C<$2>, etc, so you also +Perl uses the same mechanism to produce C<$1>, C<$2>, I<etc>, so you also pay a price for each pattern that contains capturing parentheses. (To avoid this cost while retaining the grouping behaviour, use the extended regular expression C<(?: ... )> instead.) But if you never @@ -1056,12 +1325,6 @@ pair of parentheses with a question mark as the first thing within the parentheses. The character after the question mark indicates the extension. -The stability of these extensions varies widely. Some have been -part of the core language for many years. Others are experimental -and may change without warning or be completely removed. Check -the documentation on an individual feature to verify its current -status. - A question mark was chosen for this and for the minimal-matching construct because 1) question marks are rare in older regular expressions, and 2) whenever you see one, you should stop and @@ -1080,16 +1343,38 @@ a backslash if it appears in the comment. See L</E<sol>x> for another way to have comments in patterns. +Note that a comment can go just about anywhere, except in the middle of +an escape sequence. Examples: + + qr/foo(?#comment)bar/' # Matches 'foobar' + + # The pattern below matches 'abcd', 'abccd', or 'abcccd' + qr/abc(?#comment between literal and its quantifier){1,3}d/ + + # The pattern below generates a syntax error, because the '\p' must + # be followed immediately by a '{'. + qr/\p(?#comment between \p and its property name){Any}/ + + # The pattern below generates a syntax error, because the initial + # '\(' is a literal opening parenthesis, and so there is nothing + # for the closing ')' to match + qr/\(?#the backslash means this isn't a comment)p{Any}/ + + # Comments can be used to fold long patterns into multiple lines + qr/First part of a long regex(?# + )remaining part/ + =item C<(?adlupimnsx-imnsx)> =item C<(?^alupimnsx)> X<(?)> X<(?^)> One or more embedded pattern-match modifiers, to be turned on (or -turned off, if preceded by C<"-">) for the remainder of the pattern or +turned off if preceded by C<"-">) for the remainder of the pattern or the remainder of the enclosing pattern group (if any). -This is particularly useful for dynamic patterns, such as those read in from a +This is particularly useful for dynamically-generated patterns, +such as those read in from a configuration file, taken from an argument, or specified in a table somewhere. Consider the case where some patterns want to be case-sensitive and some do not: The case-insensitive ones merely need to @@ -1115,6 +1400,29 @@ These modifiers do not carry over into named subpatterns called in the enclosing group. In other words, a pattern such as C<((?i)(?&NAME))> does not change the case-sensitivity of the C<"NAME"> pattern. +A modifier is overridden by later occurrences of this construct in the +same scope containing the same modifier, so that + + /((?im)foo(?-m)bar)/ + +matches all of C<foobar> case insensitively, but uses C</m> rules for +only the C<foo> portion. The C<"a"> flag overrides C<aa> as well; +likewise C<aa> overrides C<"a">. The same goes for C<"x"> and C<xx>. +Hence, in + + /(?-x)foo/xx + +both C</x> and C</xx> are turned off during matching C<foo>. And in + + /(?x)foo/x + +C</x> but NOT C</xx> is turned on for matching C<foo>. (One might +mistakenly think that since the inner C<(?x)> is already in the scope of +C</x>, that the result would effectively be the sum of them, yielding +C</xx>. It doesn't work that way.) Similarly, doing something like +C<(?xx-x)foo> turns off all C<"x"> behavior for matching C<foo>, it is not +that you subtract 1 C<"x"> from 2 to get 1 C<"x"> remaining. + Any of these modifiers can be set to apply globally to all regular expressions compiled within the scope of a C<use re>. See L<re/"'/flags' mode">. @@ -1124,15 +1432,15 @@ after the C<"?"> is a shorthand equivalent to C<d-imnsx>. Flags (except C<"d">) may follow the caret to override it. But a minus sign is not legal with it. -Note that the C<a>, C<d>, C<l>, C<p>, and C<u> modifiers are special in -that they can only be enabled, not disabled, and the C<a>, C<d>, C<l>, and -C<u> modifiers are mutually exclusive: specifying one de-specifies the -others, and a maximum of one (or two C<a>'s) may appear in the +Note that the C<"a">, C<"d">, C<"l">, C<"p">, and C<"u"> modifiers are special in +that they can only be enabled, not disabled, and the C<"a">, C<"d">, C<"l">, and +C<"u"> modifiers are mutually exclusive: specifying one de-specifies the +others, and a maximum of one (or two C<"a">'s) may appear in the construct. Thus, for example, C<(?-p)> will warn when compiled under C<use warnings>; C<(?-d:...)> and C<(?dl:...)> are fatal errors. -Note also that the C<p> modifier is special in that its presence +Note also that the C<"p"> modifier is special in that its presence anywhere in a pattern has a global effect. =item C<(?:pattern)> @@ -1148,11 +1456,13 @@ C<"()">, but doesn't make backreferences as C<"()"> does. So @fields = split(/\b(?:a|b|c)\b/) -is like +matches the same field delimiters as @fields = split(/\b(a|b|c)\b/) -but doesn't spit out extra fields. It's also cheaper not to capture +but doesn't spit out the delimiters themselves as extra fields (even though +that's the behaviour of L<perlfunc/split> when its pattern contains capturing +groups). It's also cheaper not to capture characters if you don't need to. Any letters between C<"?"> and C<":"> act as flags modifiers as with @@ -1167,6 +1477,11 @@ is equivalent to the more verbose Note that any C<()> constructs enclosed within this one will still capture unless the C</n> modifier is in effect. +Like the L</(?adlupimnsx-imnsx)> construct, C<aa> and C<"a"> override each +other, as do C<xx> and C<"x">. They are not additive. So, doing +something like C<(?xx-x:foo)> turns off all C<"x"> behavior for matching +C<foo>. + Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately after the C<"?"> is a shorthand equivalent to C<d-imnsx>. Any positive flags (except C<"d">) may follow the caret, so @@ -1220,12 +1535,12 @@ Consider the following pattern. The numbers underneath show in which group the captured content will be stored. - # before ---------------branch-reset----------- after + # before ---------------branch-reset----------- after / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x - # 1 2 2 3 2 3 4 + # 1 2 2 3 2 3 4 -Be careful when using the branch reset pattern in combination with -named captures. Named captures are implemented as being aliases to +Be careful when using the branch reset pattern in combination with +named captures. Named captures are implemented as being aliases to numbered groups holding the captures, and that interferes with the implementation of the branch reset pattern. If you are using named captures in a branch reset pattern, it's best to use the same names, @@ -1237,8 +1552,8 @@ in the same order, in each of the alternations: Not doing so may lead to surprises: "12" =~ /(?| (?<a> \d+ ) | (?<b> \D+))/x; - say $+ {a}; # Prints '12' - say $+ {b}; # *Also* prints '12'. + say $+{a}; # Prints '12' + say $+{b}; # *Also* prints '12'. The problem here is that both the group named C<< a >> and the group named C<< b >> are aliases for the group belonging to C<< $1 >>. @@ -1255,13 +1570,30 @@ lookahead matches text following the current match position. =over 4 =item C<(?=pattern)> -X<(?=)> X<look-ahead, positive> X<lookahead, positive> + +=item C<(*pla:pattern)> + +=item C<(*positive_lookahead:pattern)> +X<(?=)> +X<(*pla> +X<(*positive_lookahead> +X<look-ahead, positive> X<lookahead, positive> A zero-width positive lookahead assertion. For example, C</\w+(?=\t)/> matches a word followed by a tab, without including the tab in C<$&>. +The alphabetic forms are experimental; using them yields a warning in the +C<experimental::alpha_assertions> category. + =item C<(?!pattern)> -X<(?!)> X<look-ahead, negative> X<lookahead, negative> + +=item C<(*nla:pattern)> + +=item C<(*negative_lookahead:pattern)> +X<(?!)> +X<(*nla> +X<(*negative_lookahead> +X<look-ahead, negative> X<lookahead, negative> A zero-width negative lookahead assertion. For example C</foo(?!bar)/> matches any occurrence of "foo" that isn't followed by "bar". Note @@ -1273,8 +1605,20 @@ will not do what you want. That's because the C<(?!foo)> is just saying that the next thing cannot be "foo"--and it's not, it's a "bar", so "foobar" will match. Use lookbehind instead (see below). -=item C<(?<=pattern)> C<\K> -X<(?<=)> X<look-behind, positive> X<lookbehind, positive> X<\K> +The alphabetic forms are experimental; using them yields a warning in the +C<experimental::alpha_assertions> category. + +=item C<(?<=pattern)> + +=item C<\K> + +=item C<(*plb:pattern)> + +=item C<(*positive_lookbehind:pattern)> +X<(?<=)> +X<(*plb> +X<(*positive_lookbehind> +X<look-behind, positive> X<lookbehind, positive> X<\K> A zero-width positive lookbehind assertion. For example, C</(?<=\t)\w+/> matches a word that follows a tab, without including the tab in C<$&>. @@ -1298,18 +1642,31 @@ can be rewritten as the much more efficient s/foo\Kbar//g; +The alphabetic forms (not including C<\K> are experimental; using them +yields a warning in the C<experimental::alpha_assertions> category. + =item C<(?<!pattern)> -X<(?<!)> X<look-behind, negative> X<lookbehind, negative> + +=item C<(*nlb:pattern)> + +=item C<(*negative_lookbehind:pattern)> +X<(?<!)> +X<(*nlb> +X<(*negative_lookbehind> +X<look-behind, negative> X<lookbehind, negative> A zero-width negative lookbehind assertion. For example C</(?<!bar)foo/> matches any occurrence of "foo" that does not follow "bar". Works only for fixed-width lookbehind. -=back +The alphabetic forms are experimental; using them yields a warning in the +C<experimental::alpha_assertions> category. -=item C<(?'NAME'pattern)> +=back =item C<< (?<NAME>pattern) >> + +=item C<(?'NAME'pattern)> X<< (?<NAME>) >> X<(?'NAME')> X<named capture> X<capture> A named capture group. Identical in every respect to normal capturing @@ -1319,7 +1676,7 @@ constructs (like C<\g{NAME}>) and can be accessed by name after a successful match via C<%+> or C<%->. See L<perlvar> for more details on the C<%+> and C<%-> hashes. -If multiple distinct capture groups have the same name then the +If multiple distinct capture groups have the same name, then C<$+{NAME}> will refer to the leftmost defined group in the match. The forms C<(?'NAME'pattern)> and C<< (?<NAME>pattern) >> are equivalent. @@ -1440,7 +1797,7 @@ similar localizing behaviours. So later code blocks within the same pattern will still see the values which were localized in earlier blocks. These accumulated localizations are undone either at the end of a successful match, or if the assertion is backtracked (compare -L<"Backtracking">). For example, +L</"Backtracking">). For example, $_ = 'a' x 8; m< @@ -1473,7 +1830,7 @@ regular expression. The assignment to C<$^R> above is properly localized, so the old value of C<$^R> is restored if the assertion is backtracked; compare -L<"Backtracking">. +L</"Backtracking">. Note that the special variable C<$^N> is particularly useful with code blocks to capture the results of submatches in variables without having to @@ -1512,7 +1869,7 @@ pattern captures "A"; Note that this means that there is no way for the inner pattern to refer to a capture group defined outside. (The code block itself can use C<$1>, -etc., to refer to the enclosing pattern's capture groups.) Thus, although +I<etc>., to refer to the enclosing pattern's capture groups.) Thus, although ('a' x 100)=~/(??{'(.)' x 100})/ @@ -1535,9 +1892,10 @@ L<C<(?I<PARNO>)>|/(?PARNO) (?-PARNO) (?+PARNO) (?R) (?0)> for a different, more efficient way to accomplish the same task. -Executing a postponed regular expression 50 times without consuming any -input string will result in a fatal error. The maximum depth is compiled -into perl, so changing it requires a custom build. +Executing a postponed regular expression too many times without +consuming any input string will also result in a fatal error. The depth +at which that happens is compiled into perl, so it can be changed with a +custom build. =item C<(?I<PARNO>)> C<(?-I<PARNO>)> C<(?+I<PARNO>)> C<(?R)> C<(?0)> X<(?PARNO)> X<(?1)> X<(?R)> X<(?0)> X<(?-1)> X<(?+1)> X<(?-PARNO)> X<(?+PARNO)> @@ -1602,9 +1960,9 @@ the output produced should be the following: $3 = bar(baz)+baz(bop) If there is no corresponding capture group defined, then it is a -fatal error. Recursing deeper than 50 times without consuming any input -string will also result in a fatal error. The maximum depth is compiled -into perl, so changing it requires a custom build. +fatal error. Recursing deeply without consuming any input string will +also result in a fatal error. The depth at which that happens is +compiled into perl, so it can be changed with a custom build. The following shows how using negative indexing can make it easier to embed recursive patterns inside of a C<qr//> construct @@ -1663,7 +2021,7 @@ matched); =item the special symbol C<(R)> (true when evaluated inside of recursion or eval). Additionally the -C<R> may be +C<"R"> may be followed by a number, (which will be true when evaluated when recursing inside of the appropriate group), or by C<&NAME>, in which case it will be true only when evaluated during recursion in the named group. @@ -1677,23 +2035,28 @@ Here's a summary of the possible predicates: =item C<(1)> C<(2)> ... Checks if the numbered capturing group has matched something. +Full syntax: C<< (?(1)then|else) >> =item C<(E<lt>I<NAME>E<gt>)> C<('I<NAME>')> Checks if a group with the given name has matched something. +Full syntax: C<< (?(<name>)then|else) >> =item C<(?=...)> C<(?!...)> C<(?<=...)> C<(?<!...)> Checks whether the pattern matches (or does not match, for the C<"!"> variants). +Full syntax: C<< (?(?=lookahead)then|else) >> =item C<(?{ I<CODE> })> Treats the return value of the code block as the condition. +Full syntax: C<< (?(?{ code })then|else) >> =item C<(R)> Checks if the expression has been evaluated inside of recursion. +Full syntax: C<< (?(R)then|else) >> =item C<(R1)> C<(R2)> ... @@ -1704,18 +2067,22 @@ inside of the n-th capture group. This check is the regex equivalent of In other words, it does not check the full recursion stack. +Full syntax: C<< (?(R1)then|else) >> + =item C<(R&I<NAME>)> Similar to C<(R1)>, this predicate checks to see if we're executing directly inside of the leftmost group with a given name (this is the same logic used by C<(?&I<NAME>)> to disambiguate). It does not check the full stack, but only the name of the innermost active recursion. +Full syntax: C<< (?(R&name)then|else) >> =item C<(DEFINE)> In this case, the yes-pattern is never directly executed, and no no-pattern is allowed. Similar in spirit to C<(?{0})> but more efficient. See below for details. +Full syntax: C<< (?(DEFINE)definitions...) >> =back @@ -1769,22 +2136,26 @@ compile the definitions with the C<qr//> operator, and later interpolate them in another pattern. =item C<< (?>pattern) >> + +=item C<< (*atomic:pattern) >> +X<(?E<gt>pattern)> +X<(*atomic> X<backtrack> X<backtracking> X<atomic> X<possessive> An "independent" subexpression, one which matches the substring that a I<standalone> C<pattern> would match if anchored at the given position, and it matches I<nothing other than this substring>. This construct is useful for optimizations of what would otherwise be -"eternal" matches, because it will not backtrack (see L<"Backtracking">). +"eternal" matches, because it will not backtrack (see L</"Backtracking">). It may also be useful in places where the "grab all you can, and do not give anything back" semantic is desirable. For example: C<< ^(?>a*)ab >> will never match, since C<< (?>a*) >> (anchored at the beginning of string, as above) will match I<all> -characters C<a> at the beginning of string, leaving no C<a> for +characters C<"a"> at the beginning of string, leaving no C<"a"> for C<ab> to match. In contrast, C<a*ab> will match the same as C<a+b>, since the match of the subgroup C<a*> is influenced by the following -group C<ab> (see L<"Backtracking">). In particular, C<a*> inside +group C<ab> (see L</"Backtracking">). In particular, C<a*> inside C<a*ab> will match fewer characters than a standalone C<a*>, since this makes the tail match. @@ -1833,7 +2204,7 @@ hung. However, a tiny change to this pattern which uses C<< (?>...) >> matches exactly when the one above does (verifying this yourself would be a productive exercise), but finishes in a fourth -the time when used on a similar string with 1000000 C<a>s. Be aware, +the time when used on a similar string with 1000000 C<"a">s. Be aware, however, that, when this construct is followed by a quantifier, it currently triggers a warning message under the C<use warnings> pragma or B<-w> switch saying it @@ -1841,7 +2212,7 @@ C<"matches null string many times in regex">. On simple groups, such as the pattern C<< (?> [^()]+ ) >>, a comparable effect may be achieved by negative lookahead, as in C<[^()]+ (?! [^()] )>. -This was only 4 times slower on a string with 1000000 C<a>s. +This was only 4 times slower on a string with 1000000 C<"a">s. The "grab all you can, and do not give anything back" semantic is desirable in many situations where on the first sight a simple C<()*> looks like @@ -1877,12 +2248,333 @@ to inside of one of these constructs. The following equivalences apply: PAT?+ (?>PAT?) PAT{min,max}+ (?>PAT{min,max}) +Nested C<(?E<gt>...)> constructs are not no-ops, even if at first glance +they might seem to be. This is because the nested C<(?E<gt>...)> can +restrict internal backtracking that otherwise might occur. For example, + + "abc" =~ /(?>a[bc]*c)/ + +matches, but + + "abc" =~ /(?>a(?>[bc]*)c)/ + +does not. + +The alphabetic form (C<(*atomic:...)>) is experimental; using it +yields a warning in the C<experimental::alpha_assertions> category. + =item C<(?[ ])> See L<perlrecharclass/Extended Bracketed Character Classes>. +Note that this feature is currently L<experimental|perlpolicy/experimental>; +using it yields a warning in the C<experimental::regex_sets> category. + =back +=head2 Backtracking +X<backtrack> X<backtracking> + +NOTE: This section presents an abstract approximation of regular +expression behavior. For a more rigorous (and complicated) view of +the rules involved in selecting a match among possible alternatives, +see L</Combining RE Pieces>. + +A fundamental feature of regular expression matching involves the +notion called I<backtracking>, which is currently used (when needed) +by all regular non-possessive expression quantifiers, namely C<"*">, C<*?>, C<"+">, +C<+?>, C<{n,m}>, and C<{n,m}?>. Backtracking is often optimized +internally, but the general principle outlined here is valid. + +For a regular expression to match, the I<entire> regular expression must +match, not just part of it. So if the beginning of a pattern containing a +quantifier succeeds in a way that causes later parts in the pattern to +fail, the matching engine backs up and recalculates the beginning +part--that's why it's called backtracking. + +Here is an example of backtracking: Let's say you want to find the +word following "foo" in the string "Food is on the foo table.": + + $_ = "Food is on the foo table."; + if ( /\b(foo)\s+(\w+)/i ) { + print "$2 follows $1.\n"; + } + +When the match runs, the first part of the regular expression (C<\b(foo)>) +finds a possible match right at the beginning of the string, and loads up +C<$1> with "Foo". However, as soon as the matching engine sees that there's +no whitespace following the "Foo" that it had saved in C<$1>, it realizes its +mistake and starts over again one character after where it had the +tentative match. This time it goes all the way until the next occurrence +of "foo". The complete regular expression matches this time, and you get +the expected output of "table follows foo." + +Sometimes minimal matching can help a lot. Imagine you'd like to match +everything between "foo" and "bar". Initially, you write something +like this: + + $_ = "The food is under the bar in the barn."; + if ( /foo(.*)bar/ ) { + print "got <$1>\n"; + } + +Which perhaps unexpectedly yields: + + got <d is under the bar in the > + +That's because C<.*> was greedy, so you get everything between the +I<first> "foo" and the I<last> "bar". Here it's more effective +to use minimal matching to make sure you get the text between a "foo" +and the first "bar" thereafter. + + if ( /foo(.*?)bar/ ) { print "got <$1>\n" } + got <d is under the > + +Here's another example. Let's say you'd like to match a number at the end +of a string, and you also want to keep the preceding part of the match. +So you write this: + + $_ = "I have 2 numbers: 53147"; + if ( /(.*)(\d*)/ ) { # Wrong! + print "Beginning is <$1>, number is <$2>.\n"; + } + +That won't work at all, because C<.*> was greedy and gobbled up the +whole string. As C<\d*> can match on an empty string the complete +regular expression matched successfully. + + Beginning is <I have 2 numbers: 53147>, number is <>. + +Here are some variants, most of which don't work: + + $_ = "I have 2 numbers: 53147"; + @pats = qw{ + (.*)(\d*) + (.*)(\d+) + (.*?)(\d*) + (.*?)(\d+) + (.*)(\d+)$ + (.*?)(\d+)$ + (.*)\b(\d+)$ + (.*\D)(\d+)$ + }; + + for $pat (@pats) { + printf "%-12s ", $pat; + if ( /$pat/ ) { + print "<$1> <$2>\n"; + } else { + print "FAIL\n"; + } + } + +That will print out: + + (.*)(\d*) <I have 2 numbers: 53147> <> + (.*)(\d+) <I have 2 numbers: 5314> <7> + (.*?)(\d*) <> <> + (.*?)(\d+) <I have > <2> + (.*)(\d+)$ <I have 2 numbers: 5314> <7> + (.*?)(\d+)$ <I have 2 numbers: > <53147> + (.*)\b(\d+)$ <I have 2 numbers: > <53147> + (.*\D)(\d+)$ <I have 2 numbers: > <53147> + +As you see, this can be a bit tricky. It's important to realize that a +regular expression is merely a set of assertions that gives a definition +of success. There may be 0, 1, or several different ways that the +definition might succeed against a particular string. And if there are +multiple ways it might succeed, you need to understand backtracking to +know which variety of success you will achieve. + +When using lookahead assertions and negations, this can all get even +trickier. Imagine you'd like to find a sequence of non-digits not +followed by "123". You might try to write that as + + $_ = "ABC123"; + if ( /^\D*(?!123)/ ) { # Wrong! + print "Yup, no 123 in $_\n"; + } + +But that isn't going to match; at least, not the way you're hoping. It +claims that there is no 123 in the string. Here's a clearer picture of +why that pattern matches, contrary to popular expectations: + + $x = 'ABC123'; + $y = 'ABC445'; + + print "1: got $1\n" if $x =~ /^(ABC)(?!123)/; + print "2: got $1\n" if $y =~ /^(ABC)(?!123)/; + + print "3: got $1\n" if $x =~ /^(\D*)(?!123)/; + print "4: got $1\n" if $y =~ /^(\D*)(?!123)/; + +This prints + + 2: got ABC + 3: got AB + 4: got ABC + +You might have expected test 3 to fail because it seems to a more +general purpose version of test 1. The important difference between +them is that test 3 contains a quantifier (C<\D*>) and so can use +backtracking, whereas test 1 will not. What's happening is +that you've asked "Is it true that at the start of C<$x>, following 0 or more +non-digits, you have something that's not 123?" If the pattern matcher had +let C<\D*> expand to "ABC", this would have caused the whole pattern to +fail. + +The search engine will initially match C<\D*> with "ABC". Then it will +try to match C<(?!123)> with "123", which fails. But because +a quantifier (C<\D*>) has been used in the regular expression, the +search engine can backtrack and retry the match differently +in the hope of matching the complete regular expression. + +The pattern really, I<really> wants to succeed, so it uses the +standard pattern back-off-and-retry and lets C<\D*> expand to just "AB" this +time. Now there's indeed something following "AB" that is not +"123". It's "C123", which suffices. + +We can deal with this by using both an assertion and a negation. +We'll say that the first part in C<$1> must be followed both by a digit +and by something that's not "123". Remember that the lookaheads +are zero-width expressions--they only look, but don't consume any +of the string in their match. So rewriting this way produces what +you'd expect; that is, case 5 will fail, but case 6 succeeds: + + print "5: got $1\n" if $x =~ /^(\D*)(?=\d)(?!123)/; + print "6: got $1\n" if $y =~ /^(\D*)(?=\d)(?!123)/; + + 6: got ABC + +In other words, the two zero-width assertions next to each other work as though +they're ANDed together, just as you'd use any built-in assertions: C</^$/> +matches only if you're at the beginning of the line AND the end of the +line simultaneously. The deeper underlying truth is that juxtaposition in +regular expressions always means AND, except when you write an explicit OR +using the vertical bar. C</ab/> means match "a" AND (then) match "b", +although the attempted matches are made at different positions because "a" +is not a zero-width assertion, but a one-width assertion. + +B<WARNING>: Particularly complicated regular expressions can take +exponential time to solve because of the immense number of possible +ways they can use backtracking to try for a match. For example, without +internal optimizations done by the regular expression engine, this will +take a painfully long time to run: + + 'aaaaaaaaaaaa' =~ /((a{0,5}){0,5})*[c]/ + +And if you used C<"*">'s in the internal groups instead of limiting them +to 0 through 5 matches, then it would take forever--or until you ran +out of stack space. Moreover, these internal optimizations are not +always applicable. For example, if you put C<{0,5}> instead of C<"*"> +on the external group, no current optimization is applicable, and the +match takes a long time to finish. + +A powerful tool for optimizing such beasts is what is known as an +"independent group", +which does not backtrack (see L</C<< (?>pattern) >>>). Note also that +zero-length lookahead/lookbehind assertions will not backtrack to make +the tail match, since they are in "logical" context: only +whether they match is considered relevant. For an example +where side-effects of lookahead I<might> have influenced the +following match, see L</C<< (?>pattern) >>>. + +=head2 Script Runs +X<(*script_run:...)> X<(sr:...)> +X<(*atomic_script_run:...)> X<(asr:...)> + +A script run is basically a sequence of characters, all from the same +Unicode script (see L<perlunicode/Scripts>), such as Latin or Greek. In +most places a single word would never be written in multiple scripts, +unless it is a spoofing attack. An infamous example, is + + paypal.com + +Those letters could all be Latin (as in the example just above), or they +could be all Cyrillic (except for the dot), or they could be a mixture +of the two. In the case of an internet address the C<.com> would be in +Latin, And any Cyrillic ones would cause it to be a mixture, not a +script run. Someone clicking on such a link would not be directed to +the real Paypal website, but an attacker would craft a look-alike one to +attempt to gather sensitive information from the person. + +Starting in Perl 5.28, it is now easy to detect strings that aren't +script runs. Simply enclose just about any pattern like either of +these: + + (*script_run:pattern) + (*sr:pattern) + +What happens is that after I<pattern> succeeds in matching, it is +subjected to the additional criterion that every character in it must be +from the same script (see exceptions below). If this isn't true, +backtracking occurs until something all in the same script is found that +matches, or all possibilities are exhausted. This can cause a lot of +backtracking, but generally, only malicious input will result in this, +though the slow down could cause a denial of service attack. If your +needs permit, it is best to make the pattern atomic. This is so likely +to be what you want, that instead of writing this: + + (*script_run:(?>pattern)) + +you can write either of these: + + (*atomic_script_run:pattern) + (*asr:pattern) + +(See L</C<(?E<gt>pattern)>>.) + +In Taiwan, Japan, and Korea, it is common for text to have a mixture of +characters from their native scripts and base Chinese. Perl follows +Unicode's UTS 39 (L<http://unicode.org/reports/tr39/>) Unicode Security +Mechanisms in allowing such mixtures. + +The rules used for matching decimal digits are somewhat different. Many +scripts have their own sets of digits equivalent to the Western C<0> +through C<9> ones. A few, such as Arabic, have more than one set. For +a string to be considered a script run, all digits in it must come from +the same set, as determined by the first digit encountered. The ASCII +C<[0-9]> are accepted as being in any script, even those that have their +own set. This is because these are often used in commerce even in such +scripts. But any mixing of the ASCII and other digits will cause the +sequence to not be a script run, failing the match. As an example, + + qr/(*script_run: \d+ \b )/x + +guarantees that the digits matched will all be from the same set of 10. +You won't get a look-alike digit from a different script that has a +different value than what it appears to be. + +Unicode has three pseudo scripts that are handled specially. + +"Unknown" is applied to code points whose meaning has yet to be +determined. Perl currently will match as a script run, any single +character string consisting of one of these code points. But any string +longer than one code point containing one of these will not be +considered a script run. + +"Inherited" is applied to characters that modify another, such as an +accent of some type. These are considered to be in the script of the +master character, and so never cause a script run to not match. + +The other one is "Common". This consists of mostly punctuation, emoji, +and characters used in mathematics and music, and the ASCII digits C<0> +through C<9>. These characters can appear intermixed in text in many of +the world's scripts. These also don't cause a script run to not match, +except any ASCII digits encountered have to obey the decimal digit rules +described above. + +This construct is non-capturing. You can add parentheses to I<pattern> +to capture, if desired. You will have to do this if you plan to use +L</(*ACCEPT) (*ACCEPT:arg)> and not have it bypass the script run +checking. + +This feature is experimental, and the exact syntax and details of +operation are subject to change; using it yields a warning in the +C<experimental::script_run> category. + +The C<Script_Extensions> property is used as the basis for this feature. + =head2 Special Backtracking Control Verbs These special patterns are generally of the form C<(*I<VERB>:I<ARG>)>. Unless @@ -1908,7 +2600,10 @@ C<(*MARK:NAME)> verb below for more details. B<NOTE:> C<$REGERROR> and C<$REGMARK> are not magic variables like C<$1> and most other regex-related variables. They are not local to a scope, nor readonly, but instead are volatile package variables similar to C<$AUTOLOAD>. -Use C<local> to localize changes to them to a specific scope if necessary. +They are set in the package containing the code that I<executed> the regex +(rather than the one that compiled it, where those differ). If necessary, you +can use C<local> to localize changes to these variables to a specific scope +before executing a regex. If a pattern does not contain a special backtracking verb that allows an argument, then C<$REGERROR> and C<$REGMARK> are not touched at all. @@ -1923,7 +2618,7 @@ argument, then C<$REGERROR> and C<$REGMARK> are not touched at all. X<(*PRUNE)> X<(*PRUNE:NAME)> This zero-width pattern prunes the backtracking tree at the current point -when backtracked into on failure. Consider the pattern C<I<A> (*PRUNE) I<B>>, +when backtracked into on failure. Consider the pattern C</I<A> (*PRUNE) I<B>/>, where I<A> and I<B> are complex patterns. Until the C<(*PRUNE)> verb is reached, I<A> may backtrack as necessary to match. Once it is reached, matching continues in I<B>, which may also backtrack as necessary; however, should B @@ -1964,7 +2659,8 @@ at each matching starting point like so: Any number of C<(*PRUNE)> assertions may be used in a pattern. -See also C<< (?>pattern) >> and possessive quantifiers for other ways to +See also C<<< L<< /(?>pattern) >> >>> and possessive quantifiers for +other ways to control backtracking. In some cases, the use of C<(*PRUNE)> can be replaced with a C<< (?>pattern) >> with no functional difference; however, C<(*PRUNE)> can be used to handle cases that cannot be expressed using a @@ -2118,9 +2814,9 @@ For instance: 'AB' =~ /(A (A|B(*ACCEPT)|C) D)(E)/x; -will match, and C<$1> will be C<AB> and C<$2> will be C<B>, C<$3> will not +will match, and C<$1> will be C<AB> and C<$2> will be C<"B">, C<$3> will not be set. If another branch in the inner parentheses was matched, such as in the -string 'ACDE', then the C<D> and C<E> would have to be matched as well. +string 'ACDE', then the C<"D"> and C<"E"> would have to be matched as well. You can provide an argument, which will be available in the var C<$REGMARK> after the match completes. @@ -2129,308 +2825,6 @@ C<$REGMARK> after the match completes. =back -=head2 Backtracking -X<backtrack> X<backtracking> - -NOTE: This section presents an abstract approximation of regular -expression behavior. For a more rigorous (and complicated) view of -the rules involved in selecting a match among possible alternatives, -see L<Combining RE Pieces>. - -A fundamental feature of regular expression matching involves the -notion called I<backtracking>, which is currently used (when needed) -by all regular non-possessive expression quantifiers, namely C<"*">, C<"*?">, C<"+">, -C<"+?">, C<{n,m}>, and C<{n,m}?>. Backtracking is often optimized -internally, but the general principle outlined here is valid. - -For a regular expression to match, the I<entire> regular expression must -match, not just part of it. So if the beginning of a pattern containing a -quantifier succeeds in a way that causes later parts in the pattern to -fail, the matching engine backs up and recalculates the beginning -part--that's why it's called backtracking. - -Here is an example of backtracking: Let's say you want to find the -word following "foo" in the string "Food is on the foo table.": - - $_ = "Food is on the foo table."; - if ( /\b(foo)\s+(\w+)/i ) { - print "$2 follows $1.\n"; - } - -When the match runs, the first part of the regular expression (C<\b(foo)>) -finds a possible match right at the beginning of the string, and loads up -C<$1> with "Foo". However, as soon as the matching engine sees that there's -no whitespace following the "Foo" that it had saved in C<$1>, it realizes its -mistake and starts over again one character after where it had the -tentative match. This time it goes all the way until the next occurrence -of "foo". The complete regular expression matches this time, and you get -the expected output of "table follows foo." - -Sometimes minimal matching can help a lot. Imagine you'd like to match -everything between "foo" and "bar". Initially, you write something -like this: - - $_ = "The food is under the bar in the barn."; - if ( /foo(.*)bar/ ) { - print "got <$1>\n"; - } - -Which perhaps unexpectedly yields: - - got <d is under the bar in the > - -That's because C<.*> was greedy, so you get everything between the -I<first> "foo" and the I<last> "bar". Here it's more effective -to use minimal matching to make sure you get the text between a "foo" -and the first "bar" thereafter. - - if ( /foo(.*?)bar/ ) { print "got <$1>\n" } - got <d is under the > - -Here's another example. Let's say you'd like to match a number at the end -of a string, and you also want to keep the preceding part of the match. -So you write this: - - $_ = "I have 2 numbers: 53147"; - if ( /(.*)(\d*)/ ) { # Wrong! - print "Beginning is <$1>, number is <$2>.\n"; - } - -That won't work at all, because C<.*> was greedy and gobbled up the -whole string. As C<\d*> can match on an empty string the complete -regular expression matched successfully. - - Beginning is <I have 2 numbers: 53147>, number is <>. - -Here are some variants, most of which don't work: - - $_ = "I have 2 numbers: 53147"; - @pats = qw{ - (.*)(\d*) - (.*)(\d+) - (.*?)(\d*) - (.*?)(\d+) - (.*)(\d+)$ - (.*?)(\d+)$ - (.*)\b(\d+)$ - (.*\D)(\d+)$ - }; - - for $pat (@pats) { - printf "%-12s ", $pat; - if ( /$pat/ ) { - print "<$1> <$2>\n"; - } else { - print "FAIL\n"; - } - } - -That will print out: - - (.*)(\d*) <I have 2 numbers: 53147> <> - (.*)(\d+) <I have 2 numbers: 5314> <7> - (.*?)(\d*) <> <> - (.*?)(\d+) <I have > <2> - (.*)(\d+)$ <I have 2 numbers: 5314> <7> - (.*?)(\d+)$ <I have 2 numbers: > <53147> - (.*)\b(\d+)$ <I have 2 numbers: > <53147> - (.*\D)(\d+)$ <I have 2 numbers: > <53147> - -As you see, this can be a bit tricky. It's important to realize that a -regular expression is merely a set of assertions that gives a definition -of success. There may be 0, 1, or several different ways that the -definition might succeed against a particular string. And if there are -multiple ways it might succeed, you need to understand backtracking to -know which variety of success you will achieve. - -When using lookahead assertions and negations, this can all get even -trickier. Imagine you'd like to find a sequence of non-digits not -followed by "123". You might try to write that as - - $_ = "ABC123"; - if ( /^\D*(?!123)/ ) { # Wrong! - print "Yup, no 123 in $_\n"; - } - -But that isn't going to match; at least, not the way you're hoping. It -claims that there is no 123 in the string. Here's a clearer picture of -why that pattern matches, contrary to popular expectations: - - $x = 'ABC123'; - $y = 'ABC445'; - - print "1: got $1\n" if $x =~ /^(ABC)(?!123)/; - print "2: got $1\n" if $y =~ /^(ABC)(?!123)/; - - print "3: got $1\n" if $x =~ /^(\D*)(?!123)/; - print "4: got $1\n" if $y =~ /^(\D*)(?!123)/; - -This prints - - 2: got ABC - 3: got AB - 4: got ABC - -You might have expected test 3 to fail because it seems to a more -general purpose version of test 1. The important difference between -them is that test 3 contains a quantifier (C<\D*>) and so can use -backtracking, whereas test 1 will not. What's happening is -that you've asked "Is it true that at the start of C<$x>, following 0 or more -non-digits, you have something that's not 123?" If the pattern matcher had -let C<\D*> expand to "ABC", this would have caused the whole pattern to -fail. - -The search engine will initially match C<\D*> with "ABC". Then it will -try to match C<(?!123)> with "123", which fails. But because -a quantifier (C<\D*>) has been used in the regular expression, the -search engine can backtrack and retry the match differently -in the hope of matching the complete regular expression. - -The pattern really, I<really> wants to succeed, so it uses the -standard pattern back-off-and-retry and lets C<\D*> expand to just "AB" this -time. Now there's indeed something following "AB" that is not -"123". It's "C123", which suffices. - -We can deal with this by using both an assertion and a negation. -We'll say that the first part in C<$1> must be followed both by a digit -and by something that's not "123". Remember that the lookaheads -are zero-width expressions--they only look, but don't consume any -of the string in their match. So rewriting this way produces what -you'd expect; that is, case 5 will fail, but case 6 succeeds: - - print "5: got $1\n" if $x =~ /^(\D*)(?=\d)(?!123)/; - print "6: got $1\n" if $y =~ /^(\D*)(?=\d)(?!123)/; - - 6: got ABC - -In other words, the two zero-width assertions next to each other work as though -they're ANDed together, just as you'd use any built-in assertions: C</^$/> -matches only if you're at the beginning of the line AND the end of the -line simultaneously. The deeper underlying truth is that juxtaposition in -regular expressions always means AND, except when you write an explicit OR -using the vertical bar. C</ab/> means match "a" AND (then) match "b", -although the attempted matches are made at different positions because "a" -is not a zero-width assertion, but a one-width assertion. - -B<WARNING>: Particularly complicated regular expressions can take -exponential time to solve because of the immense number of possible -ways they can use backtracking to try for a match. For example, without -internal optimizations done by the regular expression engine, this will -take a painfully long time to run: - - 'aaaaaaaaaaaa' =~ /((a{0,5}){0,5})*[c]/ - -And if you used C<"*">'s in the internal groups instead of limiting them -to 0 through 5 matches, then it would take forever--or until you ran -out of stack space. Moreover, these internal optimizations are not -always applicable. For example, if you put C<{0,5}> instead of C<"*"> -on the external group, no current optimization is applicable, and the -match takes a long time to finish. - -A powerful tool for optimizing such beasts is what is known as an -"independent group", -which does not backtrack (see L</C<< (?>pattern) >>>). Note also that -zero-length lookahead/lookbehind assertions will not backtrack to make -the tail match, since they are in "logical" context: only -whether they match is considered relevant. For an example -where side-effects of lookahead I<might> have influenced the -following match, see L</C<< (?>pattern) >>>. - -=head2 Version 8 Regular Expressions -X<regular expression, version 8> X<regex, version 8> X<regexp, version 8> - -In case you're not familiar with the "regular" Version 8 regex -routines, here are the pattern-matching rules not described above. - -Any single character matches itself, unless it is a I<metacharacter> -with a special meaning described here or above. You can cause -characters that normally function as metacharacters to be interpreted -literally by prefixing them with a C<"\"> (e.g., C<"\."> matches a C<".">, not any -character; "\\" matches a C<"\">). This escape mechanism is also required -for the character used as the pattern delimiter. - -A series of characters matches that series of characters in the target -string, so the pattern C<blurfl> would match "blurfl" in the target -string. - -You can specify a character class, by enclosing a list of characters -in C<[]>, which will match any character from the list. If the -first character after the C<"["> is C<"^">, the class matches any character not -in the list. Within a list, the C<"-"> character specifies a -range, so that C<a-z> represents all characters between "a" and "z", -inclusive. If you want either C<"-"> or C<"]"> itself to be a member of a -class, put it at the start of the list (possibly after a C<"^">), or -escape it with a backslash. C<"-"> is also taken literally when it is -at the end of the list, just before the closing C<"]">. (The -following all specify the same class of three characters: C<[-az]>, -C<[az-]>, and C<[a\-z]>. All are different from C<[a-z]>, which -specifies a class containing twenty-six characters, even on EBCDIC-based -character sets.) Also, if you try to use the character -classes C<\w>, C<\W>, C<\s>, C<\S>, C<\d>, or C<\D> as endpoints of -a range, the C<"-"> is understood literally. - -Note also that the whole range idea is rather unportable between -character sets, except for four situations that Perl handles specially. -Any subset of the ranges C<[A-Z]>, C<[a-z]>, and C<[0-9]> are guaranteed -to match the expected subset of ASCII characters, no matter what -character set the platform is running. The fourth portable way to -specify ranges is to use the C<\N{...}> syntax to specify either end -point of the range. For example, C<[\N{U+04}-\N{U+07}]> means to match -the Unicode code points C<\N{U+04}>, C<\N{U+05}>, C<\N{U+06}>, and -C<\N{U+07}>, whatever their native values may be on the platform. Under -L<use re 'strict'|re/'strict' mode> or within a L</C<(?[ ])>>, a warning -is raised, if enabled, and the other end point of a range which has a -C<\N{...}> endpoint is not portably specified. For example, - - [\N{U+00}-\x06] # Warning under "use re 'strict'". - -It is hard to understand without digging what exactly matches ranges -other than subsets of C<[A-Z]>, C<[a-z]>, and C<[0-9]>. A sound -principle is to use only ranges that begin from and end at either -alphabetics of equal case ([a-e], [A-E]), or digits ([0-9]). Anything -else is unsafe or unclear. If in doubt, spell out the range in full. - -Characters may be specified using a metacharacter syntax much like that -used in C: "\n" matches a newline, "\t" a tab, "\r" a carriage return, -"\f" a form feed, etc. More generally, \I<nnn>, where I<nnn> is a string -of three octal digits, matches the character whose coded character set value -is I<nnn>. Similarly, \xI<nn>, where I<nn> are hexadecimal digits, -matches the character whose ordinal is I<nn>. The expression \cI<x> -matches the character control-I<x>. Finally, the C<"."> metacharacter -matches any character except "\n" (unless you use C</s>). - -You can specify a series of alternatives for a pattern using C<"|"> to -separate them, so that C<fee|fie|foe> will match any of "fee", "fie", -or "foe" in the target string (as would C<f(e|i|o)e>). The -first alternative includes everything from the last pattern delimiter -(C<"(">, "(?:", etc. or the beginning of the pattern) up to the first C<"|">, and -the last alternative contains everything from the last C<"|"> to the next -closing pattern delimiter. That's why it's common practice to include -alternatives in parentheses: to minimize confusion about where they -start and end. - -Alternatives are tried from left to right, so the first -alternative found for which the entire expression matches, is the one that -is chosen. This means that alternatives are not necessarily greedy. For -example: when matching C<foo|foot> against "barefoot", only the "foo" -part will match, as that is the first alternative tried, and it successfully -matches the target string. (This might not seem important, but it is -important when you are capturing matched text using parentheses.) - -Also remember that C<"|"> is interpreted as a literal within square brackets, -so if you write C<[fee|fie|foe]> you're really only matching C<[feio|]>. - -Within a pattern, you may designate subpatterns for later reference -by enclosing them in parentheses, and you may refer back to the -I<n>th subpattern later in the pattern using the metacharacter -\I<n> or \gI<n>. Subpatterns are numbered based on the left to right order -of their opening parenthesis. A backreference matches whatever -actually matched the subpattern in the string being examined, not -the rules for that subpattern. Therefore, C<(0|0x)\d*\s\g1\d*> will -match "0x1234 0x4321", but not "0x1234 01234", because subpattern -1 matched "0x", even though the rule C<0|0x> could potentially match -the leading 0 in the second number. - =head2 Warning on C<\1> Instead of C<$1> Some people get too used to writing things like: @@ -2470,10 +2864,10 @@ loops using regular expressions, with something as innocuous as: 'foo' =~ m{ ( o? )* }x; -The C<o?> matches at the beginning of C<'foo'>, and since the position +The C<o?> matches at the beginning of "C<foo>", and since the position in the string is not moved by the match, C<o?> would match again and again because of the C<"*"> quantifier. Another common way to create a similar cycle -is with the looping modifier C<//g>: +is with the looping modifier C</g>: @matches = ( 'foo' =~ m{ o? }xg ); @@ -2533,7 +2927,7 @@ the C<"*">. The higher-level loops preserve an additional state between iterations: whether the last match was zero-length. To break the loop, the following match after a zero-length match is prohibited to have a length of zero. -This prohibition interacts with backtracking (see L<"Backtracking">), +This prohibition interacts with backtracking (see L</"Backtracking">), and so the I<second best> match is chosen if the I<best> match is of zero length. @@ -2561,13 +2955,13 @@ Each of the elementary pieces of regular expressions which were described before (such as C<ab> or C<\Z>) could match at most one substring at the given position of the input string. However, in a typical regular expression these elementary pieces are combined into more complicated -patterns using combining operators C<ST>, C<S|T>, C<S*> etc. -(in these examples C<S> and C<T> are regular subexpressions). +patterns using combining operators C<ST>, C<S|T>, C<S*> I<etc>. +(in these examples C<"S"> and C<"T"> are regular subexpressions). Such combinations can include alternatives, leading to a problem of choice: if we match a regular expression C<a|ab> against C<"abc">, will it match substring C<"a"> or C<"ab">? One way to describe which substring is -actually matched is the concept of backtracking (see L<"Backtracking">). +actually matched is the concept of backtracking (see L</"Backtracking">). However, this description is too low-level and makes you think in terms of a particular implementation. @@ -2580,28 +2974,28 @@ by the question of "which matches are better, and which are worse?". Again, for elementary pieces there is no such question, since at most one match at a given position is possible. This section describes the notion of better/worse for combining operators. In the description -below C<S> and C<T> are regular subexpressions. +below C<"S"> and C<"T"> are regular subexpressions. =over 4 =item C<ST> -Consider two possible matches, C<AB> and C<A'B'>, C<A> and C<A'> are -substrings which can be matched by C<S>, C<B> and C<B'> are substrings -which can be matched by C<T>. +Consider two possible matches, C<AB> and C<A'B'>, C<"A"> and C<A'> are +substrings which can be matched by C<"S">, C<"B"> and C<B'> are substrings +which can be matched by C<"T">. -If C<A> is a better match for C<S> than C<A'>, C<AB> is a better +If C<"A"> is a better match for C<"S"> than C<A'>, C<AB> is a better match than C<A'B'>. -If C<A> and C<A'> coincide: C<AB> is a better match than C<AB'> if -C<B> is a better match for C<T> than C<B'>. +If C<"A"> and C<A'> coincide: C<AB> is a better match than C<AB'> if +C<"B"> is a better match for C<"T"> than C<B'>. =item C<S|T> -When C<S> can match, it is a better match than when only C<T> can match. +When C<"S"> can match, it is a better match than when only C<"T"> can match. -Ordering of two matches for C<S> is the same as for C<S>. Similar for -two matches for C<T>. +Ordering of two matches for C<"S"> is the same as for C<"S">. Similar for +two matches for C<"T">. =item C<S{REPEAT_COUNT}> @@ -2625,18 +3019,18 @@ Same as C<S{0,1}?>, C<S{0,BIG_NUMBER}?>, C<S{1,BIG_NUMBER}?> respectively. =item C<< (?>S) >> -Matches the best match for C<S> and only that. +Matches the best match for C<"S"> and only that. =item C<(?=S)>, C<(?<=S)> -Only the best match for C<S> is considered. (This is important only if -C<S> has capturing parentheses, and backreferences are used somewhere +Only the best match for C<"S"> is considered. (This is important only if +C<"S"> has capturing parentheses, and backreferences are used somewhere else in the whole regular expression.) =item C<(?!S)>, C<(?<!S)> For this grouping operator there is no need to describe the ordering, since -only whether or not C<S> can match is important. +only whether or not C<"S"> can match is important. =item C<(??{ EXPR })>, C<(?I<PARNO>)> @@ -2698,7 +3092,7 @@ this: } Now C<use customre> enables the new escape in constant regular -expressions, i.e., those without any runtime variable interpolations. +expressions, I<i.e.>, those without any runtime variable interpolations. As documented in L<overload>, this conversion will work only over literal parts of regular expressions. For C<\Y|$re\Y|> the variable part of this regular expression needs to be converted explicitly @@ -2712,7 +3106,7 @@ part of this regular expression needs to be converted explicitly =head2 Embedded Code Execution Frequency -The exact rules for how often (??{}) and (?{}) are executed in a pattern +The exact rules for how often C<(??{})> and C<(?{})> are executed in a pattern are unspecified. In the case of a successful match you can assume that they DWIM and will be executed in left to right order the appropriate number of times in the accepting path of the pattern as would any other @@ -2770,7 +3164,7 @@ Subroutine call to a named capture group. Equivalent to C<< (?&NAME) >>. =head1 BUGS There are a number of issues with regard to case-insensitive matching -in Unicode rules. See C<i> under L</Modifiers> above. +in Unicode rules. See C<"i"> under L</Modifiers> above. This document varies from difficult to understand to completely and utterly opaque. The wandering prose riddled with jargon is @@ -2781,6 +3175,11 @@ from the reference content. =head1 SEE ALSO +The syntax of patterns used in Perl pattern matching evolved from those +supplied in the Bell Labs Research Unix 8th Edition (Version 8) regex +routines. (The code is actually derived (distantly) from Henry +Spencer's freely redistributable reimplementation of those V8 routines.) + L<perlrequick>. L<perlretut>. diff --git a/gnu/usr.bin/perl/pod/perlreapi.pod b/gnu/usr.bin/perl/pod/perlreapi.pod index c11ff9e52b3..2df337e21ad 100644 --- a/gnu/usr.bin/perl/pod/perlreapi.pod +++ b/gnu/usr.bin/perl/pod/perlreapi.pod @@ -624,7 +624,6 @@ values. * matching*/ U32 lastparen; /* highest close paren matched ($+) */ U32 lastcloseparen; /* last close paren matched ($^N) */ - regexp_paren_pair *swap; /* Swap copy of *offs */ regexp_paren_pair *offs; /* Array of offsets for (@-) and (@+) */ @@ -727,10 +726,6 @@ data structure. The Perl engine uses the C<regexp_internal> structure (see L<perlreguts/Base Structures>) but a custom engine should use something else. -=head2 C<swap> - -Unused. Left in for compatibility with Perl 5.10.0. - =head2 C<offs> A C<regexp_paren_pair> structure which defines offsets into the string being @@ -745,7 +740,7 @@ C<regexp_paren_pair> struct is defined as follows: If C<< ->offs[num].start >> or C<< ->offs[num].end >> is C<-1> then that capture group did not match. C<< ->offs[0].start/end >> represents C<$&> (or -C<${^MATCH}> under C<//p>) and C<< ->offs[paren].end >> matches C<$$paren> where +C<${^MATCH}> under C</p>) and C<< ->offs[paren].end >> matches C<$$paren> where C<$paren >= 1>. =head2 C<precomp> C<prelen> diff --git a/gnu/usr.bin/perl/pod/perlrebackslash.pod b/gnu/usr.bin/perl/pod/perlrebackslash.pod index 3df9bd2e9db..01226e6a6ed 100644 --- a/gnu/usr.bin/perl/pod/perlrebackslash.pod +++ b/gnu/usr.bin/perl/pod/perlrebackslash.pod @@ -69,8 +69,8 @@ as C<Not in [].> \b{}, \b Boundary. (\b is a backspace in []). \B{}, \B Not a boundary. Not in []. \cX Control-X. - \d Character class for digits. - \D Character class for non-digits. + \d Match any digit character. + \D Match any character that isn't a digit. \e Escape character. \E Turn off \Q, \L and \U processing. Not in []. \f Form feed. @@ -78,31 +78,31 @@ as C<Not in [].> \g{}, \g1 Named, absolute or relative backreference. Not in []. \G Pos assertion. Not in []. - \h Character class for horizontal whitespace. - \H Character class for non horizontal whitespace. + \h Match any horizontal whitespace character. + \H Match any character that isn't horizontal whitespace. \k{}, \k<>, \k'' Named backreference. Not in []. \K Keep the stuff left of \K. Not in []. \l Lowercase next character. Not in []. \L Lowercase till \E. Not in []. \n (Logical) newline character. - \N Any character but newline. Not in []. + \N Match any character but newline. Not in []. \N{} Named or numbered (Unicode) character or sequence. \o{} Octal escape sequence. - \p{}, \pP Character with the given Unicode property. - \P{}, \PP Character without the given Unicode property. + \p{}, \pP Match any character with the given Unicode property. + \P{}, \PP Match any character without the given property. \Q Quote (disable) pattern metacharacters till \E. Not in []. \r Return character. \R Generic new line. Not in []. - \s Character class for whitespace. - \S Character class for non whitespace. + \s Match any whitespace character. + \S Match any character that isn't a whitespace. \t Tab character. \u Titlecase next character. Not in []. \U Uppercase till \E. Not in []. - \v Character class for vertical whitespace. - \V Character class for non vertical whitespace. - \w Character class for word characters. - \W Character class for non-word characters. + \v Match any vertical whitespace character. + \V Match any character that isn't vertical whitespace + \w Match any word character. + \W Match any character that isn't a word character. \x{}, \x00 Hexadecimal escape sequence. \X Unicode "extended grapheme cluster". Not in []. \z End of string. Not in []. @@ -596,7 +596,7 @@ sentence boundary. C<\b{sb}> works with text designed for word-processors which wrap lines automatically for display, but hard-coded line boundaries are considered to be essentially the ends of text blocks (paragraphs really), and hence -the ends of sententces. C<\b{sb}> doesn't do well with text containing +the ends of sentences. C<\b{sb}> doesn't do well with text containing embedded newlines, like the source text of the document you are reading. Such text needs to be preprocessed to get rid of the line separators before looking for sentence boundaries. Some people view this as a bug @@ -622,8 +622,8 @@ space is immediately followed by something like U+0303, COMBINING TILDE. If the final space character in the span is a horizontal white space, it is broken out so that it attaches instead to the combining character. To be precise, if a span of white space that ends in a horizontal space -has the character immediately following it have either of the Word -Boundary property values "Extend" or "Format", the boundary between the +has the character immediately following it have any of the Word +Boundary property values "Extend", "Format" or "ZWJ", the boundary between the final horizontal space character and the rest of the span matches C<\b{wb}>. In all other cases the boundary between two white space characters matches C<\B{wb}>.) @@ -650,8 +650,8 @@ rule. It is also important to realize that these are default boundary definitions, and that implementations may wish to tailor the results for particular purposes and locales. For example, some languages, such as -Japanese and Thai, require dictionary lookup to determine word -boundaries. +Japanese and Thai, require dictionary lookup to accurately determine +word boundaries. Mnemonic: I<b>oundary. diff --git a/gnu/usr.bin/perl/pod/perlrecharclass.pod b/gnu/usr.bin/perl/pod/perlrecharclass.pod index a557cc0384c..3b5c5b12b19 100644 --- a/gnu/usr.bin/perl/pod/perlrecharclass.pod +++ b/gnu/usr.bin/perl/pod/perlrecharclass.pod @@ -27,9 +27,11 @@ to mean just the bracketed form. Certainly, most Perl documentation does that. The dot (or period), C<.> is probably the most used, and certainly the most well-known character class. By default, a dot matches any character, except for the newline. That default can be changed to -add matching the newline by using the I<single line> modifier: either +add matching the newline by using the I<single line> modifier: for the entire regular expression with the C</s> modifier, or -locally with C<(?s)>. (The C<L</\N>> backslash sequence, described +locally with C<(?s)> (and even globally within the scope of +L<C<use re '/s'>|re/'E<sol>flags' mode>). (The C<L</\N>> backslash +sequence, described below, matches any character except newline without regard to the I<single line> modifier.) @@ -176,7 +178,7 @@ are generally used to add auxiliary markings to letters. C<\w> matches the platform's native underscore character plus whatever the locale considers to be alphanumeric. -=item if Unicode rules are in effect ... +=item if, instead, Unicode rules are in effect ... C<\w> matches exactly what C<\p{Word}> matches. @@ -234,7 +236,7 @@ in the table below. C<\s> matches whatever the locale considers to be whitespace. -=item if Unicode rules are in effect ... +=item if, instead, Unicode rules are in effect ... C<\s> matches exactly the characters shown with an "s" column in the table below. @@ -498,10 +500,11 @@ consisting of the two characters matched against. Like the other instance where a bracketed class can match multiple characters, and for similar reasons, the class must not be inverted, and the named sequence may not appear in a range, even one where it is both endpoints. If -these happen, it is a fatal error if the character class is within an -extended L<C<(?[...])>|/Extended Bracketed Character Classes> -class; and only the first code point is used (with -a C<regexp>-type warning raised) otherwise. +these happen, it is a fatal error if the character class is within the +scope of L<C<use re 'strict>|re/'strict' mode>, or within an extended +L<C<(?[...])>|/Extended Bracketed Character Classes> class; otherwise +only the first code point is used (with a C<regexp>-type warning +raised). =back @@ -512,7 +515,14 @@ is, characters that carry a special meaning like C<.>, C<*>, or C<(>) lose their special meaning and can be used inside a character class without the need to escape them. For instance, C<[()]> matches either an opening parenthesis, or a closing parenthesis, and the parens inside the character -class don't group or capture. +class don't group or capture. Be aware that, unless the pattern is +evaluated in single-quotish context, variable interpolation will take +place before the bracketed class is parsed: + + $, = "\t| "; + $a =~ m'[$,]'; # single-quotish: matches '$' or ',' + $a =~ q{[$,]}' # same + $a =~ m/[$,]/; # double-quotish: matches "\t", "|", or " " Characters that may carry a special meaning inside a character class are: C<\>, C<^>, C<->, C<[> and C<]>, and are discussed below. They can be @@ -569,6 +579,29 @@ Examples: # containing just [, and the character class is # followed by a ]. +=head3 Bracketed Character Classes and the C</xx> pattern modifier + +Normally SPACE and TAB characters have no special meaning inside a +bracketed character class; they are just added to the list of characters +matched by the class. But if the L<C</xx>|perlre/E<sol>x and E<sol>xx> +pattern modifier is in effect, they are generally ignored and can be +added to improve readability. They can't be added in the middle of a +single construct: + + / [ \x{10 FFFF} ] /xx # WRONG! + +The SPACE in the middle of the hex constant is illegal. + +To specify a literal SPACE character, you can escape it with a +backslash, like: + + /[ a e i o u \ ]/xx + +This matches the English vowels plus the SPACE character. + +For clarity, you should already have been using C<\t> to specify a +literal tab, and C<\t> is unaffected by C</xx>. + =head3 Character Ranges It is not uncommon to want to match a range of characters. Luckily, instead @@ -608,7 +641,7 @@ Examples: # even on an EBCDIC platform. [\N{U+27}-\N{U+3F}] # Same. (U+27 is "'", and U+3F is "?") -As the final two examples above show, you can achieve portablity to +As the final two examples above show, you can achieve portability to non-ASCII platforms by using the C<\N{...}> form for the range endpoints. These indicate that the specified range is to be interpreted using Unicode values, so C<[\N{U+27}-\N{U+3F}]> means to match @@ -916,7 +949,7 @@ just the platform's native tab and space characters. =back -=item if Unicode rules are in effect ... +=item if, instead, Unicode rules are in effect ... The POSIX class matches the same as the Full-range counterpart. @@ -1009,7 +1042,7 @@ We can extend the example above: This matches digits that are in either the Thai or Laotian scripts. Notice the white space in these examples. This construct always has -the C<E<sol>x> modifier turned on within it. +the C<E<sol>xx> modifier turned on within it. The available binary operators are: @@ -1054,24 +1087,18 @@ C<\N{...}>, etc.) This last example shows the use of this construct to specify an ordinary bracketed character class without additional set operations. Note the -white space within it; a limited version of C<E<sol>x> is turned on even -within bracketed character classes, with only the SPACE and TAB (C<\t>) -characters allowed, and no comments. Hence, - - (?[ [#] ]) - -matches the literal character "#". To specify a literal white space character, -you can escape it with a backslash, like: +white space within it. This is allowed because C<E<sol>xx> is +automatically turned on within this construct. - /(?[ [ a e i o u \ ] ])/ - -This matches the English vowels plus the SPACE character. All the other escapes accepted by normal bracketed character classes are -accepted here as well; but unrecognized escapes that generate warnings -in normal classes are fatal errors here. - -All warnings from these class elements are fatal, as well as some -practices that don't currently warn. For example you cannot say +accepted here as well. + +Because this construct compiles under +L<C<use re 'strict>|re/'strict' mode>, unrecognized escapes that +generate warnings in normal classes are fatal errors here, as well as +all other warnings from these class elements, as well as some +practices that don't currently warn outside C<re 'strict'>. For example +you cannot say /(?[ [ \xF ] ])/ # Syntax error! diff --git a/gnu/usr.bin/perl/pod/perlref.pod b/gnu/usr.bin/perl/pod/perlref.pod index 8959ba5554a..cf3692212d8 100644 --- a/gnu/usr.bin/perl/pod/perlref.pod +++ b/gnu/usr.bin/perl/pod/perlref.pod @@ -718,7 +718,7 @@ outer() at the time outer is invoked. This has the interesting effect of creating a function local to another function, something not normally supported in Perl. -=head1 WARNING +=head1 WARNING: Don't use references as hash keys X<reference, string context> X<reference, use as hash key> You may not (usefully) use a reference as the key to a hash. It will be @@ -738,7 +738,7 @@ real refs, instead of the keys(), which won't. The standard Tie::RefHash module provides a convenient workaround to this. -=head1 Postfix Dereference Syntax +=head2 Postfix Dereference Syntax Beginning in v5.20.0, a postfix syntax for using references is available. It behaves as described in L</Using References>, but instead @@ -798,7 +798,7 @@ As with postfix array, postfix value slice dereferencing I<can> be used in interpolating strings (double quotes or the C<qq> operator), but only if the C<postderef_qq> L<feature> is enabled. -=head1 Assigning to References +=head2 Assigning to References Beginning in v5.22.0, the referencing operator can be assigned to. It performs an aliasing operation, so that the variable name referenced on the @@ -868,7 +868,7 @@ Combining that form with C<local> and putting parentheses immediately around a hash are forbidden (because it is not clear what they should do): \local(@array) = foo(); # WRONG - \(%hash) = bar(); # wRONG + \(%hash) = bar(); # WRONG Assignment to references and non-references may be combined in lists and conditional ternary expressions, as long as the values on the right-hand @@ -909,6 +909,29 @@ will only be visible within that inner sub, and will not affect the outer subroutine where the variables are declared. This bizarre behavior is subject to change. +=head1 Declaring a Reference to a Variable + +Beginning in v5.26.0, the referencing operator can come after C<my>, +C<state>, C<our>, or C<local>. This syntax must be enabled with C<use +feature 'declared_refs'>. It is experimental, and will warn by default +unless C<no warnings 'experimental::refaliasing'> is in effect. + +This feature makes these: + + my \$x; + our \$y; + +equivalent to: + + \my $x; + \our $x; + +It is intended mainly for use in assignments to references (see +L</Assigning to References>, above). It also allows the backslash to be +used on just some items in a list of declared variables: + + my ($foo, \@bar, \%baz); # equivalent to: my $foo, \my(@bar, %baz); + =head1 SEE ALSO Besides the obvious documents, source code can be instructive. diff --git a/gnu/usr.bin/perl/pod/perlreftut.pod b/gnu/usr.bin/perl/pod/perlreftut.pod index cd17c8bb890..94a96b0e1c3 100644 --- a/gnu/usr.bin/perl/pod/perlreftut.pod +++ b/gnu/usr.bin/perl/pod/perlreftut.pod @@ -63,14 +63,15 @@ references. A reference is a scalar value that I<refers to> an entire array or an entire hash (or to just about anything else). Names are one kind of -reference that you're already familiar with. Think of the President -of the United States: a messy, inconvenient bag of blood and bones. -But to talk about him, or to represent him in a computer program, all -you need is the easy, convenient scalar string "Barack Obama". +reference that you're already familiar with. Each human being is a +messy, inconvenient collection of cells. But to refer to a particular +human, for instance the first computer programmer, it isn't necessary to +describe each of their cells; all you need is the easy, convenient +scalar string "Ada Lovelace". References in Perl are like names for arrays and hashes. They're Perl's private, internal names, so you can be sure they're -unambiguous. Unlike "Barack Obama", a reference only refers to one +unambiguous. Unlike a human name, a reference only refers to one thing, and you always know what it refers to. If you have a reference to an array, you can recover the entire array from it. If you have a reference to a hash, you can recover the entire hash. But the @@ -502,7 +503,7 @@ to do with references. Author: Mark Jason Dominus, Plover Systems (C<mjd-perl-ref+@plover.com>) This article originally appeared in I<The Perl Journal> -( http://www.tpj.com/ ) volume 3, #2. Reprinted with permission. +( L<http://www.tpj.com/> ) volume 3, #2. Reprinted with permission. The original title was I<Understand References Today>. diff --git a/gnu/usr.bin/perl/pod/perlrequick.pod b/gnu/usr.bin/perl/pod/perlrequick.pod index d72bd2b5213..5c5030c24ce 100644 --- a/gnu/usr.bin/perl/pod/perlrequick.pod +++ b/gnu/usr.bin/perl/pod/perlrequick.pod @@ -10,6 +10,9 @@ using regular expressions ('regexes') in Perl. =head1 The Guide +This page assumes you already know things, like what a "pattern" is, and +the basic syntax of using them. If you don't, see L<perlretut>. + =head2 Simple word matching The simplest regex is simply a word, or more generally, a string of @@ -64,12 +67,13 @@ Perl will always match at the earliest possible point in the string: "That hat is red" =~ /hat/; # matches 'hat' in 'That' Not all characters can be used 'as is' in a match. Some characters, -called B<metacharacters>, are reserved for use in regex notation. -The metacharacters are +called B<metacharacters>, are considered special, and reserved for use +in regex notation. The metacharacters are {}[]()^$.|*+?\ -A metacharacter can be matched by putting a backslash before it: +A metacharacter can be matched literally by putting a backslash before +it: "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter "2+2=4" =~ /2\+2/; # matches, \+ is treated like an ordinary + @@ -79,6 +83,12 @@ A metacharacter can be matched by putting a backslash before it: In the last regex, the forward slash C<'/'> is also backslashed, because it is used to delimit the regex. +Most of the metacharacters aren't always special, and other characters +(such as the ones delimitting the pattern) become special under various +circumstances. This can be confusing and lead to unexpected results. +L<S<C<use re 'strict'>>|re/'strict' mode> can notify you of potential +pitfalls. + Non-printable ASCII characters are represented by B<escape sequences>. Common examples are C<\t> for a tab, C<\n> for a newline, and C<\r> for a carriage return. Arbitrary bytes are represented by octal @@ -86,7 +96,7 @@ escape sequences, e.g., C<\033>, or hexadecimal escape sequences, e.g., C<\x1B>: "1000\t2000" =~ m(0\t2) # matches - "cat" =~ /\143\x61\x74/ # matches in ASCII, but + "cat" =~ /\143\x61\x74/ # matches in ASCII, but # a weird way to spell cat Regexes are treated mostly as double-quoted strings, so variable @@ -113,8 +123,13 @@ end of the string. Some examples: A B<character class> allows a set of possible characters, rather than just a single character, to match at a particular point in a regex. -Character classes are denoted by brackets C<[...]>, with the set of -characters to be possibly matched inside. Here are some examples: +There are a number of different types of character classes, but usually +when people use this term, they are referring to the type described in +this section, which are technically called "Bracketed character +classes", because they are denoted by brackets C<[...]>, with the set of +characters to be possibly matched inside. But we'll drop the "bracketed" +below to correspond with common usage. Here are some examples of +(bracketed) character classes: /cat/; # matches 'cat' /[bcr]at/; # matches 'bat', 'cat', or 'rat' @@ -378,9 +393,9 @@ no string left to it, so it matches 0 times. There are a few more things you might want to know about matching operators. -The global modifier C<//g> allows the matching operator to match +The global modifier C</g> allows the matching operator to match within a string as many times as possible. In scalar context, -successive matches against a string will have C<//g> jump from match +successive matches against a string will have C</g> jump from match to match, keeping track of position in the string as it goes along. You can get or set the position with the C<pos()> function. For example, @@ -398,9 +413,9 @@ prints A failed match or changing the target string resets the position. If you don't want the position reset after failure to match, add the -C<//c>, as in C</regex/gc>. +C</c>, as in C</regex/gc>. -In list context, C<//g> returns a list of matched groupings, or if +In list context, C</g> returns a list of matched groupings, or if there are no groupings, a list of matches to the whole regex. So @words = ($x =~ /(\w+)/g); # matches, diff --git a/gnu/usr.bin/perl/pod/perlreref.pod b/gnu/usr.bin/perl/pod/perlreref.pod index db7c173a586..a2fb8553065 100644 --- a/gnu/usr.bin/perl/pod/perlreref.pod +++ b/gnu/usr.bin/perl/pod/perlreref.pod @@ -60,7 +60,7 @@ with two additions: 'e' may be specified multiple times. 'replacement' is interpreted as a double quoted string unless a single-quote (C<'>) is the delimiter. -C<?pattern?> is like C<m/pattern/> but matches only once. No alternate +C<m?pattern?> is like C<m/pattern/> but matches only once. No alternate delimiters can be used. Must be reset with reset(). =head2 SYNTAX @@ -135,7 +135,7 @@ and L<perlunicode> for details. \W A non-word character \s A whitespace character \S A non-whitespace character - \h An horizontal whitespace + \h A horizontal whitespace \H A non horizontal whitespace \N A non newline (when not followed by '{NAME}';; not valid in a character class; equivalent to [^\n]; it's @@ -234,10 +234,15 @@ There is no quantifier C<{,n}>. That's interpreted as a literal string. (?:...) Groups subexpressions without capturing (cluster) (?pimsx-imsx:...) Enable/disable option (as per m// modifiers) (?=...) Zero-width positive lookahead assertion + (?*pla:...) Same; avail experimentally starting in 5.28 (?!...) Zero-width negative lookahead assertion + (?*nla:...) Same; avail experimentally starting in 5.28 (?<=...) Zero-width positive lookbehind assertion + (?*plb:...) Same; avail experimentally starting in 5.28 (?<!...) Zero-width negative lookbehind assertion + (?*nlb:...) Same; avail experimentally starting in 5.28 (?>...) Grab what we can, prohibit backtracking + (?*atomic:...) Same; avail experimentally starting in 5.28 (?|...) Branch reset (?<name>...) Named capture (?'name'...) Named capture @@ -308,7 +313,7 @@ Captured groups are numbered according to their I<opening> paren. pos Return or set current match position quotemeta Quote metacharacters - reset Reset ?pattern? status + reset Reset m?pattern? status study Analyze string for optimizing matching split Use a regex to split a string into parts @@ -393,7 +398,7 @@ for details on regexes and internationalisation. =item * I<Mastering Regular Expressions> by Jeffrey Friedl -(F<http://oreilly.com/catalog/9780596528126/>) for a thorough grounding and +(L<http://oreilly.com/catalog/9780596528126/>) for a thorough grounding and reference on the topic. =back diff --git a/gnu/usr.bin/perl/pod/perlretut.pod b/gnu/usr.bin/perl/pod/perlretut.pod index 516d86913cf..1e1cdd49b28 100644 --- a/gnu/usr.bin/perl/pod/perlretut.pod +++ b/gnu/usr.bin/perl/pod/perlretut.pod @@ -17,21 +17,42 @@ expressions display an efficiency and flexibility unknown in most other computer languages. Mastering even the basics of regular expressions will allow you to manipulate text with surprising ease. -What is a regular expression? A regular expression is simply a string -that describes a pattern. Patterns are in common use these days; +What is a regular expression? At its most basic, a regular expression +is a template that is used to determine if a string has certain +characteristics. The string is most often some text, such as a line, +sentence, web page, or even a whole book, but less commonly it could be +some binary data as well. +Suppose we want to determine if the text in variable, C<$var> contains +the sequence of characters S<C<m u s h r o o m>> +(blanks added for legibility). We can write in Perl + + $var =~ m/mushroom/ + +The value of this expression will be TRUE if C<$var> contains that +sequence of characters, and FALSE otherwise. The portion enclosed in +C<'E<sol>'> characters denotes the characteristic we are looking for. +We use the term I<pattern> for it. The process of looking to see if the +pattern occurs in the string is called I<matching>, and the C<"=~"> +operator along with the C<m//> tell Perl to try to match the pattern +against the string. Note that the pattern is also a string, but a very +special kind of one, as we will see. Patterns are in common use these +days; examples are the patterns typed into a search engine to find web pages -and the patterns used to list files in a directory, e.g., C<ls *.txt> -or C<dir *.*>. In Perl, the patterns described by regular expressions -are used to search strings, extract desired parts of strings, and to -do search and replace operations. +and the patterns used to list files in a directory, I<e.g.>, "C<ls *.txt>" +or "C<dir *.*>". In Perl, the patterns described by regular expressions +are used not only to search strings, but to also extract desired parts +of strings, and to do search and replace operations. Regular expressions have the undeserved reputation of being abstract -and difficult to understand. Regular expressions are constructed using +and difficult to understand. This really stems simply because the +notation used to express them tends to be terse and dense, and not +because of inherent complexity. We recommend using the C</x> regular +expression modifier (described below) along with plenty of white space +to make them less dense, and easier to read. Regular expressions are +constructed using simple concepts like conditionals and loops and are no more difficult to understand than the corresponding C<if> conditionals and C<while> -loops in the Perl language itself. In fact, the main challenge in -learning regular expressions is just getting used to the terse -notation used to express these concepts. +loops in the Perl language itself. This tutorial flattens the learning curve by discussing regular expression concepts, along with their notation, one at a time and with @@ -43,7 +64,7 @@ comfortable with the basics and hungry for more power tools. It discusses the more advanced regular expression operators and introduces the latest cutting-edge innovations. -A note: to save time, 'regular expression' is often abbreviated as +A note: to save time, "regular expression" is often abbreviated as regexp or regex. Regexp is a more natural abbreviation than regex, but is harder to pronounce. The Perl pod documentation is evenly split on regexp vs regex; in Perl, there is more than one way to abbreviate it. @@ -58,7 +79,7 @@ find things that, while legal, may not be what you intended. =head2 Simple word matching The simplest regexp is simply a word, or more generally, a string of -characters. A regexp consisting of a word matches any string that +characters. A regexp consisting of just a word matches any string that contains that word: "Hello World" =~ /World/; # matches @@ -91,7 +112,7 @@ be reversed by using the C<!~> operator: The literal string in the regexp can be replaced by a variable: - $greeting = "World"; + my $greeting = "World"; if ("Hello World" =~ /$greeting/) { print "It matches\n"; } @@ -119,7 +140,7 @@ to arbitrary delimiters by putting an C<'m'> out front: # '/' becomes an ordinary char C</World/>, C<m!World!>, and C<m{World}> all represent the -same thing. When, e.g., the quote (C<">) is used as a delimiter, the forward +same thing. When, I<e.g.>, the quote (C<'"'>) is used as a delimiter, the forward slash C<'/'> becomes an ordinary character and can be used in this regexp without trouble. @@ -133,10 +154,10 @@ Let's consider how different regexps would match C<"Hello World">: The first regexp C<world> doesn't match because regexps are case-sensitive. The second regexp matches because the substring S<C<'o W'>> occurs in the string S<C<"Hello World">>. The space -character ' ' is treated like any other character in a regexp and is +character C<' '> is treated like any other character in a regexp and is needed to match in this case. The lack of a space character is the reason the third regexp C<'oW'> doesn't match. The fourth regexp -C<'World '> doesn't match because there is a space at the end of the +"C<World >" doesn't match because there is a space at the end of the regexp, but not at the end of the string. The lesson here is that regexps must match a part of the string I<exactly> in order for the statement to be true. @@ -148,15 +169,22 @@ always match at the earliest possible point in the string: "That hat is red" =~ /hat/; # matches 'hat' in 'That' With respect to character matching, there are a few more points you -need to know about. First of all, not all characters can be used 'as -is' in a match. Some characters, called I<metacharacters>, are reserved -for use in regexp notation. The metacharacters are +need to know about. First of all, not all characters can be used "as +is" in a match. Some characters, called I<metacharacters>, are +generally reserved for use in regexp notation. The metacharacters are - {}[]()^$.|*+?\ + {}[]()^$.|*+?-#\ + +This list is not as definitive as it may appear (or be claimed to be in +other documentation). For example, C<"#"> is a metacharacter only when +the C</x> pattern modifier (described below) is used, and both C<"}"> +and C<"]"> are metacharacters only when paired with opening C<"{"> or +C<"["> respectively; other gotchas apply. The significance of each of these will be explained in the rest of the tutorial, but for now, it is important only to know -that a metacharacter can be matched by putting a backslash before it: +that a metacharacter can be matched as-is by putting a backslash before +it: "2+2=4" =~ /2+2/; # doesn't match, + is a metacharacter "2+2=4" =~ /2\+2/; # matches, \+ is treated like an ordinary + @@ -176,13 +204,21 @@ be backslashed: 'C:\WIN32' =~ /C:\\WIN/; # matches +In situations where it doesn't make sense for a particular metacharacter +to mean what it normally does, it automatically loses its +metacharacter-ness and becomes an ordinary character that is to be +matched literally. For example, the C<'}'> is a metacharacter only when +it is the mate of a C<'{'> metacharacter. Otherwise it is treated as a +literal RIGHT CURLY BRACKET. This may lead to unexpected results. +L<C<use re 'strict'>|re/'strict' mode> can catch some of these. + In addition to the metacharacters, there are some ASCII characters which don't have printable character equivalents and are instead represented by I<escape sequences>. Common examples are C<\t> for a tab, C<\n> for a newline, C<\r> for a carriage return and C<\a> for a bell (or alert). If your string is better thought of as a sequence of arbitrary -bytes, the octal escape sequence, e.g., C<\033>, or hexadecimal escape -sequence, e.g., C<\x1B> may be a more natural representation for your +bytes, the octal escape sequence, I<e.g.>, C<\033>, or hexadecimal escape +sequence, I<e.g.>, C<\x1B> may be a more natural representation for your bytes. Here are some examples of escapes: "1000\t2000" =~ m(0\t2) # matches @@ -241,9 +277,9 @@ C</$regexp/> use the default variable C<$_> implicitly. With all of the regexps above, if the regexp matched anywhere in the string, it was considered a match. Sometimes, however, we'd like to specify I<where> in the string the regexp should try to match. To do -this, we would use the I<anchor> metacharacters C<^> and C<$>. The -anchor C<^> means match at the beginning of the string and the anchor -C<$> means match at the end of the string, or before a newline at the +this, we would use the I<anchor> metacharacters C<'^'> and C<'$'>. The +anchor C<'^'> means match at the beginning of the string and the anchor +C<'$'> means match at the end of the string, or before a newline at the end of the string. Here is how they are used: "housekeeper" =~ /keeper/; # matches @@ -251,13 +287,13 @@ end of the string. Here is how they are used: "housekeeper" =~ /keeper$/; # matches "housekeeper\n" =~ /keeper$/; # matches -The second regexp doesn't match because C<^> constrains C<keeper> to +The second regexp doesn't match because C<'^'> constrains C<keeper> to match only at the beginning of the string, but C<"housekeeper"> has keeper starting in the middle. The third regexp does match, since the -C<$> constrains C<keeper> to match only at the end of the string. +C<'$'> constrains C<keeper> to match only at the end of the string. -When both C<^> and C<$> are used at the same time, the regexp has to -match both the beginning and the end of the string, i.e., the regexp +When both C<'^'> and C<'$'> are used at the same time, the regexp has to +match both the beginning and the end of the string, I<i.e.>, the regexp matches the whole string. Consider "keeper" =~ /^keep$/; # doesn't match @@ -266,7 +302,7 @@ matches the whole string. Consider The first regexp doesn't match because the string has more to it than C<keep>. Since the second regexp is exactly the string, it -matches. Using both C<^> and C<$> in a regexp forces the complete +matches. Using both C<'^'> and C<'$'> in a regexp forces the complete string to match, so it gives you complete control over which strings match and which don't. Suppose you are looking for a fellow named bert, off in a string by himself: @@ -322,13 +358,13 @@ operation. We will meet other modifiers later in the tutorial. We saw in the section above that there were ordinary characters, which represented themselves, and special characters, which needed a -backslash C<\> to represent themselves. The same is true in a +backslash C<'\'> to represent themselves. The same is true in a character class, but the sets of ordinary and special characters inside a character class are different than those outside a character class. The special characters for a character class are C<-]\^$> (and the pattern delimiter, whatever it is). -C<]> is special because it denotes the end of a character class. C<$> is -special because it denotes a scalar variable. C<\> is special because +C<']'> is special because it denotes the end of a character class. C<'$'> is +special because it denotes a scalar variable. C<'\'> is special because it is used in escape sequences, just like above. Here is how the special characters C<]$\> are handled: @@ -339,7 +375,7 @@ special characters C<]$\> are handled: /[\\$x]at/; # matches '\at', 'bat, 'cat', or 'rat' The last two are a little tricky. In C<[\$x]>, the backslash protects -the dollar sign, so the character class has two members C<$> and C<x>. +the dollar sign, so the character class has two members C<'$'> and C<'x'>. In C<[\\$x]>, the backslash is protected, so C<$x> is treated as a variable and substituted in double quote fashion. @@ -359,7 +395,7 @@ If C<'-'> is the first or last character in a character class, it is treated as an ordinary character; C<[-ab]>, C<[ab-]> and C<[a\-b]> are all equivalent. -The special character C<^> in the first position of a character class +The special character C<'^'> in the first position of a character class denotes a I<negated character class>, which matches any character but those in the brackets. Both C<[...]> and C<[^...]> must match a character, or the match fails. Then @@ -372,7 +408,7 @@ character, or the match fails. Then Now, even C<[0-9]> can be a bother to write multiple times, so in the interest of saving keystrokes and making regexps more readable, Perl has several abbreviations for common character classes, as shown below. -Since the introduction of Unicode, unless the C<//a> modifier is in +Since the introduction of Unicode, unless the C</a> modifier is in effect, these character classes match more than just a few characters in the ASCII range. @@ -380,46 +416,46 @@ the ASCII range. =item * -\d matches a digit, not just [0-9] but also digits from non-roman scripts +C<\d> matches a digit, not just C<[0-9]> but also digits from non-roman scripts =item * -\s matches a whitespace character, the set [\ \t\r\n\f] and others +C<\s> matches a whitespace character, the set C<[\ \t\r\n\f]> and others =item * -\w matches a word character (alphanumeric or _), not just [0-9a-zA-Z_] +C<\w> matches a word character (alphanumeric or C<'_'>), not just C<[0-9a-zA-Z_]> but also digits and characters from non-roman scripts =item * -\D is a negated \d; it represents any other character than a digit, or [^\d] +C<\D> is a negated C<\d>; it represents any other character than a digit, or C<[^\d]> =item * -\S is a negated \s; it represents any non-whitespace character [^\s] +C<\S> is a negated C<\s>; it represents any non-whitespace character C<[^\s]> =item * -\W is a negated \w; it represents any non-word character [^\w] +C<\W> is a negated C<\w>; it represents any non-word character C<[^\w]> =item * -The period '.' matches any character but "\n" (unless the modifier C<//s> is +The period C<'.'> matches any character but C<"\n"> (unless the modifier C</s> is in effect, as explained below). =item * -\N, like the period, matches any character but "\n", but it does so -regardless of whether the modifier C<//s> is in effect. +C<\N>, like the period, matches any character but C<"\n">, but it does so +regardless of whether the modifier C</s> is in effect. =back -The C<//a> modifier, available starting in Perl 5.14, is used to -restrict the matches of \d, \s, and \w to just those in the ASCII range. +The C</a> modifier, available starting in Perl 5.14, is used to +restrict the matches of C<\d>, C<\s>, and C<\w> to just those in the ASCII range. It is useful to keep your program from being needlessly exposed to full Unicode (and its accompanying security considerations) when all you want -is to process English-like text. (The "a" may be doubled, C<//aa>, to +is to process English-like text. (The "a" may be doubled, C</aa>, to provide even more restrictions, preventing case-insensitive matching of ASCII with non-ASCII characters; otherwise a Unicode "Kelvin Sign" would caselessly match a "k" or "K".) @@ -481,48 +517,48 @@ of it as empty. Then This behavior is convenient, because we usually want to ignore newlines when we count and match characters in a line. Sometimes, -however, we want to keep track of newlines. We might even want C<^> -and C<$> to anchor at the beginning and end of lines within the +however, we want to keep track of newlines. We might even want C<'^'> +and C<'$'> to anchor at the beginning and end of lines within the string, rather than just the beginning and end of the string. Perl allows us to choose between ignoring and paying attention to newlines -by using the C<//s> and C<//m> modifiers. C<//s> and C<//m> stand for +by using the C</s> and C</m> modifiers. C</s> and C</m> stand for single line and multi-line and they determine whether a string is to be treated as one continuous string, or as a set of lines. The two modifiers affect two aspects of how the regexp is interpreted: 1) how -the C<'.'> character class is defined, and 2) where the anchors C<^> -and C<$> are able to match. Here are the four possible combinations: +the C<'.'> character class is defined, and 2) where the anchors C<'^'> +and C<'$'> are able to match. Here are the four possible combinations: =over 4 =item * -no modifiers (//): Default behavior. C<'.'> matches any character -except C<"\n">. C<^> matches only at the beginning of the string and -C<$> matches only at the end or before a newline at the end. +no modifiers: Default behavior. C<'.'> matches any character +except C<"\n">. C<'^'> matches only at the beginning of the string and +C<'$'> matches only at the end or before a newline at the end. =item * -s modifier (//s): Treat string as a single long line. C<'.'> matches -any character, even C<"\n">. C<^> matches only at the beginning of -the string and C<$> matches only at the end or before a newline at the +s modifier (C</s>): Treat string as a single long line. C<'.'> matches +any character, even C<"\n">. C<'^'> matches only at the beginning of +the string and C<'$'> matches only at the end or before a newline at the end. =item * -m modifier (//m): Treat string as a set of multiple lines. C<'.'> -matches any character except C<"\n">. C<^> and C<$> are able to match +m modifier (C</m>): Treat string as a set of multiple lines. C<'.'> +matches any character except C<"\n">. C<'^'> and C<'$'> are able to match at the start or end of I<any> line within the string. =item * -both s and m modifiers (//sm): Treat string as a single long line, but +both s and m modifiers (C</sm>): Treat string as a single long line, but detect multiple lines. C<'.'> matches any character, even -C<"\n">. C<^> and C<$>, however, are able to match at the start or end +C<"\n">. C<'^'> and C<'$'>, however, are able to match at the start or end of I<any> line within the string. =back -Here are examples of C<//s> and C<//m> in action: +Here are examples of C</s> and C</m> in action: $x = "There once was a girl\nWho programmed in Perl\n"; @@ -536,11 +572,11 @@ Here are examples of C<//s> and C<//m> in action: $x =~ /girl.Who/m; # doesn't match, "." doesn't match "\n" $x =~ /girl.Who/sm; # matches, "." matches "\n" -Most of the time, the default behavior is what is wanted, but C<//s> and -C<//m> are occasionally very useful. If C<//m> is being used, the start +Most of the time, the default behavior is what is wanted, but C</s> and +C</m> are occasionally very useful. If C</m> is being used, the start of the string can still be matched with C<\A> and the end of the string can still be matched with the anchors C<\Z> (matches both the end and -the newline before, like C<$>), and C<\z> (matches only the end): +the newline before, like C<'$'>), and C<\z> (matches only the end): $x =~ /^Who/m; # matches, "Who" at start of second line $x =~ /\AWho/m; # doesn't match, "Who" is not at start of string @@ -559,7 +595,7 @@ choices are described in the next section. Sometimes we would like our regexp to be able to match different possible words or character strings. This is accomplished by using -the I<alternation> metacharacter C<|>. To match C<dog> or C<cat>, we +the I<alternation> metacharacter C<'|'>. To match C<dog> or C<cat>, we form the regexp C<dog|cat>. As before, Perl will try to match the regexp at the earliest possible point in the string. At each character position, Perl will first try to match the first @@ -633,7 +669,7 @@ C<"20"> is two digits. The process of trying one alternative, seeing if it matches, and moving on to the next alternative, while going back in the string from where the previous alternative was tried, if it doesn't, is called -I<backtracking>. The term 'backtracking' comes from the idea that +I<backtracking>. The term "backtracking" comes from the idea that matching a regexp is like a walk in the woods. Successfully matching a regexp is like arriving at a destination. There are many possible trailheads, one for each string position, and each one is tried in @@ -651,62 +687,59 @@ of what Perl does when it tries to match the regexp =over 4 -=item Z<>0 - -Start with the first letter in the string 'a'. +=item Z<>0. Start with the first letter in the string C<'a'>. -=item Z<>1 +E<nbsp> -Try the first alternative in the first group 'abd'. +=item Z<>1. Try the first alternative in the first group C<'abd'>. -=item Z<>2 +E<nbsp> -Match 'a' followed by 'b'. So far so good. +=item Z<>2. Match C<'a'> followed by C<'b'>. So far so good. -=item Z<>3 +E<nbsp> -'d' in the regexp doesn't match 'c' in the string - a dead -end. So backtrack two characters and pick the second alternative in -the first group 'abc'. +=item Z<>3. C<'d'> in the regexp doesn't match C<'c'> in the string - a +dead end. So backtrack two characters and pick the second alternative +in the first group C<'abc'>. -=item Z<>4 +E<nbsp> -Match 'a' followed by 'b' followed by 'c'. We are on a roll -and have satisfied the first group. Set $1 to 'abc'. +=item Z<>4. Match C<'a'> followed by C<'b'> followed by C<'c'>. We are on a roll +and have satisfied the first group. Set C<$1> to C<'abc'>. -=item Z<>5 +E<nbsp> -Move on to the second group and pick the first alternative -'df'. +=item Z<>5 Move on to the second group and pick the first alternative C<'df'>. -=item Z<>6 +E<nbsp> -Match the 'd'. +=item Z<>6 Match the C<'d'>. -=item Z<>7 +E<nbsp> -'f' in the regexp doesn't match 'e' in the string, so a dead +=item Z<>7. C<'f'> in the regexp doesn't match C<'e'> in the string, so a dead end. Backtrack one character and pick the second alternative in the -second group 'd'. +second group C<'d'>. -=item Z<>8 +E<nbsp> -'d' matches. The second grouping is satisfied, so set $2 to -'d'. +=item Z<>8. C<'d'> matches. The second grouping is satisfied, so set +C<$2> to C<'d'>. -=item Z<>9 +E<nbsp> -We are at the end of the regexp, so we are done! We have -matched 'abcd' out of the string "abcde". +=item Z<>9. We are at the end of the regexp, so we are done! We have +matched C<'abcd'> out of the string C<"abcde">. =back There are a couple of things to note about this analysis. First, the -third alternative in the second group 'de' also allows a match, but we +third alternative in the second group C<'de'> also allows a match, but we stopped before we got to it - at a given character position, leftmost wins. Second, we were able to get a match at the first character -position of the string 'a'. If there were no matches at the first -position, Perl would move to the second character position 'b' and +position of the string C<'a'>. If there were no matches at the first +position, Perl would move to the second character position C<'b'> and attempt the match all over again. Only when all possible paths at all possible character positions have been exhausted does Perl give up and declare S<C<$string =~ /(abd|abc)(df|d|de)/;>> to be false. @@ -723,7 +756,7 @@ The grouping metacharacters C<()> also serve another completely different function: they allow the extraction of the parts of a string that matched. This is very useful to find out what matched and for text processing in general. For each grouping, the part that matched -inside goes into the special variables C<$1>, C<$2>, etc. They can be +inside goes into the special variables C<$1>, C<$2>, I<etc>. They can be used just as ordinary variables: # extract hours, minutes, seconds @@ -743,7 +776,7 @@ C<($1,$2,$3)>. So we could write the code more compactly as If the groupings in a regexp are nested, C<$1> gets the group with the leftmost opening parenthesis, C<$2> the next opening parenthesis, -etc. Here is a regexp with nested groups: +I<etc>. Here is a regexp with nested groups: /(ab(cd|ef)((gi)|j))/; 1 2 34 @@ -755,7 +788,7 @@ or it remains undefined. For convenience, Perl sets C<$+> to the string held by the highest numbered C<$1>, C<$2>,... that got assigned (and, somewhat related, C<$^N> to the -value of the C<$1>, C<$2>,... most-recently assigned; i.e. the C<$1>, +value of the C<$1>, C<$2>,... most-recently assigned; I<i.e.> the C<$1>, C<$2>,... associated with the rightmost closing parenthesis used in the match). @@ -767,12 +800,12 @@ the I<backreferences> C<\g1>, C<\g2>,... Backreferences are simply matching variables that can be used I<inside> a regexp. This is a really nice feature; what matches later in a regexp is made to depend on what matched earlier in the regexp. Suppose we wanted to look -for doubled words in a text, like 'the the'. The following regexp finds +for doubled words in a text, like "the the". The following regexp finds all 3-letter doubles with a space in between: /\b(\w\w\w)\s\g1\b/; -The grouping assigns a value to \g1, so that the same 3-letter sequence +The grouping assigns a value to C<\g1>, so that the same 3-letter sequence is used for both parts. A similar task is to find words consisting of two identical parts: @@ -786,7 +819,7 @@ A similar task is to find words consisting of two identical parts: papa The regexp has a single grouping which considers 4-letter -combinations, then 3-letter combinations, etc., and uses C<\g1> to look for +combinations, then 3-letter combinations, I<etc>., and uses C<\g1> to look for a repeat. Although C<$1> and C<\g1> represent the same thing, care should be taken to use matched variables C<$1>, C<$2>,... only I<outside> a regexp and backreferences C<\g1>, C<\g2>,... only I<inside> a regexp; not doing @@ -840,14 +873,14 @@ capture group is accessible through the C<%+> hash. Assuming that we have to match calendar dates which may be given in one of the three formats yyyy-mm-dd, mm/dd/yyyy or dd.mm.yyyy, we can write -three suitable patterns where we use 'd', 'm' and 'y' respectively as the +three suitable patterns where we use C<'d'>, C<'m'> and C<'y'> respectively as the names of the groups capturing the pertaining components of a date. The matching operation combines the three patterns as alternatives: $fmt1 = '(?<y>\d\d\d\d)-(?<m>\d\d)-(?<d>\d\d)'; $fmt2 = '(?<m>\d\d)/(?<d>\d\d)/(?<y>\d\d\d\d)'; $fmt3 = '(?<d>\d\d)\.(?<m>\d\d)\.(?<y>\d\d\d\d)'; - for my $d qw( 2006-10-21 15.01.2007 10/31/2005 ){ + for my $d (qw(2006-10-21 15.01.2007 10/31/2005)) { if ( $d =~ m{$fmt1|$fmt2|$fmt3} ){ print "day=$+{d} month=$+{m} year=$+{y}\n"; } @@ -906,7 +939,7 @@ prints Even if there are no groupings in a regexp, it is still possible to find out what exactly matched in a string. If you use them, Perl will set C<$`> to the part of the string before the match, will set C<$&> -to the part of the string that matched, and will set C<$'> to the part +to the part of the string that matched, and will set C<'$'> to the part of the string after the match. An example: $x = "the cat caught the mouse"; @@ -915,10 +948,10 @@ of the string after the match. An example: In the second match, C<$`> equals C<''> because the regexp matched at the first character position in the string and stopped; it never saw the -second 'the'. +second "the". If your code is to run on Perl versions earlier than -5.20, it is worthwhile to note that using C<$`> and C<$'> +5.20, it is worthwhile to note that using C<$`> and C<'$'> slows down regexp matching quite a bit, while C<$&> slows it down to a lesser extent, because if they are used in one regexp in a program, they are generated for I<all> regexps in the program. So if raw @@ -935,7 +968,7 @@ variables may be used. These are only set if the C</p> modifier is present. Consequently they do not penalize the rest of the program. In Perl 5.20, C<${^PREMATCH}>, C<${^MATCH}> and C<${^POSTMATCH}> are available whether the C</p> has been used or not (the modifier is ignored), and -C<$`>, C<$'> and C<$&> do not cause any speed difference. +C<$`>, C<'$'> and C<$&> do not cause any speed difference. =head2 Non-capturing groupings @@ -982,8 +1015,8 @@ less. We'd like to be able to match words or, more generally, strings of any length, without writing out tedious alternatives like C<\w\w\w\w|\w\w\w|\w\w|\w>. -This is exactly the problem the I<quantifier> metacharacters C<?>, -C<*>, C<+>, and C<{}> were created for. They allow us to delimit the +This is exactly the problem the I<quantifier> metacharacters C<'?'>, +C<'*'>, C<'+'>, and C<{}> were created for. They allow us to delimit the number of repeats for a portion of a regexp we consider to be a match. Quantifiers are put immediately after the character, character class, or grouping that we want to specify. They have the following @@ -993,15 +1026,15 @@ meanings: =item * -C<a?> means: match 'a' 1 or 0 times +C<a?> means: match C<'a'> 1 or 0 times =item * -C<a*> means: match 'a' 0 or more times, i.e., any number of times +C<a*> means: match C<'a'> 0 or more times, I<i.e.>, any number of times =item * -C<a+> means: match 'a' 1 or more times, i.e., at least once +C<a+> means: match C<'a'> 1 or more times, I<i.e.>, at least once =item * @@ -1041,9 +1074,9 @@ Here are some examples: For all of these quantifiers, Perl will try to match as much of the string as possible, while still allowing the regexp to succeed. Thus -with C</a?.../>, Perl will first try to match the regexp with the C<a> +with C</a?.../>, Perl will first try to match the regexp with the C<'a'> present; if that fails, Perl will try to match the regexp without the -C<a> present. For the quantifier C<*>, we get the following: +C<'a'> present. For the quantifier C<'*'>, we get the following: $x = "the cat in the hat"; $x =~ /^(.*)(cat)(.*)$/; # matches, @@ -1090,7 +1123,7 @@ that allows a match for the whole regexp will be the one used. =item * -Principle 2: The maximal matching quantifiers C<?>, C<*>, C<+> and +Principle 2: The maximal matching quantifiers C<'?'>, C<'*'>, C<'+'> and C<{n,m}> will in general match as much of the string as possible while still allowing the whole regexp to match. @@ -1120,8 +1153,8 @@ Here is an example of these principles in action: # $3 = 'l' This regexp matches at the earliest string position, C<'T'>. One -might think that C<e>, being leftmost in the alternation, would be -matched, but C<r> produces the longest string in the first quantifier. +might think that C<'e'>, being leftmost in the alternation, would be +matched, but C<'r'> produces the longest string in the first quantifier. $x =~ /(m{1,2})(.*)$/; # matches, # $1 = 'mm' @@ -1146,7 +1179,7 @@ C<'m'> for the second quantifier C<m{1,2}>. Here, C<.?> eats its maximal one character at the earliest possible position in the string, C<'a'> in C<programming>, leaving C<m{1,2}> -the opportunity to match both C<m>'s. Finally, +the opportunity to match both C<'m'>'s. Finally, "aXXXb" =~ /(X*)/; # matches with $1 = '' @@ -1158,23 +1191,23 @@ Sometimes greed is not good. At times, we would like quantifiers to match a I<minimal> piece of string, rather than a maximal piece. For this purpose, Larry Wall created the I<minimal match> or I<non-greedy> quantifiers C<??>, C<*?>, C<+?>, and C<{}?>. These are -the usual quantifiers with a C<?> appended to them. They have the +the usual quantifiers with a C<'?'> appended to them. They have the following meanings: =over 4 =item * -C<a??> means: match 'a' 0 or 1 times. Try 0 first, then 1. +C<a??> means: match C<'a'> 0 or 1 times. Try 0 first, then 1. =item * -C<a*?> means: match 'a' 0 or more times, i.e., any number of times, +C<a*?> means: match C<'a'> 0 or more times, I<i.e.>, any number of times, but as few times as possible =item * -C<a+?> means: match 'a' 1 or more times, i.e., at least once, but +C<a+?> means: match C<'a'> 1 or more times, I<i.e.>, at least once, but as few times as possible =item * @@ -1203,9 +1236,9 @@ Let's look at the example above, but with minimal quantifiers: # $2 = 'e' # $3 = ' programming republic of Perl' -The minimal string that will allow both the start of the string C<^> +The minimal string that will allow both the start of the string C<'^'> and the alternation to match is C<Th>, with the alternation C<e|r> -matching C<e>. The second quantifier C<.*> is free to gobble up the +matching C<'e'>. The second quantifier C<.*> is free to gobble up the rest of the string. $x =~ /(m{1,2}?)(.*?)$/; # matches, @@ -1216,7 +1249,7 @@ The first string position that this regexp can match is at the first C<'m'> in C<programming>. At this position, the minimal C<m{1,2}?> matches just one C<'m'>. Although the second quantifier C<.*?> would prefer to match no characters, it is constrained by the end-of-string -anchor C<$> to match the rest of the string. +anchor C<'$'> to match the rest of the string. $x =~ /(.*?)(m{1,2}?)(.*)$/; # matches, # $1 = 'The progra' @@ -1224,12 +1257,12 @@ anchor C<$> to match the rest of the string. # $3 = 'ming republic of Perl' In this regexp, you might expect the first minimal quantifier C<.*?> -to match the empty string, because it is not constrained by a C<^> +to match the empty string, because it is not constrained by a C<'^'> anchor to match the beginning of the word. Principle 0 applies here, however. Because it is possible for the whole regexp to match at the start of the string, it I<will> match at the start of the string. Thus -the first quantifier has to match everything up to the first C<m>. The -second minimal quantifier matches just one C<m> and the third +the first quantifier has to match everything up to the first C<'m'>. The +second minimal quantifier matches just one C<'m'> and the third quantifier matches the rest of the string. $x =~ /(.??)(m{1,2})(.*)$/; # matches, @@ -1270,37 +1303,36 @@ backtracking. Here is a step-by-step analysis of the example =over 4 -=item Z<>0 +=item Z<>0. Start with the first letter in the string C<'t'>. -Start with the first letter in the string 't'. +E<nbsp> -=item Z<>1 +=item Z<>1. The first quantifier C<'.*'> starts out by matching the whole +string "C<the cat in the hat>". -The first quantifier '.*' starts out by matching the whole -string 'the cat in the hat'. +E<nbsp> -=item Z<>2 +=item Z<>2. C<'a'> in the regexp element C<'at'> doesn't match the end +of the string. Backtrack one character. -'a' in the regexp element 'at' doesn't match the end of the -string. Backtrack one character. +E<nbsp> -=item Z<>3 +=item Z<>3. C<'a'> in the regexp element C<'at'> still doesn't match +the last letter of the string C<'t'>, so backtrack one more character. -'a' in the regexp element 'at' still doesn't match the last -letter of the string 't', so backtrack one more character. +E<nbsp> -=item Z<>4 +=item Z<>4. Now we can match the C<'a'> and the C<'t'>. -Now we can match the 'a' and the 't'. +E<nbsp> -=item Z<>5 - -Move on to the third element '.*'. Since we are at the end of -the string and '.*' can match 0 times, assign it the empty string. +=item Z<>5. Move on to the third element C<'.*'>. Since we are at the +end of the string and C<'.*'> can match 0 times, assign it the empty +string. -=item Z<>6 +E<nbsp> -We are done! +=item Z<>6. We are done! =back @@ -1312,14 +1344,14 @@ string. A typical structure that blows up in your face is of the form /(a|b+)*/; The problem is the nested indeterminate quantifiers. There are many -different ways of partitioning a string of length n between the C<+> -and C<*>: one repetition with C<b+> of length n, two repetitions with +different ways of partitioning a string of length n between the C<'+'> +and C<'*'>: one repetition with C<b+> of length n, two repetitions with the first C<b+> length k and the second with length n-k, m repetitions -whose bits add up to length n, etc. In fact there are an exponential +whose bits add up to length n, I<etc>. In fact there are an exponential number of ways to partition a string as a function of its length. A regexp may get lucky and match early in the process, but if there is no match, Perl will try I<every> possibility before giving up. So be -careful with nested C<*>'s, C<{n,m}>'s, and C<+>'s. The book +careful with nested C<'*'>'s, C<{n,m}>'s, and C<'+'>'s. The book I<Mastering Regular Expressions> by Jeffrey Friedl gives a wonderful discussion of this and other efficiency issues. @@ -1334,15 +1366,15 @@ the simple pattern Whenever this is applied to a string which doesn't quite meet the pattern's expectations such as S<C<"abc ">> or S<C<"abc def ">>, -the regex engine will backtrack, approximately once for each character +the regexp engine will backtrack, approximately once for each character in the string. But we know that there is no way around taking I<all> of the initial word characters to match the first repetition, that I<all> spaces must be eaten by the middle part, and the same goes for the second word. With the introduction of the I<possessive quantifiers> in Perl 5.10, we -have a way of instructing the regex engine not to backtrack, with the -usual quantifiers with a C<+> appended to them. This makes them greedy as +have a way of instructing the regexp engine not to backtrack, with the +usual quantifiers with a C<'+'> appended to them. This makes them greedy as well as stingy; once they succeed they won't give anything back to permit another solution. They have the following meanings: @@ -1430,12 +1462,12 @@ Now consider floating point numbers with exponents. The key observation here is that I<both> integers and numbers with decimal points are allowed in front of an exponent. Then exponents, like the overall sign, are independent of whether we are matching numbers with -or without decimal points, and can be 'decoupled' from the +or without decimal points, and can be "decoupled" from the mantissa. The overall form of the regexp now becomes clear: /^(optional sign)(integer | f.p. mantissa)(optional exponent)$/; -The exponent is an C<e> or C<E>, followed by an integer. So the +The exponent is an C<'e'> or C<'E'>, followed by an integer. So the exponent regexp is /[eE][+-]?\d+/; # exponent @@ -1445,10 +1477,10 @@ Putting all the parts together, we get a regexp that matches numbers: /^[+-]?(\d+\.\d+|\d+\.|\.\d+|\d+)([eE][+-]?\d+)?$/; # Ta da! Long regexps like this may impress your friends, but can be hard to -decipher. In complex situations like this, the C<//x> modifier for a +decipher. In complex situations like this, the C</x> modifier for a match is invaluable. It allows one to put nearly arbitrary whitespace and comments into a regexp without affecting their meaning. Using it, -we can rewrite our 'extended' regexp in the more pleasing form +we can rewrite our "extended" regexp in the more pleasing form /^ [+-]? # first, match an optional sign @@ -1458,7 +1490,7 @@ we can rewrite our 'extended' regexp in the more pleasing form |\.\d+ # mantissa of the form .b |\d+ # integer of the form a ) - ([eE][+-]?\d+)? # finally, optionally match an exponent + ( [eE] [+-]? \d+ )? # finally, optionally match an exponent $/x; If whitespace is mostly irrelevant, how does one include space @@ -1476,7 +1508,7 @@ this to our regexp as follows: |\.\d+ # mantissa of the form .b |\d+ # integer of the form a ) - ([eE][+-]?\d+)? # finally, optionally match an exponent + ( [eE] [+-]? \d+ )? # finally, optionally match an exponent $/x; In this form, it is easier to see a way to simplify the @@ -1492,10 +1524,28 @@ could be factored out: )? # ? takes care of integers of the form a |\.\d+ # mantissa of the form .b ) - ([eE][+-]?\d+)? # finally, optionally match an exponent + ( [eE] [+-]? \d+ )? # finally, optionally match an exponent $/x; -or written in the compact form, +Starting in Perl v5.26, specifying C</xx> changes the square-bracketed +portions of a pattern to ignore tabs and space characters unless they +are escaped by preceding them with a backslash. So, we could write + + /^ + [ + - ]?\ * # first, match an optional sign + ( # then match integers or f.p. mantissas: + \d+ # start out with a ... + ( + \.\d* # mantissa of the form a.b or a. + )? # ? takes care of integers of the form a + |\.\d+ # mantissa of the form .b + ) + ( [ e E ] [ + - ]? \d+ )? # finally, optionally match an exponent + $/xx; + +This doesn't really improve the legibility of this example, but it's +available in case you want it. Squashing the pattern down to the +compact form, we have /^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/; @@ -1539,8 +1589,8 @@ We have already introduced the matching operator in its default C</regexp/> and arbitrary delimiter C<m!regexp!> forms. We have used the binding operator C<=~> and its negation C<!~> to test for string matches. Associated with the matching operator, we have discussed the -single line C<//s>, multi-line C<//m>, case-insensitive C<//i> and -extended C<//x> modifiers. There are a few more things you might +single line C</s>, multi-line C</m>, case-insensitive C</i> and +extended C</x> modifiers. There are a few more things you might want to know about matching operators. =head3 Prohibiting substitution @@ -1555,7 +1605,7 @@ special delimiter C<m''>: } Similar to strings, C<m''> acts like apostrophes on a regexp; all other -C<m> delimiters act like quotes. If the regexp evaluates to the empty string, +C<'m'> delimiters act like quotes. If the regexp evaluates to the empty string, the regexp in the I<last successful match> is used instead. So we have "dog" =~ /d/; # 'd' matches @@ -1565,15 +1615,15 @@ the regexp in the I<last successful match> is used instead. So we have =head3 Global matching The final two modifiers we will discuss here, -C<//g> and C<//c>, concern multiple matches. -The modifier C<//g> stands for global matching and allows the +C</g> and C</c>, concern multiple matches. +The modifier C</g> stands for global matching and allows the matching operator to match within a string as many times as possible. In scalar context, successive invocations against a string will have -C<//g> jump from match to match, keeping track of position in the +C</g> jump from match to match, keeping track of position in the string as it goes along. You can get or set the position with the C<pos()> function. -The use of C<//g> is shown in the following example. Suppose we have +The use of C</g> is shown in the following example. Suppose we have a string that consists of words separated by spaces. If we know how many words there are in advance, we could extract the words using groupings: @@ -1585,7 +1635,7 @@ groupings: # $3 = 'house' But what if we had an indeterminate number of words? This is the sort -of task C<//g> was made for. To extract all words, form the simple +of task C</g> was made for. To extract all words, form the simple regexp C<(\w+)> and loop over all matches with C</(\w+)/g>: while ($x =~ /(\w+)/g) { @@ -1600,12 +1650,12 @@ prints A failed match or changing the target string resets the position. If you don't want the position reset after failure to match, add the -C<//c>, as in C</regexp/gc>. The current position in the string is +C</c>, as in C</regexp/gc>. The current position in the string is associated with the string, not the regexp. This means that different strings have different positions and their respective positions can be set or read independently. -In list context, C<//g> returns a list of matched groupings, or if +In list context, C</g> returns a list of matched groupings, or if there are no groupings, a list of matches to the whole regexp. So if we wanted just the words, we could use @@ -1614,8 +1664,8 @@ we wanted just the words, we could use # $words[1] = 'dog' # $words[2] = 'house' -Closely associated with the C<//g> modifier is the C<\G> anchor. The -C<\G> anchor matches at the point where the previous C<//g> match left +Closely associated with the C</g> modifier is the C<\G> anchor. The +C<\G> anchor matches at the point where the previous C</g> match left off. C<\G> allows us to easily do context-sensitive matching: $metric = 1; # use metric units @@ -1631,7 +1681,7 @@ off. C<\G> allows us to easily do context-sensitive matching: } $x =~ /\G\s+(widget|sprocket)/g; # continue processing -The combination of C<//g> and C<\G> allows us to process the string a +The combination of C</g> and C<\G> allows us to process the string a bit at a time and use arbitrary Perl logic to decide what to do next. Currently, the C<\G> anchor is only fully supported when used to anchor to the start of the pattern. @@ -1648,7 +1698,7 @@ naive regexp $dna =~ /TGA/; doesn't work; it may match a C<TGA>, but there is no guarantee that -the match is aligned with codon boundaries, e.g., the substring +the match is aligned with codon boundaries, I<e.g.>, the substring S<C<GTT GAA>> gives a match. A better solution is while ($dna =~ /(\w\w\w)*?TGA/g) { # note the minimal *? @@ -1681,7 +1731,7 @@ important not only to match what is desired, but to reject what is not desired. (There are other regexp modifiers that are available, such as -C<//o>, but their specialized uses are beyond the +C</o>, but their specialized uses are beyond the scope of this introduction. ) =head3 Search and replace @@ -1691,7 +1741,7 @@ operations in Perl. Search and replace is accomplished with the C<s///> operator. The general form is C<s/regexp/replacement/modifiers>, with everything we know about regexps and modifiers applying in this case as well. The -C<replacement> is a Perl double-quoted string that replaces in the +I<replacement> is a Perl double-quoted string that replaces in the string whatever is matched with the C<regexp>. The operator C<=~> is also used here to associate a string with C<s///>. If matching against C<$_>, the S<C<$_ =~>> can be dropped. If there is a match, @@ -1709,7 +1759,7 @@ false. Here are a few examples: In the last example, the whole string was matched, but only the part inside the single quotes was grouped. With the C<s///> operator, the -matched variables C<$1>, C<$2>, etc. are immediately available for use +matched variables C<$1>, C<$2>, I<etc>. are immediately available for use in the replacement expression, so we use C<$1> to replace the quoted string with just what was quoted. With the global modifier, C<s///g> will search and replace all occurrences of the regexp in the string: @@ -1721,7 +1771,7 @@ will search and replace all occurrences of the regexp in the string: $x =~ s/4/four/g; # does it all: # $x contains "I batted four for four" -If you prefer 'regex' over 'regexp' in this tutorial, you could use +If you prefer "regex" over "regexp" in this tutorial, you could use the following program to replace it: % cat > simple_replace @@ -1799,7 +1849,7 @@ such as C<s!!!> and C<s{}{}>, and even C<s{}//>. If single quotes are used C<s'''>, then the regexp and replacement are treated as single-quoted strings and there are no variable substitutions. C<s///> in list context -returns the same thing as in scalar context, i.e., the number of +returns the same thing as in scalar context, I<i.e.>, the number of matches. =head3 The split function @@ -1834,7 +1884,7 @@ groupings as well. For instance, # $parts[5] = '/' # $parts[6] = 'perl' -Since the first character of $x matched the regexp, C<split> prepended +Since the first character of C<$x> matched the regexp, C<split> prepended an empty initial element to the list. If you have read this far, congratulations! You now have all the basic @@ -1893,7 +1943,7 @@ instance, $x = "\QThat !^*&%~& cat!"; $x =~ /\Q!^*&%~&\E/; # check for rough language -It does not protect C<$> or C<@>, so that variables can still be +It does not protect C<'$'> or C<'@'>, so that variables can still be substituted. C<\Q>, C<\L>, C<\l>, C<\U>, C<\u> and C<\E> are actually part of @@ -1915,8 +1965,9 @@ to know 1) how to represent Unicode characters in a regexp and 2) that a matching operation will treat the string to be searched as a sequence of characters, not bytes. The answer to 1) is that Unicode characters greater than C<chr(255)> are represented using the C<\x{hex}> notation, because -\x hex (without curly braces) doesn't go further than 255. (Starting in Perl -5.14, if you're an octal fan, you can also use C<\o{oct}>.) +C<\x>I<XY> (without curly braces and I<XY> are two hex digits) doesn't +go further than 255. (Starting in Perl 5.14, if you're an octal fan, +you can also use C<\o{oct}>.) /\x{263a}/; # match a Unicode smiley face :) @@ -1956,7 +2007,7 @@ L<http://www.unicode.org/standard/where>. The answer to requirement 2) is that a regexp (mostly) uses Unicode characters. The "mostly" is for messy backward -compatibility reasons, but starting in Perl 5.14, any regex compiled in +compatibility reasons, but starting in Perl 5.14, any regexp compiled in the scope of a C<use feature 'unicode_strings'> (which is automatically turned on within the scope of a C<use 5.012> or higher) will turn that "mostly" into "always". If you want to handle Unicode properly, you @@ -1967,10 +2018,9 @@ it is a sequence of characters, not bytes. See L<perlunitut> for a tutorial about that. Let us now discuss Unicode character classes, most usually called -"character properties". These are represented by the -C<\p{name}> escape sequence. Closely associated is the C<\P{name}> -property, which is the negation of the C<\p{name}> one. For -example, to match lower and uppercase characters, +"character properties". These are represented by the C<\p{I<name>}> +escape sequence. The negation of this is C<\P{I<name>}>. For example, +to match lower and uppercase characters, $x = "BOB"; $x =~ /^\p{IsUpper}/; # matches, uppercase char class @@ -1978,7 +2028,7 @@ example, to match lower and uppercase characters, $x =~ /^\p{IsLower}/; # doesn't match, lowercase char class $x =~ /^\P{IsLower}/; # matches, char class sans lowercase -(The "Is" is optional.) +(The "C<Is>" is optional.) There are many, many Unicode character properties. For the full list see L<perluniprops>. Most of them have synonyms with shorter names, @@ -1986,23 +2036,27 @@ also listed there. Some synonyms are a single character. For these, you can drop the braces. For instance, C<\pM> is the same thing as C<\p{Mark}>, meaning things like accent marks. -The Unicode C<\p{Script}> property is used to categorize every Unicode -character into the language script it is written in. For example, +The Unicode C<\p{Script}> and C<\p{Script_Extensions}> properties are +used to categorize every Unicode character into the language script it +is written in. (C<Script_Extensions> is an improved version of +C<Script>, which is retained for backward compatibility, and so you +should generally use C<Script_Extensions>.) +For example, English, French, and a bunch of other European languages are written in the Latin script. But there is also the Greek script, the Thai script, -the Katakana script, etc. You can test whether a character is in a -particular script with, for example C<\p{Latin}>, C<\p{Greek}>, -or C<\p{Katakana}>. To test if it isn't in the Balinese script, you -would use C<\P{Balinese}>. +the Katakana script, I<etc>. You can test whether a character is in a +particular script (based on C<Script_Extensions>) with, for example +C<\p{Latin}>, C<\p{Greek}>, or C<\p{Katakana}>. To test if it isn't in +the Balinese script, you would use C<\P{Balinese}>. What we have described so far is the single form of the C<\p{...}> character classes. There is also a compound form which you may run into. These -look like C<\p{name=value}> or C<\p{name:value}> (the equals sign and colon +look like C<\p{I<name>=I<value>}> or C<\p{I<name>:I<value>}> (the equals sign and colon can be used interchangeably). These are more general than the single form, and in fact most of the single forms are just Perl-defined shortcuts for common compound forms. For example, the script examples in the previous paragraph -could be written equivalently as C<\p{Script=Latin}>, C<\p{Script:Greek}>, -C<\p{script=katakana}>, and C<\P{script=balinese}> (case is irrelevant +could be written equivalently as C<\p{Script_Extensions=Latin}>, C<\p{Script_Extensions:Greek}>, +C<\p{script_extensions=katakana}>, and C<\P{script_extensions=balinese}> (case is irrelevant between the C<{}> braces). You may never have to use the compound forms, but sometimes it is necessary, and their use can make your code easier to understand. @@ -2010,28 +2064,28 @@ use can make your code easier to understand. C<\X> is an abbreviation for a character class that comprises a Unicode I<extended grapheme cluster>. This represents a "logical character": what appears to be a single character, but may be represented internally by more -than one. As an example, using the Unicode full names, e.g., S<C<A + COMBINING -RING>> is a grapheme cluster with base character C<A> and combining character -S<C<COMBINING RING>>, which translates in Danish to A with the circle atop it, +than one. As an example, using the Unicode full names, I<e.g.>, "S<A + COMBINING +RING>" is a grapheme cluster with base character "A" and combining character +"S<COMBINING RING>, which translates in Danish to "A" with the circle atop it, as in the word E<Aring>ngstrom. For the full and latest information about Unicode see the latest Unicode standard, or the Unicode Consortium's website L<http://www.unicode.org> As if all those classes weren't enough, Perl also defines POSIX-style -character classes. These have the form C<[:name:]>, with C<name> the +character classes. These have the form C<[:I<name>:]>, with I<name> the name of the POSIX class. The POSIX classes are C<alpha>, C<alnum>, C<ascii>, C<cntrl>, C<digit>, C<graph>, C<lower>, C<print>, C<punct>, C<space>, C<upper>, and C<xdigit>, and two extensions, C<word> (a Perl -extension to match C<\w>), and C<blank> (a GNU extension). The C<//a> +extension to match C<\w>), and C<blank> (a GNU extension). The C</a> modifier restricts these to matching just in the ASCII range; otherwise they can match the same as their corresponding Perl Unicode classes: -C<[:upper:]> is the same as C<\p{IsUpper}>, etc. (There are some +C<[:upper:]> is the same as C<\p{IsUpper}>, I<etc>. (There are some exceptions and gotchas with this; see L<perlrecharclass> for a full discussion.) The C<[:digit:]>, C<[:word:]>, and C<[:space:]> correspond to the familiar C<\d>, C<\w>, and C<\s> -character classes. To negate a POSIX class, put a C<^> in front of -the name, so that, e.g., C<[:^digit:]> corresponds to C<\D> and, under +character classes. To negate a POSIX class, put a C<'^'> in front of +the name, so that, I<e.g.>, C<[:^digit:]> corresponds to C<\D> and, under Unicode, C<\P{IsDigit}>. The Unicode and POSIX character classes can be used just like C<\d>, with the exception that POSIX character classes can only be used inside of a character class: @@ -2067,7 +2121,7 @@ C<$reg> can also be interpolated into a larger regexp: $x =~ /(abc)?$reg/; # still matches As with the matching operator, the regexp quote can use different -delimiters, e.g., C<qr!!>, C<qr{}> or C<qr~~>. Apostrophes +delimiters, I<e.g.>, C<qr!!>, C<qr{}> or C<qr~~>. Apostrophes as delimiters (C<qr''>) inhibit any interpolation. Pre-compiled regexps are useful for creating dynamic matches that @@ -2189,9 +2243,9 @@ example is /(?# Match an integer:)[+-]?\d+/; This style of commenting has been largely superseded by the raw, -freeform commenting that is allowed with the C<//x> modifier. +freeform commenting that is allowed with the C</x> modifier. -Most modifiers, such as C<//i>, C<//m>, C<//s> and C<//x> (or any +Most modifiers, such as C</i>, C</m>, C</s> and C</x> (or any combination thereof) can also be embedded in a regexp using C<(?i)>, C<(?m)>, C<(?s)>, and C<(?x)>. For instance, @@ -2204,7 +2258,7 @@ a regexp using C<(?i)>, C<(?m)>, C<(?s)>, and C<(?x)>. For instance, /x; Embedded modifiers can have two important advantages over the usual -modifiers. Embedded modifiers allow a custom set of modifiers to +modifiers. Embedded modifiers allow a custom set of modifiers for I<each> regexp pattern. This is great for matching an array of regexps that must have different modifiers: @@ -2217,7 +2271,7 @@ that must have different modifiers: } } -The second advantage is that embedded modifiers (except C<//p>, which +The second advantage is that embedded modifiers (except C</p>, which modifies the entire regexp) only affect the regexp inside the group the embedded modifier is contained in. So grouping can be used to localize the modifier's effects: @@ -2225,8 +2279,8 @@ can be used to localize the modifier's effects: /Answer: ((?i)yes)/; # matches 'Answer: yes', 'Answer: YES', etc. Embedded modifiers can also turn off any modifiers already present -by using, e.g., C<(?-i)>. Modifiers can also be combined into -a single expression, e.g., C<(?s-i)> turns on single line mode and +by using, I<e.g.>, C<(?-i)>. Modifiers can also be combined into +a single expression, I<e.g.>, C<(?s-i)> turns on single line mode and turns off case insensitivity. Embedded modifiers may also be added to a non-capturing grouping. @@ -2239,13 +2293,13 @@ case insensitively and turns off multi-line mode. This section concerns the lookahead and lookbehind assertions. First, a little background. -In Perl regular expressions, most regexp elements 'eat up' a certain +In Perl regular expressions, most regexp elements "eat up" a certain amount of string when they match. For instance, the regexp element C<[abc]> eats up one character of the string when it matches, in the sense that Perl moves to the next character position in the string after the match. There are some elements, however, that don't eat up characters (advance the character position) if they match. The examples -we have seen so far are the anchors. The anchor C<^> matches the +we have seen so far are the anchors. The anchor C<'^'> matches the beginning of the line, but doesn't eat any characters. Similarly, the word boundary anchor C<\b> matches wherever a character matching C<\w> is next to a character that doesn't, but it doesn't eat up any @@ -2259,8 +2313,8 @@ checks out, we can proceed forward. But if the local environment doesn't satisfy us, we must backtrack. Checking the environment entails either looking ahead on the trail, -looking behind, or both. C<^> looks behind, to see that there are no -characters before. C<$> looks ahead, to see that there are no +looking behind, or both. C<'^'> looks behind, to see that there are no +characters before. C<'$'> looks ahead, to see that there are no characters after. C<\b> looks both ahead and behind, to see if the characters on either side differ in their "word-ness". @@ -2284,7 +2338,7 @@ non-capturing, since these are zero-width assertions. Thus in the second regexp, the substrings captured are those of the whole regexp itself. Lookahead C<(?=regexp)> can match arbitrary regexps, but lookbehind C<< (?<=fixed-regexp) >> only works for regexps of fixed -width, i.e., a fixed number of characters long. Thus +width, I<i.e.>, a fixed number of characters long. Thus C<< (?<=(ab|bc)) >> is fine, but C<< (?<=(ab)*) >> is not. The negated versions of the lookahead and lookbehind assertions are denoted by C<(?!regexp)> and C<< (?<!fixed-regexp) >> respectively. @@ -2307,6 +2361,18 @@ by looking ahead and behind: | (?<=-) (?=\S) # a '-' followed by any non-space /x, $str; # @toks = qw(one two - - - 6 - 8) +Starting in Perl 5.28, experimentally, alphabetic equivalents to these +assertions are added, so you can use whichever is most memorable for +your tastes. + + (?=...) (*pla:...) or (*positive_lookahead:...) + (?!...) (*nla:...) or (*negative_lookahead:...) + (?<=...) (*plb:...) or (*positive_lookbehind:...) + (?<!...) (*nlb:...) or (*negative_lookbehind:...) + (?>...) (*atomic:...) + +Using any of these will raise (unless turned off) a warning in the +C<experimental::alpha_assertions> category. =head2 Using independent subexpressions to prevent backtracking @@ -2322,9 +2388,9 @@ considering an ordinary regexp: $x =~ /a*ab/; # matches This obviously matches, but in the process of matching, the -subexpression C<a*> first grabbed the C<a>. Doing so, however, +subexpression C<a*> first grabbed the C<'a'>. Doing so, however, wouldn't allow the whole regexp to match, so after backtracking, C<a*> -eventually gave back the C<a> and matched the empty string. Here, what +eventually gave back the C<'a'> and matched the empty string. Here, what C<a*> matched was I<dependent> on what the rest of the regexp matched. Contrast that with an independent subexpression: @@ -2332,17 +2398,17 @@ Contrast that with an independent subexpression: $x =~ /(?>a*)ab/; # doesn't match! The independent subexpression C<< (?>a*) >> doesn't care about the rest -of the regexp, so it sees an C<a> and grabs it. Then the rest of the +of the regexp, so it sees an C<'a'> and grabs it. Then the rest of the regexp C<ab> cannot match. Because C<< (?>a*) >> is independent, there is no backtracking and the independent subexpression does not give -up its C<a>. Thus the match of the regexp as a whole fails. A similar +up its C<'a'>. Thus the match of the regexp as a whole fails. A similar behavior occurs with completely independent regexps: $x = "ab"; $x =~ /a*/g; # matches, eats an 'a' $x =~ /\Gab/g; # doesn't match, no 'a' available -Here C<//g> and C<\G> create a 'tag team' handoff of the string from +Here C</g> and C<\G> create a "tag team" handoff of the string from one regexp to the other. Regexps with an independent subexpression are much like this, with a handoff of the string to the independent subexpression, and a handoff of the string back to the enclosing @@ -2354,7 +2420,7 @@ enclosed in parentheses up to two levels deep. Then the following regexp matches: $x = "abc(de(fg)h"; # unbalanced parentheses - $x =~ /\( ( [^()]+ | \([^()]*\) )+ \)/x; + $x =~ /\( ( [ ^ () ]+ | \( [ ^ () ]* \) )+ \)/xx; The regexp matches an open parenthesis, one or more copies of an alternation, and a close parenthesis. The alternation is two-way, with @@ -2368,7 +2434,7 @@ was no match possible. To prevent the exponential blowup, we need to prevent useless backtracking at some point. This can be done by enclosing the inner quantifier as an independent subexpression: - $x =~ /\( ( (?>[^()]+) | \([^()]*\) )+ \)/x; + $x =~ /\( ( (?> [ ^ () ]+ ) | \([ ^ () ]* \) )+ \)/xx; Here, C<< (?>[^()]+) >> breaks the degeneracy of string partitioning by gobbling up as much of the string as possible and keeping it. Then @@ -2380,26 +2446,27 @@ match failures fail much more quickly. A I<conditional expression> is a form of if-then-else statement that allows one to choose which patterns are to be matched, based on some condition. There are two types of conditional expression: -C<(?(condition)yes-regexp)> and -C<(?(condition)yes-regexp|no-regexp)>. C<(?(condition)yes-regexp)> is -like an S<C<'if () {}'>> statement in Perl. If the C<condition> is true, -the C<yes-regexp> will be matched. If the C<condition> is false, the -C<yes-regexp> will be skipped and Perl will move onto the next regexp +C<(?(I<condition>)I<yes-regexp>)> and +C<(?(condition)I<yes-regexp>|I<no-regexp>)>. +C<(?(I<condition>)I<yes-regexp>)> is +like an S<C<'if () {}'>> statement in Perl. If the I<condition> is true, +the I<yes-regexp> will be matched. If the I<condition> is false, the +I<yes-regexp> will be skipped and Perl will move onto the next regexp element. The second form is like an S<C<'if () {} else {}'>> statement -in Perl. If the C<condition> is true, the C<yes-regexp> will be -matched, otherwise the C<no-regexp> will be matched. +in Perl. If the I<condition> is true, the I<yes-regexp> will be +matched, otherwise the I<no-regexp> will be matched. -The C<condition> can have several forms. The first form is simply an -integer in parentheses C<(integer)>. It is true if the corresponding -backreference C<\integer> matched earlier in the regexp. The same +The I<condition> can have several forms. The first form is simply an +integer in parentheses C<(I<integer>)>. It is true if the corresponding +backreference C<\I<integer>> matched earlier in the regexp. The same thing can be done with a name associated with a capture group, written -as C<< (<name>) >> or C<< ('name') >>. The second form is a bare +as C<<< (E<lt>I<name>E<gt>) >>> or C<< ('I<name>') >>. The second form is a bare zero-width assertion C<(?...)>, either a lookahead, a lookbehind, or a code assertion (discussed in the next section). The third set of forms provides tests that return true if the expression is executed within a recursion (C<(R)>) or is being called from some capturing group, referenced either by number (C<(R1)>, C<(R2)>,...) or by name -(C<(R&name)>). +(C<(R&I<name>)>). The integer or name form of the C<condition> allows us to choose, with more flexibility, what to match based on what matched earlier in the @@ -2422,7 +2489,7 @@ match. For instance, /[ATGC]+(?(?<=AA)G|C)$/; matches a DNA sequence such that it either ends in C<AAG>, or some -other base pair combination and C<C>. Note that the form is +other base pair combination and C<'C'>. Note that the form is C<< (?(?<=AA)G|C) >> and not C<< (?((?<=AA))G|C) >>; for the lookahead, lookbehind or code assertions, the parentheses around the conditional are not needed. @@ -2434,13 +2501,13 @@ Some regular expressions use identical subpatterns in several places. Starting with Perl 5.10, it is possible to define named subpatterns in a section of the pattern so that they can be called up by name anywhere in the pattern. This syntactic pattern for this definition -group is C<< (?(DEFINE)(?<name>pattern)...) >>. An insertion -of a named pattern is written as C<(?&name)>. +group is C<< (?(DEFINE)(?<I<name>>I<pattern>)...) >>. An insertion +of a named pattern is written as C<(?&I<name>)>. The example below illustrates this feature using the pattern for floating point numbers that was presented earlier on. The three subpatterns that are used more than once are the optional sign, the -digit sequence for an integer and the decimal fraction. The DEFINE +digit sequence for an integer and the decimal fraction. The C<DEFINE> group at the end of the pattern contains their definition. Notice that the decimal fraction pattern is the first place where we can reuse the integer pattern. @@ -2460,7 +2527,7 @@ reuse the integer pattern. This feature (introduced in Perl 5.10) significantly extends the power of Perl's pattern matching. By referring to some other capture group anywhere in the pattern with the construct -C<(?group-ref)>, the I<pattern> within the referenced group is used +C<(?I<group-ref>)>, the I<pattern> within the referenced group is used as an independent subpattern in place of the group reference itself. Because the group reference may be contained I<within> the group it refers to, it is now possible to apply pattern matching to tasks that @@ -2486,7 +2553,7 @@ have the full pattern: In C<(?...)> both absolute and relative backreferences may be used. The entire pattern can be reinserted with C<(?R)> or C<(?0)>. -If you prefer to name your groups, you can use C<(?&name)> to +If you prefer to name your groups, you can use C<(?&I<name>)> to recurse into that group. @@ -2495,17 +2562,14 @@ recurse into that group. Normally, regexps are a part of Perl expressions. I<Code evaluation> expressions turn that around by allowing arbitrary Perl code to be a part of a regexp. A code evaluation -expression is denoted C<(?{code})>, with I<code> a string of Perl +expression is denoted C<(?{I<code>})>, with I<code> a string of Perl statements. -Be warned that this feature is considered experimental, and may be -changed without notice. - Code expressions are zero-width assertions, and the value they return depends on their environment. There are two possibilities: either the code expression is used as a conditional in a conditional expression -C<(?(condition)...)>, or it is not. If the code expression is a -conditional, the code is evaluated and the result (i.e., the result of +C<(?(I<condition>)...)>, or it is not. If the code expression is a +conditional, the code is evaluated and the result (I<i.e.>, the result of the last statement) is used to determine truth or falsehood. If the code expression is not used as a conditional, the assertion always evaluates true and the result is put into the special variable @@ -2533,12 +2597,12 @@ example: Hmm. What happened here? If you've been following along, you know that the above pattern should be effectively (almost) the same as the last one; -enclosing the C<d> in a character class isn't going to change what it +enclosing the C<'d'> in a character class isn't going to change what it matches. So why does the first not print while the second one does? -The answer lies in the optimizations the regex engine makes. In the first +The answer lies in the optimizations the regexp engine makes. In the first case, all the engine sees are plain old characters (aside from the -C<?{}> construct). It's smart enough to realize that the string 'ddd' +C<?{}> construct). It's smart enough to realize that the string C<'ddd'> doesn't occur in our target string before actually running the pattern through. But in the second case, we've tricked it into thinking that our pattern is more complicated. It takes a look, sees our @@ -2548,7 +2612,7 @@ running it hits the print statement before it discovers that we don't have a match. To take a closer look at how the engine does optimizations, see the -section L<"Pragmas and debugging"> below. +section L</"Pragmas and debugging"> below. More fun with C<?{}>: @@ -2564,7 +2628,7 @@ backtracks in the process of searching for a match. If the regexp backtracks over a code expression and if the variables used within are localized using C<local>, the changes in the variables produced by the code expression are undone! Thus, if we wanted to count how many times -a character got matched inside a group, we could use, e.g., +a character got matched inside a group, we could use, I<e.g.>, $x = "aaaa"; $count = 0; # initialize 'a' count @@ -2605,7 +2669,8 @@ The result C<$^R> is automatically localized, so that it will behave properly in the presence of backtracking. This example uses a code expression in a conditional to match a -definite article, either 'the' in English or 'der|die|das' in German: +definite article, either C<'the'> in English or C<'der|die|das'> in +German: $lang = 'DE'; # use German ... @@ -2619,8 +2684,8 @@ definite article, either 'the' in English or 'der|die|das' in German: ) /xi; -Note that the syntax here is C<(?(?{...})yes-regexp|no-regexp)>, not -C<(?((?{...}))yes-regexp|no-regexp)>. In other words, in the case of a +Note that the syntax here is C<(?(?{...})I<yes-regexp>|I<no-regexp>)>, not +C<(?((?{...}))I<yes-regexp>|I<no-regexp>)>. In other words, in the case of a code expression, we don't need the extra parentheses around the conditional. @@ -2678,7 +2743,7 @@ expression and matched immediately. A simple example is This final example contains both ordinary and pattern code expressions. It detects whether a binary string C<1101010010001...> has a -Fibonacci spacing 0,1,1,2,3,5,... of the C<1>'s: +Fibonacci spacing 0,1,1,2,3,5,... of the C<'1'>'s: $x = "1101010010001000001"; $z0 = ''; $z1 = '0'; # initial conditions @@ -2707,7 +2772,7 @@ expression. Rather, the whole code block is parsed as perl code at the same time as perl is compiling the code containing the literal regexp pattern. -The regexp without the C<//x> modifier is +This regexp without the C</x> modifier is /^1(?:((??{ $z0 }))1(?{ $z0 = $z1; $z1 .= $^N; }))+$/ @@ -2720,11 +2785,9 @@ regexps is almost necessary in creating and debugging regexps. Perl 5.10 introduced a number of control verbs intended to provide detailed control over the backtracking process, by directly influencing -the regexp engine and by providing monitoring techniques. As all -the features in this group are experimental and subject to change or -removal in a future version of Perl, the interested reader is -referred to L<perlre/"Special Backtracking Control Verbs"> for a -detailed description. +the regexp engine and by providing monitoring techniques. See +L<perlre/"Special Backtracking Control Verbs"> for a detailed +description. Below is just one example, illustrating the control verb C<(*FAIL)>, which may be abbreviated as C<(*F)>. If this is inserted in a regexp @@ -2838,7 +2901,7 @@ part describes the compilation stage. C<STAR(4)> means that there is a starred object, in this case C<'a'>, and if it matches, goto line 4, -i.e., C<PLUS(7)>. The middle lines describe some heuristics and +I<i.e.>, C<PLUS(7)>. The middle lines describe some heuristics and optimizations performed before a match: floating 'bc' at 0..2147483647 (checking floating) minlen 2 @@ -2896,11 +2959,6 @@ prints t2 Done at position 4 -=head1 BUGS - -Code expressions, conditional expressions, and independent expressions -are I<experimental>. Don't use them in production code. Yet. - =head1 SEE ALSO This is just a tutorial. For the full story on Perl regular @@ -2916,8 +2974,9 @@ Jeffrey Friedl (published by O'Reilly, ISBN 1556592-257-3). =head1 AUTHOR AND COPYRIGHT -Copyright (c) 2000 Mark Kvale +Copyright (c) 2000 Mark Kvale. All rights reserved. +Now maintained by Perl porters. This document may be distributed under the same terms as Perl itself. diff --git a/gnu/usr.bin/perl/pod/perlrun.pod b/gnu/usr.bin/perl/pod/perlrun.pod index 25ec5e6648a..8f15bc2c98c 100644 --- a/gnu/usr.bin/perl/pod/perlrun.pod +++ b/gnu/usr.bin/perl/pod/perlrun.pod @@ -34,7 +34,7 @@ Specified line by line via B<-e> or B<-E> switches on the command line. Contained in the file specified by the first filename on the command line. (Note that systems supporting the C<#!> notation invoke interpreters this -way. See L<Location of Perl>.) +way. See L</Location of Perl>.) =item 3. @@ -379,11 +379,14 @@ X<-D> X<DEBUGGING> X<-DDEBUGGING> =item B<-D>I<number> -sets debugging flags. To watch how it executes your program, use -B<-Dtls>. (This works only if debugging is compiled into your -Perl.) Another nice value is B<-Dx>, which lists your compiled -syntax tree. And B<-Dr> displays compiled regular expressions; -the format of the output is explained in L<perldebguts>. +sets debugging flags. This switch is enabled only if your perl binary has +been built with debugging enabled: normal production perls won't have +been. + +For example, to watch how perl executes your program, use B<-Dtls>. +Another nice value is B<-Dx>, which lists your compiled syntax tree, and +B<-Dr> displays compiled regular expressions; the format of the output is +explained in L<perldebguts>. As an alternative, specify a number instead of list of letters (e.g., B<-D14> is equivalent to B<-Dtls>): @@ -403,7 +406,6 @@ B<-D14> is equivalent to B<-Dtls>): 2048 u Tainting checks 4096 U Unofficial, User hacking (reserved for private, unreleased use) - 8192 H Hash dump -- usurps values() 16384 X Scratchpad allocation 32768 D Cleaning up 65536 S Op slab allocation @@ -429,8 +431,7 @@ All these flags require B<-DDEBUGGING> when you compile the Perl executable (but see C<:opd> in L<Devel::Peek> or L<re/'debug' mode> which may change this). See the F<INSTALL> file in the Perl source distribution -for how to do this. This flag is automatically set if you include B<-g> -option when C<Configure> asks you about optimizer/debugger flags. +for how to do this. If you're just trying to get a print out of each line of Perl code as it executes, the way that C<sh -x> provides for shell scripts, @@ -499,7 +500,7 @@ X<-F> specifies the pattern to split on for B<-a>. The pattern may be surrounded by C<//>, C<"">, or C<''>, otherwise it will be put in single -quotes. You can't use literal whitespace in the pattern. +quotes. You can't use literal whitespace or NUL characters in the pattern. B<-F> implicitly sets both B<-a> and B<-n>. @@ -662,14 +663,19 @@ X<-m> X<-M> =item B<-[mM]>[B<->]I<module=arg[,arg]...> B<-m>I<module> executes C<use> I<module> C<();> before executing your -program. +program. This loads the module, but does not call its C<import> method, +so does not import subroutines and does not give effect to a pragma. B<-M>I<module> executes C<use> I<module> C<;> before executing your -program. You can use quotes to add extra code after the module name, +program. This loads the module and calls its C<import> method, causing +the module to have its default effect, typically importing subroutines +or giving effect to a pragma. +You can use quotes to add extra code after the module name, e.g., C<'-MI<MODULE> qw(foo bar)'>. If the first character after the B<-M> or B<-m> is a dash (B<->) then the 'use' is replaced with 'no'. +This makes no difference for B<-m>. A little builtin syntactic sugar means you can also say B<-mI<MODULE>=foo,bar> or B<-MI<MODULE>=foo,bar> as a shortcut for @@ -679,7 +685,8 @@ C<use module split(/,/,q{foo,bar})>. Note that the C<=> form removes the distinction between B<-m> and B<-M>; that is, B<-mI<MODULE>=foo,bar> is the same as B<-MI<MODULE>=foo,bar>. -A consequence of this is that B<-MI<MODULE>=number> never does a version check, +A consequence of the C<split> formulation +is that B<-MI<MODULE>=number> never does a version check, unless C<I<MODULE>::import()> itself is set up to do a version check, which could happen for example if I<MODULE> inherits from L<Exporter>. @@ -993,9 +1000,9 @@ used. =item PERL5LIB X<PERL5LIB> -A list of directories in which to look for Perl library -files before looking in the standard library and the current -directory. Any architecture-specific and version-specific directories, +A list of directories in which to look for Perl library files before +looking in the standard library. +Any architecture-specific and version-specific directories, such as F<version/archname/>, F<version/>, or F<archname/> under the specified locations are automatically included if they exist, with this lookup done at interpreter startup time. In addition, any directories @@ -1119,7 +1126,7 @@ A pseudolayer that enables a flag in the layer below to tell Perl that output should be in utf8 and that input should be regarded as already in valid utf8 form. B<WARNING: It does not check for validity and as such should be handled with extreme caution for input, because security violations -can occur with non-shortest UTF-8 encodings, etc.> Generally C<:encoding(utf8)> is +can occur with non-shortest UTF-8 encodings, etc.> Generally C<:encoding(UTF-8)> is the best option when reading UTF-8 encoded data. =item :win32 @@ -1176,7 +1183,7 @@ support. X<PERLLIB> A list of directories in which to look for Perl library -files before looking in the standard library and the current directory. +files before looking in the standard library. If PERL5LIB is defined, PERLLIB is not used. The PERLLIB environment variable is completely ignored when Perl @@ -1377,11 +1384,32 @@ a boolean variable. Setting this to C<"1"> is not the right way to your shell before starting Perl). See the description of the B<-C> switch for more information. +=item PERL_USE_UNSAFE_INC +X<PERL_USE_UNSAFE_INC> + +If perl has been configured to not have the current directory in +L<C<@INC>|perlvar/@INC> by default, this variable can be set to C<"1"> +to reinstate it. It's primarily intended for use while building and +testing modules that have not been updated to deal with "." not being in +C<@INC> and should not be set in the environment for day-to-day use. + =item SYS$LOGIN (specific to the VMS port) X<SYS$LOGIN> Used if chdir has no argument and HOME and LOGDIR are not set. +=item PERL_INTERNAL_RAND_SEED +X<PERL_INTERNAL_RAND_SEED> + +Set to a non-negative integer to seed the random number generator used +internally by perl for a variety of purposes. + +Ignored if perl is run setuid or setgid. Used only for some limited +startup randomization (hash keys) if C<-T> or C<-t> perl is started +with tainting enabled. + +Perl may be built to ignore this variable. + =back Perl also has environment variables that control how Perl handles data diff --git a/gnu/usr.bin/perl/pod/perlsec.pod b/gnu/usr.bin/perl/pod/perlsec.pod index 6eb1bd2115d..b210445685e 100644 --- a/gnu/usr.bin/perl/pod/perlsec.pod +++ b/gnu/usr.bin/perl/pod/perlsec.pod @@ -370,7 +370,7 @@ abusing perl bugs to make the host interpreter crash or behave in unpredictable ways. In any case it's better avoided completely if you're really concerned about security. -=head2 Security Bugs +=head2 Shebang Race Condition Beyond the obvious problems that stem from giving special privileges to systems as flexible as scripts, on many versions of Unix, set-id scripts @@ -380,39 +380,60 @@ see which interpreter to run and when the (now-set-id) interpreter turns around and reopens the file to interpret it, the file in question may have changed, especially if you have symbolic links on your system. -Fortunately, sometimes this kernel "feature" can be disabled. -Unfortunately, there are two ways to disable it. The system can simply -outlaw scripts with any set-id bit set, which doesn't help much. -Alternately, it can simply ignore the set-id bits on scripts. +Some Unixes, especially more recent ones, are free of this +inherent security bug. On such systems, when the kernel passes the name +of the set-id script to open to the interpreter, rather than using a +pathname subject to meddling, it instead passes I</dev/fd/3>. This is a +special file already opened on the script, so that there can be no race +condition for evil scripts to exploit. On these systems, Perl should be +compiled with C<-DSETUID_SCRIPTS_ARE_SECURE_NOW>. The F<Configure> +program that builds Perl tries to figure this out for itself, so you +should never have to specify this yourself. Most modern releases of +SysVr4 and BSD 4.4 use this approach to avoid the kernel race condition. -However, if the kernel set-id script feature isn't disabled, Perl will -complain loudly that your set-id script is insecure. You'll need to -either disable the kernel set-id script feature, or put a C wrapper around +If you don't have the safe version of set-id scripts, all is not lost. +Sometimes this kernel "feature" can be disabled, so that the kernel +either doesn't run set-id scripts with the set-id or doesn't run them +at all. Either way avoids the exploitability of the race condition, +but doesn't help in actually running scripts set-id. + +If the kernel set-id script feature isn't disabled, then any set-id +script provides an exploitable vulnerability. Perl can't avoid being +exploitable, but will point out vulnerable scripts where it can. If Perl +detects that it is being applied to a set-id script then it will complain +loudly that your set-id script is insecure, and won't run it. When Perl +complains, you need to remove the set-id bit from the script to eliminate +the vulnerability. Refusing to run the script doesn't in itself close +the vulnerability; it is just Perl's way of encouraging you to do this. + +To actually run a script set-id, if you don't have the safe version of +set-id scripts, you'll need to put a C wrapper around the script. A C wrapper is just a compiled program that does nothing except call your Perl program. Compiled programs are not subject to the kernel bug that plagues set-id scripts. Here's a simple wrapper, written in C: + #include <unistd.h> + #include <stdio.h> + #include <string.h> + #include <errno.h> + #define REAL_PATH "/path/to/script" - main(ac, av) - char **av; + + int main(int argc, char **argv) { - execv(REAL_PATH, av); + execv(REAL_PATH, argv); + fprintf(stderr, "%s: %s: %s\n", + argv[0], REAL_PATH, strerror(errno)); + return 127; } Compile this wrapper into a binary executable and then make I<it> rather -than your script setuid or setgid. - -In recent years, vendors have begun to supply systems free of this -inherent security bug. On such systems, when the kernel passes the name -of the set-id script to open to the interpreter, rather than using a -pathname subject to meddling, it instead passes I</dev/fd/3>. This is a -special file already opened on the script, so that there can be no race -condition for evil scripts to exploit. On these systems, Perl should be -compiled with C<-DSETUID_SCRIPTS_ARE_SECURE_NOW>. The F<Configure> -program that builds Perl tries to figure this out for itself, so you -should never have to specify this yourself. Most modern releases of -SysVr4 and BSD 4.4 use this approach to avoid the kernel race condition. +than your script setuid or setgid. Note that this wrapper isn't doing +anything to sanitise the execution environment other than ensuring +that a safe path to the script is used. It only avoids the shebang +race condition. It relies on Perl's own features, and on the script +itself being careful, to make it safe enough to run the script set-id. =head2 Protecting Your Programs @@ -566,16 +587,38 @@ Perl running out of memory. =item * Sorting - the quicksort algorithm used in Perls before 5.8.0 to -implement the sort() function is very easy to trick into misbehaving +implement the sort() function was very easy to trick into misbehaving so that it consumes a lot of time. Starting from Perl 5.8.0 a different sorting algorithm, mergesort, is used by default. Mergesort cannot misbehave on any input. =back -See L<http://www.cs.rice.edu/~scrosby/hash/> for more information, +See L<https://www.usenix.org/legacy/events/sec03/tech/full_papers/crosby/crosby.pdf> for more information, and any computer science textbook on algorithmic complexity. +=head2 Using Sudo + +The popular tool C<sudo> provides a controlled way for users to be able +to run programs as other users. It sanitises the execution environment +to some extent, and will avoid the L<shebang race condition|/"Shebang +Race Condition">. If you don't have the safe version of set-id scripts, +then C<sudo> may be a more convenient way of executing a script as +another user than writing a C wrapper would be. + +However, C<sudo> sets the real user or group ID to that of the target +identity, not just the effective ID as set-id bits do. As a result, Perl +can't detect that it is running under C<sudo>, and so won't automatically +take its own security precautions such as turning on taint mode. Where +C<sudo> configuration dictates exactly which command can be run, the +approved command may include a C<-T> option to perl to enable taint mode. + +In general, it is necessary to evaluate the suitaility of a script to +run under C<sudo> specifically with that kind of execution environment +in mind. It is neither necessary nor sufficient for the same script to +be suitable to run in a traditional set-id arrangement, though many of +the issues overlap. + =head1 SEE ALSO L<perlrun> for its description of cleaning up environment variables. diff --git a/gnu/usr.bin/perl/pod/perlstyle.pod b/gnu/usr.bin/perl/pod/perlstyle.pod index 37dfaaf1413..5c2534581e9 100644 --- a/gnu/usr.bin/perl/pod/perlstyle.pod +++ b/gnu/usr.bin/perl/pod/perlstyle.pod @@ -210,8 +210,9 @@ function should not be used outside the package that defined it. =item * -If you have a really hairy regular expression, use the C</x> modifier and -put in some whitespace to make it look a little less like line noise. +If you have a really hairy regular expression, use the C</x> or C</xx> +modifiers and put in some whitespace to make it look a little less like +line noise. Don't use slash as a delimiter when your regexp has slashes or backslashes. =item * diff --git a/gnu/usr.bin/perl/pod/perlsub.pod b/gnu/usr.bin/perl/pod/perlsub.pod index 78de2847338..a761e3d0784 100644 --- a/gnu/usr.bin/perl/pod/perlsub.pod +++ b/gnu/usr.bin/perl/pod/perlsub.pod @@ -15,20 +15,25 @@ X<subroutine, declaration> X<sub> sub NAME BLOCK # A declaration and a definition. sub NAME(PROTO) BLOCK # ditto, but with prototypes - sub NAME(SIG) BLOCK # with a signature instead sub NAME : ATTRS BLOCK # with attributes sub NAME(PROTO) : ATTRS BLOCK # with prototypes and attributes - sub NAME(SIG) : ATTRS BLOCK # with a signature and attributes + + use feature 'signatures'; + sub NAME(SIG) BLOCK # with signature + sub NAME :ATTRS (SIG) BLOCK # with signature, attributes + sub NAME :prototype(PROTO) (SIG) BLOCK # with signature, prototype To define an anonymous subroutine at runtime: X<subroutine, anonymous> $subref = sub BLOCK; # no proto $subref = sub (PROTO) BLOCK; # with proto - $subref = sub (SIG) BLOCK; # with signature $subref = sub : ATTRS BLOCK; # with attributes $subref = sub (PROTO) : ATTRS BLOCK; # with proto and attributes - $subref = sub (SIG) : ATTRS BLOCK; # with signature and attributes + + use feature 'signatures'; + $subref = sub (SIG) BLOCK; # with signature + $subref = sub : ATTRS(SIG) BLOCK; # with signature, attributes To import subroutines: X<import> @@ -98,8 +103,8 @@ Aside from an experimental facility (see L</Signatures> below), Perl does not have named formal parameters. In practice all you do is assign to a C<my()> list of these. Variables that aren't declared to be private are global variables. For gory details -on creating private variables, see L<"Private Variables via my()"> -and L<"Temporary Values via local()">. To create protected +on creating private variables, see L</"Private Variables via my()"> +and L</"Temporary Values via local()">. To create protected environments for a set of functions in a separate package (and probably a separate file), see L<perlmod/"Packages">. X<formal parameter> X<parameter, formal> @@ -192,7 +197,7 @@ Do not, however, be tempted to do this: Like the flattened incoming parameter list, the return list is also flattened on return. So all you have managed to do here is stored everything in C<@a> and made C<@b> empty. See -L<Pass by Reference> for alternatives. +L</Pass by Reference> for alternatives. A subroutine may be called using an explicit C<&> prefix. The C<&> is optional in modern Perl, as are parentheses if the @@ -317,10 +322,15 @@ a warning unless the "experimental::signatures" warnings category is disabled. The signature is part of a subroutine's body. Normally the body of a -subroutine is simply a braced block of code. When using a signature, -the signature is a parenthesised list that goes immediately after -the subroutine name (or, for anonymous subroutines, immediately after -the C<sub> keyword). The signature declares lexical variables that are +subroutine is simply a braced block of code, but when using a signature, +the signature is a parenthesised list that goes immediately before the +block, after any name or attributes. + +For example, + + sub foo :lvalue ($a, $b = 1, @c) { .... } + +The signature declares lexical variables that are in scope for the block. When the subroutine is called, the signature takes control first. It populates the signature variables from the list of arguments that were passed. If the argument list doesn't meet @@ -490,12 +500,13 @@ a signature. They do different jobs: the prototype affects compilation of calls to the subroutine, and the signature puts argument values into lexical variables at runtime. You can therefore write - sub foo ($left, $right) : prototype($$) { + sub foo :prototype($$) ($left, $right) { return $left + $right; } -The prototype attribute, and any other attributes, come after -the signature. +The prototype attribute, and any other attributes, must come before +the signature. The signature always immediately precedes the block of +the subroutine's body. =head2 Private Variables via my() X<my> X<variable, lexical> X<lexical> X<lexical variable> X<scope, lexical> @@ -736,10 +747,11 @@ And this example uses anonymous subroutines to create separate counters: Also, since C<$x> is lexical, it can't be reached or modified by any Perl code outside. -When combined with variable declaration, simple scalar assignment to C<state> +When combined with variable declaration, simple assignment to C<state> variables (as in C<state $x = 42>) is executed only the first time. When such statements are evaluated subsequent times, the assignment is ignored. The -behavior of this sort of assignment to non-scalar variables is undefined. +behavior of assignment to C<state> declarations where the left hand side +of the assignment involves any parentheses is currently undefined. =head3 Persistent variables with closures @@ -1056,42 +1068,63 @@ using the CPAN module Sentinel or something similar. =head2 Lexical Subroutines X<my sub> X<state sub> X<our sub> X<subroutine, lexical> -B<WARNING>: Lexical subroutines are still experimental. The feature may be -modified or removed in future versions of Perl. - -Lexical subroutines are only available under the C<use feature -'lexical_subs'> pragma, which produces a warning unless the -"experimental::lexical_subs" warnings category is disabled. - Beginning with Perl 5.18, you can declare a private subroutine with C<my> or C<state>. As with state variables, the C<state> keyword is only available under C<use feature 'state'> or C<use 5.010> or higher. +Prior to Perl 5.26, lexical subroutines were deemed experimental and were +available only under the C<use feature 'lexical_subs'> pragma. They also +produced a warning unless the "experimental::lexical_subs" warnings +category was disabled. + These subroutines are only visible within the block in which they are declared, and only after that declaration: + # Include these two lines if your code is intended to run under Perl + # versions earlier than 5.26. no warnings "experimental::lexical_subs"; use feature 'lexical_subs'; - foo(); # calls the package/global subroutine + foo(); # calls the package/global subroutine state sub foo { - foo(); # also calls the package subroutine + foo(); # also calls the package subroutine } - foo(); # calls "state" sub - my $ref = \&foo; # take a reference to "state" sub + foo(); # calls "state" sub + my $ref = \&foo; # take a reference to "state" sub my sub bar { ... } - bar(); # calls "my" sub + bar(); # calls "my" sub -To use a lexical subroutine from inside the subroutine itself, you must -predeclare it. The C<sub foo {...}> subroutine definition syntax respects -any previous C<my sub;> or C<state sub;> declaration. +You can't (directly) write a recursive lexical subroutine: - my sub baz; # predeclaration - sub baz { # define the "my" sub - baz(); # recursive call + # WRONG + my sub baz { + baz(); } +This example fails because C<baz()> refers to the package/global subroutine +C<baz>, not the lexical subroutine currently being defined. + +The solution is to use L<C<__SUB__>|perlfunc/__SUB__>: + + my sub baz { + __SUB__->(); # calls itself + } + +It is possible to predeclare a lexical subroutine. The C<sub foo {...}> +subroutine definition syntax respects any previous C<my sub;> or C<state sub;> +declaration. Using this to define recursive subroutines is a bad idea, +however: + + my sub baz; # predeclaration + sub baz { # define the "my" sub + baz(); # WRONG: calls itself, but leaks memory + } + +Just like C<< my $f; $f = sub { $f->() } >>, this example leaks memory. The +name C<baz> is a reference to the subroutine, and the subroutine uses the name +C<baz>; they keep each other alive (see L<perlref/Circular References>). + =head3 C<state sub> vs C<my sub> What is the difference between "state" subs and "my" subs? Each time that @@ -1102,9 +1135,6 @@ containing block to the next. So, in general, "state" subroutines are faster. But "my" subs are necessary if you want to create closures: - no warnings "experimental::lexical_subs"; - use feature 'lexical_subs'; - sub whatever { my $x = shift; my sub inner { @@ -1125,9 +1155,6 @@ subroutine of the same name. The two main uses for this are to switch back to using the package sub inside an inner scope: - no warnings "experimental::lexical_subs"; - use feature 'lexical_subs'; - sub foo { ... } sub bar { @@ -1143,9 +1170,6 @@ and to make a subroutine visible to other packages in the same scope: package MySneakyModule; - no warnings "experimental::lexical_subs"; - use feature 'lexical_subs'; - our sub do_something { ... } sub do_something_with_caller { @@ -1591,14 +1615,14 @@ and someone has been calling it with an array or expression returning a list: func(@foo); - func( split /:/ ); + func( $text =~ /\w+/g ); Then you've just supplied an automatic C<scalar> in front of their argument, which can be more than a bit surprising. The old C<@foo> which used to hold one thing doesn't get passed in. Instead, C<func()> now gets passed in a C<1>; that is, the number of elements -in C<@foo>. And the C<split> gets called in scalar context so it -starts scribbling on your C<@_> parameter list. Ouch! +in C<@foo>. And the C<m//g> gets called in scalar context so instead of a +list of words it returns a boolean result and advances C<pos($text)>. Ouch! If a sub has both a PROTO and a BLOCK, the prototype is not applied until after the BLOCK is completely defined. This means that a recursive diff --git a/gnu/usr.bin/perl/pod/perlsyn.pod b/gnu/usr.bin/perl/pod/perlsyn.pod index 09cfd13b98b..d63108f2752 100644 --- a/gnu/usr.bin/perl/pod/perlsyn.pod +++ b/gnu/usr.bin/perl/pod/perlsyn.pod @@ -116,16 +116,6 @@ C<do {}> that I<look> like compound statements, but aren't--they're just TERMs in an expression--and thus need an explicit termination when used as the last item in a statement. -=head2 Truth and Falsehood -X<truth> X<falsehood> X<true> X<false> X<!> X<not> X<negation> X<0> - -The number 0, the strings C<'0'> and C<"">, the empty list C<()>, and -C<undef> are all false in a boolean context. All other values are true. -Negation of a true value by C<!> or C<not> returns a special false value. -When evaluated as a string it is treated as C<"">, but as a number, it -is treated as 0. Most Perl operators -that return true or false behave this way. - =head2 Statement Modifiers X<statement modifier> X<modifier> X<if> X<unless> X<while> X<until> X<when> X<foreach> X<for> @@ -158,6 +148,8 @@ for each item in the LIST (with C<$_> aliased to each item in turn). print "Hello $_!\n" for qw(world Dolly nurse); C<while> repeats the statement I<while> the condition is true. +Postfix C<while> has the same magic treatment of some kinds of condition +that prefix C<while> has. C<until> does the opposite, it repeats the statement I<until> the condition is true (or while the condition is false): @@ -231,8 +223,8 @@ a C<next> from inside a C<foreach> and C<break> from inside a C<given>. Under the current implementation, the C<foreach> loop can be anywhere within the C<when> modifier's dynamic scope, but must be -within the C<given> block's lexical scope. This restricted may -be relaxed in a future release. See L<"Switch Statements"> below. +within the C<given> block's lexical scope. This restriction may +be relaxed in a future release. See L</"Switch Statements"> below. =head2 Compound Statements X<statement, compound> X<block> X<bracket, curly> X<curly bracket> X<brace> @@ -243,8 +235,16 @@ Sometimes a block is delimited by the file containing it (in the case of a required file, or the program as a whole), and sometimes a block is delimited by the extent of a string (in the case of an eval). -But generally, a block is delimited by curly brackets, also known as braces. -We will call this syntactic construct a BLOCK. +But generally, a block is delimited by curly brackets, also known as +braces. We will call this syntactic construct a BLOCK. Because enclosing +braces are also the syntax for hash reference constructor expressions +(see L<perlref>), you may occasionally need to disambiguate by placing a +C<;> immediately after an opening brace so that Perl realises the brace +is the start of a block. You will more frequently need to disambiguate +the other way, by placing a C<+> immediately before an opening brace to +force it to be interpreted as a hash reference constructor expression. +It is considered good style to use these disambiguating mechanisms +liberally, not only when Perl would otherwise guess incorrectly. The following compound statements may be used to control flow: @@ -304,7 +304,7 @@ language construct, as everyone reading your code will have to think at least twice before they can understand what's going on. The C<while> statement executes the block as long as the expression is -L<true|/"Truth and Falsehood">. +true. The C<until> statement executes the block as long as the expression is false. The LABEL is optional, and if present, consists of an identifier followed @@ -316,6 +316,20 @@ looking back your call-stack at run time to find the LABEL. Such desperate behavior triggers a warning if you use the C<use warnings> pragma or the B<-w> flag. +If the condition expression of a C<while> statement is based +on any of a group of iterative expression types then it gets +some magic treatment. The affected iterative expression types +are L<C<readline>|perlfunc/readline EXPR>, the L<C<< <FILEHANDLE> +>>|perlop/"I/O Operators"> input operator, L<C<readdir>|perlfunc/readdir +DIRHANDLE>, L<C<glob>|perlfunc/glob EXPR>, the L<C<< <PATTERN> +>>|perlop/"I/O Operators"> globbing operator, and L<C<each>|perlfunc/each +HASH>. If the condition expression is one of these expression types, then +the value yielded by the iterative operator will be implicitly assigned +to C<$_>. If the condition expression is one of these expression types +or an explicit assignment of one of them to a scalar, then the condition +actually tests for definedness of the expression's value, not for its +regular truth value. + If there is a C<continue> BLOCK, it is always executed just before the conditional is about to be evaluated again. Thus it can be used to increment a loop variable, even when the loop has been continued via @@ -411,7 +425,7 @@ they aren't loops. You can double the braces to make them such, though. }} This is caused by the fact that a block by itself acts as a loop that -executes once, see L<"Basic BLOCKs">. +executes once, see L</"Basic BLOCKs">. The form C<while/if BLOCK BLOCK>, available in Perl 4, is no longer available. Replace any occurrence of C<if BLOCK> by C<if (do BLOCK)>. @@ -469,14 +483,8 @@ X<eof> X<end-of-file> X<end of file> # do something } -Using C<readline> (or the operator form, C<< <EXPR> >>) as the -conditional of a C<for> loop is shorthand for the following. This -behaviour is the same as a C<while> loop conditional. -X<readline> X<< <> >> - - for ( prompt(); defined( $_ = <STDIN> ); prompt() ) { - # do something - } +The condition expression of a C<for> loop gets the same magic treatment of +C<readline> et al that the condition expression of a C<while> loop gets. =head2 Foreach Loops X<for> X<foreach> @@ -638,8 +646,8 @@ Starting from Perl 5.16, one can prefix the switch keywords with C<CORE::> to access the feature without a C<use feature> statement. The keywords C<given> and C<when> are analogous to C<switch> and -C<case> in other languages, so the code in the previous section could be -rewritten as +C<case> in other languages -- though C<continue> is not -- so the code +in the previous section could be rewritten as use v5.10.1; for ($var) { @@ -754,12 +762,7 @@ X<whatever operator> X<triple-dot operator> Beginning in Perl 5.12, Perl accepts an ellipsis, "C<...>", as a -placeholder for code that you haven't implemented yet. This form of -ellipsis, the unimplemented statement, should not be confused with the -binary flip-flop C<...> operator. One is a statement and the other an -operator. (Perl doesn't usually confuse them because usually Perl can tell -whether it wants an operator or a statement, but see below for exceptions.) - +placeholder for code that you haven't implemented yet. When Perl 5.12 or later encounters an ellipsis statement, it parses this without error, but if and when you should actually try to execute it, Perl throws an exception with the text C<Unimplemented>: @@ -771,8 +774,11 @@ throws an exception with the text C<Unimplemented>: say "I found an ellipsis!"; } -You can only use the elliptical statement to stand in for a -complete statement. These examples of how the ellipsis works: +You can only use the elliptical statement to stand in for a complete +statement. Syntactically, "C<...;>" is a complete statement, but, +as with other kinds of semicolon-terminated statement, the semicolon +may be omitted if "C<...>" appears immediately before a closing brace. +These examples show how the ellipsis works: use v5.12; { ... } @@ -791,9 +797,7 @@ complete statement. These examples of how the ellipsis works: }; The elliptical statement cannot stand in for an expression that -is part of a larger statement, since the C<...> is also the three-dot -version of the flip-flop operator (see L<perlop/"Range Operators">). - +is part of a larger statement. These examples of attempts to use an ellipsis are syntax errors: use v5.12; @@ -801,22 +805,17 @@ These examples of attempts to use an ellipsis are syntax errors: print ...; open(my $fh, ">", "/dev/passwd") or ...; if ($condition && ... ) { say "Howdy" }; + ... if $a > $b; + say "Cromulent" if ...; + $flub = 5 + ...; There are some cases where Perl can't immediately tell the difference between an expression and a statement. For instance, the syntax for a block and an anonymous hash reference constructor look the same unless there's something in the braces to give Perl a hint. The ellipsis is a -syntax error if Perl doesn't guess that the C<{ ... }> is a block. In that -case, it doesn't think the C<...> is an ellipsis because it's expecting an -expression instead of a statement: - - @transformed = map { ... } @input; # syntax error - +syntax error if Perl doesn't guess that the C<{ ... }> is a block. Inside your block, you can use a C<;> before the ellipsis to denote that the -C<{ ... }> is a block and not a hash reference constructor. Now the ellipsis -works: - - @transformed = map {; ... } @input; # ';' disambiguates +C<{ ... }> is a block and not a hash reference constructor. Note: Some folks colloquially refer to this bit of punctuation as a "yada-yada" or "triple-dot", but its true name @@ -993,7 +992,7 @@ the form C<!/REGEX/>, C<$foo !~ /REGEX/>, or C<$foo !~ EXPR>. A smart match that uses an explicit C<~~> operator, such as C<EXPR ~~ EXPR>. B<NOTE:> You will often have to use C<$c ~~ $_> because the default case -uses C<$_ ~~ $c> , which is frequentlythe opposite of what you want. +uses C<$_ ~~ $c> , which is frequently the opposite of what you want. =item Z<>4. @@ -1118,7 +1117,7 @@ a C<break>. =head3 Fall-through You can use the C<continue> keyword to fall through from one -case to the next: +case to the next immediate C<when> or C<default>: given($foo) { when (/x/) { say '$foo contains an x'; continue } diff --git a/gnu/usr.bin/perl/pod/perlthrtut.pod b/gnu/usr.bin/perl/pod/perlthrtut.pod index f5e35a3a5ec..956214fbd06 100644 --- a/gnu/usr.bin/perl/pod/perlthrtut.pod +++ b/gnu/usr.bin/perl/pod/perlthrtut.pod @@ -1104,7 +1104,7 @@ Here's a short bibliography courtesy of Jürgen Christoffel: Birrell, Andrew D. An Introduction to Programming with Threads. Digital Equipment Corporation, 1989, DEC-SRC Research Report #35 online as -ftp://ftp.dec.com/pub/DEC/SRC/research-reports/SRC-035.pdf +L<ftp://ftp.dec.com/pub/DEC/SRC/research-reports/SRC-035.pdf> (highly recommended) Robbins, Kay. A., and Steven Robbins. Practical Unix Programming: A diff --git a/gnu/usr.bin/perl/pod/perltie.pod b/gnu/usr.bin/perl/pod/perltie.pod index 7b89f570adc..5ee19c09ec0 100644 --- a/gnu/usr.bin/perl/pod/perltie.pod +++ b/gnu/usr.bin/perl/pod/perltie.pod @@ -504,8 +504,9 @@ reports whether a key is present in the hash, and DELETE deletes one. CLEAR empties the hash by deleting all the key and value pairs. FIRSTKEY and NEXTKEY implement the keys() and each() functions to iterate over all the keys. SCALAR is triggered when the tied hash is evaluated in scalar -context. UNTIE is called when C<untie> happens, and DESTROY is called when -the tied variable is garbage collected. +context, and in 5.28 onwards, by C<keys> in boolean context. UNTIE is +called when C<untie> happens, and DESTROY is called when the tied variable +is garbage collected. If this seems like a lot, then feel free to inherit from merely the standard Tie::StdHash module for most of your methods, redefining only the @@ -805,9 +806,10 @@ thing, but we'll have to go through the LIST field indirectly. =item SCALAR this X<SCALAR> -This is called when the hash is evaluated in scalar context. In order -to mimic the behaviour of untied hashes, this method should return a -false value when the tied hash is considered empty. If this method does +This is called when the hash is evaluated in scalar context, and in 5.28 +onwards, by C<keys> in boolean context. In order to mimic the behaviour of +untied hashes, this method must return a value which when used as boolean, +indicates whether the tied hash is considered empty. If this method does not exist, perl will make some educated guesses and return true when the hash is inside an iteration. If this isn't the case, FIRSTKEY is called, and the result will be a false value if FIRSTKEY returns the empty @@ -828,6 +830,11 @@ referenced by C<$self-E<gt>{LIST}>: return scalar %{ $self->{LIST} } } +NOTE: In perl 5.25 the behavior of scalar %hash on an untied hash changed +to return the count of keys. Prior to this it returned a string containing +information about the bucket setup of the hash. See +L<Hash::Util/bucket_ratio> for a backwards compatibility path. + =item UNTIE this X<UNTIE> @@ -1196,10 +1203,11 @@ modules L<Tie::Scalar>, L<Tie::Array>, L<Tie::Hash>, or L<Tie::Handle>. =head1 BUGS -The bucket usage information provided by C<scalar(%hash)> is not +The normal return provided by C<scalar(%hash)> is not available. What this means is that using %tied_hash in boolean context doesn't work right (currently this always tests false, regardless of whether the hash is empty or hash elements). +[ This paragraph needs review in light of changes in 5.25 ] Localizing tied arrays or hashes does not work. After exiting the scope the arrays or the hashes are not restored. diff --git a/gnu/usr.bin/perl/pod/perlunicode.pod b/gnu/usr.bin/perl/pod/perlunicode.pod index 4222c43e0df..9c9111dce03 100644 --- a/gnu/usr.bin/perl/pod/perlunicode.pod +++ b/gnu/usr.bin/perl/pod/perlunicode.pod @@ -36,8 +36,8 @@ Unicode support is an extensive requirement. While Perl does not implement the Unicode standard or the accompanying technical reports from cover to cover, Perl does support many Unicode features. -Also, the use of Unicode may present security issues that aren't obvious. -Read L<Unicode Security Considerations|http://www.unicode.org/reports/tr36>. +Also, the use of Unicode may present security issues that aren't +obvious, see L</Security Implications of Unicode>. =over 4 @@ -60,10 +60,11 @@ filenames. Use the C<:encoding(...)> layer to read from and write to filehandles using the specified encoding. (See L<open>.) -=item You should convert your non-ASCII, non-UTF-8 Perl scripts to be +=item You must convert your non-ASCII, non-UTF-8 Perl scripts to be UTF-8. -See L<encoding>. +The L<encoding> module has been deprecated since perl 5.18 and the +perl internals it requires have been removed with perl 5.26. =item C<use utf8> still needed to enable L<UTF-8|/Unicode Encodings> in scripts @@ -73,14 +74,16 @@ recognition of that (in string or regular expression literals, or in identifier names). B<This is the only time when an explicit S<C<use utf8>> is needed.> (See L<utf8>). -=item C<BOM>-marked scripts and L<UTF-16|/Unicode Encodings> scripts autodetected +If a Perl script begins with the bytes that form the UTF-8 encoding of +the Unicode BYTE ORDER MARK (C<BOM>, see L</Unicode Encodings>), those +bytes are completely ignored. -However, if a Perl script begins with the Unicode C<BOM> (UTF-16LE, -UTF16-BE, or UTF-8), or if the script looks like non-C<BOM>-marked +=item L<UTF-16|/Unicode Encodings> scripts autodetected + +If a Perl script begins with the Unicode C<BOM> (UTF-16LE, +UTF16-BE), or if the script looks like non-C<BOM>-marked UTF-16 of either endianness, Perl will correctly read in the script as -the appropriate Unicode encoding. (C<BOM>-less UTF-8 cannot be -effectively recognized or differentiated from ISO 8859-1 or other -eight-bit encodings.) +the appropriate Unicode encoding. =back @@ -162,7 +165,7 @@ contain characters that have ordinal values larger than 255. If you use a Unicode editor to edit your program, Unicode characters may occur directly within the literal strings in UTF-8 encoding, or UTF-16. -(The former requires a C<BOM> or C<use utf8>, the latter requires a C<BOM>.) +(The former requires a C<use utf8>, the latter may require a C<BOM>.) L<perluniintro/Creating Unicode> gives other ways to place non-ASCII characters in your strings. @@ -189,11 +192,12 @@ C<scalar reverse()> reverses by character rather than by byte. =item * The bit string operators, C<& | ^ ~> and (starting in v5.22) -C<&. |. ^. ~.> can operate on characters that don't fit into a byte. -However, the current behavior is likely to change. You should not use -these operators on strings that are encoded in UTF-8. If you're not -sure about the encoding of a string, downgrade it before using any of -these operators; you can use +C<&. |. ^. ~.> can operate on bit strings encoded in UTF-8, but this +can give unexpected results if any of the strings contain code points +above 0xFF. Starting in v5.28, it is a fatal error to have such an +operand. Otherwise, the operation is performed on a non-UTF-8 copy of +the operand. If you're not sure about the encoding of a string, +downgrade it before using any of these operators; you can use L<C<utf8::utf8_downgrade()>|utf8/Utility functions>. =back @@ -206,7 +210,8 @@ Semantics". Before Unicode, when a character was a byte was a character, Perl knew only about the 128 characters defined by ASCII, code points 0 -through 127 (except for under S<C<use locale>>). That left the code +through 127 (except for under L<S<C<use locale>>|perllocale>). That +left the code points 128 to 255 as unassigned, and available for whatever use a program might want. The only semantics they have is their ordinal numbers, and that they are members of none of the non-negative character @@ -229,7 +234,7 @@ Unicode: Within the scope of S<C<use utf8>> If the whole program is Unicode (signified by using 8-bit B<U>nicode -B<T>ransformation B<F>ormat), then all strings within it must be +B<T>ransformation B<F>ormat), then all literal strings within it must be Unicode. =item * @@ -389,7 +394,7 @@ other. You may be presented with strings in any of these equivalent forms. There is currently nothing in Perl 5 that ignores the differences. So -you'll have to specially hanlde it. The usual advice is to convert your +you'll have to specially handle it. The usual advice is to convert your inputs to C<NFD> before processing further. For more detailed information, see L<http://unicode.org/reports/tr15/>. @@ -602,16 +607,19 @@ The world's languages are written in many different scripts. This sentence written in Cyrillic, and Greek is written in, well, Greek; Japanese mainly in Hiragana or Katakana. There are many more. -The Unicode C<Script> and C<Script_Extensions> properties give what script a -given character is in. Either property can be specified with the -compound form like +The Unicode C<Script> and C<Script_Extensions> properties give what +script a given character is in. The C<Script_Extensions> property is an +improved version of C<Script>, as demonstrated below. Either property +can be specified with the compound form like C<\p{Script=Hebrew}> (short: C<\p{sc=hebr}>), or C<\p{Script_Extensions=Javanese}> (short: C<\p{scx=java}>). In addition, Perl furnishes shortcuts for all -C<Script> property names. You can omit everything up through the equals -(or colon), and simply write C<\p{Latin}> or C<\P{Cyrillic}>. -(This is not true for C<Script_Extensions>, which is required to be -written in the compound form.) +C<Script_Extensions> property names. You can omit everything up through +the equals (or colon), and simply write C<\p{Latin}> or C<\P{Cyrillic}>. +(This is not true for C<Script>, which is required to be +written in the compound form. Prior to Perl v5.26, the single form +returned the plain old C<Script> version, but was changed because +C<Script_Extensions> gives better results.) The difference between these two properties involves characters that are used in multiple scripts. For example the digits '0' through '9' are @@ -645,7 +653,11 @@ fewer characters in the C<Common> script, and correspondingly more in other scripts. It is new in Unicode version 6.0, and its data are likely to change significantly in later releases, as things get sorted out. New code should probably be using C<Script_Extensions> and not plain -C<Script>. +C<Script>. If you compile perl with a Unicode release that doesn't have +C<Script_Extensions>, the single form Perl extensions will instead refer +to the plain C<Script> property. If you compile with a version of +Unicode that doesn't have the C<Script> property, these extensions will +not be defined at all. (Actually, besides C<Common>, the C<Inherited> script, contains characters that are used in multiple scripts. These are modifier @@ -658,15 +670,18 @@ C<Script>, but not in C<Script_Extensions>.) It is worth stressing that there are several different sets of digits in Unicode that are equivalent to 0-9 and are matchable by C<\d> in a regular expression. If they are used in a single language only, they -are in that language's C<Script> and C<Script_Extension>. If they are +are in that language's C<Script> and C<Script_Extensions>. If they are used in more than one script, they will be in C<sc=Common>, but only if they are used in many scripts should they be in C<scx=Common>. +The explanation above has omitted some detail; refer to UAX#24 "Unicode +Script Property": L<http://www.unicode.org/reports/tr24>. + A complete list of scripts and their shortcuts is in L<perluniprops>. =head3 B<Use of the C<"Is"> Prefix> -For backward compatibility (with Perl 5.6), all properties writable +For backward compatibility (with ancient Perl 5.6), all properties writable without using the compound form mentioned so far may have C<Is> or C<Is_> prepended to their name, so C<\P{Is_Lu}>, for example, is equal to C<\P{Lu}>, and C<\p{IsScript:Arabic}> is equal to @@ -690,7 +705,7 @@ C<Common> script. For more about scripts versus blocks, see UAX#24 "Unicode Script Property": L<http://www.unicode.org/reports/tr24> -The C<Script> or C<Script_Extensions> properties are likely to be the +The C<Script_Extensions> or C<Script> properties are likely to be the ones you want to use when processing natural language; the C<Block> property may occasionally be useful in working with the nuts and bolts of Unicode. @@ -711,10 +726,11 @@ longer work. The extensions are mentioned here for completeness: Take the block name and prefix it with one of: C<In> (for example C<\p{Blk=Arrows}> can currently be written as C<\p{In_Arrows}>); or sometimes C<Is> (like C<\p{Is_Arrows}>); or sometimes no prefix at all -(C<\p{Arrows}>). As of this writing (Unicode 8.0) there are no +(C<\p{Arrows}>). As of this writing (Unicode 9.0) there are no conflicts with using the C<In_> prefix, but there are plenty with the other two forms. For example, C<\p{Is_Hebrew}> and C<\p{Hebrew}> mean -C<\p{Script=Hebrew}> which is NOT the same thing as C<\p{Blk=Hebrew}>. Our +C<\p{Script_Extensions=Hebrew}> which is NOT the same thing as +C<\p{Blk=Hebrew}>. Our advice used to be to use the C<In_> prefix as a single form way of specifying a block. But Unicode 8.0 added properties whose names begin with C<In>, and it's now clear that it's only luck that's so far @@ -1069,38 +1085,40 @@ See L<Encode>. =head2 Unicode Regular Expression Support Level The following list of Unicode supported features for regular expressions describes -all features currently directly supported by core Perl. The references to "Level N" -and the section numbers refer to the Unicode Technical Standard #18, -"Unicode Regular Expressions", version 13, from August 2008. - -=over 4 - -=item * - -Level 1 - Basic Unicode Support - - RL1.1 Hex Notation - done [1] - RL1.2 Properties - done [2][3] - RL1.2a Compatibility Properties - done [4] - RL1.3 Subtraction and Intersection - experimental [5] - RL1.4 Simple Word Boundaries - done [6] - RL1.5 Simple Loose Matches - done [7] - RL1.6 Line Boundaries - MISSING [8][9] - RL1.7 Supplementary Code Points - done [10] +all features currently directly supported by core Perl. The references +to "Level I<N>" and the section numbers refer to +L<UTS#18 "Unicode Regular Expressions"|http://www.unicode.org/reports/tr18>, +version 13, November 2013. + +=head3 Level 1 - Basic Unicode Support + + RL1.1 Hex Notation - Done [1] + RL1.2 Properties - Done [2] + RL1.2a Compatibility Properties - Done [3] + RL1.3 Subtraction and Intersection - Experimental [4] + RL1.4 Simple Word Boundaries - Done [5] + RL1.5 Simple Loose Matches - Done [6] + RL1.6 Line Boundaries - Partial [7] + RL1.7 Supplementary Code Points - Done [8] =over 4 =item [1] C<\N{U+...}> and C<\x{...}> -=item [2] C<\p{...}> C<\P{...}> +=item [2] +C<\p{...}> C<\P{...}>. This requirement is for a minimal list of +properties. Perl supports these and all other Unicode character +properties, as R2.7 asks (see L</"Unicode Character Properties"> above). -=item [3] supports not only minimal list, but all Unicode character -properties (see Unicode Character Properties above) +=item [3] +Perl has C<\d> C<\D> C<\s> C<\S> C<\w> C<\W> C<\X> C<[:I<prop>:]> +C<[:^I<prop>:]>, plus all the properties specified by +L<http://www.unicode.org/reports/tr18/#Compatibility_Properties>. These +are described above in L</Other Properties> -=item [4] C<\d> C<\D> C<\s> C<\S> C<\w> C<\W> C<\X> C<[:I<prop>:]> -C<[:^I<prop>:]> +=item [4] -=item [5] The experimental feature starting in v5.18 C<"(?[...])"> accomplishes +The experimental feature C<"(?[...])"> starting in v5.18 accomplishes this. See L<perlre/(?[ ])>. If you don't want to use an experimental @@ -1109,7 +1127,6 @@ feature, you can use one of the following: =over 4 =item * - Regular expression lookahead You can mimic class subtraction using lookahead. @@ -1143,9 +1160,12 @@ C<"+"> for union, C<"-"> for removal (set-difference), C<"&"> for intersection =back -=item [6] C<\b> C<\B> +=item [5] +C<\b> C<\B> meet most, but not all, the details of this requirement, but +C<\b{wb}> and C<\B{wb}> do, as well as the stricter R2.3. + +=item [6] -=item [7] Note that Perl does Full case-folding in matching, not Simple: For example C<U+1F88> is equivalent to C<U+1F00 U+03B9>, instead of just @@ -1154,9 +1174,18 @@ letters with certain modifiers: the Full case-folding decomposes the letter, while the Simple case-folding would map it to a single character. -=item [8] -Perl treats C<\n> as the start- and end-line delimiter. Unicode -specifies more characters that should be so-interpreted. +=item [7] + +The reason this is considered to be only partially implemented is that +Perl has L<C<qrE<sol>\b{lb}E<sol>>|perlrebackslash/\b{lb}> and +C<L<Unicode::LineBreak>> that are conformant with +L<UAX#14 "Unicode Line Breaking Algorithm"|http://www.unicode.org/reports/tr14>. +The regular expression construct provides default behavior, while the +heavier-weight module provides customizable line breaking. + +But Perl treats C<\n> as the start- and end-line +delimiter, whereas Unicode specifies more characters that should be +so-interpreted. These are: @@ -1176,63 +1205,66 @@ Also, lines should not be split within C<CRLF> (i.e. there is no empty line between C<\r> and C<\n>). For C<CRLF>, try the C<:crlf> layer (see L<PerlIO>). -=item [9] But C<qr/\b{lb}/> and C<L<Unicode::LineBreak>> are available. - -L<C<qrE<sol>\b{lb}E<sol>>|perlrebackslash/\b{lb}> supplies default line -breaking conformant with -L<UAX#14 "Unicode Line Breaking Algorithm"|http://www.unicode.org/reports/tr14>. - -And, the module C<L<Unicode::LineBreak>> also conformant with UAX#14, -provides customizable line breaking. - -=item [10] +=item [8] UTF-8/UTF-EBDDIC used in Perl allows not only C<U+10000> to C<U+10FFFF> but also beyond C<U+10FFFF> =back -=item * +=head3 Level 2 - Extended Unicode Support -Level 2 - Extended Unicode Support + RL2.1 Canonical Equivalents - Retracted [9] + by Unicode + RL2.2 Extended Grapheme Clusters - Partial [10] + RL2.3 Default Word Boundaries - Done [11] + RL2.4 Default Case Conversion - Done + RL2.5 Name Properties - Done + RL2.6 Wildcard Properties - Missing + RL2.7 Full Properties - Done - RL2.1 Canonical Equivalents - MISSING [10][11] - RL2.2 Default Grapheme Clusters - MISSING [12] - RL2.3 Default Word Boundaries - DONE [14] - RL2.4 Default Loose Matches - MISSING [15] - RL2.5 Name Properties - DONE - RL2.6 Wildcard Properties - MISSING +=over 4 - [10] see UAX#15 "Unicode Normalization Forms" - [11] have Unicode::Normalize but not integrated to regexes - [12] have \X and \b{gcb} but we don't have a "Grapheme Cluster - Mode" - [14] see UAX#29, Word Boundaries - [15] This is covered in Chapter 3.13 (in Unicode 6.0) +=item [9] +Unicode has rewritten this portion of UTS#18 to say that getting +canonical equivalence (see UAX#15 +L<"Unicode Normalization Forms"|http://www.unicode.org/reports/tr15>) +is basically to be done at the programmer level. Use NFD to write +both your regular expressions and text to match them against (you +can use L<Unicode::Normalize>). -=item * +=item [10] +Perl has C<\X> and C<\b{gcb}> but we don't have a "Grapheme Cluster Mode". + +=item [11] see +L<UAX#29 "Unicode Text Segmentation"|http://www.unicode.org/reports/tr29>, + +=back + +=head3 Level 3 - Tailored Support + + RL3.1 Tailored Punctuation - Missing + RL3.2 Tailored Grapheme Clusters - Missing [12] + RL3.3 Tailored Word Boundaries - Missing + RL3.4 Tailored Loose Matches - Retracted by Unicode + RL3.5 Tailored Ranges - Retracted by Unicode + RL3.6 Context Matching - Missing [13] + RL3.7 Incremental Matches - Missing + RL3.8 Unicode Set Sharing - Unicode is proposing + to retract this + RL3.9 Possible Match Sets - Missing + RL3.10 Folded Matching - Retracted by Unicode + RL3.11 Submatchers - Missing + +=over 4 + +=item [12] +Perl has L<Unicode::Collate>, but it isn't integrated with regular +expressions. See +L<UTS#10 "Unicode Collation Algorithms"|http://www.unicode.org/reports/tr10>. -Level 3 - Tailored Support - - RL3.1 Tailored Punctuation - MISSING - RL3.2 Tailored Grapheme Clusters - MISSING [17][18] - RL3.3 Tailored Word Boundaries - MISSING - RL3.4 Tailored Loose Matches - MISSING - RL3.5 Tailored Ranges - MISSING - RL3.6 Context Matching - MISSING [19] - RL3.7 Incremental Matches - MISSING - ( RL3.8 Unicode Set Sharing ) - RL3.9 Possible Match Sets - MISSING - RL3.10 Folded Matching - MISSING [20] - RL3.11 Submatchers - MISSING - - [17] see UAX#10 "Unicode Collation Algorithms" - [18] have Unicode::Collate but not integrated to regexes - [19] have (?<=x) and (?=x), but lookaheads or lookbehinds - should see outside of the target substring - [20] need insensitive matching for linguistic features other - than case; for example, hiragana to katakana, wide and - narrow, simplified Han to traditional Han (see UTR#30 - "Character Foldings") +=item [13] +Perl has C<(?<=x)> and C<(?=x)>, but lookaheads or lookbehinds should +see outside of the target substring =back @@ -1507,7 +1539,7 @@ became generally reliable) through v5.18. The difference is that Perl treated all C<\p{}> matches as failing, but all C<\P{}> matches as succeeding. -One problem with this is that it leads to unexpected, and confusting +One problem with this is that it leads to unexpected, and confusing results in some cases: chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Failed on <= v5.18 @@ -1603,15 +1635,23 @@ Also, note the following: Malformed UTF-8 -Unfortunately, the original specification of UTF-8 leaves some room for -interpretation of how many bytes of encoded output one should generate -from one input Unicode character. Strictly speaking, the shortest -possible sequence of UTF-8 bytes should be generated, -because otherwise there is potential for an input buffer overflow at -the receiving end of a UTF-8 connection. Perl always generates the -shortest length UTF-8, and with warnings on, Perl will warn about -non-shortest length UTF-8 along with other malformations, such as the -surrogates, which are not Unicode code points valid for interchange. +UTF-8 is very structured, so many combinations of bytes are invalid. In +the past, Perl tried to soldier on and make some sense of invalid +combinations, but this can lead to security holes, so now, if the Perl +core needs to process an invalid combination, it will either raise a +fatal error, or will replace those bytes by the sequence that forms the +Unicode REPLACEMENT CHARACTER, for which purpose Unicode created it. + +Every code point can be represented by more than one possible +syntactically valid UTF-8 sequence. Early on, both Unicode and Perl +considered any of these to be valid, but now, all sequences longer +than the shortest possible one are considered to be malformed. + +Unicode considers many code points to be illegal, or to be avoided. +Perl generally accepts them, once they have passed through any input +filters that may try to exclude them. These have been discussed above +(see "Surrogates" under UTF-16 in L</Unicode Encodings>, +L</Noncharacter code points>, and L</Beyond Unicode code points>). =item * @@ -1639,7 +1679,7 @@ See L<perlebcdic/Unicode and UTF>. Because UTF-EBCDIC is so similar to UTF-8, the differences are mostly hidden from you; S<C<use utf8>> (and NOT something like -S<C<use utfebcdic>>) declares the the script is in the platform's +S<C<use utfebcdic>>) declares the script is in the platform's "native" 8-bit encoding of Unicode. (Similarly for the C<":utf8"> layer.) @@ -1719,7 +1759,7 @@ it, which changes the rules from ASCII to Unicode. As an example, consider the following program and its output: $ perl -le' - no feature 'unicode_strings'; + no feature "unicode_strings"; $s1 = "\xC2"; $s2 = "\x{2660}"; for ($s1, $s2, $s1.$s2) { @@ -1787,6 +1827,27 @@ Prior to that, or outside its scope, no code points above 127 are quoted in UTF-8 encoded strings, but in byte encoded strings, code points between 128-255 are always quoted. +=item * + +In the C<..> or L<range|perlop/Range Operators> operator. + +Starting in Perl 5.26.0, the range operator on strings treats their lengths +consistently within the scope of C<unicode_strings>. Prior to that, or +outside its scope, it could produce strings whose length in characters +exceeded that of the right-hand side, where the right-hand side took up more +bytes than the correct range endpoint. + +=item * + +In L<< C<split>'s special-case whitespace splitting|perlfunc/split >>. + +Starting in Perl 5.28.0, the C<split> function with a pattern specified as +a string containing a single space handles whitespace characters consistently +within the scope of of C<unicode_strings>. Prior to that, or outside its scope, +characters that are whitespace according to Unicode rules but not according to +ASCII rules were treated as field contents rather than field separators when +they appear in byte-encoded strings. + =back You can see from the above that the effect of C<unicode_strings> @@ -1852,7 +1913,7 @@ work under 5.6, so you should be safe to try them out. A filehandle that should read or write UTF-8 if ($] > 5.008) { - binmode $fh, ":encoding(utf8)"; + binmode $fh, ":encoding(UTF-8)"; } =item * @@ -1867,7 +1928,7 @@ check the documentation to verify if this is still true. if ($] > 5.008) { require Encode; - $val = Encode::encode_utf8($val); # make octets + $val = Encode::encode("UTF-8", $val); # make octets } =item * @@ -1879,7 +1940,7 @@ want the UTF8 flag restored: if ($] > 5.008) { require Encode; - $val = Encode::decode_utf8($val); + $val = Encode::decode("UTF-8", $val); } =item * @@ -1980,8 +2041,8 @@ Perl's internal representation like so: sub my_escape_html ($) { my($what) = shift; return unless defined $what; - Encode::decode_utf8(Foo::Bar::escape_html( - Encode::encode_utf8($what))); + Encode::decode("UTF-8", Foo::Bar::escape_html( + Encode::encode("UTF-8", $what))); } Sometimes, when the extension does not convert data but just stores diff --git a/gnu/usr.bin/perl/pod/perlunicook.pod b/gnu/usr.bin/perl/pod/perlunicook.pod index e1693cd6b71..eb395f795e4 100644 --- a/gnu/usr.bin/perl/pod/perlunicook.pod +++ b/gnu/usr.bin/perl/pod/perlunicook.pod @@ -26,7 +26,7 @@ to work correctly, with the C<#!> adjusted to work on your system: use strict; # quote strings, declare variables use warnings; # on by default use warnings qw(FATAL utf8); # fatalize encoding glitches - use open qw(:std :utf8); # undeclared streams in UTF-8 + use open qw(:std :encoding(UTF-8)); # undeclared streams in UTF-8 use charnames qw(:full :short); # unneeded in v5.16 This I<does> make even Unix programmers C<binmode> your binary streams, @@ -234,8 +234,8 @@ C<binmode> as described later below. or $ export PERL_UNICODE=A or - use Encode qw(decode_utf8); - @ARGV = map { decode_utf8($_, 1) } @ARGV; + use Encode qw(decode); + @ARGV = map { decode('UTF-8', $_, 1) } @ARGV; =head2 ℞ 14: Decode program arguments as locale encoding @@ -255,9 +255,9 @@ call C<binmode> explicitly: or $ export PERL_UNICODE=S or - use open qw(:std :utf8); + use open qw(:std :encoding(UTF-8)); or - binmode(STDIN, ":utf8"); + binmode(STDIN, ":encoding(UTF-8)"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); @@ -280,7 +280,7 @@ Files opened without an encoding argument will be in UTF-8: or $ export PERL_UNICODE=D or - use open qw(:utf8); + use open qw(:encoding(UTF-8)); =head2 ℞ 18: Make all I/O and args default to utf8 @@ -288,9 +288,9 @@ Files opened without an encoding argument will be in UTF-8: or $ export PERL_UNICODE=SDA or - use open qw(:std :utf8); - use Encode qw(decode_utf8); - @ARGV = map { decode_utf8($_, 1) } @ARGV; + use open qw(:std :encoding(UTF-8)); + use Encode qw(decode); + @ARGV = map { decode('UTF-8', $_, 1) } @ARGV; =head2 ℞ 19: Open file with specific encoding @@ -391,7 +391,7 @@ one codepoint lacking that property. \p{Sk}, \p{Ps}, \p{Lt} \p{alpha}, \p{upper}, \p{lower} \p{Latin}, \p{Greek} - \p{script=Latin}, \p{script=Greek} + \p{script_extensions=Latin}, \p{scx=Greek} \p{East_Asian_Width=Wide}, \p{EA=W} \p{Line_Break=Hyphen}, \p{LB=HY} \p{Numeric_Value=4}, \p{NV=4} @@ -701,7 +701,7 @@ Here's that program; tested on v5.14. use strict; use warnings; use warnings qw(FATAL utf8); # fatalize encoding faults - use open qw(:std :utf8); # undeclared streams in UTF-8 + use open qw(:std :encoding(UTF-8)); # undeclared streams in UTF-8 use charnames qw(:full :short); # unneeded in v5.16 # std modules diff --git a/gnu/usr.bin/perl/pod/perlunifaq.pod b/gnu/usr.bin/perl/pod/perlunifaq.pod index 4135fbaeb23..ba391d423f6 100644 --- a/gnu/usr.bin/perl/pod/perlunifaq.pod +++ b/gnu/usr.bin/perl/pod/perlunifaq.pod @@ -199,7 +199,9 @@ or by letting automatic decoding and encoding do all the work: =head2 What are C<decode_utf8> and C<encode_utf8>? These are alternate syntaxes for C<decode('utf8', ...)> and C<encode('utf8', -...)>. +...)>. Do not use these functions for data exchange. Instead use +C<decode('UTF-8', ...)> and C<encode('UTF-8', ...)>; see +L</What's the difference between UTF-8 and utf8?> below. =head2 What is a "wide character"? @@ -283,7 +285,7 @@ C<UTF-8> is the official standard. C<utf8> is Perl's way of being liberal in what it accepts. If you have to communicate with things that aren't so liberal, you may want to consider using C<UTF-8>. If you have to communicate with things that are too liberal, you may have to use C<utf8>. The full explanation is in -L<Encode>. +L<Encode/"UTF-8 vs. utf8 vs. UTF8">. C<UTF-8> is internally known as C<utf-8-strict>. The tutorial uses UTF-8 consistently, even where utf8 is actually used internally, because the diff --git a/gnu/usr.bin/perl/pod/perluniintro.pod b/gnu/usr.bin/perl/pod/perluniintro.pod index 7ddf77c1729..5e263b4e635 100644 --- a/gnu/usr.bin/perl/pod/perluniintro.pod +++ b/gnu/usr.bin/perl/pod/perluniintro.pod @@ -112,7 +112,7 @@ unallocated, for future growth. But there have been occasions when a later release needed more code points than the available extras, and a new block had to allocated somewhere else, not contiguous to the initial one, to handle the overflow. Thus, it became apparent early on that -"block" wasn't an adequate organizing principal, and so the C<Script> +"block" wasn't an adequate organizing principle, and so the C<Script> property was created. (Later an improved script property was added as well, the C<Script_Extensions> property.) Those code points that are in overflow blocks can still @@ -151,9 +151,14 @@ serious Unicode work. The maintenance release 5.6.1 fixed many of the problems of the initial Unicode implementation, but for example regular expressions still do not work with Unicode in 5.6.1. Perl v5.14.0 is the first release where Unicode support is -(almost) seamlessly integrable without some gotchas (the exception being -some differences in L<quotemeta|perlfunc/quotemeta>, and that is fixed -starting in Perl 5.16.0). To enable this +(almost) seamlessly integrable without some gotchas. (There are a few +exceptions. Firstly, some differences in L<quotemeta|perlfunc/quotemeta> +were fixed starting in Perl 5.16.0. Secondly, some differences in +L<the range operator|perlop/Range Operators> were fixed starting in +Perl 5.26.0. Thirdly, some differences in L<split|perlfunc/split> were fixed +started in Perl 5.28.0.) + +To enable this seamless support, you should C<use feature 'unicode_strings'> (which is automatically selected if you C<use 5.012> or higher). See L<feature>. (5.14 also fixes a number of bugs and departures from the Unicode @@ -354,7 +359,7 @@ The C<Encode> module knows about many encodings and has interfaces for doing conversions between those encodings: use Encode 'decode'; - $data = decode("iso-8859-3", $data); # convert from legacy to utf-8 + $data = decode("iso-8859-3", $data); # convert from legacy =head2 Unicode I/O @@ -389,7 +394,7 @@ many encodings have several aliases. Note that the C<:utf8> layer must always be specified exactly like that; it is I<not> subject to the loose matching of encoding names. Also note that currently C<:utf8> is unsafe for input, because it accepts the data without validating that it is indeed valid -UTF-8; you should instead use C<:encoding(utf-8)> (with or without a +UTF-8; you should instead use C<:encoding(UTF-8)> (with or without a hyphen). See L<PerlIO> for the C<:utf8> layer, L<PerlIO::encoding> and @@ -402,7 +407,7 @@ Unicode or legacy encodings does not magically turn the data into Unicode in Perl's eyes. To do that, specify the appropriate layer when opening files - open(my $fh,'<:encoding(utf8)', 'anything'); + open(my $fh,'<:encoding(UTF-8)', 'anything'); my $line_of_unicode = <$fh>; open(my $fh,'<:encoding(Big5)', 'anything'); @@ -411,8 +416,8 @@ layer when opening files The I/O layers can also be specified more flexibly with the C<open> pragma. See L<open>, or look at the following example. - use open ':encoding(utf8)'; # input/output default encoding will be - # UTF-8 + use open ':encoding(UTF-8)'; # input/output default encoding will be + # UTF-8 open X, ">file"; print X chr(0x100), "\n"; close X; @@ -468,8 +473,11 @@ standardisation organisations are recognised; for a more detailed list see L<Encode::Supported>. C<read()> reads characters and returns the number of characters. -C<seek()> and C<tell()> operate on byte counts, as do C<sysread()> -and C<sysseek()>. +C<seek()> and C<tell()> operate on byte counts, as does C<sysseek()>. + +C<sysread()> and C<syswrite()> should not be used on file handles with +character encoding layers, they behave badly, and that behaviour has +been deprecated since perl 5.24. Notice that because of the default behaviour of not doing any conversion upon input if there is no default layer, @@ -481,12 +489,12 @@ by repeatedly encoding the data: local $/; ## read in the whole file of 8-bit characters $t = <F>; close F; - open F, ">:encoding(utf8)", "file"; + open F, ">:encoding(UTF-8)", "file"; print F $t; ## convert to UTF-8 on output close F; If you run this code twice, the contents of the F<file> will be twice -UTF-8 encoded. A C<use open ':encoding(utf8)'> would have avoided the +UTF-8 encoded. A C<use open ':encoding(UTF-8)'> would have avoided the bug, or explicitly opening also the F<file> for input as UTF-8. B<NOTE>: the C<:utf8> and C<:encoding> features work only if your @@ -530,15 +538,17 @@ you want to see what the native values are.) =item * -Bit Complement Operator ~ And vec() +Starting in Perl 5.28, it is illegal for bit operators, like C<~>, to +operate on strings containing code points above 255. + +=item * -The bit complement operator C<~> may produce surprising results if +The vec() function may produce surprising results if used on strings containing characters with ordinal values above 255. In such a case, the results are consistent with the internal encoding of the characters, but not with much else. So don't do -that. Similarly for C<vec()>: you will be operating on the -internally-encoded bit patterns of the Unicode characters, not on -the code point values, which is very probably not what you want. +that, and starting in Perl 5.28, a deprecation message is issued if you +do so, becoming illegal in Perl 5.32. =item * @@ -641,13 +651,12 @@ Starting in v5.22, you can use Unicode code points as the end points of regular expression pattern character ranges, and the range will include all Unicode code points that lie between those end points, inclusive. - qr/ [\N{U+03]-\N{U+20}] /x + qr/ [ \N{U+03} - \N{U+20} ] /xx includes the code points C<\N{U+03}>, C<\N{U+04}>, ..., C<\N{U+20}>. -(It is planned to extend this behavior to ranges in C<tr///> in Perl -v5.24.) +This also works for ranges in C<tr///> starting in Perl v5.24. =item * @@ -726,16 +735,13 @@ the output string will be UTF-8-encoded C<ab\x80c = \x{100}\n>, but C<$a> will stay byte-encoded. Sometimes you might really need to know the byte length of a string -instead of the character length. For that use either the -C<Encode::encode_utf8()> function or the C<bytes> pragma +instead of the character length. For that use the C<bytes> pragma and the C<length()> function: my $unicode = chr(0x100); print length($unicode), "\n"; # will print 1 - require Encode; - print length(Encode::encode_utf8($unicode)),"\n"; # will print 2 use bytes; - print length($unicode), "\n"; # will also print 2 + print length($unicode), "\n"; # will print 2 # (the 0xC4 0x80 of the UTF-8) no bytes; @@ -752,12 +758,12 @@ How Do I Detect Data That's Not Valid In a Particular Encoding? Use the C<Encode> package to try converting it. For example, - use Encode 'decode_utf8'; + use Encode 'decode'; - if (eval { decode_utf8($string, Encode::FB_CROAK); 1 }) { - # $string is valid utf8 + if (eval { decode('UTF-8', $string, Encode::FB_CROAK); 1 }) { + # $string is valid UTF-8 } else { - # $string is not valid utf8 + # $string is not valid UTF-8 } Or use C<unpack> to try decoding it: @@ -788,7 +794,7 @@ If you have a raw sequence of bytes that you know should be interpreted via a particular encoding, you can use C<Encode>: use Encode 'from_to'; - from_to($data, "iso-8859-1", "utf-8"); # from latin-1 to utf-8 + from_to($data, "iso-8859-1", "UTF-8"); # from latin-1 to UTF-8 The call to C<from_to()> changes the bytes in C<$data>, but nothing material about the nature of the string has changed as far as Perl is @@ -817,8 +823,8 @@ pack/unpack to convert to/from Unicode. If you have a sequence of bytes you B<know> is valid UTF-8, but Perl doesn't know it yet, you can make Perl a believer, too: - use Encode 'decode_utf8'; - $Unicode = decode_utf8($bytes); + $Unicode = $bytes; + utf8::decode($Unicode); or: @@ -843,9 +849,13 @@ L<http://www.cl.cam.ac.uk/~mgk25/unicode.html> How Does Unicode Work With Traditional Locales? -If your locale is a UTF-8 locale, starting in Perl v5.20, Perl works -well for all categories except C<LC_COLLATE> dealing with sorting and -the C<cmp> operator. +If your locale is a UTF-8 locale, starting in Perl v5.26, Perl works +well for all categories; before this, starting with Perl v5.20, it works +for all categories but C<LC_COLLATE>, which deals with +sorting and the C<cmp> operator. But note that the standard +C<L<Unicode::Collate>> and C<L<Unicode::Collate::Locale>> modules offer +much more powerful solutions to collation issues, and work on earlier +releases. For other locales, starting in Perl 5.16, you can specify diff --git a/gnu/usr.bin/perl/pod/perlutil.pod b/gnu/usr.bin/perl/pod/perlutil.pod index 4047b42e852..b41a6118305 100644 --- a/gnu/usr.bin/perl/pod/perlutil.pod +++ b/gnu/usr.bin/perl/pod/perlutil.pod @@ -126,13 +126,6 @@ around - the F<.ph> file should be created by running F<h2ph> on the corresponding F<.h> file. See the F<h2ph> documentation for more on how to convert a whole bunch of header files at once. -=item L<c2ph|c2ph> and L<pstruct|pstruct> - -F<c2ph> and F<pstruct>, which are actually the same program but behave -differently depending on how they are called, provide another way of -getting at C with Perl - they'll convert C structures and union declarations -to Perl code. This is deprecated in favour of F<h2xs> these days. - =item L<h2xs|h2xs> F<h2xs> converts C header files into XS modules, and will try and write @@ -238,7 +231,7 @@ L<perldoc|perldoc>, L<pod2man|pod2man>, L<perlpod>, L<pod2html|pod2html>, L<pod2usage|pod2usage>, L<podselect|podselect>, L<podchecker|podchecker>, L<splain|splain>, L<perldiag>, C<roffitall|roffitall>, L<File::Find|File::Find>, L<pl2pm|pl2pm>, -L<perlbug|perlbug>, L<h2ph|h2ph>, L<c2ph|c2ph>, L<h2xs|h2xs>, L<enc2xs>, +L<perlbug|perlbug>, L<h2ph|h2ph>, L<h2xs|h2xs>, L<enc2xs>, L<xsubpp>, L<cpan>, L<instmodsh>, L<piconv>, L<prove>, L<corelist>, L<ptar>, L<ptardiff>, L<shasum>, L<zipdetails> diff --git a/gnu/usr.bin/perl/pod/perlvar.pod b/gnu/usr.bin/perl/pod/perlvar.pod index 1821b95d1c1..c7b77120efe 100644 --- a/gnu/usr.bin/perl/pod/perlvar.pod +++ b/gnu/usr.bin/perl/pod/perlvar.pod @@ -142,19 +142,12 @@ test. Outside a C<while> test, this will not happen. =back -C<$_> is by default a global variable. However, as -of perl v5.10.0, you can use a lexical version of -C<$_> by declaring it in a file or in a block with C<my>. Moreover, -declaring C<our $_> restores the global C<$_> in the current scope. Though -this seemed like a good idea at the time it was introduced, lexical C<$_> -actually causes more problems than it solves. If you call a function that -expects to be passed information via C<$_>, it may or may not work, -depending on how the function is written, there not being any easy way to -solve this. Just avoid lexical C<$_>, unless you are feeling particularly -masochistic. For this reason lexical C<$_> is still experimental and will -produce a warning unless warnings have been disabled. As with other -experimental features, the behavior of lexical C<$_> is subject to change -without notice, including change into a fatal error. +C<$_> is a global variable. + +However, between perl v5.10.0 and v5.24.0, it could be used lexically by +writing C<my $_>. Making C<$_> refer to the global C<$_> in the same scope +was then possible with C<our $_>. This experimental feature was removed and is +now a fatal error, but you may encounter it in older code. Mnemonic: underline is understood in certain operations. @@ -503,9 +496,10 @@ initially consists of the arguments to any B<-I> command-line switches, followed by the default Perl library, probably F</usr/local/lib/perl>, followed by ".", to represent the current directory. ("." will not be appended if taint checks are enabled, -either by C<-T> or by C<-t>.) If you need to modify this at runtime, -you should use the C<use lib> pragma to get the machine-dependent -library properly loaded also: +either by C<-T> or by C<-t>, or if configured not to do so by the +C<-Ddefault_inc_excludes_dot> compile time option.) If you need to +modify this at runtime, you should use the C<use lib> pragma to get +the machine-dependent library properly loaded also: use lib '/mypath/libdir/'; use SomeMod; @@ -540,6 +534,19 @@ inplace editing. Mnemonic: value of B<-i> switch. +=item @ISA +X<@ISA> + +Each package contains a special array called C<@ISA> which contains a list +of that class's parent classes, if any. This array is simply a list of +scalars, each of which is a string that corresponds to a package name. The +array is examined when Perl does method resolution, which is covered in +L<perlobj>. + +To load packages while adding them to C<@ISA>, see the L<parent> pragma. The +discouraged L<base> pragma does this as well, but should not be used except +when compatibility with the discouraged L<fields> pragma is required. + =item $^M X<$^M> @@ -646,13 +653,12 @@ or a C<die()>. The C<__DIE__> handler is explicitly disabled during the call, so that you can die from a C<__DIE__> handler. Similarly for C<__WARN__>. -Due to an implementation glitch, the C<$SIG{__DIE__}> hook is called -even inside an C<eval()>. Do not use this to rewrite a pending -exception in C<$@>, or as a bizarre substitute for overriding -C<CORE::GLOBAL::die()>. This strange action at a distance may be fixed -in a future release so that C<$SIG{__DIE__}> is only called if your -program is about to exit, as was the original intent. Any other use is -deprecated. +The C<$SIG{__DIE__}> hook is called even inside an C<eval()>. It was +never intended to happen this way, but an implementation glitch made +this possible. This used to be deprecated, as it allowed strange action +at a distance like rewriting a pending exception in C<$@>. Plans to +rectify this have been scrapped, as users found that rewriting a +pending exception is actually a useful feature, and not a bug. C<__DIE__>/C<__WARN__> handlers are very special in one respect: they may be called to report (probable) errors found by the parser. In such @@ -715,13 +721,14 @@ conversion, which works for both v-strings or version objects: See the documentation of C<use VERSION> and C<require VERSION> for a convenient way to fail if the running Perl interpreter is too old. -See also C<$]> for a decimal representation of the Perl version. +See also C<L</$]>> for a decimal representation of the Perl version. The main advantage of C<$^V> over C<$]> is that, for Perl v5.10.0 or later, it overloads operators, allowing easy comparison against other version representations (e.g. decimal, literal v-string, "v1.2.3", or objects). The disadvantage is that prior to v5.10.0, it was only a -literal v-string, which can't be easily printed or compared. +literal v-string, which can't be easily printed or compared, whereas +the behavior of C<$]> is unchanged on all versions of Perl. Mnemonic: use ^V for a version object. @@ -860,9 +867,9 @@ this: $str =~ /pattern/; - print $`, $&, $'; # bad: perfomance hit + print $`, $&, $'; # bad: performance hit - print # good: no perfomance hit + print # good: no performance hit substr($str, 0, $-[0]), substr($str, $-[0], $+[0]-$-[0]), substr($str, $+[0]); @@ -894,16 +901,43 @@ find uses of these problematic match variables in your code. =over 8 =item $<I<digits>> ($1, $2, ...) -X<$1> X<$2> X<$3> +X<$1> X<$2> X<$3> X<$I<digits>> Contains the subpattern from the corresponding set of capturing parentheses from the last successful pattern match, not counting patterns matched in nested blocks that have been exited already. +Note there is a distinction between a capture buffer which matches +the empty string a capture buffer which is optional. Eg, C<(x?)> and +C<(x)?> The latter may be undef, the former not. + These variables are read-only and dynamically-scoped. Mnemonic: like \digits. +=item @{^CAPTURE} +X<@{^CAPTURE}> X<@^CAPTURE> + +An array which exposes the contents of the capture buffers, if any, of +the last successful pattern match, not counting patterns matched +in nested blocks that have been exited already. + +Note that the 0 index of @{^CAPTURE} is equivalent to $1, the 1 index +is equivalent to $2, etc. + + if ("foal"=~/(.)(.)(.)(.)/) { + print join "-", @{^CAPTURE}; + } + +should output "f-o-a-l". + +See also L</$I<digits>>, L</%{^CAPTURE}> and L</%{^CAPTURE_ALL}>. + +Note that unlike most other regex magic variables there is no single +letter equivalent to C<@{^CAPTURE}>. + +This variable was added in 5.25.7 + =item $MATCH =item $& @@ -1062,10 +1096,12 @@ examples given for the C<@-> variable. This variable was added in Perl v5.6.0. +=item %{^CAPTURE} + =item %LAST_PAREN_MATCH =item %+ -X<%+> X<%LAST_PAREN_MATCH> +X<%+> X<%LAST_PAREN_MATCH> X<%{^CAPTURE}> Similar to C<@+>, the C<%+> hash allows access to the named capture buffers, should they exist, in the last successful match in the @@ -1078,6 +1114,9 @@ For example, C<$+{foo}> is equivalent to C<$1> after the following match: The keys of the C<%+> hash list only the names of buffers that have captured (and that are thus associated to defined values). +If multiple distinct capture groups have the same name, then +C<$+{NAME}> will refer to the leftmost defined group in the match. + The underlying behaviour of C<%+> is provided by the L<Tie::Hash::NamedCapture> module. @@ -1087,7 +1126,8 @@ iterative access to them via C<each> may have unpredictable results. Likewise, if the last successful match changes, then the results may be surprising. -This variable was added in Perl v5.10.0. +This variable was added in Perl v5.10.0. The C<%{^CAPTURE}> alias was +added in 5.25.7. This variable is read-only and dynamically-scoped. @@ -1135,6 +1175,9 @@ After a match against some variable C<$var>: This variable was added in Perl v5.6.0. +=item %{^CAPTURE_ALL} +X<%{^CAPTURE_ALL}> + =item %- X<%-> @@ -1179,7 +1222,8 @@ iterative access to them via C<each> may have unpredictable results. Likewise, if the last successful match changes, then the results may be surprising. -This variable was added in Perl v5.10.0. +This variable was added in Perl v5.10.0. The C<%{^CAPTURE_ALL}> alias was +added in 5.25.7. This variable is read-only and dynamically-scoped. @@ -1917,24 +1961,19 @@ Mnemonic: value of B<-D> switch. =item ${^ENCODING} X<${^ENCODING}> -DEPRECATED!!! +This variable is no longer supported. -The I<object reference> to the C<Encode> object that is used to convert -the source code to Unicode. Thanks to this variable your Perl script -does not have to be written in UTF-8. Default is C<undef>. +It used to hold the I<object reference> to the C<Encode> object that was +used to convert the source code to Unicode. -Setting this variable to any other value than C<undef> is deprecated due -to fundamental defects in its design and implementation. It is planned -to remove it from a future Perl version. Its purpose was to allow your -non-ASCII Perl scripts to not have to be written in UTF-8; this was +Its purpose was to allow your non-ASCII Perl +scripts not to have to be written in UTF-8; this was useful before editors that worked on UTF-8 encoded text were common, but -that was long ago. It causes problems, such as affecting the operation -of other modules that aren't expecting it, causing general mayhem. Its -use can lead to segfaults. +that was long ago. It caused problems, such as affecting the operation +of other modules that weren't expecting it, causing general mayhem. -If you need something like this functionality, you should use the -L<encoding> pragma, which is also deprecated, but has fewer nasty side -effects. +If you need something like this functionality, it is recommended that use +you a simple source filter, such as L<Filter::Encoding>. If you are coming here because code of yours is being adversely affected by someone's use of this variable, you can usually work around it by @@ -1946,7 +1985,8 @@ near the beginning of the functions that are getting broken. This undefines the variable during the scope of execution of the including function. -This variable was added in Perl 5.8.2. +This variable was added in Perl 5.8.2 and removed in 5.26.0. +Setting it to anything other than C<undef> was made fatal in Perl 5.28.0. =item ${^GLOBAL_PHASE} X<${^GLOBAL_PHASE}> @@ -2212,6 +2252,21 @@ This variable is read-only. This variable was added in Perl v5.8.0. +=item ${^SAFE_LOCALES} +X<${^SAFE_LOCALES}> + +Reflects if safe locale operations are available to this perl (when the +value is 1) or not (the value is 0). This variable is always 1 if the +perl has been compiled without threads. It is also 1 if this perl is +using thread-safe locale operations. Note that an individual thread may +choose to use the global locale (generally unsafe) by calling +L<perlapi/switch_to_global_locale>. This variable currently is still +set to 1 in such threads. + +This variable is read-only. + +This variable was added in Perl v5.28.0. + =item ${^UNICODE} X<${^UNICODE}> @@ -2316,6 +2371,8 @@ Mnemonic: [ begins subscripts. Deprecated in Perl v5.12.0. +Assigning a non-zero value be fatal in Perl v5.30.0. + =back =cut diff --git a/gnu/usr.bin/perl/pod/perlvms.pod b/gnu/usr.bin/perl/pod/perlvms.pod index 22082570399..e0d98039170 100644 --- a/gnu/usr.bin/perl/pod/perlvms.pod +++ b/gnu/usr.bin/perl/pod/perlvms.pod @@ -326,8 +326,10 @@ improve performance at the expense of the BYTLM UAF quota. =head1 PERL5LIB and PERLLIB -The PERL5LIB and PERLLIB logical names work as documented in L<perl>, -except that the element separator is '|' instead of ':'. The +The PERL5LIB and PERLLIB environment elements work as documented in L<perl>, +except that the element separator is, by default, '|' instead of ':'. +However, when running under a Unix shell as determined by the logical +name C<GNV$UNIX_SHELL>, the separator will be ':' as on Unix systems. The directory specifications may use either VMS or Unix syntax. =head1 The Perl Forked Debugger diff --git a/gnu/usr.bin/perl/pod/splitpod b/gnu/usr.bin/perl/pod/splitpod index 10fd6afb111..1bc91bd1838 100644 --- a/gnu/usr.bin/perl/pod/splitpod +++ b/gnu/usr.bin/perl/pod/splitpod @@ -46,7 +46,7 @@ for $f ( keys %syn ) { my $has_back = $body =~ /^=back/; $body =~ s/^=over\s*//m if $has_over and !$has_back; $body =~ s/^=back\s*//m if $has_back and !$has_over; - open (POD, "> $name.pod") || die "can't open $name.pod: $!"; + open (POD, '>', "$name.pod") || die "can't open $name.pod: $!"; print POD <<EOF; \=head1 NAME |