summaryrefslogtreecommitdiffstats
path: root/gnu/usr.bin/perl/cpan/Unicode-Normalize/Normalize.xs
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/usr.bin/perl/cpan/Unicode-Normalize/Normalize.xs')
-rw-r--r--gnu/usr.bin/perl/cpan/Unicode-Normalize/Normalize.xs925
1 files changed, 0 insertions, 925 deletions
diff --git a/gnu/usr.bin/perl/cpan/Unicode-Normalize/Normalize.xs b/gnu/usr.bin/perl/cpan/Unicode-Normalize/Normalize.xs
deleted file mode 100644
index 4acff7fe490..00000000000
--- a/gnu/usr.bin/perl/cpan/Unicode-Normalize/Normalize.xs
+++ /dev/null
@@ -1,925 +0,0 @@
-
-#define PERL_NO_GET_CONTEXT /* we want efficiency */
-
-/* private functions which need pTHX_ and aTHX_
- pv_cat_decompHangul
- sv_2pvunicode
- pv_utf8_decompose
- pv_utf8_reorder
- pv_utf8_compose
-*/
-
-#include "EXTERN.h"
-#include "perl.h"
-#include "XSUB.h"
-
-/* These 5 files are prepared by mkheader */
-#include "unfcmb.h"
-#include "unfcan.h"
-#include "unfcpt.h"
-#include "unfcmp.h"
-#include "unfexc.h"
-
-/* The generated normalization tables since v5.20 are in native character set
- * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for
- * later perls, and redefine that to be 'uvuni' for earlier ones */
-#if PERL_VERSION < 20
-# undef uvchr_to_utf8
-# ifdef uvuni_to_utf8
-# define uvchr_to_utf8 uvuni_to_utf8
-# else /* Perl 5.6.1 */
-# define uvchr_to_utf8 uv_to_utf8
-# endif
-
-# undef utf8n_to_uvchr
-# ifdef utf8n_to_uvuni
-# define utf8n_to_uvchr utf8n_to_uvuni
-# else /* Perl 5.6.1 */
-# define utf8n_to_uvchr utf8_to_uv
-# endif
-#endif
-
-/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
-#ifndef UTF8_ALLOW_BOM
-#define UTF8_ALLOW_BOM (0)
-#endif /* UTF8_ALLOW_BOM */
-
-#ifndef UTF8_ALLOW_SURROGATE
-#define UTF8_ALLOW_SURROGATE (0)
-#endif /* UTF8_ALLOW_SURROGATE */
-
-#ifndef UTF8_ALLOW_FE_FF
-#define UTF8_ALLOW_FE_FF (0)
-#endif /* UTF8_ALLOW_FE_FF */
-
-#ifndef UTF8_ALLOW_FFFF
-#define UTF8_ALLOW_FFFF (0)
-#endif /* UTF8_ALLOW_FFFF */
-
-#ifndef PERL_UNUSED_VAR
-# define PERL_UNUSED_VAR(x) ((void)sizeof(x))
-#endif
-
-#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
-
-/* check if the string buffer is enough before uvchr_to_utf8(). */
-/* dstart, d, and dlen should be defined outside before. */
-#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \
- if (dlen < curlen + (need)) { \
- dlen += (need); \
- Renew(dstart, dlen+1, U8); \
- d = dstart + curlen; \
- }
-
-/* if utf8n_to_uvchr() sets retlen to 0 (if broken?) */
-#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
-
-/* utf8_hop() hops back before start. Maybe broken UTF-8 */
-#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
-
-/* At present, char > 0x10ffff are unaffected without complaint, right? */
-#define VALID_UTF_MAX (0x10ffff)
-#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
-
-/* size of array for combining characters */
-/* enough as an initial value? */
-#define CC_SEQ_SIZE (10)
-#define CC_SEQ_STEP (5)
-
-/* HANGUL begin */
-#define Hangul_SBase 0xAC00
-#define Hangul_SFinal 0xD7A3
-#define Hangul_SCount 11172
-
-#define Hangul_NCount 588
-
-#define Hangul_LBase 0x1100
-#define Hangul_LFinal 0x1112
-#define Hangul_LCount 19
-
-#define Hangul_VBase 0x1161
-#define Hangul_VFinal 0x1175
-#define Hangul_VCount 21
-
-#define Hangul_TBase 0x11A7
-#define Hangul_TFinal 0x11C2
-#define Hangul_TCount 28
-
-#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
-#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
-#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
-#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
-#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
-#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
-/* HANGUL end */
-
-/* this is used for canonical ordering of combining characters (c.c.). */
-typedef struct {
- U8 cc; /* combining class */
- UV uv; /* codepoint */
- STRLEN pos; /* position */
-} UNF_cc;
-
-static int compare_cc(const void *a, const void *b)
-{
- int ret_cc;
- ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
- if (ret_cc)
- return ret_cc;
-
- return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
- - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
-}
-
-static U8* dec_canonical(UV uv)
-{
- U8 ***plane, **row;
- if (OVER_UTF_MAX(uv))
- return NULL;
- plane = (U8***)UNF_canon[uv >> 16];
- if (! plane)
- return NULL;
- row = plane[(uv >> 8) & 0xff];
- return row ? row[uv & 0xff] : NULL;
-}
-
-static U8* dec_compat(UV uv)
-{
- U8 ***plane, **row;
- if (OVER_UTF_MAX(uv))
- return NULL;
- plane = (U8***)UNF_compat[uv >> 16];
- if (! plane)
- return NULL;
- row = plane[(uv >> 8) & 0xff];
- return row ? row[uv & 0xff] : NULL;
-}
-
-static UV composite_uv(UV uv, UV uv2)
-{
- UNF_complist ***plane, **row, *cell, *i;
-
- if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
- return 0;
-
- if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
- UV lindex = uv - Hangul_LBase;
- UV vindex = uv2 - Hangul_VBase;
- return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
- Hangul_TCount);
- }
- if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
- UV tindex = uv2 - Hangul_TBase;
- return(uv + tindex);
- }
- plane = UNF_compos[uv >> 16];
- if (! plane)
- return 0;
- row = plane[(uv >> 8) & 0xff];
- if (! row)
- return 0;
- cell = row[uv & 0xff];
- if (! cell)
- return 0;
- for (i = cell; i->nextchar; i++) {
- if (uv2 == i->nextchar)
- return i->composite;
- }
- return 0;
-}
-
-static U8 getCombinClass(UV uv)
-{
- U8 **plane, *row;
- if (OVER_UTF_MAX(uv))
- return 0;
- plane = (U8**)UNF_combin[uv >> 16];
- if (! plane)
- return 0;
- row = plane[(uv >> 8) & 0xff];
- return row ? row[uv & 0xff] : 0;
-}
-
-static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
-{
- UV sindex = uv - Hangul_SBase;
- UV lindex = sindex / Hangul_NCount;
- UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
- UV tindex = sindex % Hangul_TCount;
-
- if (! Hangul_IsS(uv))
- return d;
-
- d = uvchr_to_utf8(d, (lindex + Hangul_LBase));
- d = uvchr_to_utf8(d, (vindex + Hangul_VBase));
- if (tindex)
- d = uvchr_to_utf8(d, (tindex + Hangul_TBase));
- return d;
-}
-
-static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
-{
- char *s;
- STRLEN len;
- s = SvPV(sv,len);
- if (!SvUTF8(sv)) {
- SV* tmpsv = sv_2mortal(newSVpvn(s, len));
- if (!SvPOK(tmpsv))
- s = SvPV_force(tmpsv,len);
- sv_utf8_upgrade(tmpsv);
- s = SvPV(tmpsv,len);
- }
- if (lp)
- *lp = len;
- return s;
-}
-
-static
-U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
-{
- U8* p = s;
- U8* e = s + slen;
- U8* dstart = *dp;
- U8* d = dstart;
-
- while (p < e) {
- STRLEN retlen;
- UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero, "decompose");
- p += retlen;
-
- if (Hangul_IsS(uv)) {
- Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
- d = pv_cat_decompHangul(aTHX_ d, uv);
- }
- else {
- U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
-
- if (r) {
- STRLEN len = (STRLEN)strlen((char *)r);
- Renew_d_if_not_enough_to(len)
- while (len--)
- *d++ = *r++;
- }
- else {
- Renew_d_if_not_enough_to(UTF8_MAXLEN)
- d = uvchr_to_utf8(d, uv);
- }
- }
- }
- *dp = dstart;
- return d;
-}
-
-static
-U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
-{
- U8* p = s;
- U8* e = s + slen;
- U8* dstart = *dp;
- U8* d = dstart;
-
- UNF_cc seq_ary[CC_SEQ_SIZE];
- UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
- UNF_cc* seq_ext = NULL; /* extend if need */
- STRLEN seq_max = CC_SEQ_SIZE;
- STRLEN cc_pos = 0;
-
- while (p < e) {
- U8 curCC;
- STRLEN retlen;
- UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero, "reorder");
- p += retlen;
-
- curCC = getCombinClass(uv);
-
- if (curCC != 0) {
- if (seq_max < cc_pos + 1) { /* extend if need */
- seq_max = cc_pos + CC_SEQ_STEP; /* new size */
- if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
- STRLEN i;
- New(0, seq_ext, seq_max, UNF_cc);
- for (i = 0; i < cc_pos; i++)
- seq_ext[i] = seq_ary[i];
- }
- else {
- Renew(seq_ext, seq_max, UNF_cc);
- }
- seq_ptr = seq_ext; /* use seq_ext from now */
- }
-
- seq_ptr[cc_pos].cc = curCC;
- seq_ptr[cc_pos].uv = uv;
- seq_ptr[cc_pos].pos = cc_pos;
- ++cc_pos;
-
- if (p < e)
- continue;
- }
-
- /* output */
- if (cc_pos) {
- STRLEN i;
-
- if (cc_pos > 1) /* reordered if there are two c.c.'s */
- qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
-
- for (i = 0; i < cc_pos; i++) {
- Renew_d_if_not_enough_to(UTF8_MAXLEN)
- d = uvchr_to_utf8(d, seq_ptr[i].uv);
- }
- cc_pos = 0;
- }
-
- if (curCC == 0) {
- Renew_d_if_not_enough_to(UTF8_MAXLEN)
- d = uvchr_to_utf8(d, uv);
- }
- }
- if (seq_ext)
- Safefree(seq_ext);
- *dp = dstart;
- return d;
-}
-
-static
-U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
-{
- U8* p = s;
- U8* e = s + slen;
- U8* dstart = *dp;
- U8* d = dstart;
-
- UV uvS = 0; /* code point of the starter */
- bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
- U8 preCC = 0;
-
- UV seq_ary[CC_SEQ_SIZE];
- UV* seq_ptr = seq_ary; /* use array at the beginning */
- UV* seq_ext = NULL; /* extend if need */
- STRLEN seq_max = CC_SEQ_SIZE;
- STRLEN cc_pos = 0;
-
- while (p < e) {
- U8 curCC;
- STRLEN retlen;
- UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero, "compose");
- p += retlen;
-
- curCC = getCombinClass(uv);
-
- if (!valid_uvS) {
- if (curCC == 0) {
- uvS = uv; /* the first Starter is found */
- valid_uvS = TRUE;
- if (p < e)
- continue;
- }
- else {
- Renew_d_if_not_enough_to(UTF8_MAXLEN)
- d = uvchr_to_utf8(d, uv);
- continue;
- }
- }
- else {
- bool composed;
-
- /* blocked */
- if ((iscontig && cc_pos) || /* discontiguous combination */
- (curCC != 0 && preCC == curCC) || /* blocked by same CC */
- (preCC > curCC)) /* blocked by higher CC: revised D2 */
- composed = FALSE;
-
- /* not blocked:
- iscontig && cc_pos == 0 -- contiguous combination
- curCC == 0 && preCC == 0 -- starter + starter
- curCC != 0 && preCC < curCC -- lower CC */
- else {
- /* try composition */
- UV uvComp = composite_uv(uvS, uv);
-
- if (uvComp && !isExclusion(uvComp)) {
- uvS = uvComp;
- composed = TRUE;
-
- /* preCC should not be changed to curCC */
- /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
- if (p < e)
- continue;
- }
- else
- composed = FALSE;
- }
-
- if (!composed) {
- preCC = curCC;
- if (curCC != 0 || !(p < e)) {
- if (seq_max < cc_pos + 1) { /* extend if need */
- seq_max = cc_pos + CC_SEQ_STEP; /* new size */
- if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
- New(0, seq_ext, seq_max, UV);
- Copy(seq_ary, seq_ext, cc_pos, UV);
- }
- else {
- Renew(seq_ext, seq_max, UV);
- }
- seq_ptr = seq_ext; /* use seq_ext from now */
- }
- seq_ptr[cc_pos] = uv;
- ++cc_pos;
- }
- if (curCC != 0 && p < e)
- continue;
- }
- }
-
- /* output */
- {
- Renew_d_if_not_enough_to(UTF8_MAXLEN)
- d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */
- }
-
- if (cc_pos) {
- STRLEN i;
-
- for (i = 0; i < cc_pos; i++) {
- Renew_d_if_not_enough_to(UTF8_MAXLEN)
- d = uvchr_to_utf8(d, seq_ptr[i]);
- }
- cc_pos = 0;
- }
-
- uvS = uv;
- }
- if (seq_ext)
- Safefree(seq_ext);
- *dp = dstart;
- return d;
-}
-
-MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
-
-SV*
-decompose(src, compat = &PL_sv_no)
- SV * src
- SV * compat
- PROTOTYPE: $;$
- PREINIT:
- SV* dst;
- U8 *s, *d, *dend;
- STRLEN slen, dlen;
- CODE:
- s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
- dst = newSVpvn("", 0);
- dlen = slen;
- New(0, d, dlen+1, U8);
- dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
- sv_setpvn(dst, (char *)d, dend - d);
- SvUTF8_on(dst);
- Safefree(d);
- RETVAL = dst;
- OUTPUT:
- RETVAL
-
-
-SV*
-reorder(src)
- SV * src
- PROTOTYPE: $
- PREINIT:
- SV* dst;
- U8 *s, *d, *dend;
- STRLEN slen, dlen;
- CODE:
- s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
- dst = newSVpvn("", 0);
- dlen = slen;
- New(0, d, dlen+1, U8);
- dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
- sv_setpvn(dst, (char *)d, dend - d);
- SvUTF8_on(dst);
- Safefree(d);
- RETVAL = dst;
- OUTPUT:
- RETVAL
-
-
-SV*
-compose(src)
- SV * src
- PROTOTYPE: $
- ALIAS:
- composeContiguous = 1
- PREINIT:
- SV* dst;
- U8 *s, *d, *dend;
- STRLEN slen, dlen;
- CODE:
- s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
- dst = newSVpvn("", 0);
- dlen = slen;
- New(0, d, dlen+1, U8);
- dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
- sv_setpvn(dst, (char *)d, dend - d);
- SvUTF8_on(dst);
- Safefree(d);
- RETVAL = dst;
- OUTPUT:
- RETVAL
-
-
-SV*
-NFD(src)
- SV * src
- PROTOTYPE: $
- ALIAS:
- NFKD = 1
- PREINIT:
- SV *dst;
- U8 *s, *t, *tend, *d, *dend;
- STRLEN slen, tlen, dlen;
- CODE:
- s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
-
- /* decompose */
- tlen = slen;
- New(0, t, tlen+1, U8);
- tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
- *tend = '\0';
- tlen = tend - t; /* no longer know real size of t */
-
- /* reorder */
- dlen = tlen;
- New(0, d, dlen+1, U8);
- dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
- *dend = '\0';
- dlen = dend - d; /* no longer know real size of d */
-
- /* return */
- dst = newSVpvn("", 0);
- sv_setpvn(dst, (char *)d, dlen);
- SvUTF8_on(dst);
-
- Safefree(t);
- Safefree(d);
- RETVAL = dst;
- OUTPUT:
- RETVAL
-
-
-SV*
-NFC(src)
- SV * src
- PROTOTYPE: $
- ALIAS:
- NFKC = 1
- FCC = 2
- PREINIT:
- SV *dst;
- U8 *s, *t, *tend, *u, *uend, *d, *dend;
- STRLEN slen, tlen, ulen, dlen;
- CODE:
- s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
-
- /* decompose */
- tlen = slen;
- New(0, t, tlen+1, U8);
- tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
- *tend = '\0';
- tlen = tend - t; /* no longer know real size of t */
-
- /* reorder */
- ulen = tlen;
- New(0, u, ulen+1, U8);
- uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
- *uend = '\0';
- ulen = uend - u; /* no longer know real size of u */
-
- /* compose */
- dlen = ulen;
- New(0, d, dlen+1, U8);
- dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
- *dend = '\0';
- dlen = dend - d; /* no longer know real size of d */
-
- /* return */
- dst = newSVpvn("", 0);
- sv_setpvn(dst, (char *)d, dlen);
- SvUTF8_on(dst);
-
- Safefree(t);
- Safefree(u);
- Safefree(d);
- RETVAL = dst;
- OUTPUT:
- RETVAL
-
-
-SV*
-checkNFD(src)
- SV * src
- PROTOTYPE: $
- ALIAS:
- checkNFKD = 1
- PREINIT:
- STRLEN srclen, retlen;
- U8 *s, *e, *p, curCC, preCC;
- bool result = TRUE;
- CODE:
- s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
- e = s + srclen;
-
- preCC = 0;
- for (p = s; p < e; p += retlen) {
- UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero, "checkNFD or -NFKD");
-
- curCC = getCombinClass(uv);
- if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
- result = FALSE;
- break;
- }
- if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
- result = FALSE;
- break;
- }
- preCC = curCC;
- }
- RETVAL = boolSV(result);
- OUTPUT:
- RETVAL
-
-
-SV*
-checkNFC(src)
- SV * src
- PROTOTYPE: $
- ALIAS:
- checkNFKC = 1
- PREINIT:
- STRLEN srclen, retlen;
- U8 *s, *e, *p, curCC, preCC;
- bool result = TRUE;
- bool isMAYBE = FALSE;
- CODE:
- s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
- e = s + srclen;
-
- preCC = 0;
- for (p = s; p < e; p += retlen) {
- UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero, "checkNFC or -NFKC");
-
- curCC = getCombinClass(uv);
- if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
- result = FALSE;
- break;
- }
-
- /* get NFC/NFKC property */
- if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
- ; /* YES */
- else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
- result = FALSE;
- break;
- }
- else if (isComp2nd(uv))
- isMAYBE = TRUE;
- else if (ix) {
- char *canon, *compat;
- /* NFKC_NO when having compatibility mapping. */
- canon = (char *) dec_canonical(uv);
- compat = (char *) dec_compat(uv);
- if (compat && !(canon && strEQ(canon, compat))) {
- result = FALSE;
- break;
- }
- } /* end of get NFC/NFKC property */
-
- preCC = curCC;
- }
- if (isMAYBE && result) /* NO precedes MAYBE */
- XSRETURN_UNDEF;
- RETVAL = boolSV(result);
- OUTPUT:
- RETVAL
-
-
-SV*
-checkFCD(src)
- SV * src
- PROTOTYPE: $
- ALIAS:
- checkFCC = 1
- PREINIT:
- STRLEN srclen, retlen;
- U8 *s, *e, *p, curCC, preCC;
- bool result = TRUE;
- bool isMAYBE = FALSE;
- CODE:
- s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
- e = s + srclen;
- preCC = 0;
- for (p = s; p < e; p += retlen) {
- U8 *sCan;
- UV uvLead;
- STRLEN canlen = 0;
- UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero, "checkFCD or -FCC");
-
- sCan = (U8*) dec_canonical(uv);
-
- if (sCan) {
- STRLEN canret;
- canlen = (STRLEN)strlen((char *) sCan);
- uvLead = utf8n_to_uvchr(sCan, canlen, &canret, AllowAnyUTF);
- if (!canret)
- croak(ErrRetlenIsZero, "checkFCD or -FCC");
- }
- else {
- uvLead = uv;
- }
-
- curCC = getCombinClass(uvLead);
-
- if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
- result = FALSE;
- break;
- }
-
- if (ix) {
- if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
- result = FALSE;
- break;
- }
- else if (isComp2nd(uv))
- isMAYBE = TRUE;
- }
-
- if (sCan) {
- STRLEN canret;
- UV uvTrail;
- U8* eCan = sCan + canlen;
- U8* pCan = utf8_hop(eCan, -1);
- if (pCan < sCan)
- croak(ErrHopBeforeStart);
- uvTrail = utf8n_to_uvchr(pCan, eCan - pCan, &canret, AllowAnyUTF);
- if (!canret)
- croak(ErrRetlenIsZero, "checkFCD or -FCC");
- preCC = getCombinClass(uvTrail);
- }
- else {
- preCC = curCC;
- }
- }
- if (isMAYBE && result) /* NO precedes MAYBE */
- XSRETURN_UNDEF;
- RETVAL = boolSV(result);
- OUTPUT:
- RETVAL
-
-
-U8
-getCombinClass(uv)
- UV uv
- PROTOTYPE: $
-
-bool
-isExclusion(uv)
- UV uv
- PROTOTYPE: $
-
-bool
-isSingleton(uv)
- UV uv
- PROTOTYPE: $
-
-bool
-isNonStDecomp(uv)
- UV uv
- PROTOTYPE: $
-
-bool
-isComp2nd(uv)
- UV uv
- PROTOTYPE: $
- ALIAS:
- isNFC_MAYBE = 1
- isNFKC_MAYBE = 2
- INIT:
- PERL_UNUSED_VAR(ix);
-
-SV*
-isNFD_NO(uv)
- UV uv
- PROTOTYPE: $
- ALIAS:
- isNFKD_NO = 1
- PREINIT:
- bool result = FALSE;
- CODE:
- if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
- result = TRUE; /* NFD_NO or NFKD_NO */
- RETVAL = boolSV(result);
- OUTPUT:
- RETVAL
-
-
-SV*
-isComp_Ex(uv)
- UV uv
- PROTOTYPE: $
- ALIAS:
- isNFC_NO = 0
- isNFKC_NO = 1
- PREINIT:
- bool result = FALSE;
- CODE:
- if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
- result = TRUE; /* NFC_NO or NFKC_NO */
- else if (ix) {
- char *canon, *compat;
- canon = (char *) dec_canonical(uv);
- compat = (char *) dec_compat(uv);
- if (compat && (!canon || strNE(canon, compat)))
- result = TRUE; /* NFC_NO or NFKC_NO */
- }
- RETVAL = boolSV(result);
- OUTPUT:
- RETVAL
-
-SV*
-getComposite(uv, uv2)
- UV uv
- UV uv2
- PROTOTYPE: $$
- PREINIT:
- UV composite;
- CODE:
- composite = composite_uv(uv, uv2);
- RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
- OUTPUT:
- RETVAL
-
-
-
-SV*
-getCanon(uv)
- UV uv
- PROTOTYPE: $
- ALIAS:
- getCompat = 1
- CODE:
- if (Hangul_IsS(uv)) {
- U8 tmp[3 * UTF8_MAXLEN + 1];
- U8 *t = tmp;
- U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
- RETVAL = newSVpvn((char *)t, e - t);
- } else {
- U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
- if (!rstr)
- XSRETURN_UNDEF;
- RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
- }
- SvUTF8_on(RETVAL);
- OUTPUT:
- RETVAL
-
-
-void
-splitOnLastStarter(src)
- SV * src
- PREINIT:
- SV *svp;
- STRLEN srclen;
- U8 *s, *e, *p;
- PPCODE:
- s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
- e = s + srclen;
- p = e;
- while (s < p) {
- UV uv;
- p = utf8_hop(p, -1);
- if (p < s)
- croak(ErrHopBeforeStart);
- uv = utf8n_to_uvchr(p, e - p, NULL, AllowAnyUTF);
- if (getCombinClass(uv) == 0) /* Last Starter found */
- break;
- }
-
- svp = sv_2mortal(newSVpvn((char*)s, p - s));
- SvUTF8_on(svp);
- XPUSHs(svp);
-
- svp = sv_2mortal(newSVpvn((char*)p, e - p));
- SvUTF8_on(svp);
- XPUSHs(svp);
-