#include "EXTERN.h" #include "perl.h" #include "XSUB.h" /* These 5 files are prepared by mkheader */ #include "unfcmb.h" #include "unfcan.h" #include "unfcpt.h" #include "unfcmp.h" #include "unfexc.h" /* Perl 5.6.1 ? */ #ifndef uvuni_to_utf8 #define uvuni_to_utf8 uv_to_utf8 #endif /* uvuni_to_utf8 */ /* Perl 5.6.1 ? */ #ifndef utf8n_to_uvuni #define utf8n_to_uvuni utf8_to_uv #endif /* utf8n_to_uvuni */ /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ #ifdef UTF8_ALLOW_BOM #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) #else #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) #endif /* if utf8n_to_uvuni() sets retlen to 0 (?) */ #define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character" /* utf8_hop() hops back before start. Maybe broken UTF-8 */ #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" /* At present, char > 0x10ffff are unaffected without complaint, right? */ #define VALID_UTF_MAX (0x10ffff) #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) /* HANGUL_H */ #define Hangul_SBase 0xAC00 #define Hangul_SFinal 0xD7A3 #define Hangul_SCount 11172 #define Hangul_NCount 588 #define Hangul_LBase 0x1100 #define Hangul_LFinal 0x1112 #define Hangul_LCount 19 #define Hangul_VBase 0x1161 #define Hangul_VFinal 0x1175 #define Hangul_VCount 21 #define Hangul_TBase 0x11A7 #define Hangul_TFinal 0x11C2 #define Hangul_TCount 28 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) /* HANGUL_H */ /* this is used for canonical ordering of combining characters (c.c.). */ typedef struct { U8 cc; /* combining class */ UV uv; /* codepoint */ STRLEN pos; /* position */ } UNF_cc; static int compare_cc (const void *a, const void *b) 124 { 124 int ret_cc; 124 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; 124 if (ret_cc) 93 return ret_cc; 31 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); } static U8* dec_canonical (UV uv) 609 { 609 U8 ***plane, **row; 609 if (OVER_UTF_MAX(uv)) 10 return NULL; 599 plane = (U8***)UNF_canon[uv >> 16]; 599 if (! plane) 5 return NULL; 594 row = plane[(uv >> 8) & 0xff]; 594 return row ? row[uv & 0xff] : NULL; } static U8* dec_compat (UV uv) 217 { 217 U8 ***plane, **row; 217 if (OVER_UTF_MAX(uv)) 6 return NULL; 211 plane = (U8***)UNF_compat[uv >> 16]; 211 if (! plane) 3 return NULL; 208 row = plane[(uv >> 8) & 0xff]; 208 return row ? row[uv & 0xff] : NULL; } static UV composite_uv (UV uv, UV uv2) 242 { 242 UNF_complist ***plane, **row, *cell, *i; 242 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) 8 return 0; 234 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { 28 uv -= Hangul_LBase; /* lindex */ 28 uv2 -= Hangul_VBase; /* vindex */ 28 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount); } 206 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { 13 uv2 -= Hangul_TBase; /* tindex */ 13 return(uv + uv2); } 193 plane = UNF_compos[uv >> 16]; 193 if (! plane) ###### return 0; 193 row = plane[(uv >> 8) & 0xff]; 193 if (! row) 24 return 0; 169 cell = row[uv & 0xff]; 169 if (! cell) 46 return 0; 1099 for (i = cell; i->nextchar; i++) { 1056 if (uv2 == i->nextchar) 80 return i->composite; } 43 return 0; } static U8 getCombinClass (UV uv) 1433 { 1433 U8 **plane, *row; 1433 if (OVER_UTF_MAX(uv)) 22 return 0; 1411 plane = (U8**)UNF_combin[uv >> 16]; 1411 if (! plane) 11 return 0; 1400 row = plane[(uv >> 8) & 0xff]; 1400 return row ? row[uv & 0xff] : 0; } static void sv_cat_decompHangul (SV* sv, UV uv) 17 { 17 UV sindex, lindex, vindex, tindex; 17 U8 *t, tmp[3 * UTF8_MAXLEN + 1]; 17 if (! Hangul_IsS(uv)) 17 return; 17 sindex = uv - Hangul_SBase; 17 lindex = sindex / Hangul_NCount; 17 vindex = (sindex % Hangul_NCount) / Hangul_TCount; 17 tindex = sindex % Hangul_TCount; 17 t = tmp; 17 t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); 17 t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); 17 if (tindex) 2 t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); 17 *t = '\0'; 17 sv_catpvn(sv, (char *)tmp, t - tmp); } static void sv_cat_uvuni (SV* sv, UV uv) 520 { 520 U8 *t, tmp[UTF8_MAXLEN + 1]; 520 t = tmp; 520 t = uvuni_to_utf8(t, uv); 520 *t = '\0'; 520 sv_catpvn(sv, (char *)tmp, t - tmp); } static char * sv_2pvunicode(SV *sv, STRLEN *lp) 1031 { 1031 char *s; 1031 STRLEN len; 1031 s = (char*)SvPV(sv,len); 1031 if (!SvUTF8(sv)) { 105 SV* tmpsv = sv_mortalcopy(sv); 105 if (!SvPOK(tmpsv)) ###### (void)sv_pvn_force(tmpsv,&len); 105 sv_utf8_upgrade(tmpsv); 105 s = (char*)SvPV(tmpsv,len); } 1031 *lp = len; 1031 return s; } MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize SV* decompose(src, compat = &PL_sv_no) SV * src SV * compat PROTOTYPE: $;$ PREINIT: 370 SV *dst; 370 STRLEN srclen, retlen; 370 U8 *s, *e, *p, *r; 370 UV uv; 370 bool iscompat; CODE: 241 iscompat = SvTRUE(compat); s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; dst = newSV(1); (void)SvPOK_only(dst); SvUTF8_on(dst); for (p = s; p < e; p += retlen) { uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); if (!retlen) croak(ErrRetlenIsZero); if (Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv); else { r = iscompat ? dec_compat(uv) : dec_canonical(uv); if (r) sv_catpv(dst, (char *)r); else sv_cat_uvuni(dst, uv); } } RETVAL = dst; OUTPUT: RETVAL SV* reorder(src) SV * src PROTOTYPE: $ PREINIT: 363 SV *dst; 363 STRLEN srclen, dstlen, retlen, stk_cc_max; 363 U8 *s, *e, *p, *d, curCC; 363 UV uv, uvlast; 363 UNF_cc * stk_cc; 363 STRLEN i, cc_pos; 363 bool valid_uvlast; CODE: 363 s = (U8*)sv_2pvunicode(src,&srclen); 363 e = s + srclen; 363 dstlen = srclen + 1; 363 dst = newSV(dstlen); 363 (void)SvPOK_only(dst); 363 SvUTF8_on(dst); 363 d = (U8*)SvPVX(dst); 363 stk_cc_max = 10; /* enough as an initial value? */ 363 New(0, stk_cc, stk_cc_max, UNF_cc); 412 for (p = s; p < e;) { 612 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 612 if (!retlen) ###### croak(ErrRetlenIsZero); 612 p += retlen; 612 curCC = getCombinClass(uv); 612 if (curCC == 0) { 429 d = uvuni_to_utf8(d, uv); 429 continue; } 183 cc_pos = 0; 183 stk_cc[cc_pos].cc = curCC; 183 stk_cc[cc_pos].uv = uv; 183 stk_cc[cc_pos].pos = cc_pos; 183 valid_uvlast = FALSE; 275 while (p < e) { 141 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 141 if (!retlen) ###### croak(ErrRetlenIsZero); 141 p += retlen; 141 curCC = getCombinClass(uv); 141 if (curCC == 0) { 49 uvlast = uv; 49 valid_uvlast = TRUE; 49 break; } 92 cc_pos++; 92 if (stk_cc_max <= cc_pos) { /* extend if need */ ###### stk_cc_max = cc_pos + 1; ###### Renew(stk_cc, stk_cc_max, UNF_cc); } 92 stk_cc[cc_pos].cc = curCC; 92 stk_cc[cc_pos].uv = uv; 92 stk_cc[cc_pos].pos = cc_pos; } /* reordered if there are two c.c.'s */ 183 if (cc_pos) { 48 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc); } 458 for (i = 0; i <= cc_pos; i++) { 275 d = uvuni_to_utf8(d, stk_cc[i].uv); } 183 if (valid_uvlast) { 49 d = uvuni_to_utf8(d, uvlast); } } 363 *d = '\0'; 363 SvCUR_set(dst, d - (U8*)SvPVX(dst)); 363 Safefree(stk_cc); 363 RETVAL = dst; OUTPUT: RETVAL SV* compose(src) SV * src PROTOTYPE: $ ALIAS: composeContiguous = 1 PREINIT: 201 SV *dst, *tmp; 201 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; 201 UV uv, uvS, uvComp; 201 STRLEN srclen, dstlen, tmplen, retlen; 201 bool beginning = TRUE; CODE: 201 s = (U8*)sv_2pvunicode(src,&srclen); 201 e = s + srclen; 201 dstlen = srclen + 1; 201 dst = newSV(dstlen); 201 (void)SvPOK_only(dst); 201 SvUTF8_on(dst); 201 d = (U8*)SvPVX(dst); /* for uncomposed combining char */ 201 tmp = sv_2mortal(newSV(dstlen)); 201 (void)SvPOK_only(tmp); 201 SvUTF8_on(tmp); 423 for (p = s; p < e;) { 229 if (beginning) { 197 uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 197 if (!retlen) ###### croak(ErrRetlenIsZero); 197 p += retlen; 197 if (getCombinClass(uvS)) { /* no Starter found yet */ 7 d = uvuni_to_utf8(d, uvS); 7 continue; } 190 beginning = FALSE; } /* Starter */ 222 t = tmp_start = (U8*)SvPVX(tmp); 222 preCC = 0; /* to the next Starter */ 420 while (p < e) { 230 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 230 if (!retlen) ###### croak(ErrRetlenIsZero); 230 p += retlen; 230 curCC = getCombinClass(uv); 230 if (preCC && preCC == curCC) { 4 preCC = curCC; 4 t = uvuni_to_utf8(t, uv); } else { 226 uvComp = composite_uv(uvS, uv); 226 if (uvComp && ! isExclusion(uvComp) && (ix ? (t == tmp_start) : (preCC <= curCC))) { 103 STRLEN leftcur, rightcur, dstcur; 103 leftcur = UNISKIP(uvComp); 103 rightcur = UNISKIP(uvS) + UNISKIP(uv); 103 if (leftcur > rightcur) { ###### dstcur = d - (U8*)SvPVX(dst); ###### dstlen += leftcur - rightcur; ###### d = (U8*)SvGROW(dst,dstlen) + dstcur; } /* preCC not changed to curCC */ 103 uvS = uvComp; 123 } else if (! curCC && p < e) { /* blocked */ 32 break; } else { 91 preCC = curCC; 91 t = uvuni_to_utf8(t, uv); } } } 222 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ 222 tmplen = t - tmp_start; 222 if (tmplen) { /* uncomposed combining char */ 56 t = (U8*)SvPVX(tmp); 240 while (tmplen--) 184 *d++ = *t++; } 222 uvS = uv; } /* for */ 201 *d = '\0'; 201 SvCUR_set(dst, d - (U8*)SvPVX(dst)); 201 RETVAL = dst; OUTPUT: RETVAL void checkNFD(src) SV * src PROTOTYPE: $ ALIAS: checkNFKD = 1 PREINIT: 23 STRLEN srclen, retlen; 23 U8 *s, *e, *p, curCC, preCC; 23 UV uv; CODE: 23 s = (U8*)sv_2pvunicode(src,&srclen); 23 e = s + srclen; 23 preCC = 0; 73 for (p = s; p < e; p += retlen) { 61 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 61 if (!retlen) ###### croak(ErrRetlenIsZero); 61 curCC = getCombinClass(uv); 61 if (preCC > curCC && curCC != 0) /* canonical ordering violated */ ###### XSRETURN_NO; 61 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) 11 XSRETURN_NO; 50 preCC = curCC; } 12 XSRETURN_YES; void checkNFC(src) SV * src PROTOTYPE: $ ALIAS: checkNFKC = 1 PREINIT: 28 STRLEN srclen, retlen; 28 U8 *s, *e, *p, curCC, preCC; 28 UV uv; 28 bool isMAYBE; CODE: 28 s = (U8*)sv_2pvunicode(src,&srclen); 28 e = s + srclen; 28 preCC = 0; 28 isMAYBE = FALSE; 103 for (p = s; p < e; p += retlen) { 80 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 80 if (!retlen) ###### croak(ErrRetlenIsZero); 80 curCC = getCombinClass(uv); 80 if (preCC > curCC && curCC != 0) /* canonical ordering violated */ ###### XSRETURN_NO; /* get NFC/NFKC property */ 80 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ ; /* YES */ 77 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) 1 XSRETURN_NO; 76 else if (isComp2nd(uv)) 5 isMAYBE = TRUE; 71 else if (ix) { 29 char *canon, *compat; /* NFKC_NO when having compatibility mapping. */ 29 canon = (char *) dec_canonical(uv); 29 compat = (char *) dec_compat(uv); 29 if (compat && !(canon && strEQ(canon, compat))) 4 XSRETURN_NO; } /* end of get NFC/NFKC property */ 75 preCC = curCC; } 23 if (isMAYBE) 4 XSRETURN_UNDEF; else 19 XSRETURN_YES; void checkFCD(src) SV * src PROTOTYPE: $ ALIAS: checkFCC = 1 PREINIT: 39 STRLEN srclen, retlen, canlen, canret; 39 U8 *s, *e, *p, curCC, preCC; 39 UV uv, uvLead, uvTrail; 39 U8 *sCan, *pCan, *eCan; 39 bool isMAYBE; CODE: 39 s = (U8*)sv_2pvunicode(src,&srclen); 39 e = s + srclen; 39 preCC = 0; 39 isMAYBE = FALSE; 87 for (p = s; p < e; p += retlen) { 57 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 57 if (!retlen) ###### croak(ErrRetlenIsZero); 57 sCan = (U8*) dec_canonical(uv); 57 if (sCan) { 10 canlen = (STRLEN)strlen((char *) sCan); 10 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); } else { 47 uvLead = uv; } 57 curCC = getCombinClass(uvLead); 57 if (curCC != 0 && curCC < preCC) /* canonical ordering violated */ 8 XSRETURN_NO; 49 if (ix) { 17 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) 1 XSRETURN_NO; 16 else if (isComp2nd(uv)) 7 isMAYBE = TRUE; } 48 if (sCan) { 9 eCan = sCan + canlen; 9 pCan = utf8_hop(eCan, -1); 9 if (pCan < sCan) ###### croak(ErrHopBeforeStart); 9 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); 9 preCC = getCombinClass(uvTrail); } else { 39 preCC = curCC; } } 30 if (isMAYBE) 5 XSRETURN_UNDEF; else 25 XSRETURN_YES; U8 getCombinClass(uv) UV uv PROTOTYPE: $ bool isExclusion(uv) UV uv PROTOTYPE: $ bool isSingleton(uv) UV uv PROTOTYPE: $ bool isNonStDecomp(uv) UV uv PROTOTYPE: $ bool isComp2nd(uv) UV uv PROTOTYPE: $ ALIAS: isNFC_MAYBE = 1 isNFKC_MAYBE = 2 void isNFD_NO(uv) UV uv PROTOTYPE: $ ALIAS: isNFKD_NO = 1 CODE: ###### if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) ###### XSRETURN_YES; /* NFD_NO or NFKD_NO */ else ###### XSRETURN_NO; void isComp_Ex(uv) UV uv PROTOTYPE: $ ALIAS: isNFC_NO = 0 isNFKC_NO = 1 CODE: ###### if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) ###### XSRETURN_YES; /* NFC_NO or NFKC_NO */ ###### else if (ix) { ###### char *canon, *compat; ###### canon = (char *) dec_canonical(uv); ###### compat = (char *) dec_compat(uv); ###### if (compat && (!canon || strNE(canon, compat))) ###### XSRETURN_YES; /* NFC_NO or NFKC_NO */ else ###### XSRETURN_NO; } else ###### XSRETURN_NO; SV* getComposite(uv, uv2) UV uv UV uv2 PROTOTYPE: $$ PREINIT: 16 UV composite; CODE: 16 composite = composite_uv(uv, uv2); 16 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; OUTPUT: RETVAL SV* getCanon(uv) UV uv PROTOTYPE: $ ALIAS: getCompat = 1 PREINIT: 26 U8 * rstr; CODE: 26 if (Hangul_IsS(uv)) { 4 SV * dst; 4 dst = newSV(1); 4 (void)SvPOK_only(dst); 4 sv_cat_decompHangul(dst, uv); 4 RETVAL = dst; } else { 22 rstr = ix ? dec_compat(uv) : dec_canonical(uv); 22 if (!rstr) 6 XSRETURN_UNDEF; 16 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); } 20 SvUTF8_on(RETVAL); OUTPUT: RETVAL void splitOnLastStarter(src) SV * src PREINIT: 7 SV *svp; 7 STRLEN srclen, retlen; 7 U8 *s, *e, *p; 7 UV uv; PPCODE: 7 s = (U8*)sv_2pvunicode(src,&srclen); 7 e = s + srclen; 14 for (p = e; s < p; ) { 10 p = utf8_hop(p, -1); 10 if (p < s) ###### croak(ErrHopBeforeStart); 10 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 10 if (getCombinClass(uv) == 0) /* Last Starter found */ 7 break; } 7 svp = sv_2mortal(newSVpvn((char*)s, p - s)); 7 SvUTF8_on(svp); 7 XPUSHs(svp); 7 svp = sv_2mortal(newSVpvn((char*)p, e - p)); 7 SvUTF8_on(svp); 7 XPUSHs(svp);