1 2 #include "EXTERN.h" 3 #include "perl.h" 4 #include "XSUB.h" 5 6 /* These 5 files are prepared by mkheader */ 7 #include "unfcmb.h" 8 #include "unfcan.h" 9 #include "unfcpt.h" 10 #include "unfcmp.h" 11 #include "unfexc.h" 12 13 /* Perl 5.6.1 ? */ 14 #ifndef uvuni_to_utf8 15 #define uvuni_to_utf8 uv_to_utf8 16 #endif /* uvuni_to_utf8 */ 17 18 /* Perl 5.6.1 ? */ 19 #ifndef utf8n_to_uvuni 20 #define utf8n_to_uvuni utf8_to_uv 21 #endif /* utf8n_to_uvuni */ 22 23 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ 24 #ifdef UTF8_ALLOW_BOM 25 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) 26 #else 27 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) 28 #endif 29 30 /* if utf8n_to_uvuni() sets retlen to 0 (?) */ 31 #define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character" 32 33 /* utf8_hop() hops back before start. Maybe broken UTF-8 */ 34 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" 35 36 /* At present, char > 0x10ffff are unaffected without complaint, right? */ 37 #define VALID_UTF_MAX (0x10ffff) 38 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) 39 40 /* HANGUL_H */ 41 #define Hangul_SBase 0xAC00 42 #define Hangul_SFinal 0xD7A3 43 #define Hangul_SCount 11172 44 45 #define Hangul_NCount 588 46 47 #define Hangul_LBase 0x1100 48 #define Hangul_LFinal 0x1112 49 #define Hangul_LCount 19 50 51 #define Hangul_VBase 0x1161 52 #define Hangul_VFinal 0x1175 53 #define Hangul_VCount 21 54 55 #define Hangul_TBase 0x11A7 56 #define Hangul_TFinal 0x11C2 57 #define Hangul_TCount 28 58 59 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) 60 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) 61 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) 62 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) 63 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) 64 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) 65 /* HANGUL_H */ 66 67 /* this is used for canonical ordering of combining characters (c.c.). */ 68 typedef struct { 69 U8 cc; /* combining class */ 70 UV uv; /* codepoint */ 71 STRLEN pos; /* position */ 72 } UNF_cc; 73 74 static int compare_cc (const void *a, const void *b) 75 124 { 76 124 int ret_cc; 77 124 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; 78 124 if (ret_cc) 79 93 return ret_cc; 80 81 31 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) 82 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); 83 } 84 85 static U8* dec_canonical (UV uv) 86 609 { 87 609 U8 ***plane, **row; 88 609 if (OVER_UTF_MAX(uv)) 89 10 return NULL; 90 599 plane = (U8***)UNF_canon[uv >> 16]; 91 599 if (! plane) 92 5 return NULL; 93 594 row = plane[(uv >> 8) & 0xff]; 94 594 return row ? row[uv & 0xff] : NULL; 95 } 96 97 static U8* dec_compat (UV uv) 98 217 { 99 217 U8 ***plane, **row; 100 217 if (OVER_UTF_MAX(uv)) 101 6 return NULL; 102 211 plane = (U8***)UNF_compat[uv >> 16]; 103 211 if (! plane) 104 3 return NULL; 105 208 row = plane[(uv >> 8) & 0xff]; 106 208 return row ? row[uv & 0xff] : NULL; 107 } 108 109 static UV composite_uv (UV uv, UV uv2) 110 242 { 111 242 UNF_complist ***plane, **row, *cell, *i; 112 113 242 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) 114 8 return 0; 115 116 234 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { 117 28 uv -= Hangul_LBase; /* lindex */ 118 28 uv2 -= Hangul_VBase; /* vindex */ 119 28 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount); 120 } 121 206 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { 122 13 uv2 -= Hangul_TBase; /* tindex */ 123 13 return(uv + uv2); 124 } 125 193 plane = UNF_compos[uv >> 16]; 126 193 if (! plane) 127 ###### return 0; 128 193 row = plane[(uv >> 8) & 0xff]; 129 193 if (! row) 130 24 return 0; 131 169 cell = row[uv & 0xff]; 132 169 if (! cell) 133 46 return 0; 134 1099 for (i = cell; i->nextchar; i++) { 135 1056 if (uv2 == i->nextchar) 136 80 return i->composite; 137 } 138 43 return 0; 139 } 140 141 static U8 getCombinClass (UV uv) 142 1433 { 143 1433 U8 **plane, *row; 144 1433 if (OVER_UTF_MAX(uv)) 145 22 return 0; 146 1411 plane = (U8**)UNF_combin[uv >> 16]; 147 1411 if (! plane) 148 11 return 0; 149 1400 row = plane[(uv >> 8) & 0xff]; 150 1400 return row ? row[uv & 0xff] : 0; 151 } 152 153 static void sv_cat_decompHangul (SV* sv, UV uv) 154 17 { 155 17 UV sindex, lindex, vindex, tindex; 156 17 U8 *t, tmp[3 * UTF8_MAXLEN + 1]; 157 158 17 if (! Hangul_IsS(uv)) 159 17 return; 160 161 17 sindex = uv - Hangul_SBase; 162 17 lindex = sindex / Hangul_NCount; 163 17 vindex = (sindex % Hangul_NCount) / Hangul_TCount; 164 17 tindex = sindex % Hangul_TCount; 165 166 17 t = tmp; 167 17 t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); 168 17 t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); 169 17 if (tindex) 170 2 t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); 171 17 *t = '\0'; 172 17 sv_catpvn(sv, (char *)tmp, t - tmp); 173 } 174 175 static void sv_cat_uvuni (SV* sv, UV uv) 176 520 { 177 520 U8 *t, tmp[UTF8_MAXLEN + 1]; 178 179 520 t = tmp; 180 520 t = uvuni_to_utf8(t, uv); 181 520 *t = '\0'; 182 520 sv_catpvn(sv, (char *)tmp, t - tmp); 183 } 184 185 static char * sv_2pvunicode(SV *sv, STRLEN *lp) 186 1031 { 187 1031 char *s; 188 1031 STRLEN len; 189 1031 s = (char*)SvPV(sv,len); 190 1031 if (!SvUTF8(sv)) { 191 105 SV* tmpsv = sv_mortalcopy(sv); 192 105 if (!SvPOK(tmpsv)) 193 ###### (void)sv_pvn_force(tmpsv,&len); 194 105 sv_utf8_upgrade(tmpsv); 195 105 s = (char*)SvPV(tmpsv,len); 196 } 197 1031 *lp = len; 198 1031 return s; 199 } 200 201 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize 202 203 SV* 204 decompose(src, compat = &PL_sv_no) 205 SV * src 206 SV * compat 207 PROTOTYPE: $;$ 208 PREINIT: 209 370 SV *dst; 210 370 STRLEN srclen, retlen; 211 370 U8 *s, *e, *p, *r; 212 370 UV uv; 213 370 bool iscompat; 214 CODE: 215 241 iscompat = SvTRUE(compat); 216 s = (U8*)sv_2pvunicode(src,&srclen); 217 e = s + srclen; 218 219 dst = newSV(1); 220 (void)SvPOK_only(dst); 221 SvUTF8_on(dst); 222 223 for (p = s; p < e; p += retlen) { 224 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 225 if (!retlen) 226 croak(ErrRetlenIsZero); 227 228 if (Hangul_IsS(uv)) 229 sv_cat_decompHangul(dst, uv); 230 else { 231 r = iscompat ? dec_compat(uv) : dec_canonical(uv); 232 if (r) 233 sv_catpv(dst, (char *)r); 234 else 235 sv_cat_uvuni(dst, uv); 236 } 237 } 238 RETVAL = dst; 239 OUTPUT: 240 RETVAL 241 242 243 244 SV* 245 reorder(src) 246 SV * src 247 PROTOTYPE: $ 248 PREINIT: 249 363 SV *dst; 250 363 STRLEN srclen, dstlen, retlen, stk_cc_max; 251 363 U8 *s, *e, *p, *d, curCC; 252 363 UV uv, uvlast; 253 363 UNF_cc * stk_cc; 254 363 STRLEN i, cc_pos; 255 363 bool valid_uvlast; 256 CODE: 257 363 s = (U8*)sv_2pvunicode(src,&srclen); 258 363 e = s + srclen; 259 260 363 dstlen = srclen + 1; 261 363 dst = newSV(dstlen); 262 363 (void)SvPOK_only(dst); 263 363 SvUTF8_on(dst); 264 363 d = (U8*)SvPVX(dst); 265 266 363 stk_cc_max = 10; /* enough as an initial value? */ 267 363 New(0, stk_cc, stk_cc_max, UNF_cc); 268 269 412 for (p = s; p < e;) { 270 612 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 271 612 if (!retlen) 272 ###### croak(ErrRetlenIsZero); 273 612 p += retlen; 274 275 612 curCC = getCombinClass(uv); 276 612 if (curCC == 0) { 277 429 d = uvuni_to_utf8(d, uv); 278 429 continue; 279 } 280 281 183 cc_pos = 0; 282 183 stk_cc[cc_pos].cc = curCC; 283 183 stk_cc[cc_pos].uv = uv; 284 183 stk_cc[cc_pos].pos = cc_pos; 285 286 183 valid_uvlast = FALSE; 287 275 while (p < e) { 288 141 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 289 141 if (!retlen) 290 ###### croak(ErrRetlenIsZero); 291 141 p += retlen; 292 293 141 curCC = getCombinClass(uv); 294 141 if (curCC == 0) { 295 49 uvlast = uv; 296 49 valid_uvlast = TRUE; 297 49 break; 298 } 299 300 92 cc_pos++; 301 92 if (stk_cc_max <= cc_pos) { /* extend if need */ 302 ###### stk_cc_max = cc_pos + 1; 303 ###### Renew(stk_cc, stk_cc_max, UNF_cc); 304 } 305 92 stk_cc[cc_pos].cc = curCC; 306 92 stk_cc[cc_pos].uv = uv; 307 92 stk_cc[cc_pos].pos = cc_pos; 308 } 309 310 /* reordered if there are two c.c.'s */ 311 183 if (cc_pos) { 312 48 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc); 313 } 314 315 458 for (i = 0; i <= cc_pos; i++) { 316 275 d = uvuni_to_utf8(d, stk_cc[i].uv); 317 } 318 183 if (valid_uvlast) 319 { 320 49 d = uvuni_to_utf8(d, uvlast); 321 } 322 } 323 363 *d = '\0'; 324 363 SvCUR_set(dst, d - (U8*)SvPVX(dst)); 325 363 Safefree(stk_cc); 326 363 RETVAL = dst; 327 OUTPUT: 328 RETVAL 329 330 331 332 SV* 333 compose(src) 334 SV * src 335 PROTOTYPE: $ 336 ALIAS: 337 composeContiguous = 1 338 PREINIT: 339 201 SV *dst, *tmp; 340 201 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; 341 201 UV uv, uvS, uvComp; 342 201 STRLEN srclen, dstlen, tmplen, retlen; 343 201 bool beginning = TRUE; 344 CODE: 345 201 s = (U8*)sv_2pvunicode(src,&srclen); 346 201 e = s + srclen; 347 348 201 dstlen = srclen + 1; 349 201 dst = newSV(dstlen); 350 201 (void)SvPOK_only(dst); 351 201 SvUTF8_on(dst); 352 201 d = (U8*)SvPVX(dst); 353 354 /* for uncomposed combining char */ 355 201 tmp = sv_2mortal(newSV(dstlen)); 356 201 (void)SvPOK_only(tmp); 357 201 SvUTF8_on(tmp); 358 359 423 for (p = s; p < e;) { 360 229 if (beginning) { 361 197 uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 362 197 if (!retlen) 363 ###### croak(ErrRetlenIsZero); 364 197 p += retlen; 365 366 197 if (getCombinClass(uvS)) { /* no Starter found yet */ 367 7 d = uvuni_to_utf8(d, uvS); 368 7 continue; 369 } 370 190 beginning = FALSE; 371 } 372 373 /* Starter */ 374 222 t = tmp_start = (U8*)SvPVX(tmp); 375 222 preCC = 0; 376 377 /* to the next Starter */ 378 420 while (p < e) { 379 230 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 380 230 if (!retlen) 381 ###### croak(ErrRetlenIsZero); 382 230 p += retlen; 383 384 230 curCC = getCombinClass(uv); 385 386 230 if (preCC && preCC == curCC) { 387 4 preCC = curCC; 388 4 t = uvuni_to_utf8(t, uv); 389 } else { 390 226 uvComp = composite_uv(uvS, uv); 391 392 226 if (uvComp && ! isExclusion(uvComp) && 393 (ix ? (t == tmp_start) : (preCC <= curCC))) { 394 103 STRLEN leftcur, rightcur, dstcur; 395 103 leftcur = UNISKIP(uvComp); 396 103 rightcur = UNISKIP(uvS) + UNISKIP(uv); 397 398 103 if (leftcur > rightcur) { 399 ###### dstcur = d - (U8*)SvPVX(dst); 400 ###### dstlen += leftcur - rightcur; 401 ###### d = (U8*)SvGROW(dst,dstlen) + dstcur; 402 } 403 /* preCC not changed to curCC */ 404 103 uvS = uvComp; 405 123 } else if (! curCC && p < e) { /* blocked */ 406 32 break; 407 } else { 408 91 preCC = curCC; 409 91 t = uvuni_to_utf8(t, uv); 410 } 411 } 412 } 413 222 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ 414 222 tmplen = t - tmp_start; 415 222 if (tmplen) { /* uncomposed combining char */ 416 56 t = (U8*)SvPVX(tmp); 417 240 while (tmplen--) 418 184 *d++ = *t++; 419 } 420 222 uvS = uv; 421 } /* for */ 422 201 *d = '\0'; 423 201 SvCUR_set(dst, d - (U8*)SvPVX(dst)); 424 201 RETVAL = dst; 425 OUTPUT: 426 RETVAL 427 428 429 void 430 checkNFD(src) 431 SV * src 432 PROTOTYPE: $ 433 ALIAS: 434 checkNFKD = 1 435 PREINIT: 436 23 STRLEN srclen, retlen; 437 23 U8 *s, *e, *p, curCC, preCC; 438 23 UV uv; 439 CODE: 440 23 s = (U8*)sv_2pvunicode(src,&srclen); 441 23 e = s + srclen; 442 443 23 preCC = 0; 444 73 for (p = s; p < e; p += retlen) { 445 61 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 446 61 if (!retlen) 447 ###### croak(ErrRetlenIsZero); 448 449 61 curCC = getCombinClass(uv); 450 61 if (preCC > curCC && curCC != 0) /* canonical ordering violated */ 451 ###### XSRETURN_NO; 452 61 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) 453 11 XSRETURN_NO; 454 50 preCC = curCC; 455 } 456 12 XSRETURN_YES; 457 458 459 460 void 461 checkNFC(src) 462 SV * src 463 PROTOTYPE: $ 464 ALIAS: 465 checkNFKC = 1 466 PREINIT: 467 28 STRLEN srclen, retlen; 468 28 U8 *s, *e, *p, curCC, preCC; 469 28 UV uv; 470 28 bool isMAYBE; 471 CODE: 472 28 s = (U8*)sv_2pvunicode(src,&srclen); 473 28 e = s + srclen; 474 475 28 preCC = 0; 476 28 isMAYBE = FALSE; 477 103 for (p = s; p < e; p += retlen) { 478 80 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 479 80 if (!retlen) 480 ###### croak(ErrRetlenIsZero); 481 482 80 curCC = getCombinClass(uv); 483 484 80 if (preCC > curCC && curCC != 0) /* canonical ordering violated */ 485 ###### XSRETURN_NO; 486 487 /* get NFC/NFKC property */ 488 80 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ 489 ; /* YES */ 490 77 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) 491 1 XSRETURN_NO; 492 76 else if (isComp2nd(uv)) 493 5 isMAYBE = TRUE; 494 71 else if (ix) { 495 29 char *canon, *compat; 496 /* NFKC_NO when having compatibility mapping. */ 497 29 canon = (char *) dec_canonical(uv); 498 29 compat = (char *) dec_compat(uv); 499 29 if (compat && !(canon && strEQ(canon, compat))) 500 4 XSRETURN_NO; 501 } /* end of get NFC/NFKC property */ 502 503 75 preCC = curCC; 504 } 505 23 if (isMAYBE) 506 4 XSRETURN_UNDEF; 507 else 508 19 XSRETURN_YES; 509 510 511 512 void 513 checkFCD(src) 514 SV * src 515 PROTOTYPE: $ 516 ALIAS: 517 checkFCC = 1 518 PREINIT: 519 39 STRLEN srclen, retlen, canlen, canret; 520 39 U8 *s, *e, *p, curCC, preCC; 521 39 UV uv, uvLead, uvTrail; 522 39 U8 *sCan, *pCan, *eCan; 523 39 bool isMAYBE; 524 CODE: 525 39 s = (U8*)sv_2pvunicode(src,&srclen); 526 39 e = s + srclen; 527 528 39 preCC = 0; 529 39 isMAYBE = FALSE; 530 87 for (p = s; p < e; p += retlen) { 531 57 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 532 57 if (!retlen) 533 ###### croak(ErrRetlenIsZero); 534 535 57 sCan = (U8*) dec_canonical(uv); 536 537 57 if (sCan) { 538 10 canlen = (STRLEN)strlen((char *) sCan); 539 10 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); 540 } 541 else { 542 47 uvLead = uv; 543 } 544 545 57 curCC = getCombinClass(uvLead); 546 547 57 if (curCC != 0 && curCC < preCC) /* canonical ordering violated */ 548 8 XSRETURN_NO; 549 550 49 if (ix) { 551 17 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) 552 1 XSRETURN_NO; 553 16 else if (isComp2nd(uv)) 554 7 isMAYBE = TRUE; 555 } 556 557 48 if (sCan) { 558 9 eCan = sCan + canlen; 559 9 pCan = utf8_hop(eCan, -1); 560 9 if (pCan < sCan) 561 ###### croak(ErrHopBeforeStart); 562 9 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); 563 9 preCC = getCombinClass(uvTrail); 564 } 565 else { 566 39 preCC = curCC; 567 } 568 } 569 30 if (isMAYBE) 570 5 XSRETURN_UNDEF; 571 else 572 25 XSRETURN_YES; 573 574 575 576 U8 577 getCombinClass(uv) 578 UV uv 579 PROTOTYPE: $ 580 581 bool 582 isExclusion(uv) 583 UV uv 584 PROTOTYPE: $ 585 586 bool 587 isSingleton(uv) 588 UV uv 589 PROTOTYPE: $ 590 591 bool 592 isNonStDecomp(uv) 593 UV uv 594 PROTOTYPE: $ 595 596 bool 597 isComp2nd(uv) 598 UV uv 599 PROTOTYPE: $ 600 ALIAS: 601 isNFC_MAYBE = 1 602 isNFKC_MAYBE = 2 603 604 605 606 void 607 isNFD_NO(uv) 608 UV uv 609 PROTOTYPE: $ 610 ALIAS: 611 isNFKD_NO = 1 612 CODE: 613 ###### if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) 614 ###### XSRETURN_YES; /* NFD_NO or NFKD_NO */ 615 else 616 ###### XSRETURN_NO; 617 618 619 620 void 621 isComp_Ex(uv) 622 UV uv 623 PROTOTYPE: $ 624 ALIAS: 625 isNFC_NO = 0 626 isNFKC_NO = 1 627 CODE: 628 ###### if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) 629 ###### XSRETURN_YES; /* NFC_NO or NFKC_NO */ 630 ###### else if (ix) { 631 ###### char *canon, *compat; 632 ###### canon = (char *) dec_canonical(uv); 633 ###### compat = (char *) dec_compat(uv); 634 ###### if (compat && (!canon || strNE(canon, compat))) 635 ###### XSRETURN_YES; /* NFC_NO or NFKC_NO */ 636 else 637 ###### XSRETURN_NO; 638 } 639 else 640 ###### XSRETURN_NO; 641 642 643 644 SV* 645 getComposite(uv, uv2) 646 UV uv 647 UV uv2 648 PROTOTYPE: $$ 649 PREINIT: 650 16 UV composite; 651 CODE: 652 16 composite = composite_uv(uv, uv2); 653 16 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; 654 OUTPUT: 655 RETVAL 656 657 658 659 SV* 660 getCanon(uv) 661 UV uv 662 PROTOTYPE: $ 663 ALIAS: 664 getCompat = 1 665 PREINIT: 666 26 U8 * rstr; 667 CODE: 668 26 if (Hangul_IsS(uv)) { 669 4 SV * dst; 670 4 dst = newSV(1); 671 4 (void)SvPOK_only(dst); 672 4 sv_cat_decompHangul(dst, uv); 673 4 RETVAL = dst; 674 } else { 675 22 rstr = ix ? dec_compat(uv) : dec_canonical(uv); 676 22 if (!rstr) 677 6 XSRETURN_UNDEF; 678 16 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); 679 } 680 20 SvUTF8_on(RETVAL); 681 OUTPUT: 682 RETVAL 683 684 685 void 686 splitOnLastStarter(src) 687 SV * src 688 PREINIT: 689 7 SV *svp; 690 7 STRLEN srclen, retlen; 691 7 U8 *s, *e, *p; 692 7 UV uv; 693 PPCODE: 694 7 s = (U8*)sv_2pvunicode(src,&srclen); 695 7 e = s + srclen; 696 697 14 for (p = e; s < p; ) { 698 10 p = utf8_hop(p, -1); 699 10 if (p < s) 700 ###### croak(ErrHopBeforeStart); 701 10 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); 702 10 if (getCombinClass(uv) == 0) /* Last Starter found */ 703 7 break; 704 } 705 706 7 svp = sv_2mortal(newSVpvn((char*)s, p - s)); 707 7 SvUTF8_on(svp); 708 7 XPUSHs(svp); 709 710 7 svp = sv_2mortal(newSVpvn((char*)p, e - p)); 711 7 SvUTF8_on(svp); 712 7 XPUSHs(svp); 713