1 /* 2 $Id: Unicode.xs,v 2.1 2004/10/24 13:00:29 dankogai Exp $ 3 */ 4 5 #define PERL_NO_GET_CONTEXT 6 #include "EXTERN.h" 7 #include "perl.h" 8 #include "XSUB.h" 9 #define U8 U8 10 #include "../Encode/encode.h" 11 12 #define FBCHAR 0xFFFd 13 #define BOM_BE 0xFeFF 14 #define BOM16LE 0xFFFe 15 #define BOM32LE 0xFFFe0000 16 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) 17 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) 18 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) 19 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) 20 21 static UV 22 enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) 23 1638157 { 24 1638157 U8 *s = *sp; 25 1638157 UV v = 0; 26 1638157 if (s+size > e) { 27 ###### croak("Partial character %c",(char) endian); 28 } 29 1638157 switch(endian) { 30 case 'N': 31 394304 v = *s++; 32 394304 v = (v << 8) | *s++; 33 case 'n': 34 849544 v = (v << 8) | *s++; 35 849544 v = (v << 8) | *s++; 36 849544 break; 37 case 'V': 38 case 'v': 39 788613 v |= *s++; 40 788613 v |= (*s++ << 8); 41 788613 if (endian == 'v') 42 394313 break; 43 394300 v |= (*s++ << 16); 44 394300 v |= (*s++ << 24); 45 394300 break; 46 default: 47 ###### croak("Unknown endian %c",(char) endian); 48 1638157 break; 49 } 50 1638157 *sp = s; 51 1638157 return v; 52 } 53 54 void 55 enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) 56 1675986 { 57 1675986 U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1); 58 1675986 switch(endian) { 59 case 'v': 60 case 'V': 61 788736 d += SvCUR(result); 62 788736 SvCUR_set(result,SvCUR(result)+size); 63 3154898 while (size--) { 64 2366162 *d++ = (U8)(value & 0xFF); 65 2366162 value >>= 8; 66 } 67 887250 break; 68 case 'n': 69 case 'N': 70 887250 SvCUR_set(result,SvCUR(result)+size); 71 887250 d += SvCUR(result); 72 3488030 while (size--) { 73 2600780 *--d = (U8)(value & 0xFF); 74 2600780 value >>= 8; 75 } 76 ###### break; 77 default: 78 ###### croak("Unknown endian %c",(char) endian); 79 1675986 break; 80 } 81 } 82 83 MODULE = Encode::Unicode PACKAGE = Encode::Unicode 84 85 PROTOTYPES: DISABLE 86 87 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ 88 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) 89 90 void 91 decode_xs(obj, str, check = 0) 92 SV * obj 93 SV * str 94 IV check 95 CODE: 96 { 97 2232 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 98 2232 int size = SvIV(attr("size", 4)); 99 2232 int ucs2 = SvTRUE(attr("ucs2", 4)); 100 2232 int renewed = SvTRUE(attr("renewed", 7)); 101 2232 SV *result = newSVpvn("",0); 102 2232 STRLEN ulen; 103 2232 U8 *s = (U8 *)SvPVbyte(str,ulen); 104 2232 U8 *e = (U8 *)SvEND(str); 105 2232 ST(0) = sv_2mortal(result); 106 2232 SvUTF8_on(result); 107 108 2232 if (!endian && s+size <= e) { 109 8 UV bom; 110 8 endian = (size == 4) ? 'N' : 'n'; 111 8 bom = enc_unpack(aTHX_ &s,e,size,endian); 112 8 if (bom != BOM_BE) { 113 4 if (bom == BOM16LE) { 114 2 endian = 'v'; 115 } 116 2 else if (bom == BOM32LE) { 117 2 endian = 'V'; 118 } 119 else { 120 ###### croak("%"SVf":Unrecognised BOM %"UVxf, 121 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 122 bom); 123 } 124 } 125 #if 1 126 /* Update endian for next sequence */ 127 8 if (renewed) { 128 4 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 129 } 130 #endif 131 } 132 1640370 while (s < e && s+size <= e) { 133 1638140 UV ord = enc_unpack(aTHX_ &s,e,size,endian); 134 1638140 U8 *d; 135 1638140 if (size != 4 && invalid_ucs2(ord)) { 136 11 if (ucs2) { 137 4 if (check) { 138 2 croak("%"SVf":no surrogates allowed %"UVxf, 139 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 140 ord); 141 } 142 2 if (s+size <= e) { 143 /* skip the next one as well */ 144 2 enc_unpack(aTHX_ &s,e,size,endian); 145 } 146 2 ord = FBCHAR; 147 } 148 else { 149 7 UV lo; 150 7 if (!isHiSurrogate(ord)) { 151 ###### croak("%"SVf":Malformed HI surrogate %"UVxf, 152 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 153 ord); 154 } 155 7 if (s+size > e) { 156 /* Partial character */ 157 ###### s -= size; /* back up to 1st half */ 158 ###### break; /* And exit loop */ 159 } 160 7 lo = enc_unpack(aTHX_ &s,e,size,endian); 161 7 if (!isLoSurrogate(lo)){ 162 ###### croak("%"SVf":Malformed LO surrogate %"UVxf, 163 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 164 ord); 165 } 166 7 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); 167 } 168 } 169 1638138 d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); 170 1638138 d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); 171 1638138 SvCUR_set(result,d - (U8 *)SvPVX(result)); 172 } 173 2230 if (s < e) { 174 /* unlikely to happen because it's fixed-length -- dankogai */ 175 ###### if (check & ENCODE_WARN_ON_ERR){ 176 ###### Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", 177 *hv_fetch((HV *)SvRV(obj),"Name",4,0)); 178 } 179 } 180 2230 if (check && !(check & ENCODE_LEAVE_SRC)){ 181 112 if (s < e) { 182 ###### Move(s,SvPVX(str),e-s,U8); 183 ###### SvCUR_set(str,(e-s)); 184 } 185 else { 186 112 SvCUR_set(str,0); 187 } 188 112 *SvEND(str) = '\0'; 189 } 190 2230 XSRETURN(1); 191 } 192 193 void 194 encode_xs(obj, utf8, check = 0) 195 SV * obj 196 SV * utf8 197 IV check 198 CODE: 199 { 200 2190 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 201 2190 int size = SvIV(attr("size", 4)); 202 2190 int ucs2 = SvTRUE(attr("ucs2", 4)); 203 2190 int renewed = SvTRUE(attr("renewed", 7)); 204 2190 SV *result = newSVpvn("",0); 205 2190 STRLEN ulen; 206 2190 U8 *s = (U8 *)SvPVutf8(utf8,ulen); 207 2190 U8 *e = (U8 *)SvEND(utf8); 208 2190 ST(0) = sv_2mortal(result); 209 2190 if (!endian) { 210 6 endian = (size == 4) ? 'N' : 'n'; 211 6 enc_pack(aTHX_ result,size,endian,BOM_BE); 212 #if 1 213 /* Update endian for next sequence */ 214 6 if (renewed){ 215 2 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 216 } 217 #endif 218 } 219 1678164 while (s < e && s+UTF8SKIP(s) <= e) { 220 1675976 STRLEN len; 221 1675976 UV ord = utf8n_to_uvuni(s, e-s, &len, 0); 222 1675976 s += len; 223 1675976 if (size != 4 && invalid_ucs2(ord)) { 224 10 if (!issurrogate(ord)){ 225 10 if (ucs2) { 226 4 if (check) { 227 2 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", 228 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); 229 } 230 2 enc_pack(aTHX_ result,size,endian,FBCHAR); 231 }else{ 232 6 UV hi = ((ord - 0x10000) >> 10) + 0xD800; 233 6 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; 234 6 enc_pack(aTHX_ result,size,endian,hi); 235 6 enc_pack(aTHX_ result,size,endian,lo); 236 } 237 } 238 else { 239 /* not supposed to happen */ 240 ###### enc_pack(aTHX_ result,size,endian,FBCHAR); 241 } 242 } 243 else { 244 1675966 enc_pack(aTHX_ result,size,endian,ord); 245 } 246 } 247 2188 if (s < e) { 248 /* UTF-8 partial char happens often on PerlIO. 249 Since this is okay and normal, we do not warn. 250 But this is critical when you choose to LEAVE_SRC 251 in which case we die */ 252 32 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){ 253 ###### Perl_croak(aTHX_ "%"SVf":partial character is not allowed " 254 "when CHECK = 0x%" UVuf, 255 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); 256 } 257 258 } 259 2188 if (check && !(check & ENCODE_LEAVE_SRC)){ 260 58 if (s < e) { 261 32 Move(s,SvPVX(utf8),e-s,U8); 262 32 SvCUR_set(utf8,(e-s)); 263 } 264 else { 265 26 SvCUR_set(utf8,0); 266 } 267 58 *SvEND(utf8) = '\0'; 268 } 269 2188 XSRETURN(1); 270 } 271