     1			/*    utf8.c
     2			 *
     3			 *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
     4			 *    others
     5			 *
     6			 *    You may distribute under the terms of either the GNU General Public
     7			 *    License or the Artistic License, as specified in the README file.
     8			 *
     9			 */
    10			
    11			/*
    12			 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
    13			 * heard of that we don't want to see any closer; and that's the one place
    14			 * we're trying to get to!  And that's just where we can't get, nohow.'
    15			 *
    16			 * 'Well do I understand your speech,' he answered in the same language;
    17			 * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
    18			 * as is the custom in the West, if you wish to be answered?'
    19			 *
    20			 * ...the travellers perceived that the floor was paved with stones of many
    21			 * hues; branching runes and strange devices intertwined beneath their feet.
    22			 */
    23			
    24			#include "EXTERN.h"
    25			#define PERL_IN_UTF8_C
    26			#include "perl.h"
    27			
    28			static const char unees[] =
    29			    "Malformed UTF-8 character (unexpected end of string)";
    30			
    31			/* 
    32			=head1 Unicode Support
    33			
    34			This file contains various utility functions for manipulating UTF8-encoded
    35			strings. For the uninitiated, this is a method of representing arbitrary
    36			Unicode characters as a variable number of bytes, in such a way that
    37			characters in the ASCII range are unmodified, and a zero byte never appears
    38			within non-zero characters.
    39			
    40			=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
    41			
    42			Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
    43			of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
    44			bytes available. The return value is the pointer to the byte after the
    45			end of the new character. In other words,
    46			
    47			    d = uvuni_to_utf8_flags(d, uv, flags);
    48			
    49			or, in most cases,
    50			
    51			    d = uvuni_to_utf8(d, uv);
    52			
    53			(which is equivalent to)
    54			
    55			    d = uvuni_to_utf8_flags(d, uv, 0);
    56			
    57			is the recommended Unicode-aware way of saying
    58			
    59			    *(d++) = uv;
    60			
    61			=cut
    62			*/
    63			
    64			U8 *
    65			Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
    66	     2768713    {
    67	     2768713        if (ckWARN(WARN_UTF8)) {
    68	      267915    	 if (UNICODE_IS_SURROGATE(uv) &&
    69				     !(flags & UNICODE_ALLOW_SURROGATE))
    70	           6    	      Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
    71	      267909    	 else if (
    72					  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
    73					    !(flags & UNICODE_ALLOW_FDD0))
    74					   ||
    75					   ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
    76					    !(flags & UNICODE_ALLOW_FFFF))) &&
    77					  /* UNICODE_ALLOW_SUPER includes
    78					   * FFFEs and FFFFs beyond 0x10FFFF. */
    79					  ((uv <= PERL_UNICODE_MAX) ||
    80					   !(flags & UNICODE_ALLOW_SUPER))
    81					  )
    82	          12    	      Perl_warner(aTHX_ packWARN(WARN_UTF8),
    83						 "Unicode character 0x%04"UVxf" is illegal", uv);
    84			    }
    85	     2768713        if (UNI_IS_INVARIANT(uv)) {
    86	     1725406    	*d++ = (U8)UTF_TO_NATIVE(uv);
    87	     1725406    	return d;
    88			    }
    89			#if defined(EBCDIC)
    90			    else {
    91				STRLEN len  = UNISKIP(uv);
    92				U8 *p = d+len-1;
    93				while (p > d) {
    94				    *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
    95				    uv >>= UTF_ACCUMULATION_SHIFT;
    96				}
    97				*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
    98				return d+len;
    99			    }
   100			#else /* Non loop style */
   101	     1043307        if (uv < 0x800) {
   102	       74757    	*d++ = (U8)(( uv >>  6)         | 0xc0);
   103	       74757    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
   104	       74757    	return d;
   105			    }
   106	      968550        if (uv < 0x10000) {
   107	      570861    	*d++ = (U8)(( uv >> 12)         | 0xe0);
   108	      570861    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
   109	      570861    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
   110	      570861    	return d;
   111			    }
   112	      397689        if (uv < 0x200000) {
   113	      359295    	*d++ = (U8)(( uv >> 18)         | 0xf0);
   114	      359295    	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
   115	      359295    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
   116	      359295    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
   117	      359295    	return d;
   118			    }
   119	       38394        if (uv < 0x4000000) {
   120	           6    	*d++ = (U8)(( uv >> 24)         | 0xf8);
   121	           6    	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
   122	           6    	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
   123	           6    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
   124	           6    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
   125	           6    	return d;
   126			    }
   127	       38388        if (uv < 0x80000000) {
   128	          28    	*d++ = (U8)(( uv >> 30)         | 0xfc);
   129	          28    	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
   130	          28    	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
   131	          28    	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
   132	          28    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
   133	          28    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
   134	          28    	return d;
   135			    }
   136			#ifdef HAS_QUAD
   137			    if (uv < UTF8_QUAD_MAX)
   138			#endif
   139			    {
   140	       38360    	*d++ =                            0xfe;	/* Can't match U+FEFF! */
   141	       38360    	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
   142	       38360    	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
   143	       38360    	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
   144	       38360    	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
   145	       38360    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
   146	       38360    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
   147	       38360    	return d;
   148			    }
   149			#ifdef HAS_QUAD
   150			    {
   151				*d++ =                            0xff;		/* Can't match U+FFFE! */
   152				*d++ =                            0x80;		/* 6 Reserved bits */
   153				*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);	/* 2 Reserved bits */
   154				*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
   155				*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
   156				*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
   157				*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
   158				*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
   159				*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
   160				*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
   161				*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
   162				*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
   163				*d++ = (U8)(( uv        & 0x3f) | 0x80);
   164				return d;
   165			    }
   166			#endif
   167			#endif /* Loop style */
   168			}
   169			 
   170			U8 *
   171			Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
   172	      251674    {
   173	      251674        return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
   174			}
   175			
   176			/*
   177			
   178			Tests if some arbitrary number of bytes begins in a valid UTF-8
   179			character.  Note that an INVARIANT (i.e. ASCII) character is a valid
   180			UTF-8 character.  The actual number of bytes in the UTF-8 character
   181			will be returned if it is valid, otherwise 0.
   182			
   183			This is the "slow" version as opposed to the "fast" version which is
   184			the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
   185			difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
   186			or less you should use the IS_UTF8_CHAR(), for lengths of five or more
   187			you should use the _slow().  In practice this means that the _slow()
   188			will be used very rarely, since the maximum Unicode code point (as of
   189			Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
   190			the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
   191			five bytes or more.
   192			
   193			=cut */
   194			STATIC STRLEN
   195			S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
   196	           1    {
   197	           1        U8 u = *s;
   198	           1        STRLEN slen;
   199	           1        UV uv, ouv;
   200			
   201	           1        if (UTF8_IS_INVARIANT(u))
   202	      ######    	return 1;
   203			
   204	           1        if (!UTF8_IS_START(u))
   205	           1    	return 0;
   206			
   207	      ######        if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
   208	      ######    	return 0;
   209			
   210	      ######        slen = len - 1;
   211	      ######        s++;
   212	      ######        u &= UTF_START_MASK(len);
   213	      ######        uv  = u;
   214	      ######        ouv = uv;
   215	      ######        while (slen--) {
   216	      ######    	if (!UTF8_IS_CONTINUATION(*s))
   217	      ######    	    return 0;
   218	      ######    	uv = UTF8_ACCUMULATE(uv, *s);
   219	      ######    	if (uv < ouv) 
   220	      ######    	    return 0;
   221	      ######    	ouv = uv;
   222	      ######    	s++;
   223			    }
   224			
   225	      ######        if ((STRLEN)UNISKIP(uv) < len)
   226	      ######    	return 0;
   227			
   228	      ######        return len;
   229			}
   230			
   231			/*
   232			=for apidoc A|STRLEN|is_utf8_char|const U8 *s
   233			
   234			Tests if some arbitrary number of bytes begins in a valid UTF-8
   235			character.  Note that an INVARIANT (i.e. ASCII) character is a valid
   236			UTF-8 character.  The actual number of bytes in the UTF-8 character
   237			will be returned if it is valid, otherwise 0.
   238			
   239			=cut */
   240			STRLEN
   241			Perl_is_utf8_char(pTHX_ const U8 *s)
   242	         209    {
   243	         209        STRLEN len = UTF8SKIP(s);
   244			#ifdef IS_UTF8_CHAR
   245	         209        if (IS_UTF8_CHAR_FAST(len))
   246	         208            return IS_UTF8_CHAR(s, len) ? len : 0;
   247			#endif /* #ifdef IS_UTF8_CHAR */
   248	           1        return is_utf8_char_slow(s, len);
   249			}
   250			
   251			/*
   252			=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
   253			
   254			Returns true if first C<len> bytes of the given string form a valid
   255			UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
   256			not mean 'a string that contains code points above 0x7F encoded in UTF-8'
   257			because a valid ASCII string is a valid UTF-8 string.
   258			
   259			See also is_utf8_string_loclen() and is_utf8_string_loc().
   260			
   261			=cut
   262			*/
   263			
   264			bool
   265			Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
   266	       77372    {
   267	       77372        const U8* x = s;
   268	       77372        const U8* send;
   269			
   270	       77372        if (!len && s)
   271	           7    	len = strlen((const char *)s);
   272	       77372        send = s + len;
   273			
   274	   366043500        while (x < send) {
   275	   365966132    	STRLEN c;
   276				 /* Inline the easy bits of is_utf8_char() here for speed... */
   277	   365966132    	 if (UTF8_IS_INVARIANT(*x))
   278	      255178    	      c = 1;
   279	   365710954    	 else if (!UTF8_IS_START(*x))
   280	   365710950    	     goto out;
   281				 else {
   282				      /* ... and call is_utf8_char() only if really needed. */
   283			#ifdef IS_UTF8_CHAR
   284	   365710950    	     c = UTF8SKIP(x);
   285	   365710950    	     if (IS_UTF8_CHAR_FAST(c)) {
   286	   365710950    	         if (!IS_UTF8_CHAR(x, c))
   287	      ######    		     goto out;
   288	      ######    	     } else if (!is_utf8_char_slow(x, c))
   289	      ######    	         goto out;
   290			#else
   291				     c = is_utf8_char(x);
   292			#endif /* #ifdef IS_UTF8_CHAR */
   293	   365710950    	      if (!c)
   294	      ######    		  goto out;
   295				 }
   296	   365966128            x += c;
   297			    }
   298			
   299			 out:
   300	       77372        if (x != send)
   301	           8    	return FALSE;
   302			
   303	       77364        return TRUE;
   304			}
   305			
   306			/*
   307			=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
   308			
   309			Like is_ut8_string() but stores the location of the failure (in the
   310			case of "utf8ness failure") or the location s+len (in the case of
   311			"utf8ness success") in the C<ep>, and the number of UTF-8
   312			encoded characters in the C<el>.
   313			
   314			See also is_utf8_string_loc() and is_utf8_string().
   315			
   316			=cut
   317			*/
   318			
   319			bool
   320			Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
   321	         433    {
   322	         433        const U8* x = s;
   323	         433        const U8* send;
   324	         433        STRLEN c;
   325			
   326	         433        if (!len && s)
   327	      ######            len = strlen((const char *)s);
   328	         433        send = s + len;
   329	         433        if (el)
   330	           7            *el = 0;
   331			
   332	       11479        while (x < send) {
   333				 /* Inline the easy bits of is_utf8_char() here for speed... */
   334	       11048    	 if (UTF8_IS_INVARIANT(*x))
   335	        8948    	     c = 1;
   336	        2100    	 else if (!UTF8_IS_START(*x))
   337	        2100    	     goto out;
   338				 else {
   339				     /* ... and call is_utf8_char() only if really needed. */
   340			#ifdef IS_UTF8_CHAR
   341	        2100    	     c = UTF8SKIP(x);
   342	        2100    	     if (IS_UTF8_CHAR_FAST(c)) {
   343	        2100    	         if (!IS_UTF8_CHAR(x, c))
   344	           2    		     c = 0;
   345				     } else
   346	      ######    	         c = is_utf8_char_slow(x, c);
   347			#else
   348				     c = is_utf8_char(x);
   349			#endif /* #ifdef IS_UTF8_CHAR */
   350	        2100    	     if (!c)
   351	           2    	         goto out;
   352				 }
   353	       11046             x += c;
   354	       11046    	 if (el)
   355	          32    	     (*el)++;
   356			    }
   357			
   358			 out:
   359	         433        if (ep)
   360	         433            *ep = x;
   361	         433        if (x != send)
   362	           2    	return FALSE;
   363			
   364	         431        return TRUE;
   365			}
   366			
   367			/*
   368			=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
   369			
   370			Like is_ut8_string() but stores the location of the failure (in the
   371			case of "utf8ness failure") or the location s+len (in the case of
   372			"utf8ness success") in the C<ep>.
   373			
   374			See also is_utf8_string_loclen() and is_utf8_string().
   375			
   376			=cut
   377			*/
   378			
   379			bool
   380			Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep)
   381	         426    {
   382	         426        return is_utf8_string_loclen(s, len, ep, 0);
   383			}
   384			
   385			/*
   386			=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
   387			
   388			Bottom level UTF-8 decode routine.
   389			Returns the unicode code point value of the first character in the string C<s>
   390			which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
   391			C<retlen> will be set to the length, in bytes, of that character.
   392			
   393			If C<s> does not point to a well-formed UTF-8 character, the behaviour
   394			is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
   395			it is assumed that the caller will raise a warning, and this function
   396			will silently just set C<retlen> to C<-1> and return zero.  If the
   397			C<flags> does not contain UTF8_CHECK_ONLY, warnings about
   398			malformations will be given, C<retlen> will be set to the expected
   399			length of the UTF-8 character in bytes, and zero will be returned.
   400			
   401			The C<flags> can also contain various flags to allow deviations from
   402			the strict UTF-8 encoding (see F<utf8.h>).
   403			
   404			Most code should use utf8_to_uvchr() rather than call this directly.
   405			
   406			=cut
   407			*/
   408			
   409			UV
   410			Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
   411	     4967331    {
   412	     4967331        const U8 *s0 = s;
   413	     4967331        UV uv = *s, ouv = 0;
   414	     4967331        STRLEN len = 1;
   415	     4967331        const bool dowarn = ckWARN_d(WARN_UTF8);
   416	     4967331        const UV startbyte = *s;
   417	     4967331        STRLEN expectlen = 0;
   418	     4967331        U32 warning = 0;
   419			
   420			/* This list is a superset of the UTF8_ALLOW_XXX. */
   421			
   422			#define UTF8_WARN_EMPTY				 1
   423			#define UTF8_WARN_CONTINUATION			 2
   424			#define UTF8_WARN_NON_CONTINUATION	 	 3
   425			#define UTF8_WARN_FE_FF				 4
   426			#define UTF8_WARN_SHORT				 5
   427			#define UTF8_WARN_OVERFLOW			 6
   428			#define UTF8_WARN_SURROGATE			 7
   429			#define UTF8_WARN_LONG				 8
   430			#define UTF8_WARN_FFFF				 9 /* Also FFFE. */
   431			
   432	     4967331        if (curlen == 0 &&
   433				!(flags & UTF8_ALLOW_EMPTY)) {
   434	           1    	warning = UTF8_WARN_EMPTY;
   435	           1    	goto malformed;
   436			    }
   437			
   438	     4967330        if (UTF8_IS_INVARIANT(uv)) {
   439	     1986226    	if (retlen)
   440	     1837970    	    *retlen = 1;
   441	     1986226    	return (UV) (NATIVE_TO_UTF(*s));
   442			    }
   443			
   444	     2981104        if (UTF8_IS_CONTINUATION(uv) &&
   445				!(flags & UTF8_ALLOW_CONTINUATION)) {
   446	           1    	warning = UTF8_WARN_CONTINUATION;
   447	           1    	goto malformed;
   448			    }
   449			
   450	     2981103        if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
   451				!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
   452	         304    	warning = UTF8_WARN_NON_CONTINUATION;
   453	         304    	goto malformed;
   454			    }
   455			
   456			#ifdef EBCDIC
   457			    uv = NATIVE_TO_UTF(uv);
   458			#else
   459	     2980799        if ((uv == 0xfe || uv == 0xff) &&
   460				!(flags & UTF8_ALLOW_FE_FF)) {
   461	           1    	warning = UTF8_WARN_FE_FF;
   462	           1    	goto malformed;
   463			    }
   464			#endif
   465			
   466	     2980798        if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
   467	     2866240        else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
   468	     1471583        else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
   469	       34040        else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
   470			#ifdef EBCDIC
   471			    else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
   472			    else			{ len =  7; uv &= 0x01; }
   473			#else
   474	       34034        else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
   475	       34005        else if (!(uv & 0x01))	{ len =  7; uv = 0; }
   476	      ######        else			{ len = 13; uv = 0; } /* whoa! */
   477			#endif
   478			
   479	     2980798        if (retlen)
   480	     2889946    	*retlen = len;
   481			
   482	     2980798        expectlen = len;
   483			
   484	     2980798        if ((curlen < expectlen) &&
   485				!(flags & UTF8_ALLOW_SHORT)) {
   486	           8    	warning = UTF8_WARN_SHORT;
   487	           8    	goto malformed;
   488			    }
   489			
   490	     2980790        len--;
   491	     2980790        s++;
   492	     2980790        ouv = uv;
   493			
   494	    10401468        while (len--) {
   495	     7420680    	if (!UTF8_IS_CONTINUATION(*s) &&
   496				    !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
   497	      ######    	    s--;
   498	      ######    	    warning = UTF8_WARN_NON_CONTINUATION;
   499	      ######    	    goto malformed;
   500				}
   501				else
   502	     7420680    	    uv = UTF8_ACCUMULATE(uv, *s);
   503	     7420680    	if (!(uv > ouv)) {
   504				    /* These cannot be allowed. */
   505	           2    	    if (uv == ouv) {
   506	           2    		if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
   507	           2    		    warning = UTF8_WARN_LONG;
   508	           2    		    goto malformed;
   509					}
   510				    }
   511				    else { /* uv < ouv */
   512					/* This cannot be allowed. */
   513	      ######    		warning = UTF8_WARN_OVERFLOW;
   514	      ######    		goto malformed;
   515				    }
   516				}
   517	     7420678    	s++;
   518	     7420678    	ouv = uv;
   519			    }
   520			
   521	     2980788        if (UNICODE_IS_SURROGATE(uv) &&
   522				!(flags & UTF8_ALLOW_SURROGATE)) {
   523	           7    	warning = UTF8_WARN_SURROGATE;
   524	           7    	goto malformed;
   525	     2980781        } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
   526				       !(flags & UTF8_ALLOW_LONG)) {
   527	      ######    	warning = UTF8_WARN_LONG;
   528	      ######    	goto malformed;
   529	     2980781        } else if (UNICODE_IS_ILLEGAL(uv) &&
   530				       !(flags & UTF8_ALLOW_FFFF)) {
   531	           1    	warning = UTF8_WARN_FFFF;
   532	           1    	goto malformed;
   533			    }
   534			
   535	     2980780        return uv;
   536			
   537			malformed:
   538			
   539	         325        if (flags & UTF8_CHECK_ONLY) {
   540	         312    	if (retlen)
   541	         312    	    *retlen = -1;
   542	         312    	return 0;
   543			    }
   544			
   545	          13        if (dowarn) {
   546	           5    	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
   547			
   548	           5    	switch (warning) {
   549	      ######    	case 0: /* Intentionally empty. */ break;
   550				case UTF8_WARN_EMPTY:
   551	      ######    	    Perl_sv_catpv(aTHX_ sv, "(empty string)");
   552	      ######    	    break;
   553				case UTF8_WARN_CONTINUATION:
   554	           1    	    Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
   555	           1    	    break;
   556				case UTF8_WARN_NON_CONTINUATION:
   557	           2    	    if (s == s0)
   558	           2    	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
   559			                           (UV)s[1], startbyte);
   560				    else
   561	      ######    	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
   562			                           (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, (int)expectlen);
   563				      
   564	      ######    	    break;
   565				case UTF8_WARN_FE_FF:
   566	           1    	    Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
   567	           1    	    break;
   568				case UTF8_WARN_SHORT:
   569	           1    	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
   570			                           (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
   571	           1    	    expectlen = curlen;		/* distance for caller to skip */
   572	           1    	    break;
   573				case UTF8_WARN_OVERFLOW:
   574	      ######    	    Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
   575			                           ouv, *s, startbyte);
   576	      ######    	    break;
   577				case UTF8_WARN_SURROGATE:
   578	      ######    	    Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
   579	      ######    	    break;
   580				case UTF8_WARN_LONG:
   581	      ######    	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
   582						   (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
   583	      ######    	    break;
   584				case UTF8_WARN_FFFF:
   585	      ######    	    Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
   586	      ######    	    break;
   587				default:
   588	      ######    	    Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
   589	           5    	    break;
   590				}
   591				
   592	           5    	if (warning) {
   593	           5    	    const char *s = SvPVX_const(sv);
   594			
   595	           5    	    if (PL_op)
   596	           2    		Perl_warner(aTHX_ packWARN(WARN_UTF8),
   597						    "%s in %s", s,  OP_DESC(PL_op));
   598				    else
   599	           3    		Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
   600				}
   601			    }
   602			
   603	          13        if (retlen)
   604	          13    	*retlen = expectlen ? expectlen : len;
   605			
   606	          13        return 0;
   607			}
   608			
   609			/*
   610			=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
   611			
   612			Returns the native character value of the first character in the string C<s>
   613			which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
   614			length, in bytes, of that character.
   615			
   616			If C<s> does not point to a well-formed UTF-8 character, zero is
   617			returned and retlen is set, if possible, to -1.
   618			
   619			=cut
   620			*/
   621			
   622			UV
   623			Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
   624	      449536    {
   625	      449536        return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
   626						       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
   627			}
   628			
   629			/*
   630			=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
   631			
   632			Returns the Unicode code point of the first character in the string C<s>
   633			which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
   634			length, in bytes, of that character.
   635			
   636			This function should only be used when returned UV is considered
   637			an index into the Unicode semantic tables (e.g. swashes).
   638			
   639			If C<s> does not point to a well-formed UTF-8 character, zero is
   640			returned and retlen is set, if possible, to -1.
   641			
   642			=cut
   643			*/
   644			
   645			UV
   646			Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
   647	           2    {
   648			    /* Call the low level routine asking for checks */
   649	           2        return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
   650						       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
   651			}
   652			
   653			/*
   654			=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
   655			
   656			Return the length of the UTF-8 char encoded string C<s> in characters.
   657			Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
   658			up past C<e>, croaks.
   659			
   660			=cut
   661			*/
   662			
   663			STRLEN
   664			Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
   665	      173750    {
   666	      173750        STRLEN len = 0;
   667			
   668			    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
   669			     * the bitops (especially ~) can create illegal UTF-8.
   670			     * In other words: in Perl UTF-8 is not just for Unicode. */
   671			
   672	      173750        if (e < s)
   673	      ######    	goto warn_and_return;
   674	   253198496        while (s < e) {
   675	   253024747    	const U8 t = UTF8SKIP(s);
   676	   253024747    	if (e - s < t) {
   677				    warn_and_return:
   678	           1    	    if (ckWARN_d(WARN_UTF8)) {
   679	           1    	        if (PL_op)
   680	           1    		    Perl_warner(aTHX_ packWARN(WARN_UTF8),
   681						    "%s in %s", unees, OP_DESC(PL_op));
   682					else
   683	      ######    		    Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
   684				    }
   685	           1    	    return len;
   686				}
   687	   253024746    	s += t;
   688	   253024746    	len++;
   689			    }
   690			
   691	      173749        return len;
   692			}
   693			
   694			/*
   695			=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
   696			
   697			Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
   698			and C<b>.
   699			
   700			WARNING: use only if you *know* that the pointers point inside the
   701			same UTF-8 buffer.
   702			
   703			=cut
   704			*/
   705			
   706			IV
   707			Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
   708	           7    {
   709	           7        IV off = 0;
   710			
   711			    /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
   712			     * the bitops (especially ~) can create illegal UTF-8.
   713			     * In other words: in Perl UTF-8 is not just for Unicode. */
   714			
   715	           7        if (a < b) {
   716	      ######    	while (a < b) {
   717	      ######    	    const U8 c = UTF8SKIP(a);
   718	      ######    	    if (b - a < c)
   719	      ######    		goto warn_and_return;
   720	      ######    	    a += c;
   721	      ######    	    off--;
   722				}
   723			    }
   724			    else {
   725	          42    	while (b < a) {
   726	          35    	    const U8 c = UTF8SKIP(b);
   727			
   728	          35    	    if (a - b < c) {
   729					warn_and_return:
   730	      ######    	        if (ckWARN_d(WARN_UTF8)) {
   731	      ######    		    if (PL_op)
   732	      ######    		        Perl_warner(aTHX_ packWARN(WARN_UTF8),
   733							    "%s in %s", unees, OP_DESC(PL_op));
   734					    else
   735	      ######    		        Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
   736					}
   737	      ######    		return off;
   738				    }
   739	          35    	    b += c;
   740	          35    	    off++;
   741				}
   742			    }
   743			
   744	           7        return off;
   745			}
   746			
   747			/*
   748			=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
   749			
   750			Return the UTF-8 pointer C<s> displaced by C<off> characters, either
   751			forward or backward.
   752			
   753			WARNING: do not use the following unless you *know* C<off> is within
   754			the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
   755			on the first byte of character or just after the last byte of a character.
   756			
   757			=cut
   758			*/
   759			
   760			U8 *
   761			Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
   762	         961    {
   763			    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
   764			     * the bitops (especially ~) can create illegal UTF-8.
   765			     * In other words: in Perl UTF-8 is not just for Unicode. */
   766			
   767	         961        if (off >= 0) {
   768	        8375    	while (off--)
   769	        7436    	    s += UTF8SKIP(s);
   770			    }
   771			    else {
   772	          44    	while (off++) {
   773	          22    	    s--;
   774	          44    	    while (UTF8_IS_CONTINUATION(*s))
   775	          22    		s--;
   776				}
   777			    }
   778	         961        return (U8 *)s;
   779			}
   780			
   781			/*
   782			=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
   783			
   784			Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
   785			Unlike C<bytes_to_utf8>, this over-writes the original string, and
   786			updates len to contain the new length.
   787			Returns zero on failure, setting C<len> to -1.
   788			
   789			=cut
   790			*/
   791			
   792			U8 *
   793			Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
   794	        3052    {
   795	        3052        U8 *send;
   796	        3052        U8 *d;
   797	        3052        U8 *save = s;
   798			
   799			    /* ensure valid UTF-8 and chars < 256 before updating string */
   800	        6064        for (send = s + *len; s < send; ) {
   801	      224855            U8 c = *s++;
   802			
   803	      224855            if (!UTF8_IS_INVARIANT(c) &&
   804			            (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
   805				     || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
   806	          40                *len = -1;
   807	          40                return 0;
   808			        }
   809			    }
   810			
   811	        3012        d = s = save;
   812	      227810        while (s < send) {
   813	      224798            STRLEN ulen;
   814	      224798            *d++ = (U8)utf8_to_uvchr(s, &ulen);
   815	      224798            s += ulen;
   816			    }
   817	        3012        *d = '\0';
   818	        3012        *len = d - save;
   819	        3012        return save;
   820			}
   821			
   822			/*
   823			=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
   824			
   825			Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
   826			Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
   827			the newly-created string, and updates C<len> to contain the new
   828			length.  Returns the original string if no conversion occurs, C<len>
   829			is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
   830			0 if C<s> is converted or contains all 7bit characters.
   831			
   832			=cut
   833			*/
   834			
   835			U8 *
   836			Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
   837	        7713    {
   838	        7713        U8 *d;
   839	        7713        const U8 *start = s;
   840	        7713        const U8 *send;
   841	        7713        I32 count = 0;
   842			
   843	        7713        if (!*is_utf8)
   844	      ######            return (U8 *)start;
   845			
   846			    /* ensure valid UTF-8 and chars < 256 before converting string */
   847	       10212        for (send = s + *len; s < send;) {
   848	     1577763            U8 c = *s++;
   849	     1577763    	if (!UTF8_IS_INVARIANT(c)) {
   850	        8641    	    if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
   851			                (c = *s++) && UTF8_IS_CONTINUATION(c))
   852	        3427    		count++;
   853				    else
   854	        5214                    return (U8 *)start;
   855				}
   856			    }
   857			
   858	        2499        *is_utf8 = 0;		
   859			
   860	        2499        Newz(801, d, (*len) - count + 1, U8);
   861	        2499        s = start; start = d;
   862	     1574972        while (s < send) {
   863	     1572473    	U8 c = *s++;
   864	     1572473    	if (!UTF8_IS_INVARIANT(c)) {
   865				    /* Then it is two-byte encoded */
   866	        3382    	    c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
   867	     1572473    	    c = ASCII_TO_NATIVE(c);
   868				}
   869	     1572473    	*d++ = c;
   870			    }
   871	        2499        *d = '\0';
   872	        2499        *len = d - start;
   873	        2499        return (U8 *)start;
   874			}
   875			
   876			/*
   877			=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
   878			
   879			Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
   880			Returns a pointer to the newly-created string, and sets C<len> to
   881			reflect the new length.
   882			
   883			If you want to convert to UTF-8 from other encodings than ASCII,
   884			see sv_recode_to_utf8().
   885			
   886			=cut
   887			*/
   888			
   889			U8*
   890			Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
   891	       12646    {
   892	       12646        const U8 * const send = s + (*len);
   893	       12646        U8 *d;
   894	       12646        U8 *dst;
   895			
   896	       12646        Newz(801, d, (*len) * 2 + 1, U8);
   897	       12646        dst = d;
   898			
   899	       43211        while (s < send) {
   900	       30565            const UV uv = NATIVE_TO_ASCII(*s++);
   901	       30565            if (UNI_IS_INVARIANT(uv))
   902	       13751                *d++ = (U8)UTF_TO_NATIVE(uv);
   903			        else {
   904	       16814                *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
   905	       16814                *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
   906			        }
   907			    }
   908	       12646        *d = '\0';
   909	       12646        *len = d-dst;
   910	       12646        return dst;
   911			}
   912			
   913			/*
   914			 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
   915			 *
   916			 * Destination must be pre-extended to 3/2 source.  Do not use in-place.
   917			 * We optimize for native, for obvious reasons. */
   918			
   919			U8*
   920			Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
   921	          21    {
   922	          21        U8* pend;
   923	          21        U8* dstart = d;
   924			
   925	          21        if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
   926	           7    	 d[0] = 0;
   927	           7    	 *newlen = 1;
   928	           7    	 return d;
   929			    }
   930			
   931	          14        if (bytelen & 1)
   932	      ######    	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
   933			
   934	          14        pend = p + bytelen;
   935			
   936	         107        while (p < pend) {
   937	          93    	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
   938	          93    	p += 2;
   939	          93    	if (uv < 0x80) {
   940	          93    	    *d++ = (U8)uv;
   941	          93    	    continue;
   942				}
   943	      ######    	if (uv < 0x800) {
   944	      ######    	    *d++ = (U8)(( uv >>  6)         | 0xc0);
   945	      ######    	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
   946	      ######    	    continue;
   947				}
   948	      ######    	if (uv >= 0xd800 && uv < 0xdbff) {	/* surrogates */
   949	      ######    	    UV low = (p[0] << 8) + p[1];
   950	      ######    	    p += 2;
   951	      ######    	    if (low < 0xdc00 || low >= 0xdfff)
   952	      ######    		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
   953	      ######    	    uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
   954				}
   955	      ######    	if (uv < 0x10000) {
   956	      ######    	    *d++ = (U8)(( uv >> 12)         | 0xe0);
   957	      ######    	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
   958	      ######    	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
   959	      ######    	    continue;
   960				}
   961				else {
   962	      ######    	    *d++ = (U8)(( uv >> 18)         | 0xf0);
   963	      ######    	    *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
   964	      ######    	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
   965	      ######    	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
   966	      ######    	    continue;
   967				}
   968			    }
   969	          14        *newlen = d - dstart;
   970	          14        return d;
   971			}
   972			
   973			/* Note: this one is slightly destructive of the source. */
   974			
   975			U8*
   976			Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
   977	           7    {
   978	           7        U8* s = (U8*)p;
   979	           7        U8* send = s + bytelen;
   980	          50        while (s < send) {
   981	          43    	U8 tmp = s[0];
   982	          43    	s[0] = s[1];
   983	          43    	s[1] = tmp;
   984	          43    	s += 2;
   985			    }
   986	           7        return utf16_to_utf8(p, d, bytelen, newlen);
   987			}
   988			
   989			/* for now these are all defined (inefficiently) in terms of the utf8 versions */
   990			
   991			bool
   992			Perl_is_uni_alnum(pTHX_ UV c)
   993	          25    {
   994	          25        U8 tmpbuf[UTF8_MAXBYTES+1];
   995	          25        uvchr_to_utf8(tmpbuf, c);
   996	          25        return is_utf8_alnum(tmpbuf);
   997			}
   998			
   999			bool
  1000			Perl_is_uni_alnumc(pTHX_ UV c)
  1001	      ######    {
  1002	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1003	      ######        uvchr_to_utf8(tmpbuf, c);
  1004	      ######        return is_utf8_alnumc(tmpbuf);
  1005			}
  1006			
  1007			bool
  1008			Perl_is_uni_idfirst(pTHX_ UV c)
  1009	      ######    {
  1010	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1011	      ######        uvchr_to_utf8(tmpbuf, c);
  1012	      ######        return is_utf8_idfirst(tmpbuf);
  1013			}
  1014			
  1015			bool
  1016			Perl_is_uni_alpha(pTHX_ UV c)
  1017	      ######    {
  1018	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1019	      ######        uvchr_to_utf8(tmpbuf, c);
  1020	      ######        return is_utf8_alpha(tmpbuf);
  1021			}
  1022			
  1023			bool
  1024			Perl_is_uni_ascii(pTHX_ UV c)
  1025	      ######    {
  1026	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1027	      ######        uvchr_to_utf8(tmpbuf, c);
  1028	      ######        return is_utf8_ascii(tmpbuf);
  1029			}
  1030			
  1031			bool
  1032			Perl_is_uni_space(pTHX_ UV c)
  1033	      ######    {
  1034	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1035	      ######        uvchr_to_utf8(tmpbuf, c);
  1036	      ######        return is_utf8_space(tmpbuf);
  1037			}
  1038			
  1039			bool
  1040			Perl_is_uni_digit(pTHX_ UV c)
  1041	      ######    {
  1042	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1043	      ######        uvchr_to_utf8(tmpbuf, c);
  1044	      ######        return is_utf8_digit(tmpbuf);
  1045			}
  1046			
  1047			bool
  1048			Perl_is_uni_upper(pTHX_ UV c)
  1049	      ######    {
  1050	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1051	      ######        uvchr_to_utf8(tmpbuf, c);
  1052	      ######        return is_utf8_upper(tmpbuf);
  1053			}
  1054			
  1055			bool
  1056			Perl_is_uni_lower(pTHX_ UV c)
  1057	      ######    {
  1058	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1059	      ######        uvchr_to_utf8(tmpbuf, c);
  1060	      ######        return is_utf8_lower(tmpbuf);
  1061			}
  1062			
  1063			bool
  1064			Perl_is_uni_cntrl(pTHX_ UV c)
  1065	      ######    {
  1066	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1067	      ######        uvchr_to_utf8(tmpbuf, c);
  1068	      ######        return is_utf8_cntrl(tmpbuf);
  1069			}
  1070			
  1071			bool
  1072			Perl_is_uni_graph(pTHX_ UV c)
  1073	      ######    {
  1074	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1075	      ######        uvchr_to_utf8(tmpbuf, c);
  1076	      ######        return is_utf8_graph(tmpbuf);
  1077			}
  1078			
  1079			bool
  1080			Perl_is_uni_print(pTHX_ UV c)
  1081	      ######    {
  1082	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1083	      ######        uvchr_to_utf8(tmpbuf, c);
  1084	      ######        return is_utf8_print(tmpbuf);
  1085			}
  1086			
  1087			bool
  1088			Perl_is_uni_punct(pTHX_ UV c)
  1089	      ######    {
  1090	      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
  1091	      ######        uvchr_to_utf8(tmpbuf, c);
  1092	      ######        return is_utf8_punct(tmpbuf);
  1093			}
  1094			
  1095			bool
  1096			Perl_is_uni_xdigit(pTHX_ UV c)
  1097	      ######    {
  1098	      ######        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
  1099	      ######        uvchr_to_utf8(tmpbuf, c);
  1100	      ######        return is_utf8_xdigit(tmpbuf);
  1101			}
  1102			
  1103			UV
  1104			Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
  1105	      ######    {
  1106	      ######        uvchr_to_utf8(p, c);
  1107	      ######        return to_utf8_upper(p, p, lenp);
  1108			}
  1109			
  1110			UV
  1111			Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
  1112	      ######    {
  1113	      ######        uvchr_to_utf8(p, c);
  1114	      ######        return to_utf8_title(p, p, lenp);
  1115			}
  1116			
  1117			UV
  1118			Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
  1119	      ######    {
  1120	      ######        uvchr_to_utf8(p, c);
  1121	      ######        return to_utf8_lower(p, p, lenp);
  1122			}
  1123			
  1124			UV
  1125			Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
  1126	       33212    {
  1127	       33212        uvchr_to_utf8(p, c);
  1128	       33212        return to_utf8_fold(p, p, lenp);
  1129			}
  1130			
  1131			/* for now these all assume no locale info available for Unicode > 255 */
  1132			
  1133			bool
  1134			Perl_is_uni_alnum_lc(pTHX_ UV c)
  1135	      ######    {
  1136	      ######        return is_uni_alnum(c);	/* XXX no locale support yet */
  1137			}
  1138			
  1139			bool
  1140			Perl_is_uni_alnumc_lc(pTHX_ UV c)
  1141	      ######    {
  1142	      ######        return is_uni_alnumc(c);	/* XXX no locale support yet */
  1143			}
  1144			
  1145			bool
  1146			Perl_is_uni_idfirst_lc(pTHX_ UV c)
  1147	      ######    {
  1148	      ######        return is_uni_idfirst(c);	/* XXX no locale support yet */
  1149			}
  1150			
  1151			bool
  1152			Perl_is_uni_alpha_lc(pTHX_ UV c)
  1153	      ######    {
  1154	      ######        return is_uni_alpha(c);	/* XXX no locale support yet */
  1155			}
  1156			
  1157			bool
  1158			Perl_is_uni_ascii_lc(pTHX_ UV c)
  1159	      ######    {
  1160	      ######        return is_uni_ascii(c);	/* XXX no locale support yet */
  1161			}
  1162			
  1163			bool
  1164			Perl_is_uni_space_lc(pTHX_ UV c)
  1165	      ######    {
  1166	      ######        return is_uni_space(c);	/* XXX no locale support yet */
  1167			}
  1168			
  1169			bool
  1170			Perl_is_uni_digit_lc(pTHX_ UV c)
  1171	      ######    {
  1172	      ######        return is_uni_digit(c);	/* XXX no locale support yet */
  1173			}
  1174			
  1175			bool
  1176			Perl_is_uni_upper_lc(pTHX_ UV c)
  1177	      ######    {
  1178	      ######        return is_uni_upper(c);	/* XXX no locale support yet */
  1179			}
  1180			
  1181			bool
  1182			Perl_is_uni_lower_lc(pTHX_ UV c)
  1183	      ######    {
  1184	      ######        return is_uni_lower(c);	/* XXX no locale support yet */
  1185			}
  1186			
  1187			bool
  1188			Perl_is_uni_cntrl_lc(pTHX_ UV c)
  1189	      ######    {
  1190	      ######        return is_uni_cntrl(c);	/* XXX no locale support yet */
  1191			}
  1192			
  1193			bool
  1194			Perl_is_uni_graph_lc(pTHX_ UV c)
  1195	      ######    {
  1196	      ######        return is_uni_graph(c);	/* XXX no locale support yet */
  1197			}
  1198			
  1199			bool
  1200			Perl_is_uni_print_lc(pTHX_ UV c)
  1201	      ######    {
  1202	      ######        return is_uni_print(c);	/* XXX no locale support yet */
  1203			}
  1204			
  1205			bool
  1206			Perl_is_uni_punct_lc(pTHX_ UV c)
  1207	      ######    {
  1208	      ######        return is_uni_punct(c);	/* XXX no locale support yet */
  1209			}
  1210			
  1211			bool
  1212			Perl_is_uni_xdigit_lc(pTHX_ UV c)
  1213	      ######    {
  1214	      ######        return is_uni_xdigit(c);	/* XXX no locale support yet */
  1215			}
  1216			
  1217			U32
  1218			Perl_to_uni_upper_lc(pTHX_ U32 c)
  1219	      ######    {
  1220			    /* XXX returns only the first character -- do not use XXX */
  1221			    /* XXX no locale support yet */
  1222	      ######        STRLEN len;
  1223	      ######        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
  1224	      ######        return (U32)to_uni_upper(c, tmpbuf, &len);
  1225			}
  1226			
  1227			U32
  1228			Perl_to_uni_title_lc(pTHX_ U32 c)
  1229	      ######    {
  1230			    /* XXX returns only the first character XXX -- do not use XXX */
  1231			    /* XXX no locale support yet */
  1232	      ######        STRLEN len;
  1233	      ######        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
  1234	      ######        return (U32)to_uni_title(c, tmpbuf, &len);
  1235			}
  1236			
  1237			U32
  1238			Perl_to_uni_lower_lc(pTHX_ U32 c)
  1239	      ######    {
  1240			    /* XXX returns only the first character -- do not use XXX */
  1241			    /* XXX no locale support yet */
  1242	      ######        STRLEN len;
  1243	      ######        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
  1244	      ######        return (U32)to_uni_lower(c, tmpbuf, &len);
  1245			}
  1246			
  1247			bool
  1248			Perl_is_utf8_alnum(pTHX_ const U8 *p)
  1249	          96    {
  1250	          96        if (!is_utf8_char(p))
  1251	      ######    	return FALSE;
  1252	          96        if (!PL_utf8_alnum)
  1253				/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
  1254				 * descendant of isalnum(3), in other words, it doesn't
  1255				 * contain the '_'. --jhi */
  1256	          21    	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
  1257	          96        return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
  1258			/*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
  1259			#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
  1260			    if (!PL_utf8_alnum)
  1261				PL_utf8_alnum = swash_init("utf8", "",
  1262				    sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
  1263			    return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
  1264			#endif
  1265			}
  1266			
  1267			bool
  1268			Perl_is_utf8_alnumc(pTHX_ const U8 *p)
  1269	      ######    {
  1270	      ######        if (!is_utf8_char(p))
  1271	      ######    	return FALSE;
  1272	      ######        if (!PL_utf8_alnum)
  1273	      ######    	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
  1274	      ######        return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
  1275			/*    return is_utf8_alpha(p) || is_utf8_digit(p); */
  1276			#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
  1277			    if (!PL_utf8_alnum)
  1278				PL_utf8_alnum = swash_init("utf8", "",
  1279				    sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
  1280			    return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
  1281			#endif
  1282			}
  1283			
  1284			bool
  1285			Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
  1286	      ######    {
  1287	      ######        if (*p == '_')
  1288	      ######    	return TRUE;
  1289	      ######        if (!is_utf8_char(p))
  1290	      ######    	return FALSE;
  1291	      ######        if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
  1292	      ######    	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
  1293	      ######        return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
  1294			}
  1295			
  1296			bool
  1297			Perl_is_utf8_idcont(pTHX_ const U8 *p)
  1298	          32    {
  1299	          32        if (*p == '_')
  1300	      ######    	return TRUE;
  1301	          32        if (!is_utf8_char(p))
  1302	           1    	return FALSE;
  1303	          31        if (!PL_utf8_idcont)
  1304	           5    	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
  1305	          31        return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
  1306			}
  1307			
  1308			bool
  1309			Perl_is_utf8_alpha(pTHX_ const U8 *p)
  1310	      ######    {
  1311	      ######        if (!is_utf8_char(p))
  1312	      ######    	return FALSE;
  1313	      ######        if (!PL_utf8_alpha)
  1314	      ######    	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
  1315	      ######        return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
  1316			}
  1317			
  1318			bool
  1319			Perl_is_utf8_ascii(pTHX_ const U8 *p)
  1320	      ######    {
  1321	      ######        if (!is_utf8_char(p))
  1322	      ######    	return FALSE;
  1323	      ######        if (!PL_utf8_ascii)
  1324	      ######    	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
  1325	      ######        return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
  1326			}
  1327			
  1328			bool
  1329			Perl_is_utf8_space(pTHX_ const U8 *p)
  1330	          23    {
  1331	          23        if (!is_utf8_char(p))
  1332	      ######    	return FALSE;
  1333	          23        if (!PL_utf8_space)
  1334	           7    	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
  1335	          23        return swash_fetch(PL_utf8_space, p, TRUE) != 0;
  1336			}
  1337			
  1338			bool
  1339			Perl_is_utf8_digit(pTHX_ const U8 *p)
  1340	          26    {
  1341	          26        if (!is_utf8_char(p))
  1342	      ######    	return FALSE;
  1343	          26        if (!PL_utf8_digit)
  1344	           6    	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
  1345	          26        return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
  1346			}
  1347			
  1348			bool
  1349			Perl_is_utf8_upper(pTHX_ const U8 *p)
  1350	      ######    {
  1351	      ######        if (!is_utf8_char(p))
  1352	      ######    	return FALSE;
  1353	      ######        if (!PL_utf8_upper)
  1354	      ######    	PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
  1355	      ######        return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
  1356			}
  1357			
  1358			bool
  1359			Perl_is_utf8_lower(pTHX_ const U8 *p)
  1360	      ######    {
  1361	      ######        if (!is_utf8_char(p))
  1362	      ######    	return FALSE;
  1363	      ######        if (!PL_utf8_lower)
  1364	      ######    	PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
  1365	      ######        return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
  1366			}
  1367			
  1368			bool
  1369			Perl_is_utf8_cntrl(pTHX_ const U8 *p)
  1370	      ######    {
  1371	      ######        if (!is_utf8_char(p))
  1372	      ######    	return FALSE;
  1373	      ######        if (!PL_utf8_cntrl)
  1374	      ######    	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
  1375	      ######        return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
  1376			}
  1377			
  1378			bool
  1379			Perl_is_utf8_graph(pTHX_ const U8 *p)
  1380	      ######    {
  1381	      ######        if (!is_utf8_char(p))
  1382	      ######    	return FALSE;
  1383	      ######        if (!PL_utf8_graph)
  1384	      ######    	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
  1385	      ######        return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
  1386			}
  1387			
  1388			bool
  1389			Perl_is_utf8_print(pTHX_ const U8 *p)
  1390	      ######    {
  1391	      ######        if (!is_utf8_char(p))
  1392	      ######    	return FALSE;
  1393	      ######        if (!PL_utf8_print)
  1394	      ######    	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
  1395	      ######        return swash_fetch(PL_utf8_print, p, TRUE) != 0;
  1396			}
  1397			
  1398			bool
  1399			Perl_is_utf8_punct(pTHX_ const U8 *p)
  1400	      ######    {
  1401	      ######        if (!is_utf8_char(p))
  1402	      ######    	return FALSE;
  1403	      ######        if (!PL_utf8_punct)
  1404	      ######    	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
  1405	      ######        return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
  1406			}
  1407			
  1408			bool
  1409			Perl_is_utf8_xdigit(pTHX_ const U8 *p)
  1410	      ######    {
  1411	      ######        if (!is_utf8_char(p))
  1412	      ######    	return FALSE;
  1413	      ######        if (!PL_utf8_xdigit)
  1414	      ######    	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
  1415	      ######        return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
  1416			}
  1417			
  1418			bool
  1419			Perl_is_utf8_mark(pTHX_ const U8 *p)
  1420	          32    {
  1421	          32        if (!is_utf8_char(p))
  1422	      ######    	return FALSE;
  1423	          32        if (!PL_utf8_mark)
  1424	           3    	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
  1425	          32        return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
  1426			}
  1427			
  1428			/*
  1429			=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
  1430			
  1431			The "p" contains the pointer to the UTF-8 string encoding
  1432			the character that is being converted.
  1433			
  1434			The "ustrp" is a pointer to the character buffer to put the
  1435			conversion result to.  The "lenp" is a pointer to the length
  1436			of the result.
  1437			
  1438			The "swashp" is a pointer to the swash to use.
  1439			
  1440			Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
  1441			and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
  1442			but not always, a multicharacter mapping), is tried first.
  1443			
  1444			The "special" is a string like "utf8::ToSpecLower", which means the
  1445			hash %utf8::ToSpecLower.  The access to the hash is through
  1446			Perl_to_utf8_case().
  1447			
  1448			The "normal" is a string like "ToLower" which means the swash
  1449			%utf8::ToLower.
  1450			
  1451			=cut */
  1452			
  1453			UV
  1454			Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special)
  1455	       79604    {
  1456	       79604        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
  1457	       79604        STRLEN len = 0;
  1458			
  1459	       79604        const UV uv0 = utf8_to_uvchr(p, 0);
  1460			    /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
  1461			     * are necessary in EBCDIC, they are redundant no-ops
  1462			     * in ASCII-ish platforms, and hopefully optimized away. */
  1463	       79604        const UV uv1 = NATIVE_TO_UNI(uv0);
  1464	       79604        uvuni_to_utf8(tmpbuf, uv1);
  1465			
  1466	       79604        if (!*swashp) /* load on-demand */
  1467	          22             *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
  1468			
  1469			    /* The 0xDF is the only special casing Unicode code point below 0x100. */
  1470	       79604        if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
  1471			         /* It might be "special" (sometimes, but not always,
  1472				  * a multicharacter mapping) */
  1473	       18348    	 HV *hv;
  1474	       18348    	 SV **svp;
  1475			
  1476	       18348    	 if ((hv  = get_hv(special, FALSE)) &&
  1477				     (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
  1478				     (*svp)) {
  1479	        1164    	     const char *s;
  1480			
  1481	        1164    	      s = SvPV_const(*svp, len);
  1482	        1164    	      if (len == 1)
  1483	      ######    		   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
  1484				      else {
  1485			#ifdef EBCDIC
  1486					   /* If we have EBCDIC we need to remap the characters
  1487					    * since any characters in the low 256 are Unicode
  1488					    * code points, not EBCDIC. */
  1489					   U8 *t = (U8*)s, *tend = t + len, *d;
  1490					
  1491					   d = tmpbuf;
  1492					   if (SvUTF8(*svp)) {
  1493						STRLEN tlen = 0;
  1494						
  1495						while (t < tend) {
  1496						     UV c = utf8_to_uvchr(t, &tlen);
  1497						     if (tlen > 0) {
  1498							  d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
  1499							  t += tlen;
  1500						     }
  1501						     else
  1502							  break;
  1503						}
  1504					   }
  1505					   else {
  1506						while (t < tend) {
  1507						     d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
  1508						     t++;
  1509						}
  1510					   }
  1511					   len = d - tmpbuf;
  1512					   Copy(tmpbuf, ustrp, len, U8);
  1513			#else
  1514	        1164    		   Copy(s, ustrp, len, U8);
  1515			#endif
  1516				      }
  1517				 }
  1518			    }
  1519			
  1520	       79604        if (!len && *swashp) {
  1521	       78440    	 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
  1522				 
  1523	       78440    	 if (uv2) {
  1524				      /* It was "normal" (a single character mapping). */
  1525	       10872    	      UV uv3 = UNI_TO_NATIVE(uv2);
  1526				      
  1527	       10872    	      len = uvchr_to_utf8(ustrp, uv3) - ustrp;
  1528				 }
  1529			    }
  1530			
  1531	       79604        if (!len) /* Neither: just copy. */
  1532	       67568    	 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
  1533			
  1534	       79604        if (lenp)
  1535	       79604    	 *lenp = len;
  1536			
  1537	       79604        return len ? utf8_to_uvchr(ustrp, 0) : 0;
  1538			}
  1539			
  1540			/*
  1541			=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
  1542			
  1543			Convert the UTF-8 encoded character at p to its uppercase version and
  1544			store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
  1545			that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
  1546			the uppercase version may be longer than the original character.
  1547			
  1548			The first character of the uppercased version is returned
  1549			(but note, as explained above, that there may be more.)
  1550			
  1551			=cut */
  1552			
  1553			UV
  1554			Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
  1555	        7034    {
  1556	        7034        return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
  1557			                             &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
  1558			}
  1559			
  1560			/*
  1561			=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
  1562			
  1563			Convert the UTF-8 encoded character at p to its titlecase version and
  1564			store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
  1565			that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
  1566			titlecase version may be longer than the original character.
  1567			
  1568			The first character of the titlecased version is returned
  1569			(but note, as explained above, that there may be more.)
  1570			
  1571			=cut */
  1572			
  1573			UV
  1574			Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
  1575	         936    {
  1576	         936        return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
  1577			                             &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
  1578			}
  1579			
  1580			/*
  1581			=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
  1582			
  1583			Convert the UTF-8 encoded character at p to its lowercase version and
  1584			store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
  1585			that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
  1586			lowercase version may be longer than the original character.
  1587			
  1588			The first character of the lowercased version is returned
  1589			(but note, as explained above, that there may be more.)
  1590			
  1591			=cut */
  1592			
  1593			UV
  1594			Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
  1595	        7279    {
  1596	        7279        return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
  1597			                             &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
  1598			}
  1599			
  1600			/*
  1601			=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
  1602			
  1603			Convert the UTF-8 encoded character at p to its foldcase version and
  1604			store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
  1605			that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
  1606			foldcase version may be longer than the original character (up to
  1607			three characters).
  1608			
  1609			The first character of the foldcased version is returned
  1610			(but note, as explained above, that there may be more.)
  1611			
  1612			=cut */
  1613			
  1614			UV
  1615			Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
  1616	       64355    {
  1617	       64355        return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
  1618			                             &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
  1619			}
  1620			
  1621			/* a "swash" is a swatch hash */
  1622			
  1623			SV*
  1624			Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
  1625	        8359    {
  1626			    dVAR;
  1627	        8359        SV* retval;
  1628	        8359        SV* tokenbufsv = sv_newmortal();
  1629	        8359        dSP;
  1630	        8359        const size_t pkg_len = strlen(pkg);
  1631	        8359        const size_t name_len = strlen(name);
  1632	        8359        HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
  1633	        8359        SV* errsv_save;
  1634			
  1635	        8359        PUSHSTACKi(PERLSI_MAGIC);
  1636	        8359        ENTER;
  1637	        8359        SAVEI32(PL_hints);
  1638	        8359        PL_hints = 0;
  1639	        8359        save_re_context();
  1640	        8359        if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {	/* demand load utf8 */
  1641	          54    	ENTER;
  1642	          54    	errsv_save = newSVsv(ERRSV);
  1643	          54    	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
  1644						 Nullsv);
  1645	          54    	if (!SvTRUE(ERRSV))
  1646	          54    	    sv_setsv(ERRSV, errsv_save);
  1647	          54    	SvREFCNT_dec(errsv_save);
  1648	          54    	LEAVE;
  1649			    }
  1650	        8359        SPAGAIN;
  1651	        8359        PUSHMARK(SP);
  1652	        8359        EXTEND(SP,5);
  1653	        8359        PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
  1654	        8359        PUSHs(sv_2mortal(newSVpvn(name, name_len)));
  1655	        8359        PUSHs(listsv);
  1656	        8359        PUSHs(sv_2mortal(newSViv(minbits)));
  1657	        8359        PUSHs(sv_2mortal(newSViv(none)));
  1658	        8359        PUTBACK;
  1659	        8359        if (IN_PERL_COMPILETIME) {
  1660				/* XXX ought to be handled by lex_start */
  1661	          71    	SAVEI32(PL_in_my);
  1662	          71    	PL_in_my = 0;
  1663	          71    	sv_setpv(tokenbufsv, PL_tokenbuf);
  1664			    }
  1665	        8359        errsv_save = newSVsv(ERRSV);
  1666	        8359        if (call_method("SWASHNEW", G_SCALAR))
  1667	        8359    	retval = newSVsv(*PL_stack_sp--);
  1668			    else
  1669	      ######    	retval = &PL_sv_undef;
  1670	        8359        if (!SvTRUE(ERRSV))
  1671	        8359    	sv_setsv(ERRSV, errsv_save);
  1672	        8359        SvREFCNT_dec(errsv_save);
  1673	        8359        LEAVE;
  1674	        8359        POPSTACK;
  1675	        8359        if (IN_PERL_COMPILETIME) {
  1676	          71    	STRLEN len;
  1677	          71            const char* pv = SvPV_const(tokenbufsv, len);
  1678			
  1679	          71    	Copy(pv, PL_tokenbuf, len+1, char);
  1680	          71    	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
  1681			    }
  1682	        8359        if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
  1683	      ######            if (SvPOK(retval))
  1684	      ######    	    Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
  1685					       retval);
  1686	      ######    	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
  1687			    }
  1688	        8359        return retval;
  1689			}
  1690			
  1691			
  1692			/* This API is wrong for special case conversions since we may need to
  1693			 * return several Unicode characters for a single Unicode character
  1694			 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
  1695			 * the lower-level routine, and it is similarly broken for returning
  1696			 * multiple values.  --jhi */
  1697			UV
  1698			Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
  1699	     2524542    {
  1700			    dVAR;
  1701	     2524542        HV* hv = (HV*)SvRV(sv);
  1702	     2524542        U32 klen;
  1703	     2524542        U32 off;
  1704	     2524542        STRLEN slen;
  1705	     2524542        STRLEN needents;
  1706	     2524542        const U8 *tmps = NULL;
  1707	     2524542        U32 bit;
  1708	     2524542        SV *retval;
  1709	     2524542        U8 tmputf8[2];
  1710	     2524542        UV c = NATIVE_TO_ASCII(*ptr);
  1711			
  1712	     2524542        if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
  1713	        1324            tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
  1714	        1324            tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
  1715	        1324            ptr = tmputf8;
  1716			    }
  1717			    /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
  1718			     * then the "swatch" is a vec() for al the chars which start
  1719			     * with 0xAA..0xYY
  1720			     * So the key in the hash (klen) is length of encoded char -1
  1721			     */
  1722	     2524542        klen = UTF8SKIP(ptr) - 1;
  1723	     2524542        off  = ptr[klen];
  1724			
  1725	     2524542        if (klen == 0)
  1726			     {
  1727			      /* If char in invariant then swatch is for all the invariant chars
  1728			       * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
  1729			       */
  1730	       87529          needents = UTF_CONTINUATION_MARK;
  1731	       87529          off      = NATIVE_TO_UTF(ptr[klen]);
  1732			     }
  1733			    else
  1734			     {
  1735			      /* If char is encoded then swatch is for the prefix */
  1736	     2437013          needents = (1 << UTF_ACCUMULATION_SHIFT);
  1737	     2437013          off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
  1738			     }
  1739			
  1740			    /*
  1741			     * This single-entry cache saves about 1/3 of the utf8 overhead in test
  1742			     * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
  1743			     * it's nothing to sniff at.)  Pity we usually come through at least
  1744			     * two function calls to get here...
  1745			     *
  1746			     * NB: this code assumes that swatches are never modified, once generated!
  1747			     */
  1748			
  1749	     2524542        if (hv   == PL_last_swash_hv &&
  1750				klen == PL_last_swash_klen &&
  1751				(!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
  1752			    {
  1753	     2402559    	tmps = PL_last_swash_tmps;
  1754	     2402559    	slen = PL_last_swash_slen;
  1755			    }
  1756			    else {
  1757				/* Try our second-level swatch cache, kept in a hash. */
  1758	      121983    	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
  1759			
  1760				/* If not cached, generate it via utf8::SWASHGET */
  1761	      121983    	if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
  1762	       32163    	    dSP;
  1763				    /* We use utf8n_to_uvuni() as we want an index into
  1764				       Unicode tables, not a native character number.
  1765				     */
  1766	       32163    	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
  1767								   ckWARN(WARN_UTF8) ?
  1768	       32163    					   0 : UTF8_ALLOW_ANY);
  1769	       32163    	    SV *errsv_save;
  1770	       32163    	    ENTER;
  1771	       32163    	    SAVETMPS;
  1772	       32163    	    save_re_context();
  1773	       32163    	    PUSHSTACKi(PERLSI_MAGIC);
  1774	       32163    	    PUSHMARK(SP);
  1775	       32163    	    EXTEND(SP,3);
  1776	       32163    	    PUSHs((SV*)sv);
  1777				    /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
  1778				    PUSHs(sv_2mortal(newSViv((klen) ?
  1779	       32163    				     (code_point & ~(needents - 1)) : 0)));
  1780	       32163    	    PUSHs(sv_2mortal(newSViv(needents)));
  1781	       32163    	    PUTBACK;
  1782	       32163    	    errsv_save = newSVsv(ERRSV);
  1783	       32163    	    if (call_method("SWASHGET", G_SCALAR))
  1784	       32163    		retval = newSVsv(*PL_stack_sp--);
  1785				    else
  1786	      ######    		retval = &PL_sv_undef;
  1787	       32163    	    if (!SvTRUE(ERRSV))
  1788	       32101    		sv_setsv(ERRSV, errsv_save);
  1789	       32163    	    SvREFCNT_dec(errsv_save);
  1790	       32163    	    POPSTACK;
  1791	       32163    	    FREETMPS;
  1792	       32163    	    LEAVE;
  1793	       32163    	    if (IN_PERL_COMPILETIME)
  1794	          36    		PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
  1795			
  1796	       32163    	    svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
  1797			
  1798	       32163    	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
  1799	      ######    		Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
  1800				}
  1801			
  1802	      121983    	PL_last_swash_hv = hv;
  1803	      121983    	PL_last_swash_klen = klen;
  1804				/* FIXME change interpvar.h?  */
  1805	      121983    	PL_last_swash_tmps = (U8 *) tmps;
  1806	      121983    	PL_last_swash_slen = slen;
  1807	      121983    	if (klen)
  1808	       80335    	    Copy(ptr, PL_last_swash_key, klen, U8);
  1809			    }
  1810			
  1811	     2524542        switch ((int)((slen << 3) / needents)) {
  1812			    case 1:
  1813	     1812687    	bit = 1 << (off & 7);
  1814	     1812687    	off >>= 3;
  1815	     1812687    	return (tmps[off] & bit) != 0;
  1816			    case 8:
  1817	       29915    	return tmps[off];
  1818			    case 16:
  1819	      601885    	off <<= 1;
  1820	      601885    	return (tmps[off] << 8) + tmps[off + 1] ;
  1821			    case 32:
  1822	       80055    	off <<= 2;
  1823	       80055    	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
  1824			    }
  1825	      ######        Perl_croak(aTHX_ "panic: swash_fetch");
  1826	     2524542        return 0;
  1827			}
  1828			
  1829			
  1830			/*
  1831			=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
  1832			
  1833			Adds the UTF-8 representation of the Native codepoint C<uv> to the end
  1834			of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  1835			bytes available. The return value is the pointer to the byte after the
  1836			end of the new character. In other words,
  1837			
  1838			    d = uvchr_to_utf8(d, uv);
  1839			
  1840			is the recommended wide native character-aware way of saying
  1841			
  1842			    *(d++) = uv;
  1843			
  1844			=cut
  1845			*/
  1846			
  1847			/* On ASCII machines this is normally a macro but we want a
  1848			   real function in case XS code wants it
  1849			*/
  1850			#undef Perl_uvchr_to_utf8
  1851			U8 *
  1852			Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
  1853	      ######    {
  1854	      ######        return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
  1855			}
  1856			
  1857			U8 *
  1858			Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  1859	      854261    {
  1860	      854261        return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
  1861			}
  1862			
  1863			/*
  1864			=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
  1865			
  1866			Returns the native character value of the first character in the string C<s>
  1867			which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  1868			length, in bytes, of that character.
  1869			
  1870			Allows length and flags to be passed to low level routine.
  1871			
  1872			=cut
  1873			*/
  1874			/* On ASCII machines this is normally a macro but we want
  1875			   a real function in case XS code wants it
  1876			*/
  1877			#undef Perl_utf8n_to_uvchr
  1878			UV
  1879			Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  1880	      ######    {
  1881	      ######        UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
  1882	      ######        return UNI_TO_NATIVE(uv);
  1883			}
  1884			
  1885			/*
  1886			=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
  1887			
  1888			Build to the scalar dsv a displayable version of the string spv,
  1889			length len, the displayable version being at most pvlim bytes long
  1890			(if longer, the rest is truncated and "..." will be appended).
  1891			
  1892			The flags argument can have UNI_DISPLAY_ISPRINT set to display
  1893			isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
  1894			to display the \\[nrfta\\] as the backslashed versions (like '\n')
  1895			(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
  1896			UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
  1897			UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
  1898			
  1899			The pointer to the PV of the dsv is returned.
  1900			
  1901			=cut */
  1902			char *
  1903			Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
  1904	           5    {
  1905	           5        int truncated = 0;
  1906	           5        const char *s, *e;
  1907			
  1908	           5        sv_setpvn(dsv, "", 0);
  1909	          14        for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
  1910	           9    	 UV u;
  1911				  /* This serves double duty as a flag and a character to print after
  1912				     a \ when flags & UNI_DISPLAY_BACKSLASH is true.
  1913				  */
  1914	           9    	 char ok = 0;
  1915			
  1916	           9    	 if (pvlim && SvCUR(dsv) >= pvlim) {
  1917	      ######    	      truncated++;
  1918	      ######    	      break;
  1919				 }
  1920	           9    	 u = utf8_to_uvchr((U8*)s, 0);
  1921	           9    	 if (u < 256) {
  1922	           1    	     const unsigned char c = (unsigned char)u & 0xFF;
  1923	           1    	     if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
  1924	           1    	         switch (c) {
  1925					 case '\n':
  1926	      ######    		     ok = 'n'; break;
  1927					 case '\r':
  1928	      ######    		     ok = 'r'; break;
  1929					 case '\t':
  1930	      ######    		     ok = 't'; break;
  1931					 case '\f':
  1932	      ######    		     ok = 'f'; break;
  1933					 case '\a':
  1934	      ######    		     ok = 'a'; break;
  1935					 case '\\':
  1936	      ######    		     ok = '\\'; break;
  1937	           1    		 default: break;
  1938					 }
  1939	           1    		 if (ok) {
  1940	      ######    		     Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
  1941					 }
  1942				     }
  1943				     /* isPRINT() is the locale-blind version. */
  1944	           1    	     if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
  1945	      ######    	         Perl_sv_catpvf(aTHX_ dsv, "%c", c);
  1946	      ######    		 ok = 1;
  1947				     }
  1948				 }
  1949	           9    	 if (!ok)
  1950	           9    	     Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
  1951			    }
  1952	           5        if (truncated)
  1953	      ######    	 sv_catpvn(dsv, "...", 3);
  1954			    
  1955	           5        return SvPVX(dsv);
  1956			}
  1957			
  1958			/*
  1959			=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
  1960			
  1961			Build to the scalar dsv a displayable version of the scalar sv,
  1962			the displayable version being at most pvlim bytes long
  1963			(if longer, the rest is truncated and "..." will be appended).
  1964			
  1965			The flags argument is as in pv_uni_display().
  1966			
  1967			The pointer to the PV of the dsv is returned.
  1968			
  1969			=cut */
  1970			char *
  1971			Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
  1972	           5    {
  1973	           5         return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
  1974							SvCUR(ssv), pvlim, flags);
  1975			}
  1976			
  1977			/*
  1978			=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
  1979			
  1980			Return true if the strings s1 and s2 differ case-insensitively, false
  1981			if not (if they are equal case-insensitively).  If u1 is true, the
  1982			string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
  1983			the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
  1984			are false, the respective string is assumed to be in native 8-bit
  1985			encoding.
  1986			
  1987			If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
  1988			in there (they will point at the beginning of the I<next> character).
  1989			If the pointers behind pe1 or pe2 are non-NULL, they are the end
  1990			pointers beyond which scanning will not continue under any
  1991			circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
  1992			s2+l2 will be used as goal end pointers that will also stop the scan,
  1993			and which qualify towards defining a successful match: all the scans
  1994			that define an explicit length must reach their goal pointers for
  1995			a match to succeed).
  1996			
  1997			For case-insensitiveness, the "casefolding" of Unicode is used
  1998			instead of upper/lowercasing both the characters, see
  1999			http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
  2000			
  2001			=cut */
  2002			I32
  2003			Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
  2004	       11858    {
  2005	       11858         register const U8 *p1  = (const U8*)s1;
  2006	       11858         register const U8 *p2  = (const U8*)s2;
  2007	       11858         register const U8 *f1 = 0, *f2 = 0;
  2008	       11858         register U8 *e1 = 0, *q1 = 0;
  2009	       11858         register U8 *e2 = 0, *q2 = 0;
  2010	       11858         STRLEN n1 = 0, n2 = 0;
  2011	       11858         U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
  2012	       11858         U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
  2013	       11858         U8 natbuf[1+1];
  2014	       11858         STRLEN foldlen1, foldlen2;
  2015	       11858         bool match;
  2016			     
  2017	       11858         if (pe1)
  2018	      ######    	  e1 = *(U8**)pe1;
  2019	       11858         if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
  2020	       11858    	  f1 = (const U8*)s1 + l1;
  2021	       11858         if (pe2)
  2022	        8907    	  e2 = *(U8**)pe2;
  2023	       11858         if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
  2024	        2951    	  f2 = (const U8*)s2 + l2;
  2025			
  2026	       11858         if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
  2027	      ######    	  return 1; /* mismatch; possible infinite loop or false positive */
  2028			
  2029	       11858         if (!u1 || !u2)
  2030	          25    	  natbuf[1] = 0; /* Need to terminate the buffer. */
  2031			
  2032	       27022         while ((e1 == 0 || p1 < e1) &&
  2033				    (f1 == 0 || p1 < f1) &&
  2034				    (e2 == 0 || p2 < e2) &&
  2035				    (f2 == 0 || p2 < f2)) {
  2036	       15179    	  if (n1 == 0) {
  2037	       15179    	       if (u1)
  2038	       15147    		    to_utf8_fold(p1, foldbuf1, &foldlen1);
  2039				       else {
  2040	          32    		    natbuf[0] = *p1;
  2041	          32    		    to_utf8_fold(natbuf, foldbuf1, &foldlen1);
  2042				       }
  2043	       15179    	       q1 = foldbuf1;
  2044	       15179    	       n1 = foldlen1;
  2045				  }
  2046	       15179    	  if (n2 == 0) {
  2047	       14938    	       if (u2)
  2048	       14929    		    to_utf8_fold(p2, foldbuf2, &foldlen2);
  2049				       else {
  2050	           9    		    natbuf[0] = *p2;
  2051	           9    		    to_utf8_fold(natbuf, foldbuf2, &foldlen2);
  2052				       }
  2053	       14938    	       q2 = foldbuf2;
  2054	       14938    	       n2 = foldlen2;
  2055				  }
  2056	       30343    	  while (n1 && n2) {
  2057	       15179    	       if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
  2058					   (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
  2059					    memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
  2060	          15    		   return 1; /* mismatch */
  2061	       15164    	       n1 -= UTF8SKIP(q1);
  2062	       15164    	       q1 += UTF8SKIP(q1);
  2063	       15164    	       n2 -= UTF8SKIP(q2);
  2064	       15164    	       q2 += UTF8SKIP(q2);
  2065				  }
  2066	       15164    	  if (n1 == 0)
  2067	       15164    	       p1 += u1 ? UTF8SKIP(p1) : 1;
  2068	       15164    	  if (n2 == 0)
  2069	       14923    	       p2 += u2 ? UTF8SKIP(p2) : 1;
  2070			
  2071			     }
  2072			
  2073			     /* A match is defined by all the scans that specified
  2074			      * an explicit length reaching their final goals. */
  2075	       11843         match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
  2076			
  2077	       11843         if (match) {
  2078	        8892    	  if (pe1)
  2079	      ######    	       *pe1 = (char*)p1;
  2080	        8892    	  if (pe2)
  2081	        8892    	       *pe2 = (char*)p2;
  2082			     }
  2083			
  2084	       11843         return match ? 0 : 1; /* 0 match, 1 mismatch */
  2085			}
  2086			
  2087			/*
  2088			 * Local variables:
  2089			 * c-indentation-style: bsd
  2090			 * c-basic-offset: 4
  2091			 * indent-tabs-mode: t
  2092			 * End:
  2093			 *
  2094			 * ex: set ts=8 sts=4 sw=4 noet:
  2095			 */
