		/*    utf8.c
		 *
		 *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
		 *    others
		 *
		 *    You may distribute under the terms of either the GNU General Public
		 *    License or the Artistic License, as specified in the README file.
		 *
		 */
		
		/*
		 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
		 * heard of that we don't want to see any closer; and that's the one place
		 * we're trying to get to!  And that's just where we can't get, nohow.'
		 *
		 * 'Well do I understand your speech,' he answered in the same language;
		 * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
		 * as is the custom in the West, if you wish to be answered?'
		 *
		 * ...the travellers perceived that the floor was paved with stones of many
		 * hues; branching runes and strange devices intertwined beneath their feet.
		 */
		
		#include "EXTERN.h"
		#define PERL_IN_UTF8_C
		#include "perl.h"
		
		static const char unees[] =
		    "Malformed UTF-8 character (unexpected end of string)";
		
		/* 
		=head1 Unicode Support
		
		This file contains various utility functions for manipulating UTF8-encoded
		strings. For the uninitiated, this is a method of representing arbitrary
		Unicode characters as a variable number of bytes, in such a way that
		characters in the ASCII range are unmodified, and a zero byte never appears
		within non-zero characters.
		
		=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
		
		Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
		of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
		bytes available. The return value is the pointer to the byte after the
		end of the new character. In other words,
		
		    d = uvuni_to_utf8_flags(d, uv, flags);
		
		or, in most cases,
		
		    d = uvuni_to_utf8(d, uv);
		
		(which is equivalent to)
		
		    d = uvuni_to_utf8_flags(d, uv, 0);
		
		is the recommended Unicode-aware way of saying
		
		    *(d++) = uv;
		
		=cut
		*/
		
		U8 *
		Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
     2768713    {
     2768713        if (ckWARN(WARN_UTF8)) {
      267915    	 if (UNICODE_IS_SURROGATE(uv) &&
			     !(flags & UNICODE_ALLOW_SURROGATE))
           6    	      Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
      267909    	 else if (
				  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
				    !(flags & UNICODE_ALLOW_FDD0))
				   ||
				   ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
				    !(flags & UNICODE_ALLOW_FFFF))) &&
				  /* UNICODE_ALLOW_SUPER includes
				   * FFFEs and FFFFs beyond 0x10FFFF. */
				  ((uv <= PERL_UNICODE_MAX) ||
				   !(flags & UNICODE_ALLOW_SUPER))
				  )
          12    	      Perl_warner(aTHX_ packWARN(WARN_UTF8),
					 "Unicode character 0x%04"UVxf" is illegal", uv);
		    }
     2768713        if (UNI_IS_INVARIANT(uv)) {
     1725406    	*d++ = (U8)UTF_TO_NATIVE(uv);
     1725406    	return d;
		    }
		#if defined(EBCDIC)
		    else {
			STRLEN len  = UNISKIP(uv);
			U8 *p = d+len-1;
			while (p > d) {
			    *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
			    uv >>= UTF_ACCUMULATION_SHIFT;
			}
			*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
			return d+len;
		    }
		#else /* Non loop style */
     1043307        if (uv < 0x800) {
       74757    	*d++ = (U8)(( uv >>  6)         | 0xc0);
       74757    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
       74757    	return d;
		    }
      968550        if (uv < 0x10000) {
      570861    	*d++ = (U8)(( uv >> 12)         | 0xe0);
      570861    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
      570861    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
      570861    	return d;
		    }
      397689        if (uv < 0x200000) {
      359295    	*d++ = (U8)(( uv >> 18)         | 0xf0);
      359295    	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
      359295    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
      359295    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
      359295    	return d;
		    }
       38394        if (uv < 0x4000000) {
           6    	*d++ = (U8)(( uv >> 24)         | 0xf8);
           6    	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
           6    	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
           6    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
           6    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
           6    	return d;
		    }
       38388        if (uv < 0x80000000) {
          28    	*d++ = (U8)(( uv >> 30)         | 0xfc);
          28    	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
          28    	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
          28    	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
          28    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
          28    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
          28    	return d;
		    }
		#ifdef HAS_QUAD
		    if (uv < UTF8_QUAD_MAX)
		#endif
		    {
       38360    	*d++ =                            0xfe;	/* Can't match U+FEFF! */
       38360    	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
       38360    	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
       38360    	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
       38360    	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
       38360    	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
       38360    	*d++ = (U8)(( uv        & 0x3f) | 0x80);
       38360    	return d;
		    }
		#ifdef HAS_QUAD
		    {
			*d++ =                            0xff;		/* Can't match U+FFFE! */
			*d++ =                            0x80;		/* 6 Reserved bits */
			*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);	/* 2 Reserved bits */
			*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
			*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
			*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
			*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
			*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
			*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
			*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
			*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
			*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
			*d++ = (U8)(( uv        & 0x3f) | 0x80);
			return d;
		    }
		#endif
		#endif /* Loop style */
		}
		 
		U8 *
		Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
      251674    {
      251674        return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
		}
		
		/*
		
		Tests if some arbitrary number of bytes begins in a valid UTF-8
		character.  Note that an INVARIANT (i.e. ASCII) character is a valid
		UTF-8 character.  The actual number of bytes in the UTF-8 character
		will be returned if it is valid, otherwise 0.
		
		This is the "slow" version as opposed to the "fast" version which is
		the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
		difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
		or less you should use the IS_UTF8_CHAR(), for lengths of five or more
		you should use the _slow().  In practice this means that the _slow()
		will be used very rarely, since the maximum Unicode code point (as of
		Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
		the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
		five bytes or more.
		
		=cut */
		STATIC STRLEN
		S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
           1    {
           1        U8 u = *s;
           1        STRLEN slen;
           1        UV uv, ouv;
		
           1        if (UTF8_IS_INVARIANT(u))
      ######    	return 1;
		
           1        if (!UTF8_IS_START(u))
           1    	return 0;
		
      ######        if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
      ######    	return 0;
		
      ######        slen = len - 1;
      ######        s++;
      ######        u &= UTF_START_MASK(len);
      ######        uv  = u;
      ######        ouv = uv;
      ######        while (slen--) {
      ######    	if (!UTF8_IS_CONTINUATION(*s))
      ######    	    return 0;
      ######    	uv = UTF8_ACCUMULATE(uv, *s);
      ######    	if (uv < ouv) 
      ######    	    return 0;
      ######    	ouv = uv;
      ######    	s++;
		    }
		
      ######        if ((STRLEN)UNISKIP(uv) < len)
      ######    	return 0;
		
      ######        return len;
		}
		
		/*
		=for apidoc A|STRLEN|is_utf8_char|const U8 *s
		
		Tests if some arbitrary number of bytes begins in a valid UTF-8
		character.  Note that an INVARIANT (i.e. ASCII) character is a valid
		UTF-8 character.  The actual number of bytes in the UTF-8 character
		will be returned if it is valid, otherwise 0.
		
		=cut */
		STRLEN
		Perl_is_utf8_char(pTHX_ const U8 *s)
         209    {
         209        STRLEN len = UTF8SKIP(s);
		#ifdef IS_UTF8_CHAR
         209        if (IS_UTF8_CHAR_FAST(len))
         208            return IS_UTF8_CHAR(s, len) ? len : 0;
		#endif /* #ifdef IS_UTF8_CHAR */
           1        return is_utf8_char_slow(s, len);
		}
		
		/*
		=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
		
		Returns true if first C<len> bytes of the given string form a valid
		UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
		not mean 'a string that contains code points above 0x7F encoded in UTF-8'
		because a valid ASCII string is a valid UTF-8 string.
		
		See also is_utf8_string_loclen() and is_utf8_string_loc().
		
		=cut
		*/
		
		bool
		Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
       77372    {
       77372        const U8* x = s;
       77372        const U8* send;
		
       77372        if (!len && s)
           7    	len = strlen((const char *)s);
       77372        send = s + len;
		
   366043500        while (x < send) {
   365966132    	STRLEN c;
			 /* Inline the easy bits of is_utf8_char() here for speed... */
   365966132    	 if (UTF8_IS_INVARIANT(*x))
      255178    	      c = 1;
   365710954    	 else if (!UTF8_IS_START(*x))
   365710950    	     goto out;
			 else {
			      /* ... and call is_utf8_char() only if really needed. */
		#ifdef IS_UTF8_CHAR
   365710950    	     c = UTF8SKIP(x);
   365710950    	     if (IS_UTF8_CHAR_FAST(c)) {
   365710950    	         if (!IS_UTF8_CHAR(x, c))
      ######    		     goto out;
      ######    	     } else if (!is_utf8_char_slow(x, c))
      ######    	         goto out;
		#else
			     c = is_utf8_char(x);
		#endif /* #ifdef IS_UTF8_CHAR */
   365710950    	      if (!c)
      ######    		  goto out;
			 }
   365966128            x += c;
		    }
		
		 out:
       77372        if (x != send)
           8    	return FALSE;
		
       77364        return TRUE;
		}
		
		/*
		=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
		
		Like is_ut8_string() but stores the location of the failure (in the
		case of "utf8ness failure") or the location s+len (in the case of
		"utf8ness success") in the C<ep>, and the number of UTF-8
		encoded characters in the C<el>.
		
		See also is_utf8_string_loc() and is_utf8_string().
		
		=cut
		*/
		
		bool
		Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
         433    {
         433        const U8* x = s;
         433        const U8* send;
         433        STRLEN c;
		
         433        if (!len && s)
      ######            len = strlen((const char *)s);
         433        send = s + len;
         433        if (el)
           7            *el = 0;
		
       11479        while (x < send) {
			 /* Inline the easy bits of is_utf8_char() here for speed... */
       11048    	 if (UTF8_IS_INVARIANT(*x))
        8948    	     c = 1;
        2100    	 else if (!UTF8_IS_START(*x))
        2100    	     goto out;
			 else {
			     /* ... and call is_utf8_char() only if really needed. */
		#ifdef IS_UTF8_CHAR
        2100    	     c = UTF8SKIP(x);
        2100    	     if (IS_UTF8_CHAR_FAST(c)) {
        2100    	         if (!IS_UTF8_CHAR(x, c))
           2    		     c = 0;
			     } else
      ######    	         c = is_utf8_char_slow(x, c);
		#else
			     c = is_utf8_char(x);
		#endif /* #ifdef IS_UTF8_CHAR */
        2100    	     if (!c)
           2    	         goto out;
			 }
       11046             x += c;
       11046    	 if (el)
          32    	     (*el)++;
		    }
		
		 out:
         433        if (ep)
         433            *ep = x;
         433        if (x != send)
           2    	return FALSE;
		
         431        return TRUE;
		}
		
		/*
		=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
		
		Like is_ut8_string() but stores the location of the failure (in the
		case of "utf8ness failure") or the location s+len (in the case of
		"utf8ness success") in the C<ep>.
		
		See also is_utf8_string_loclen() and is_utf8_string().
		
		=cut
		*/
		
		bool
		Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep)
         426    {
         426        return is_utf8_string_loclen(s, len, ep, 0);
		}
		
		/*
		=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
		
		Bottom level UTF-8 decode routine.
		Returns the unicode code point value of the first character in the string C<s>
		which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
		C<retlen> will be set to the length, in bytes, of that character.
		
		If C<s> does not point to a well-formed UTF-8 character, the behaviour
		is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
		it is assumed that the caller will raise a warning, and this function
		will silently just set C<retlen> to C<-1> and return zero.  If the
		C<flags> does not contain UTF8_CHECK_ONLY, warnings about
		malformations will be given, C<retlen> will be set to the expected
		length of the UTF-8 character in bytes, and zero will be returned.
		
		The C<flags> can also contain various flags to allow deviations from
		the strict UTF-8 encoding (see F<utf8.h>).
		
		Most code should use utf8_to_uvchr() rather than call this directly.
		
		=cut
		*/
		
		UV
		Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
     4967331    {
     4967331        const U8 *s0 = s;
     4967331        UV uv = *s, ouv = 0;
     4967331        STRLEN len = 1;
     4967331        const bool dowarn = ckWARN_d(WARN_UTF8);
     4967331        const UV startbyte = *s;
     4967331        STRLEN expectlen = 0;
     4967331        U32 warning = 0;
		
		/* This list is a superset of the UTF8_ALLOW_XXX. */
		
		#define UTF8_WARN_EMPTY				 1
		#define UTF8_WARN_CONTINUATION			 2
		#define UTF8_WARN_NON_CONTINUATION	 	 3
		#define UTF8_WARN_FE_FF				 4
		#define UTF8_WARN_SHORT				 5
		#define UTF8_WARN_OVERFLOW			 6
		#define UTF8_WARN_SURROGATE			 7
		#define UTF8_WARN_LONG				 8
		#define UTF8_WARN_FFFF				 9 /* Also FFFE. */
		
     4967331        if (curlen == 0 &&
			!(flags & UTF8_ALLOW_EMPTY)) {
           1    	warning = UTF8_WARN_EMPTY;
           1    	goto malformed;
		    }
		
     4967330        if (UTF8_IS_INVARIANT(uv)) {
     1986226    	if (retlen)
     1837970    	    *retlen = 1;
     1986226    	return (UV) (NATIVE_TO_UTF(*s));
		    }
		
     2981104        if (UTF8_IS_CONTINUATION(uv) &&
			!(flags & UTF8_ALLOW_CONTINUATION)) {
           1    	warning = UTF8_WARN_CONTINUATION;
           1    	goto malformed;
		    }
		
     2981103        if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
			!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
         304    	warning = UTF8_WARN_NON_CONTINUATION;
         304    	goto malformed;
		    }
		
		#ifdef EBCDIC
		    uv = NATIVE_TO_UTF(uv);
		#else
     2980799        if ((uv == 0xfe || uv == 0xff) &&
			!(flags & UTF8_ALLOW_FE_FF)) {
           1    	warning = UTF8_WARN_FE_FF;
           1    	goto malformed;
		    }
		#endif
		
     2980798        if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
     2866240        else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
     1471583        else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
       34040        else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
		#ifdef EBCDIC
		    else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
		    else			{ len =  7; uv &= 0x01; }
		#else
       34034        else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
       34005        else if (!(uv & 0x01))	{ len =  7; uv = 0; }
      ######        else			{ len = 13; uv = 0; } /* whoa! */
		#endif
		
     2980798        if (retlen)
     2889946    	*retlen = len;
		
     2980798        expectlen = len;
		
     2980798        if ((curlen < expectlen) &&
			!(flags & UTF8_ALLOW_SHORT)) {
           8    	warning = UTF8_WARN_SHORT;
           8    	goto malformed;
		    }
		
     2980790        len--;
     2980790        s++;
     2980790        ouv = uv;
		
    10401468        while (len--) {
     7420680    	if (!UTF8_IS_CONTINUATION(*s) &&
			    !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
      ######    	    s--;
      ######    	    warning = UTF8_WARN_NON_CONTINUATION;
      ######    	    goto malformed;
			}
			else
     7420680    	    uv = UTF8_ACCUMULATE(uv, *s);
     7420680    	if (!(uv > ouv)) {
			    /* These cannot be allowed. */
           2    	    if (uv == ouv) {
           2    		if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
           2    		    warning = UTF8_WARN_LONG;
           2    		    goto malformed;
				}
			    }
			    else { /* uv < ouv */
				/* This cannot be allowed. */
      ######    		warning = UTF8_WARN_OVERFLOW;
      ######    		goto malformed;
			    }
			}
     7420678    	s++;
     7420678    	ouv = uv;
		    }
		
     2980788        if (UNICODE_IS_SURROGATE(uv) &&
			!(flags & UTF8_ALLOW_SURROGATE)) {
           7    	warning = UTF8_WARN_SURROGATE;
           7    	goto malformed;
     2980781        } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
			       !(flags & UTF8_ALLOW_LONG)) {
      ######    	warning = UTF8_WARN_LONG;
      ######    	goto malformed;
     2980781        } else if (UNICODE_IS_ILLEGAL(uv) &&
			       !(flags & UTF8_ALLOW_FFFF)) {
           1    	warning = UTF8_WARN_FFFF;
           1    	goto malformed;
		    }
		
     2980780        return uv;
		
		malformed:
		
         325        if (flags & UTF8_CHECK_ONLY) {
         312    	if (retlen)
         312    	    *retlen = -1;
         312    	return 0;
		    }
		
          13        if (dowarn) {
           5    	SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
		
           5    	switch (warning) {
      ######    	case 0: /* Intentionally empty. */ break;
			case UTF8_WARN_EMPTY:
      ######    	    Perl_sv_catpv(aTHX_ sv, "(empty string)");
      ######    	    break;
			case UTF8_WARN_CONTINUATION:
           1    	    Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
           1    	    break;
			case UTF8_WARN_NON_CONTINUATION:
           2    	    if (s == s0)
           2    	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
		                           (UV)s[1], startbyte);
			    else
      ######    	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
		                           (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, (int)expectlen);
			      
      ######    	    break;
			case UTF8_WARN_FE_FF:
           1    	    Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
           1    	    break;
			case UTF8_WARN_SHORT:
           1    	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
		                           (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
           1    	    expectlen = curlen;		/* distance for caller to skip */
           1    	    break;
			case UTF8_WARN_OVERFLOW:
      ######    	    Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
		                           ouv, *s, startbyte);
      ######    	    break;
			case UTF8_WARN_SURROGATE:
      ######    	    Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
      ######    	    break;
			case UTF8_WARN_LONG:
      ######    	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
					   (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
      ######    	    break;
			case UTF8_WARN_FFFF:
      ######    	    Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
      ######    	    break;
			default:
      ######    	    Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
           5    	    break;
			}
			
           5    	if (warning) {
           5    	    const char *s = SvPVX_const(sv);
		
           5    	    if (PL_op)
           2    		Perl_warner(aTHX_ packWARN(WARN_UTF8),
					    "%s in %s", s,  OP_DESC(PL_op));
			    else
           3    		Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
			}
		    }
		
          13        if (retlen)
          13    	*retlen = expectlen ? expectlen : len;
		
          13        return 0;
		}
		
		/*
		=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
		
		Returns the native character value of the first character in the string C<s>
		which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
		length, in bytes, of that character.
		
		If C<s> does not point to a well-formed UTF-8 character, zero is
		returned and retlen is set, if possible, to -1.
		
		=cut
		*/
		
		UV
		Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
      449536    {
      449536        return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
					       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
		}
		
		/*
		=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
		
		Returns the Unicode code point of the first character in the string C<s>
		which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
		length, in bytes, of that character.
		
		This function should only be used when returned UV is considered
		an index into the Unicode semantic tables (e.g. swashes).
		
		If C<s> does not point to a well-formed UTF-8 character, zero is
		returned and retlen is set, if possible, to -1.
		
		=cut
		*/
		
		UV
		Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
           2    {
		    /* Call the low level routine asking for checks */
           2        return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
					       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
		}
		
		/*
		=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
		
		Return the length of the UTF-8 char encoded string C<s> in characters.
		Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
		up past C<e>, croaks.
		
		=cut
		*/
		
		STRLEN
		Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
      173750    {
      173750        STRLEN len = 0;
		
		    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
		     * the bitops (especially ~) can create illegal UTF-8.
		     * In other words: in Perl UTF-8 is not just for Unicode. */
		
      173750        if (e < s)
      ######    	goto warn_and_return;
   253198496        while (s < e) {
   253024747    	const U8 t = UTF8SKIP(s);
   253024747    	if (e - s < t) {
			    warn_and_return:
           1    	    if (ckWARN_d(WARN_UTF8)) {
           1    	        if (PL_op)
           1    		    Perl_warner(aTHX_ packWARN(WARN_UTF8),
					    "%s in %s", unees, OP_DESC(PL_op));
				else
      ######    		    Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
			    }
           1    	    return len;
			}
   253024746    	s += t;
   253024746    	len++;
		    }
		
      173749        return len;
		}
		
		/*
		=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
		
		Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
		and C<b>.
		
		WARNING: use only if you *know* that the pointers point inside the
		same UTF-8 buffer.
		
		=cut
		*/
		
		IV
		Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
           7    {
           7        IV off = 0;
		
		    /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
		     * the bitops (especially ~) can create illegal UTF-8.
		     * In other words: in Perl UTF-8 is not just for Unicode. */
		
           7        if (a < b) {
      ######    	while (a < b) {
      ######    	    const U8 c = UTF8SKIP(a);
      ######    	    if (b - a < c)
      ######    		goto warn_and_return;
      ######    	    a += c;
      ######    	    off--;
			}
		    }
		    else {
          42    	while (b < a) {
          35    	    const U8 c = UTF8SKIP(b);
		
          35    	    if (a - b < c) {
				warn_and_return:
      ######    	        if (ckWARN_d(WARN_UTF8)) {
      ######    		    if (PL_op)
      ######    		        Perl_warner(aTHX_ packWARN(WARN_UTF8),
						    "%s in %s", unees, OP_DESC(PL_op));
				    else
      ######    		        Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
				}
      ######    		return off;
			    }
          35    	    b += c;
          35    	    off++;
			}
		    }
		
           7        return off;
		}
		
		/*
		=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
		
		Return the UTF-8 pointer C<s> displaced by C<off> characters, either
		forward or backward.
		
		WARNING: do not use the following unless you *know* C<off> is within
		the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
		on the first byte of character or just after the last byte of a character.
		
		=cut
		*/
		
		U8 *
		Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
         961    {
		    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
		     * the bitops (especially ~) can create illegal UTF-8.
		     * In other words: in Perl UTF-8 is not just for Unicode. */
		
         961        if (off >= 0) {
        8375    	while (off--)
        7436    	    s += UTF8SKIP(s);
		    }
		    else {
          44    	while (off++) {
          22    	    s--;
          44    	    while (UTF8_IS_CONTINUATION(*s))
          22    		s--;
			}
		    }
         961        return (U8 *)s;
		}
		
		/*
		=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
		
		Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
		Unlike C<bytes_to_utf8>, this over-writes the original string, and
		updates len to contain the new length.
		Returns zero on failure, setting C<len> to -1.
		
		=cut
		*/
		
		U8 *
		Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
        3052    {
        3052        U8 *send;
        3052        U8 *d;
        3052        U8 *save = s;
		
		    /* ensure valid UTF-8 and chars < 256 before updating string */
        6064        for (send = s + *len; s < send; ) {
      224855            U8 c = *s++;
		
      224855            if (!UTF8_IS_INVARIANT(c) &&
		            (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
			     || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
          40                *len = -1;
          40                return 0;
		        }
		    }
		
        3012        d = s = save;
      227810        while (s < send) {
      224798            STRLEN ulen;
      224798            *d++ = (U8)utf8_to_uvchr(s, &ulen);
      224798            s += ulen;
		    }
        3012        *d = '\0';
        3012        *len = d - save;
        3012        return save;
		}
		
		/*
		=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
		
		Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
		Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
		the newly-created string, and updates C<len> to contain the new
		length.  Returns the original string if no conversion occurs, C<len>
		is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
		0 if C<s> is converted or contains all 7bit characters.
		
		=cut
		*/
		
		U8 *
		Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
        7713    {
        7713        U8 *d;
        7713        const U8 *start = s;
        7713        const U8 *send;
        7713        I32 count = 0;
		
        7713        if (!*is_utf8)
      ######            return (U8 *)start;
		
		    /* ensure valid UTF-8 and chars < 256 before converting string */
       10212        for (send = s + *len; s < send;) {
     1577763            U8 c = *s++;
     1577763    	if (!UTF8_IS_INVARIANT(c)) {
        8641    	    if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
		                (c = *s++) && UTF8_IS_CONTINUATION(c))
        3427    		count++;
			    else
        5214                    return (U8 *)start;
			}
		    }
		
        2499        *is_utf8 = 0;		
		
        2499        Newz(801, d, (*len) - count + 1, U8);
        2499        s = start; start = d;
     1574972        while (s < send) {
     1572473    	U8 c = *s++;
     1572473    	if (!UTF8_IS_INVARIANT(c)) {
			    /* Then it is two-byte encoded */
        3382    	    c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
     1572473    	    c = ASCII_TO_NATIVE(c);
			}
     1572473    	*d++ = c;
		    }
        2499        *d = '\0';
        2499        *len = d - start;
        2499        return (U8 *)start;
		}
		
		/*
		=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
		
		Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
		Returns a pointer to the newly-created string, and sets C<len> to
		reflect the new length.
		
		If you want to convert to UTF-8 from other encodings than ASCII,
		see sv_recode_to_utf8().
		
		=cut
		*/
		
		U8*
		Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
       12646    {
       12646        const U8 * const send = s + (*len);
       12646        U8 *d;
       12646        U8 *dst;
		
       12646        Newz(801, d, (*len) * 2 + 1, U8);
       12646        dst = d;
		
       43211        while (s < send) {
       30565            const UV uv = NATIVE_TO_ASCII(*s++);
       30565            if (UNI_IS_INVARIANT(uv))
       13751                *d++ = (U8)UTF_TO_NATIVE(uv);
		        else {
       16814                *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
       16814                *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
		        }
		    }
       12646        *d = '\0';
       12646        *len = d-dst;
       12646        return dst;
		}
		
		/*
		 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
		 *
		 * Destination must be pre-extended to 3/2 source.  Do not use in-place.
		 * We optimize for native, for obvious reasons. */
		
		U8*
		Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
          21    {
          21        U8* pend;
          21        U8* dstart = d;
		
          21        if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
           7    	 d[0] = 0;
           7    	 *newlen = 1;
           7    	 return d;
		    }
		
          14        if (bytelen & 1)
      ######    	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
		
          14        pend = p + bytelen;
		
         107        while (p < pend) {
          93    	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
          93    	p += 2;
          93    	if (uv < 0x80) {
          93    	    *d++ = (U8)uv;
          93    	    continue;
			}
      ######    	if (uv < 0x800) {
      ######    	    *d++ = (U8)(( uv >>  6)         | 0xc0);
      ######    	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
      ######    	    continue;
			}
      ######    	if (uv >= 0xd800 && uv < 0xdbff) {	/* surrogates */
      ######    	    UV low = (p[0] << 8) + p[1];
      ######    	    p += 2;
      ######    	    if (low < 0xdc00 || low >= 0xdfff)
      ######    		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
      ######    	    uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
			}
      ######    	if (uv < 0x10000) {
      ######    	    *d++ = (U8)(( uv >> 12)         | 0xe0);
      ######    	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
      ######    	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
      ######    	    continue;
			}
			else {
      ######    	    *d++ = (U8)(( uv >> 18)         | 0xf0);
      ######    	    *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
      ######    	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
      ######    	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
      ######    	    continue;
			}
		    }
          14        *newlen = d - dstart;
          14        return d;
		}
		
		/* Note: this one is slightly destructive of the source. */
		
		U8*
		Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
           7    {
           7        U8* s = (U8*)p;
           7        U8* send = s + bytelen;
          50        while (s < send) {
          43    	U8 tmp = s[0];
          43    	s[0] = s[1];
          43    	s[1] = tmp;
          43    	s += 2;
		    }
           7        return utf16_to_utf8(p, d, bytelen, newlen);
		}
		
		/* for now these are all defined (inefficiently) in terms of the utf8 versions */
		
		bool
		Perl_is_uni_alnum(pTHX_ UV c)
          25    {
          25        U8 tmpbuf[UTF8_MAXBYTES+1];
          25        uvchr_to_utf8(tmpbuf, c);
          25        return is_utf8_alnum(tmpbuf);
		}
		
		bool
		Perl_is_uni_alnumc(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_alnumc(tmpbuf);
		}
		
		bool
		Perl_is_uni_idfirst(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_idfirst(tmpbuf);
		}
		
		bool
		Perl_is_uni_alpha(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_alpha(tmpbuf);
		}
		
		bool
		Perl_is_uni_ascii(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_ascii(tmpbuf);
		}
		
		bool
		Perl_is_uni_space(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_space(tmpbuf);
		}
		
		bool
		Perl_is_uni_digit(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_digit(tmpbuf);
		}
		
		bool
		Perl_is_uni_upper(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_upper(tmpbuf);
		}
		
		bool
		Perl_is_uni_lower(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_lower(tmpbuf);
		}
		
		bool
		Perl_is_uni_cntrl(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_cntrl(tmpbuf);
		}
		
		bool
		Perl_is_uni_graph(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_graph(tmpbuf);
		}
		
		bool
		Perl_is_uni_print(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_print(tmpbuf);
		}
		
		bool
		Perl_is_uni_punct(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_punct(tmpbuf);
		}
		
		bool
		Perl_is_uni_xdigit(pTHX_ UV c)
      ######    {
      ######        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
      ######        uvchr_to_utf8(tmpbuf, c);
      ######        return is_utf8_xdigit(tmpbuf);
		}
		
		UV
		Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
      ######    {
      ######        uvchr_to_utf8(p, c);
      ######        return to_utf8_upper(p, p, lenp);
		}
		
		UV
		Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
      ######    {
      ######        uvchr_to_utf8(p, c);
      ######        return to_utf8_title(p, p, lenp);
		}
		
		UV
		Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
      ######    {
      ######        uvchr_to_utf8(p, c);
      ######        return to_utf8_lower(p, p, lenp);
		}
		
		UV
		Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
       33212    {
       33212        uvchr_to_utf8(p, c);
       33212        return to_utf8_fold(p, p, lenp);
		}
		
		/* for now these all assume no locale info available for Unicode > 255 */
		
		bool
		Perl_is_uni_alnum_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_alnum(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_alnumc_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_alnumc(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_idfirst_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_idfirst(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_alpha_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_alpha(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_ascii_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_ascii(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_space_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_space(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_digit_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_digit(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_upper_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_upper(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_lower_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_lower(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_cntrl_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_cntrl(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_graph_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_graph(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_print_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_print(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_punct_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_punct(c);	/* XXX no locale support yet */
		}
		
		bool
		Perl_is_uni_xdigit_lc(pTHX_ UV c)
      ######    {
      ######        return is_uni_xdigit(c);	/* XXX no locale support yet */
		}
		
		U32
		Perl_to_uni_upper_lc(pTHX_ U32 c)
      ######    {
		    /* XXX returns only the first character -- do not use XXX */
		    /* XXX no locale support yet */
      ######        STRLEN len;
      ######        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
      ######        return (U32)to_uni_upper(c, tmpbuf, &len);
		}
		
		U32
		Perl_to_uni_title_lc(pTHX_ U32 c)
      ######    {
		    /* XXX returns only the first character XXX -- do not use XXX */
		    /* XXX no locale support yet */
      ######        STRLEN len;
      ######        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
      ######        return (U32)to_uni_title(c, tmpbuf, &len);
		}
		
		U32
		Perl_to_uni_lower_lc(pTHX_ U32 c)
      ######    {
		    /* XXX returns only the first character -- do not use XXX */
		    /* XXX no locale support yet */
      ######        STRLEN len;
      ######        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
      ######        return (U32)to_uni_lower(c, tmpbuf, &len);
		}
		
		bool
		Perl_is_utf8_alnum(pTHX_ const U8 *p)
          96    {
          96        if (!is_utf8_char(p))
      ######    	return FALSE;
          96        if (!PL_utf8_alnum)
			/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
			 * descendant of isalnum(3), in other words, it doesn't
			 * contain the '_'. --jhi */
          21    	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
          96        return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
		/*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
		#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
		    if (!PL_utf8_alnum)
			PL_utf8_alnum = swash_init("utf8", "",
			    sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
		    return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
		#endif
		}
		
		bool
		Perl_is_utf8_alnumc(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_alnum)
      ######    	PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
		/*    return is_utf8_alpha(p) || is_utf8_digit(p); */
		#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
		    if (!PL_utf8_alnum)
			PL_utf8_alnum = swash_init("utf8", "",
			    sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
		    return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
		#endif
		}
		
		bool
		Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
      ######    {
      ######        if (*p == '_')
      ######    	return TRUE;
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
      ######    	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_idcont(pTHX_ const U8 *p)
          32    {
          32        if (*p == '_')
      ######    	return TRUE;
          32        if (!is_utf8_char(p))
           1    	return FALSE;
          31        if (!PL_utf8_idcont)
           5    	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
          31        return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_alpha(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_alpha)
      ######    	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_ascii(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_ascii)
      ######    	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_space(pTHX_ const U8 *p)
          23    {
          23        if (!is_utf8_char(p))
      ######    	return FALSE;
          23        if (!PL_utf8_space)
           7    	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
          23        return swash_fetch(PL_utf8_space, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_digit(pTHX_ const U8 *p)
          26    {
          26        if (!is_utf8_char(p))
      ######    	return FALSE;
          26        if (!PL_utf8_digit)
           6    	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
          26        return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_upper(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_upper)
      ######    	PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_lower(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_lower)
      ######    	PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_cntrl(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_cntrl)
      ######    	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_graph(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_graph)
      ######    	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_print(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_print)
      ######    	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_print, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_punct(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_punct)
      ######    	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_xdigit(pTHX_ const U8 *p)
      ######    {
      ######        if (!is_utf8_char(p))
      ######    	return FALSE;
      ######        if (!PL_utf8_xdigit)
      ######    	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
      ######        return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
		}
		
		bool
		Perl_is_utf8_mark(pTHX_ const U8 *p)
          32    {
          32        if (!is_utf8_char(p))
      ######    	return FALSE;
          32        if (!PL_utf8_mark)
           3    	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
          32        return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
		}
		
		/*
		=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
		
		The "p" contains the pointer to the UTF-8 string encoding
		the character that is being converted.
		
		The "ustrp" is a pointer to the character buffer to put the
		conversion result to.  The "lenp" is a pointer to the length
		of the result.
		
		The "swashp" is a pointer to the swash to use.
		
		Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
		and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
		but not always, a multicharacter mapping), is tried first.
		
		The "special" is a string like "utf8::ToSpecLower", which means the
		hash %utf8::ToSpecLower.  The access to the hash is through
		Perl_to_utf8_case().
		
		The "normal" is a string like "ToLower" which means the swash
		%utf8::ToLower.
		
		=cut */
		
		UV
		Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special)
       79604    {
       79604        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
       79604        STRLEN len = 0;
		
       79604        const UV uv0 = utf8_to_uvchr(p, 0);
		    /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
		     * are necessary in EBCDIC, they are redundant no-ops
		     * in ASCII-ish platforms, and hopefully optimized away. */
       79604        const UV uv1 = NATIVE_TO_UNI(uv0);
       79604        uvuni_to_utf8(tmpbuf, uv1);
		
       79604        if (!*swashp) /* load on-demand */
          22             *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
		
		    /* The 0xDF is the only special casing Unicode code point below 0x100. */
       79604        if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
		         /* It might be "special" (sometimes, but not always,
			  * a multicharacter mapping) */
       18348    	 HV *hv;
       18348    	 SV **svp;
		
       18348    	 if ((hv  = get_hv(special, FALSE)) &&
			     (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
			     (*svp)) {
        1164    	     const char *s;
		
        1164    	      s = SvPV_const(*svp, len);
        1164    	      if (len == 1)
      ######    		   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
			      else {
		#ifdef EBCDIC
				   /* If we have EBCDIC we need to remap the characters
				    * since any characters in the low 256 are Unicode
				    * code points, not EBCDIC. */
				   U8 *t = (U8*)s, *tend = t + len, *d;
				
				   d = tmpbuf;
				   if (SvUTF8(*svp)) {
					STRLEN tlen = 0;
					
					while (t < tend) {
					     UV c = utf8_to_uvchr(t, &tlen);
					     if (tlen > 0) {
						  d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
						  t += tlen;
					     }
					     else
						  break;
					}
				   }
				   else {
					while (t < tend) {
					     d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
					     t++;
					}
				   }
				   len = d - tmpbuf;
				   Copy(tmpbuf, ustrp, len, U8);
		#else
        1164    		   Copy(s, ustrp, len, U8);
		#endif
			      }
			 }
		    }
		
       79604        if (!len && *swashp) {
       78440    	 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
			 
       78440    	 if (uv2) {
			      /* It was "normal" (a single character mapping). */
       10872    	      UV uv3 = UNI_TO_NATIVE(uv2);
			      
       10872    	      len = uvchr_to_utf8(ustrp, uv3) - ustrp;
			 }
		    }
		
       79604        if (!len) /* Neither: just copy. */
       67568    	 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
		
       79604        if (lenp)
       79604    	 *lenp = len;
		
       79604        return len ? utf8_to_uvchr(ustrp, 0) : 0;
		}
		
		/*
		=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
		
		Convert the UTF-8 encoded character at p to its uppercase version and
		store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
		that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
		the uppercase version may be longer than the original character.
		
		The first character of the uppercased version is returned
		(but note, as explained above, that there may be more.)
		
		=cut */
		
		UV
		Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
        7034    {
        7034        return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
		                             &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
		}
		
		/*
		=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
		
		Convert the UTF-8 encoded character at p to its titlecase version and
		store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
		that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
		titlecase version may be longer than the original character.
		
		The first character of the titlecased version is returned
		(but note, as explained above, that there may be more.)
		
		=cut */
		
		UV
		Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
         936    {
         936        return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
		                             &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
		}
		
		/*
		=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
		
		Convert the UTF-8 encoded character at p to its lowercase version and
		store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
		that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
		lowercase version may be longer than the original character.
		
		The first character of the lowercased version is returned
		(but note, as explained above, that there may be more.)
		
		=cut */
		
		UV
		Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
        7279    {
        7279        return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
		                             &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
		}
		
		/*
		=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
		
		Convert the UTF-8 encoded character at p to its foldcase version and
		store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
		that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
		foldcase version may be longer than the original character (up to
		three characters).
		
		The first character of the foldcased version is returned
		(but note, as explained above, that there may be more.)
		
		=cut */
		
		UV
		Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
       64355    {
       64355        return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
		                             &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
		}
		
		/* a "swash" is a swatch hash */
		
		SV*
		Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
        8359    {
		    dVAR;
        8359        SV* retval;
        8359        SV* tokenbufsv = sv_newmortal();
        8359        dSP;
        8359        const size_t pkg_len = strlen(pkg);
        8359        const size_t name_len = strlen(name);
        8359        HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
        8359        SV* errsv_save;
		
        8359        PUSHSTACKi(PERLSI_MAGIC);
        8359        ENTER;
        8359        SAVEI32(PL_hints);
        8359        PL_hints = 0;
        8359        save_re_context();
        8359        if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {	/* demand load utf8 */
          54    	ENTER;
          54    	errsv_save = newSVsv(ERRSV);
          54    	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
					 Nullsv);
          54    	if (!SvTRUE(ERRSV))
          54    	    sv_setsv(ERRSV, errsv_save);
          54    	SvREFCNT_dec(errsv_save);
          54    	LEAVE;
		    }
        8359        SPAGAIN;
        8359        PUSHMARK(SP);
        8359        EXTEND(SP,5);
        8359        PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
        8359        PUSHs(sv_2mortal(newSVpvn(name, name_len)));
        8359        PUSHs(listsv);
        8359        PUSHs(sv_2mortal(newSViv(minbits)));
        8359        PUSHs(sv_2mortal(newSViv(none)));
        8359        PUTBACK;
        8359        if (IN_PERL_COMPILETIME) {
			/* XXX ought to be handled by lex_start */
          71    	SAVEI32(PL_in_my);
          71    	PL_in_my = 0;
          71    	sv_setpv(tokenbufsv, PL_tokenbuf);
		    }
        8359        errsv_save = newSVsv(ERRSV);
        8359        if (call_method("SWASHNEW", G_SCALAR))
        8359    	retval = newSVsv(*PL_stack_sp--);
		    else
      ######    	retval = &PL_sv_undef;
        8359        if (!SvTRUE(ERRSV))
        8359    	sv_setsv(ERRSV, errsv_save);
        8359        SvREFCNT_dec(errsv_save);
        8359        LEAVE;
        8359        POPSTACK;
        8359        if (IN_PERL_COMPILETIME) {
          71    	STRLEN len;
          71            const char* pv = SvPV_const(tokenbufsv, len);
		
          71    	Copy(pv, PL_tokenbuf, len+1, char);
          71    	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
		    }
        8359        if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
      ######            if (SvPOK(retval))
      ######    	    Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
				       retval);
      ######    	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
		    }
        8359        return retval;
		}
		
		
		/* This API is wrong for special case conversions since we may need to
		 * return several Unicode characters for a single Unicode character
		 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
		 * the lower-level routine, and it is similarly broken for returning
		 * multiple values.  --jhi */
		UV
		Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
     2524542    {
		    dVAR;
     2524542        HV* hv = (HV*)SvRV(sv);
     2524542        U32 klen;
     2524542        U32 off;
     2524542        STRLEN slen;
     2524542        STRLEN needents;
     2524542        const U8 *tmps = NULL;
     2524542        U32 bit;
     2524542        SV *retval;
     2524542        U8 tmputf8[2];
     2524542        UV c = NATIVE_TO_ASCII(*ptr);
		
     2524542        if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
        1324            tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
        1324            tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
        1324            ptr = tmputf8;
		    }
		    /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
		     * then the "swatch" is a vec() for al the chars which start
		     * with 0xAA..0xYY
		     * So the key in the hash (klen) is length of encoded char -1
		     */
     2524542        klen = UTF8SKIP(ptr) - 1;
     2524542        off  = ptr[klen];
		
     2524542        if (klen == 0)
		     {
		      /* If char in invariant then swatch is for all the invariant chars
		       * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
		       */
       87529          needents = UTF_CONTINUATION_MARK;
       87529          off      = NATIVE_TO_UTF(ptr[klen]);
		     }
		    else
		     {
		      /* If char is encoded then swatch is for the prefix */
     2437013          needents = (1 << UTF_ACCUMULATION_SHIFT);
     2437013          off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
		     }
		
		    /*
		     * This single-entry cache saves about 1/3 of the utf8 overhead in test
		     * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
		     * it's nothing to sniff at.)  Pity we usually come through at least
		     * two function calls to get here...
		     *
		     * NB: this code assumes that swatches are never modified, once generated!
		     */
		
     2524542        if (hv   == PL_last_swash_hv &&
			klen == PL_last_swash_klen &&
			(!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
		    {
     2402559    	tmps = PL_last_swash_tmps;
     2402559    	slen = PL_last_swash_slen;
		    }
		    else {
			/* Try our second-level swatch cache, kept in a hash. */
      121983    	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
		
			/* If not cached, generate it via utf8::SWASHGET */
      121983    	if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
       32163    	    dSP;
			    /* We use utf8n_to_uvuni() as we want an index into
			       Unicode tables, not a native character number.
			     */
       32163    	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
							   ckWARN(WARN_UTF8) ?
       32163    					   0 : UTF8_ALLOW_ANY);
       32163    	    SV *errsv_save;
       32163    	    ENTER;
       32163    	    SAVETMPS;
       32163    	    save_re_context();
       32163    	    PUSHSTACKi(PERLSI_MAGIC);
       32163    	    PUSHMARK(SP);
       32163    	    EXTEND(SP,3);
       32163    	    PUSHs((SV*)sv);
			    /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
			    PUSHs(sv_2mortal(newSViv((klen) ?
       32163    				     (code_point & ~(needents - 1)) : 0)));
       32163    	    PUSHs(sv_2mortal(newSViv(needents)));
       32163    	    PUTBACK;
       32163    	    errsv_save = newSVsv(ERRSV);
       32163    	    if (call_method("SWASHGET", G_SCALAR))
       32163    		retval = newSVsv(*PL_stack_sp--);
			    else
      ######    		retval = &PL_sv_undef;
       32163    	    if (!SvTRUE(ERRSV))
       32101    		sv_setsv(ERRSV, errsv_save);
       32163    	    SvREFCNT_dec(errsv_save);
       32163    	    POPSTACK;
       32163    	    FREETMPS;
       32163    	    LEAVE;
       32163    	    if (IN_PERL_COMPILETIME)
          36    		PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
		
       32163    	    svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
		
       32163    	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
      ######    		Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
			}
		
      121983    	PL_last_swash_hv = hv;
      121983    	PL_last_swash_klen = klen;
			/* FIXME change interpvar.h?  */
      121983    	PL_last_swash_tmps = (U8 *) tmps;
      121983    	PL_last_swash_slen = slen;
      121983    	if (klen)
       80335    	    Copy(ptr, PL_last_swash_key, klen, U8);
		    }
		
     2524542        switch ((int)((slen << 3) / needents)) {
		    case 1:
     1812687    	bit = 1 << (off & 7);
     1812687    	off >>= 3;
     1812687    	return (tmps[off] & bit) != 0;
		    case 8:
       29915    	return tmps[off];
		    case 16:
      601885    	off <<= 1;
      601885    	return (tmps[off] << 8) + tmps[off + 1] ;
		    case 32:
       80055    	off <<= 2;
       80055    	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
		    }
      ######        Perl_croak(aTHX_ "panic: swash_fetch");
     2524542        return 0;
		}
		
		
		/*
		=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
		
		Adds the UTF-8 representation of the Native codepoint C<uv> to the end
		of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
		bytes available. The return value is the pointer to the byte after the
		end of the new character. In other words,
		
		    d = uvchr_to_utf8(d, uv);
		
		is the recommended wide native character-aware way of saying
		
		    *(d++) = uv;
		
		=cut
		*/
		
		/* On ASCII machines this is normally a macro but we want a
		   real function in case XS code wants it
		*/
		#undef Perl_uvchr_to_utf8
		U8 *
		Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
      ######    {
      ######        return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
		}
		
		U8 *
		Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
      854261    {
      854261        return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
		}
		
		/*
		=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
		
		Returns the native character value of the first character in the string C<s>
		which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
		length, in bytes, of that character.
		
		Allows length and flags to be passed to low level routine.
		
		=cut
		*/
		/* On ASCII machines this is normally a macro but we want
		   a real function in case XS code wants it
		*/
		#undef Perl_utf8n_to_uvchr
		UV
		Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
      ######    {
      ######        UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
      ######        return UNI_TO_NATIVE(uv);
		}
		
		/*
		=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
		
		Build to the scalar dsv a displayable version of the string spv,
		length len, the displayable version being at most pvlim bytes long
		(if longer, the rest is truncated and "..." will be appended).
		
		The flags argument can have UNI_DISPLAY_ISPRINT set to display
		isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
		to display the \\[nrfta\\] as the backslashed versions (like '\n')
		(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
		UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
		UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
		
		The pointer to the PV of the dsv is returned.
		
		=cut */
		char *
		Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
           5    {
           5        int truncated = 0;
           5        const char *s, *e;
		
           5        sv_setpvn(dsv, "", 0);
          14        for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
           9    	 UV u;
			  /* This serves double duty as a flag and a character to print after
			     a \ when flags & UNI_DISPLAY_BACKSLASH is true.
			  */
           9    	 char ok = 0;
		
           9    	 if (pvlim && SvCUR(dsv) >= pvlim) {
      ######    	      truncated++;
      ######    	      break;
			 }
           9    	 u = utf8_to_uvchr((U8*)s, 0);
           9    	 if (u < 256) {
           1    	     const unsigned char c = (unsigned char)u & 0xFF;
           1    	     if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
           1    	         switch (c) {
				 case '\n':
      ######    		     ok = 'n'; break;
				 case '\r':
      ######    		     ok = 'r'; break;
				 case '\t':
      ######    		     ok = 't'; break;
				 case '\f':
      ######    		     ok = 'f'; break;
				 case '\a':
      ######    		     ok = 'a'; break;
				 case '\\':
      ######    		     ok = '\\'; break;
           1    		 default: break;
				 }
           1    		 if (ok) {
      ######    		     Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
				 }
			     }
			     /* isPRINT() is the locale-blind version. */
           1    	     if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
      ######    	         Perl_sv_catpvf(aTHX_ dsv, "%c", c);
      ######    		 ok = 1;
			     }
			 }
           9    	 if (!ok)
           9    	     Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
		    }
           5        if (truncated)
      ######    	 sv_catpvn(dsv, "...", 3);
		    
           5        return SvPVX(dsv);
		}
		
		/*
		=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
		
		Build to the scalar dsv a displayable version of the scalar sv,
		the displayable version being at most pvlim bytes long
		(if longer, the rest is truncated and "..." will be appended).
		
		The flags argument is as in pv_uni_display().
		
		The pointer to the PV of the dsv is returned.
		
		=cut */
		char *
		Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
           5    {
           5         return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
						SvCUR(ssv), pvlim, flags);
		}
		
		/*
		=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
		
		Return true if the strings s1 and s2 differ case-insensitively, false
		if not (if they are equal case-insensitively).  If u1 is true, the
		string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
		the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
		are false, the respective string is assumed to be in native 8-bit
		encoding.
		
		If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
		in there (they will point at the beginning of the I<next> character).
		If the pointers behind pe1 or pe2 are non-NULL, they are the end
		pointers beyond which scanning will not continue under any
		circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
		s2+l2 will be used as goal end pointers that will also stop the scan,
		and which qualify towards defining a successful match: all the scans
		that define an explicit length must reach their goal pointers for
		a match to succeed).
		
		For case-insensitiveness, the "casefolding" of Unicode is used
		instead of upper/lowercasing both the characters, see
		http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
		
		=cut */
		I32
		Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
       11858    {
       11858         register const U8 *p1  = (const U8*)s1;
       11858         register const U8 *p2  = (const U8*)s2;
       11858         register const U8 *f1 = 0, *f2 = 0;
       11858         register U8 *e1 = 0, *q1 = 0;
       11858         register U8 *e2 = 0, *q2 = 0;
       11858         STRLEN n1 = 0, n2 = 0;
       11858         U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
       11858         U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
       11858         U8 natbuf[1+1];
       11858         STRLEN foldlen1, foldlen2;
       11858         bool match;
		     
       11858         if (pe1)
      ######    	  e1 = *(U8**)pe1;
       11858         if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
       11858    	  f1 = (const U8*)s1 + l1;
       11858         if (pe2)
        8907    	  e2 = *(U8**)pe2;
       11858         if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
        2951    	  f2 = (const U8*)s2 + l2;
		
       11858         if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
      ######    	  return 1; /* mismatch; possible infinite loop or false positive */
		
       11858         if (!u1 || !u2)
          25    	  natbuf[1] = 0; /* Need to terminate the buffer. */
		
       27022         while ((e1 == 0 || p1 < e1) &&
			    (f1 == 0 || p1 < f1) &&
			    (e2 == 0 || p2 < e2) &&
			    (f2 == 0 || p2 < f2)) {
       15179    	  if (n1 == 0) {
       15179    	       if (u1)
       15147    		    to_utf8_fold(p1, foldbuf1, &foldlen1);
			       else {
          32    		    natbuf[0] = *p1;
          32    		    to_utf8_fold(natbuf, foldbuf1, &foldlen1);
			       }
       15179    	       q1 = foldbuf1;
       15179    	       n1 = foldlen1;
			  }
       15179    	  if (n2 == 0) {
       14938    	       if (u2)
       14929    		    to_utf8_fold(p2, foldbuf2, &foldlen2);
			       else {
           9    		    natbuf[0] = *p2;
           9    		    to_utf8_fold(natbuf, foldbuf2, &foldlen2);
			       }
       14938    	       q2 = foldbuf2;
       14938    	       n2 = foldlen2;
			  }
       30343    	  while (n1 && n2) {
       15179    	       if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
				   (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
				    memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
          15    		   return 1; /* mismatch */
       15164    	       n1 -= UTF8SKIP(q1);
       15164    	       q1 += UTF8SKIP(q1);
       15164    	       n2 -= UTF8SKIP(q2);
       15164    	       q2 += UTF8SKIP(q2);
			  }
       15164    	  if (n1 == 0)
       15164    	       p1 += u1 ? UTF8SKIP(p1) : 1;
       15164    	  if (n2 == 0)
       14923    	       p2 += u2 ? UTF8SKIP(p2) : 1;
		
		     }
		
		     /* A match is defined by all the scans that specified
		      * an explicit length reaching their final goals. */
       11843         match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
		
       11843         if (match) {
        8892    	  if (pe1)
      ######    	       *pe1 = (char*)p1;
        8892    	  if (pe2)
        8892    	       *pe2 = (char*)p2;
		     }
		
       11843         return match ? 0 : 1; /* 0 match, 1 mismatch */
		}
		
		/*
		 * Local variables:
		 * c-indentation-style: bsd
		 * c-basic-offset: 4
		 * indent-tabs-mode: t
		 * End:
		 *
		 * ex: set ts=8 sts=4 sw=4 noet:
		 */
