     1			/*    regcomp.c
     2			 */
     3			
     4			/*
     5			 * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
     6			 */
     7			
     8			/* This file contains functions for compiling a regular expression.  See
     9			 * also regexec.c which funnily enough, contains functions for executing
    10			 * a regular expression.
    11			 *
    12			 * This file is also copied at build time to ext/re/re_comp.c, where
    13			 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
    14			 * This causes the main functions to be compiled under new names and with
    15			 * debugging support added, which makes "use re 'debug'" work.
    16			 */
    17			
    18			/* NOTE: this is derived from Henry Spencer's regexp code, and should not
    19			 * confused with the original package (see point 3 below).  Thanks, Henry!
    20			 */
    21			
    22			/* Additional note: this code is very heavily munged from Henry's version
    23			 * in places.  In some spots I've traded clarity for efficiency, so don't
    24			 * blame Henry for some of the lack of readability.
    25			 */
    26			
    27			/* The names of the functions have been changed from regcomp and
    28			 * regexec to  pregcomp and pregexec in order to avoid conflicts
    29			 * with the POSIX routines of the same names.
    30			*/
    31			
    32			#ifdef PERL_EXT_RE_BUILD
    33			/* need to replace pregcomp et al, so enable that */
    34			#  ifndef PERL_IN_XSUB_RE
    35			#    define PERL_IN_XSUB_RE
    36			#  endif
    37			/* need access to debugger hooks */
    38			#  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
    39			#    define DEBUGGING
    40			#  endif
    41			#endif
    42			
    43			#ifdef PERL_IN_XSUB_RE
    44			/* We *really* need to overwrite these symbols: */
    45			#  define Perl_pregcomp my_regcomp
    46			#  define Perl_regdump my_regdump
    47			#  define Perl_regprop my_regprop
    48			#  define Perl_pregfree my_regfree
    49			#  define Perl_re_intuit_string my_re_intuit_string
    50			/* *These* symbols are masked to allow static link. */
    51			#  define Perl_regnext my_regnext
    52			#  define Perl_save_re_context my_save_re_context
    53			#  define Perl_reginitcolors my_reginitcolors
    54			
    55			#  define PERL_NO_GET_CONTEXT
    56			#endif
    57			
    58			/*
    59			 * pregcomp and pregexec -- regsub and regerror are not used in perl
    60			 *
    61			 *	Copyright (c) 1986 by University of Toronto.
    62			 *	Written by Henry Spencer.  Not derived from licensed software.
    63			 *
    64			 *	Permission is granted to anyone to use this software for any
    65			 *	purpose on any computer system, and to redistribute it freely,
    66			 *	subject to the following restrictions:
    67			 *
    68			 *	1. The author is not responsible for the consequences of use of
    69			 *		this software, no matter how awful, even if they arise
    70			 *		from defects in it.
    71			 *
    72			 *	2. The origin of this software must not be misrepresented, either
    73			 *		by explicit claim or by omission.
    74			 *
    75			 *	3. Altered versions must be plainly marked as such, and must not
    76			 *		be misrepresented as being the original software.
    77			 *
    78			 *
    79			 ****    Alterations to Henry's code are...
    80			 ****
    81			 ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
    82			 ****    2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
    83			 ****
    84			 ****    You may distribute under the terms of either the GNU General Public
    85			 ****    License or the Artistic License, as specified in the README file.
    86			
    87			 *
    88			 * Beware that some of this code is subtly aware of the way operator
    89			 * precedence is structured in regular expressions.  Serious changes in
    90			 * regular-expression syntax might require a total rethink.
    91			 */
    92			#include "EXTERN.h"
    93			#define PERL_IN_REGCOMP_C
    94			#include "perl.h"
    95			
    96			#ifndef PERL_IN_XSUB_RE
    97			#  include "INTERN.h"
    98			#endif
    99			
   100			#define REG_COMP_C
   101			#include "regcomp.h"
   102			
   103			#ifdef op
   104			#undef op
   105			#endif /* op */
   106			
   107			#ifdef MSDOS
   108			#  if defined(BUGGY_MSC6)
   109			 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
   110			#    pragma optimize("a",off)
   111			 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
   112			#    pragma optimize("w",on )
   113			#  endif /* BUGGY_MSC6 */
   114			#endif /* MSDOS */
   115			
   116			#ifndef STATIC
   117			#define	STATIC	static
   118			#endif
   119			
   120			typedef struct RExC_state_t {
   121			    U32		flags;			/* are we folding, multilining? */
   122			    char	*precomp;		/* uncompiled string. */
   123			    regexp	*rx;
   124			    char	*start;			/* Start of input for compile */
   125			    char	*end;			/* End of input for compile */
   126			    char	*parse;			/* Input-scan pointer. */
   127			    I32		whilem_seen;		/* number of WHILEM in this expr */
   128			    regnode	*emit_start;		/* Start of emitted-code area */
   129			    regnode	*emit;			/* Code-emit pointer; &regdummy = don't = compiling */
   130			    I32		naughty;		/* How bad is this pattern? */
   131			    I32		sawback;		/* Did we see \1, ...? */
   132			    U32		seen;
   133			    I32		size;			/* Code size. */
   134			    I32		npar;			/* () count. */
   135			    I32		extralen;
   136			    I32		seen_zerolen;
   137			    I32		seen_evals;
   138			    I32		utf8;
   139			#if ADD_TO_REGEXEC
   140			    char 	*starttry;		/* -Dr: where regtry was called. */
   141			#define RExC_starttry	(pRExC_state->starttry)
   142			#endif
   143			} RExC_state_t;
   144			
   145			#define RExC_flags	(pRExC_state->flags)
   146			#define RExC_precomp	(pRExC_state->precomp)
   147			#define RExC_rx		(pRExC_state->rx)
   148			#define RExC_start	(pRExC_state->start)
   149			#define RExC_end	(pRExC_state->end)
   150			#define RExC_parse	(pRExC_state->parse)
   151			#define RExC_whilem_seen	(pRExC_state->whilem_seen)
   152			#define RExC_offsets	(pRExC_state->rx->offsets) /* I am not like the others */
   153			#define RExC_emit	(pRExC_state->emit)
   154			#define RExC_emit_start	(pRExC_state->emit_start)
   155			#define RExC_naughty	(pRExC_state->naughty)
   156			#define RExC_sawback	(pRExC_state->sawback)
   157			#define RExC_seen	(pRExC_state->seen)
   158			#define RExC_size	(pRExC_state->size)
   159			#define RExC_npar	(pRExC_state->npar)
   160			#define RExC_extralen	(pRExC_state->extralen)
   161			#define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
   162			#define RExC_seen_evals	(pRExC_state->seen_evals)
   163			#define RExC_utf8	(pRExC_state->utf8)
   164			
   165			#define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
   166			#define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
   167				((*s) == '{' && regcurly(s)))
   168			
   169			#ifdef SPSTART
   170			#undef SPSTART		/* dratted cpp namespace... */
   171			#endif
   172			/*
   173			 * Flags to be passed up and down.
   174			 */
   175			#define	WORST		0	/* Worst case. */
   176			#define	HASWIDTH	0x1	/* Known to match non-null strings. */
   177			#define	SIMPLE		0x2	/* Simple enough to be STAR/PLUS operand. */
   178			#define	SPSTART		0x4	/* Starts with * or +. */
   179			#define TRYAGAIN	0x8	/* Weeded out a declaration. */
   180			
   181			/* Length of a variant. */
   182			
   183			typedef struct scan_data_t {
   184			    I32 len_min;
   185			    I32 len_delta;
   186			    I32 pos_min;
   187			    I32 pos_delta;
   188			    SV *last_found;
   189			    I32 last_end;			/* min value, <0 unless valid. */
   190			    I32 last_start_min;
   191			    I32 last_start_max;
   192			    SV **longest;			/* Either &l_fixed, or &l_float. */
   193			    SV *longest_fixed;
   194			    I32 offset_fixed;
   195			    SV *longest_float;
   196			    I32 offset_float_min;
   197			    I32 offset_float_max;
   198			    I32 flags;
   199			    I32 whilem_c;
   200			    I32 *last_closep;
   201			    struct regnode_charclass_class *start_class;
   202			} scan_data_t;
   203			
   204			/*
   205			 * Forward declarations for pregcomp()'s friends.
   206			 */
   207			
   208			static const scan_data_t zero_scan_data =
   209			  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   210			
   211			#define SF_BEFORE_EOL		(SF_BEFORE_SEOL|SF_BEFORE_MEOL)
   212			#define SF_BEFORE_SEOL		0x1
   213			#define SF_BEFORE_MEOL		0x2
   214			#define SF_FIX_BEFORE_EOL	(SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
   215			#define SF_FL_BEFORE_EOL	(SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
   216			
   217			#ifdef NO_UNARY_PLUS
   218			#  define SF_FIX_SHIFT_EOL	(0+2)
   219			#  define SF_FL_SHIFT_EOL		(0+4)
   220			#else
   221			#  define SF_FIX_SHIFT_EOL	(+2)
   222			#  define SF_FL_SHIFT_EOL		(+4)
   223			#endif
   224			
   225			#define SF_FIX_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
   226			#define SF_FIX_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
   227			
   228			#define SF_FL_BEFORE_SEOL	(SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
   229			#define SF_FL_BEFORE_MEOL	(SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
   230			#define SF_IS_INF		0x40
   231			#define SF_HAS_PAR		0x80
   232			#define SF_IN_PAR		0x100
   233			#define SF_HAS_EVAL		0x200
   234			#define SCF_DO_SUBSTR		0x400
   235			#define SCF_DO_STCLASS_AND	0x0800
   236			#define SCF_DO_STCLASS_OR	0x1000
   237			#define SCF_DO_STCLASS		(SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
   238			#define SCF_WHILEM_VISITED_POS	0x2000
   239			
   240			#define UTF (RExC_utf8 != 0)
   241			#define LOC ((RExC_flags & PMf_LOCALE) != 0)
   242			#define FOLD ((RExC_flags & PMf_FOLD) != 0)
   243			
   244			#define OOB_UNICODE		12345678
   245			#define OOB_NAMEDCLASS		-1
   246			
   247			#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
   248			#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
   249			
   250			
   251			/* length of regex to show in messages that don't mark a position within */
   252			#define RegexLengthToShowInErrorMessages 127
   253			
   254			/*
   255			 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
   256			 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
   257			 * op/pragma/warn/regcomp.
   258			 */
   259			#define MARKER1 "<-- HERE"    /* marker as it appears in the description */
   260			#define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
   261			
   262			#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
   263			
   264			/*
   265			 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
   266			 * arg. Show regex, up to a maximum length. If it's too long, chop and add
   267			 * "...".
   268			 */
   269			#define	FAIL(msg) STMT_START {						\
   270			    const char *ellipses = "";						\
   271			    IV len = RExC_end - RExC_precomp;					\
   272												\
   273			    if (!SIZE_ONLY)							\
   274				SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);			\
   275			    if (len > RegexLengthToShowInErrorMessages) {			\
   276				/* chop 10 shorter than the max, to ensure meaning of "..." */	\
   277				len = RegexLengthToShowInErrorMessages - 10;			\
   278				ellipses = "...";						\
   279			    }									\
   280			    Perl_croak(aTHX_ "%s in regex m/%.*s%s/",				\
   281				    msg, (int)len, RExC_precomp, ellipses);			\
   282			} STMT_END
   283			
   284			/*
   285			 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
   286			 * args. Show regex, up to a maximum length. If it's too long, chop and add
   287			 * "...".
   288			 */
   289			#define	FAIL2(pat,msg) STMT_START {					\
   290			    const char *ellipses = "";						\
   291			    IV len = RExC_end - RExC_precomp;					\
   292												\
   293			    if (!SIZE_ONLY)							\
   294				SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);			\
   295			    if (len > RegexLengthToShowInErrorMessages) {			\
   296				/* chop 10 shorter than the max, to ensure meaning of "..." */	\
   297				len = RegexLengthToShowInErrorMessages - 10;			\
   298				ellipses = "...";						\
   299			    }									\
   300			    S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",			\
   301				    msg, (int)len, RExC_precomp, ellipses);			\
   302			} STMT_END
   303			
   304			
   305			/*
   306			 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
   307			 */
   308			#define	Simple_vFAIL(m) STMT_START {					\
   309			    const IV offset = RExC_parse - RExC_precomp;			\
   310			    Perl_croak(aTHX_ "%s" REPORT_LOCATION,				\
   311				    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
   312			} STMT_END
   313			
   314			/*
   315			 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
   316			 */
   317			#define	vFAIL(m) STMT_START {				\
   318			    if (!SIZE_ONLY)					\
   319				SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
   320			    Simple_vFAIL(m);					\
   321			} STMT_END
   322			
   323			/*
   324			 * Like Simple_vFAIL(), but accepts two arguments.
   325			 */
   326			#define	Simple_vFAIL2(m,a1) STMT_START {			\
   327			    const IV offset = RExC_parse - RExC_precomp;			\
   328			    S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,			\
   329				    (int)offset, RExC_precomp, RExC_precomp + offset);	\
   330			} STMT_END
   331			
   332			/*
   333			 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
   334			 */
   335			#define	vFAIL2(m,a1) STMT_START {			\
   336			    if (!SIZE_ONLY)					\
   337				SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
   338			    Simple_vFAIL2(m, a1);				\
   339			} STMT_END
   340			
   341			
   342			/*
   343			 * Like Simple_vFAIL(), but accepts three arguments.
   344			 */
   345			#define	Simple_vFAIL3(m, a1, a2) STMT_START {			\
   346			    const IV offset = RExC_parse - RExC_precomp;		\
   347			    S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,		\
   348				    (int)offset, RExC_precomp, RExC_precomp + offset);	\
   349			} STMT_END
   350			
   351			/*
   352			 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
   353			 */
   354			#define	vFAIL3(m,a1,a2) STMT_START {			\
   355			    if (!SIZE_ONLY)					\
   356				SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);	\
   357			    Simple_vFAIL3(m, a1, a2);				\
   358			} STMT_END
   359			
   360			/*
   361			 * Like Simple_vFAIL(), but accepts four arguments.
   362			 */
   363			#define	Simple_vFAIL4(m, a1, a2, a3) STMT_START {		\
   364			    const IV offset = RExC_parse - RExC_precomp;		\
   365			    S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,		\
   366				    (int)offset, RExC_precomp, RExC_precomp + offset);	\
   367			} STMT_END
   368			
   369			#define	vWARN(loc,m) STMT_START {					\
   370			    const IV offset = loc - RExC_precomp;				\
   371			    Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,	\
   372				    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
   373			} STMT_END
   374			
   375			#define	vWARNdep(loc,m) STMT_START {					\
   376			    const IV offset = loc - RExC_precomp;				\
   377			    Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),		\
   378				    "%s" REPORT_LOCATION,					\
   379				    m, (int)offset, RExC_precomp, RExC_precomp + offset);	\
   380			} STMT_END
   381			
   382			
   383			#define	vWARN2(loc, m, a1) STMT_START {					\
   384			    const IV offset = loc - RExC_precomp;				\
   385			    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
   386				    a1, (int)offset, RExC_precomp, RExC_precomp + offset);	\
   387			} STMT_END
   388			
   389			#define	vWARN3(loc, m, a1, a2) STMT_START {				\
   390			    const IV offset = loc - RExC_precomp;				\
   391			    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
   392				    a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);	\
   393			} STMT_END
   394			
   395			#define	vWARN4(loc, m, a1, a2, a3) STMT_START {				\
   396			    const IV offset = loc - RExC_precomp;				\
   397			    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
   398				    a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
   399			} STMT_END
   400			
   401			#define	vWARN5(loc, m, a1, a2, a3, a4) STMT_START {			\
   402			    const IV offset = loc - RExC_precomp;				\
   403			    Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,		\
   404				    a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
   405			} STMT_END
   406			
   407			
   408			/* Allow for side effects in s */
   409			#define REGC(c,s) STMT_START {			\
   410			    if (!SIZE_ONLY) *(s) = (c); else (void)(s);	\
   411			} STMT_END
   412			
   413			/* Macros for recording node offsets.   20001227 mjd@plover.com 
   414			 * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
   415			 * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
   416			 * Element 0 holds the number n.
   417			 */
   418			
   419			#define MJD_OFFSET_DEBUG(x)
   420			/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
   421			
   422			
   423			#define Set_Node_Offset_To_R(node,byte) STMT_START {			\
   424			    if (! SIZE_ONLY) {							\
   425				MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",		\
   426					__LINE__, (node), (byte)));				\
   427				if((node) < 0) {						\
   428				    Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
   429				} else {							\
   430				    RExC_offsets[2*(node)-1] = (byte);				\
   431				}								\
   432			    }									\
   433			} STMT_END
   434			
   435			#define Set_Node_Offset(node,byte) \
   436			    Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
   437			#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
   438			
   439			#define Set_Node_Length_To_R(node,len) STMT_START {			\
   440			    if (! SIZE_ONLY) {							\
   441				MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",		\
   442					__LINE__, (node), (len)));				\
   443				if((node) < 0) {						\
   444				    Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
   445				} else {							\
   446				    RExC_offsets[2*(node)] = (len);				\
   447				}								\
   448			    }									\
   449			} STMT_END
   450			
   451			#define Set_Node_Length(node,len) \
   452			    Set_Node_Length_To_R((node)-RExC_emit_start, len)
   453			#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
   454			#define Set_Node_Cur_Length(node) \
   455			    Set_Node_Length(node, RExC_parse - parse_start)
   456			
   457			/* Get offsets and lengths */
   458			#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
   459			#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
   460			
   461			static void clear_re(pTHX_ void *r);
   462			
   463			/* Mark that we cannot extend a found fixed substring at this point.
   464			   Updata the longest found anchored substring and the longest found
   465			   floating substrings if needed. */
   466			
   467			STATIC void
   468			S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
   469	     4127328    {
   470	     4127328        const STRLEN l = CHR_SVLEN(data->last_found);
   471	     4127328        const STRLEN old_l = CHR_SVLEN(*data->longest);
   472			
   473	     4127328        if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
   474	      236273    	SvSetMagicSV(*data->longest, data->last_found);
   475	      236273    	if (*data->longest == data->longest_fixed) {
   476	      117024    	    data->offset_fixed = l ? data->last_start_min : data->pos_min;
   477	      117024    	    if (data->flags & SF_BEFORE_EOL)
   478	       26077    		data->flags
   479					    |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
   480				    else
   481	       90947    		data->flags &= ~SF_FIX_BEFORE_EOL;
   482				}
   483				else {
   484	      119249    	    data->offset_float_min = l ? data->last_start_min : data->pos_min;
   485	      119249    	    data->offset_float_max = (l
   486							      ? data->last_start_max
   487							      : data->pos_min + data->pos_delta);
   488	      119249    	    if ((U32)data->offset_float_max > (U32)I32_MAX)
   489	         165    		data->offset_float_max = I32_MAX;
   490	      119249    	    if (data->flags & SF_BEFORE_EOL)
   491	       36670    		data->flags
   492					    |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
   493				    else
   494	       82579    		data->flags &= ~SF_FL_BEFORE_EOL;
   495				}
   496			    }
   497	     4127328        SvCUR_set(data->last_found, 0);
   498			    {
   499	     4127328    	SV * const sv = data->last_found;
   500	     4127328    	MAGIC * const mg =
   501	     4127328    	    SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
   502	     4127328    	if (mg && mg->mg_len > 0)
   503	        3116    	    mg->mg_len = 0;
   504			    }
   505	     4127328        data->last_end = -1;
   506	     4127328        data->flags &= ~SF_BEFORE_EOL;
   507			}
   508			
   509			/* Can match anything (initialization) */
   510			STATIC void
   511			S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
   512	      355067    {
   513	      355067        ANYOF_CLASS_ZERO(cl);
   514	      355067        ANYOF_BITMAP_SETALL(cl);
   515	      355067        cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
   516	      355067        if (LOC)
   517	         587    	cl->flags |= ANYOF_LOCALE;
   518			}
   519			
   520			/* Can match anything (initialization) */
   521			STATIC int
   522			S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
   523	       30671    {
   524	       30671        int value;
   525			
   526	      552078        for (value = 0; value <= ANYOF_MAX; value += 2)
   527	      521407    	if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
   528	      ######    	    return 1;
   529	       30671        if (!(cl->flags & ANYOF_UNICODE_ALL))
   530	        8672    	return 0;
   531	       21999        if (!ANYOF_BITMAP_TESTALLSET(cl))
   532	       20539    	return 0;
   533	        1460        return 1;
   534			}
   535			
   536			/* Can match anything (initialization) */
   537			STATIC void
   538			S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
   539	      329336    {
   540	      329336        Zero(cl, 1, struct regnode_charclass_class);
   541	      329336        cl->type = ANYOF;
   542	      329336        cl_anything(pRExC_state, cl);
   543			}
   544			
   545			STATIC void
   546			S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
   547	       15477    {
   548	       15477        Zero(cl, 1, struct regnode_charclass_class);
   549	       15477        cl->type = ANYOF;
   550	       15477        cl_anything(pRExC_state, cl);
   551	       15477        if (LOC)
   552	          19    	cl->flags |= ANYOF_LOCALE;
   553			}
   554			
   555			/* 'And' a given class with another one.  Can create false positives */
   556			/* We assume that cl is not inverted */
   557			STATIC void
   558			S_cl_and(pTHX_ struct regnode_charclass_class *cl,
   559				const struct regnode_charclass_class *and_with)
   560	       90580    {
   561	       90580        if (!(and_with->flags & ANYOF_CLASS)
   562				&& !(cl->flags & ANYOF_CLASS)
   563				&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
   564				&& !(and_with->flags & ANYOF_FOLD)
   565				&& !(cl->flags & ANYOF_FOLD)) {
   566	       90269    	int i;
   567			
   568	       90269    	if (and_with->flags & ANYOF_INVERT)
   569	        3465    	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
   570	        3360    		cl->bitmap[i] &= ~and_with->bitmap[i];
   571				else
   572	     2975412    	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
   573	     2885248    		cl->bitmap[i] &= and_with->bitmap[i];
   574			    } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
   575	       90580        if (!(and_with->flags & ANYOF_EOS))
   576	       31316    	cl->flags &= ~ANYOF_EOS;
   577			
   578	       90580        if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
   579				!(and_with->flags & ANYOF_INVERT)) {
   580	           8    	cl->flags &= ~ANYOF_UNICODE_ALL;
   581	           8    	cl->flags |= ANYOF_UNICODE;
   582	           8    	ARG_SET(cl, ARG(and_with));
   583			    }
   584	       90580        if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
   585				!(and_with->flags & ANYOF_INVERT))
   586	       19465    	cl->flags &= ~ANYOF_UNICODE_ALL;
   587	       90580        if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
   588				!(and_with->flags & ANYOF_INVERT))
   589	       19457    	cl->flags &= ~ANYOF_UNICODE;
   590			}
   591			
   592			/* 'OR' a given class with another one.  Can create false positives */
   593			/* We assume that cl is not inverted */
   594			STATIC void
   595			S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
   596	      108420    {
   597	      108420        if (or_with->flags & ANYOF_INVERT) {
   598				/* We do not use
   599				 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
   600				 *   <= (B1 | !B2) | (CL1 | !CL2)
   601				 * which is wasteful if CL2 is small, but we ignore CL2:
   602				 *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
   603				 * XXXX Can we handle case-fold?  Unclear:
   604				 *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
   605				 *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
   606				 */
   607	         400    	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
   608				     && !(or_with->flags & ANYOF_FOLD)
   609				     && !(cl->flags & ANYOF_FOLD) ) {
   610	         400    	    int i;
   611			
   612	       13200    	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
   613	       12800    		cl->bitmap[i] |= ~or_with->bitmap[i];
   614				} /* XXXX: logic is complicated otherwise */
   615				else {
   616	      ######    	    cl_anything(pRExC_state, cl);
   617				}
   618			    } else {
   619				/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
   620	      108020    	if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
   621				     && (!(or_with->flags & ANYOF_FOLD)
   622					 || (cl->flags & ANYOF_FOLD)) ) {
   623	       97826    	    int i;
   624			
   625				    /* OR char bitmap and class bitmap separately */
   626	     3228258    	    for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
   627	     3130432    		cl->bitmap[i] |= or_with->bitmap[i];
   628	       97826    	    if (or_with->flags & ANYOF_CLASS) {
   629	      ######    		for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
   630	      ######    		    cl->classflags[i] |= or_with->classflags[i];
   631	      ######    		cl->flags |= ANYOF_CLASS;
   632				    }
   633				}
   634				else { /* XXXX: logic is complicated, leave it along for a moment. */
   635	       10194    	    cl_anything(pRExC_state, cl);
   636				}
   637			    }
   638	      108420        if (or_with->flags & ANYOF_EOS)
   639	        6291    	cl->flags |= ANYOF_EOS;
   640			
   641	      108420        if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
   642				ARG(cl) != ARG(or_with)) {
   643	      ######    	cl->flags |= ANYOF_UNICODE_ALL;
   644	      ######    	cl->flags &= ~ANYOF_UNICODE;
   645			    }
   646	      108420        if (or_with->flags & ANYOF_UNICODE_ALL) {
   647	       46210    	cl->flags |= ANYOF_UNICODE_ALL;
   648	       46210    	cl->flags &= ~ANYOF_UNICODE;
   649			    }
   650			}
   651			
   652			/*
   653			
   654			 make_trie(startbranch,first,last,tail,flags)
   655			  startbranch: the first branch in the whole branch sequence
   656			  first      : start branch of sequence of branch-exact nodes.
   657				       May be the same as startbranch
   658			  last       : Thing following the last branch.
   659				       May be the same as tail.
   660			  tail       : item following the branch sequence
   661			  flags      : currently the OP() type we will be building one of /EXACT(|F|Fl)/
   662			
   663			Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
   664			
   665			A trie is an N'ary tree where the branches are determined by digital
   666			decomposition of the key. IE, at the root node you look up the 1st character and
   667			follow that branch repeat until you find the end of the branches. Nodes can be
   668			marked as "accepting" meaning they represent a complete word. Eg:
   669			
   670			  /he|she|his|hers/
   671			
   672			would convert into the following structure. Numbers represent states, letters
   673			following numbers represent valid transitions on the letter from that state, if
   674			the number is in square brackets it represents an accepting state, otherwise it
   675			will be in parenthesis.
   676			
   677			      +-h->+-e->[3]-+-r->(8)-+-s->[9]
   678			      |    |
   679			      |   (2)
   680			      |    |
   681			     (1)   +-i->(6)-+-s->[7]
   682			      |
   683			      +-s->(3)-+-h->(4)-+-e->[5]
   684			
   685			      Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
   686			
   687			This shows that when matching against the string 'hers' we will begin at state 1
   688			read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
   689			then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
   690			is also accepting. Thus we know that we can match both 'he' and 'hers' with a
   691			single traverse. We store a mapping from accepting to state to which word was
   692			matched, and then when we have multiple possibilities we try to complete the
   693			rest of the regex in the order in which they occured in the alternation.
   694			
   695			The only prior NFA like behaviour that would be changed by the TRIE support is
   696			the silent ignoring of duplicate alternations which are of the form:
   697			
   698			 / (DUPE|DUPE) X? (?{ ... }) Y /x
   699			
   700			Thus EVAL blocks follwing a trie may be called a different number of times with
   701			and without the optimisation. With the optimisations dupes will be silently
   702			ignored. This inconsistant behaviour of EVAL type nodes is well established as
   703			the following demonstrates:
   704			
   705			 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
   706			
   707			which prints out 'word' three times, but
   708			
   709			 'words'=~/(word|word|word)(?{ print $1 })S/
   710			
   711			which doesnt print it out at all. This is due to other optimisations kicking in.
   712			
   713			Example of what happens on a structural level:
   714			
   715			The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
   716			
   717			   1: CURLYM[1] {1,32767}(18)
   718			   5:   BRANCH(8)
   719			   6:     EXACT <ac>(16)
   720			   8:   BRANCH(11)
   721			   9:     EXACT <ad>(16)
   722			  11:   BRANCH(14)
   723			  12:     EXACT <ab>(16)
   724			  16:   SUCCEED(0)
   725			  17:   NOTHING(18)
   726			  18: END(0)
   727			
   728			This would be optimizable with startbranch=5, first=5, last=16, tail=16
   729			and should turn into:
   730			
   731			   1: CURLYM[1] {1,32767}(18)
   732			   5:   TRIE(16)
   733				[Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
   734				  <ac>
   735				  <ad>
   736				  <ab>
   737			  16:   SUCCEED(0)
   738			  17:   NOTHING(18)
   739			  18: END(0)
   740			
   741			Cases where tail != last would be like /(?foo|bar)baz/:
   742			
   743			   1: BRANCH(4)
   744			   2:   EXACT <foo>(8)
   745			   4: BRANCH(7)
   746			   5:   EXACT <bar>(8)
   747			   7: TAIL(8)
   748			   8: EXACT <baz>(10)
   749			  10: END(0)
   750			
   751			which would be optimizable with startbranch=1, first=1, last=7, tail=8
   752			and would end up looking like:
   753			
   754			    1: TRIE(8)
   755			      [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
   756				<foo>
   757				<bar>
   758			   7: TAIL(8)
   759			   8: EXACT <baz>(10)
   760			  10: END(0)
   761			
   762			*/
   763			
   764			#define TRIE_DEBUG_CHAR                                                    \
   765			    DEBUG_TRIE_COMPILE_r({                                                 \
   766				SV *tmp;                                                           \
   767				if ( UTF ) {                                                       \
   768				    tmp = newSVpvn( "", 0 );                                       \
   769				    pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX );         \
   770				} else {                                                           \
   771				    tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc );               \
   772				}                                                                  \
   773				av_push( trie->revcharmap, tmp );                                  \
   774			    })
   775			
   776			#define TRIE_READ_CHAR STMT_START {                                           \
   777			    if ( UTF ) {                                                              \
   778				if ( folder ) {                                                       \
   779				    if ( foldlen > 0 ) {                                              \
   780				       uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
   781				       foldlen -= len;                                                \
   782				       scan += len;                                                   \
   783				       len = 0;                                                       \
   784				    } else {                                                          \
   785					uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
   786					uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
   787					foldlen -= UNISKIP( uvc );                                    \
   788					scan = foldbuf + UNISKIP( uvc );                              \
   789				    }                                                                 \
   790				} else {                                                              \
   791				    uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
   792				}                                                                     \
   793			    } else {                                                                  \
   794				uvc = (U32)*uc;                                                       \
   795				len = 1;                                                              \
   796			    }                                                                         \
   797			} STMT_END
   798			
   799			
   800			#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
   801			#define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
   802			#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
   803			#define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
   804			
   805			#define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
   806			    if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
   807				TRIE_LIST_LEN( state ) *= 2;                            \
   808				Renew( trie->states[ state ].trans.list,                \
   809				       TRIE_LIST_LEN( state ), reg_trie_trans_le );     \
   810			    }                                                           \
   811			    TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
   812			    TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
   813			    TRIE_LIST_CUR( state )++;                                   \
   814			} STMT_END
   815			
   816			#define TRIE_LIST_NEW(state) STMT_START {                       \
   817			    Newz( 1023, trie->states[ state ].trans.list,               \
   818				4, reg_trie_trans_le );                                 \
   819			     TRIE_LIST_CUR( state ) = 1;                                \
   820			     TRIE_LIST_LEN( state ) = 4;                                \
   821			} STMT_END
   822			
   823			STATIC I32
   824			S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
   825	       15392    {
   826			    dVAR;
   827			    /* first pass, loop through and scan words */
   828	       15392        reg_trie_data *trie;
   829	       15392        regnode *cur;
   830	       15392        const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
   831	       15392        STRLEN len = 0;
   832	       15392        UV uvc = 0;
   833	       15392        U16 curword = 0;
   834	       15392        U32 next_alloc = 0;
   835			    /* we just use folder as a flag in utf8 */
   836	       15392        const U8 * const folder = ( flags == EXACTF
   837			                       ? PL_fold
   838			                       : ( flags == EXACTFL
   839			                           ? PL_fold_locale
   840			                           : NULL
   841			                         )
   842	       15392                         );
   843			
   844	       15392        const U32 data_slot = add_data( pRExC_state, 1, "t" );
   845	       15392        SV *re_trie_maxbuff;
   846			
   847	       15392        GET_RE_DEBUG_FLAGS_DECL;
   848			
   849	       15392        Newz( 848200, trie, 1, reg_trie_data );
   850	       15392        trie->refcount = 1;
   851	       15392        RExC_rx->data->data[ data_slot ] = (void*)trie;
   852	       15392        Newz( 848201, trie->charmap, 256, U16 );
   853			    DEBUG_r({
   854			        trie->words = newAV();
   855			        trie->revcharmap = newAV();
   856	       15392        });
   857			
   858			
   859	       15392        re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
   860	       15392        if (!SvIOK(re_trie_maxbuff)) {
   861	      ######            sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
   862			    }
   863			
   864			    /*  -- First loop and Setup --
   865			
   866			       We first traverse the branches and scan each word to determine if it
   867			       contains widechars, and how many unique chars there are, this is
   868			       important as we have to build a table with at least as many columns as we
   869			       have unique chars.
   870			
   871			       We use an array of integers to represent the character codes 0..255
   872			       (trie->charmap) and we use a an HV* to store unicode characters. We use the
   873			       native representation of the character value as the key and IV's for the
   874			       coded index.
   875			
   876			       *TODO* If we keep track of how many times each character is used we can
   877			       remap the columns so that the table compression later on is more
   878			       efficient in terms of memory by ensuring most common value is in the
   879			       middle and the least common are on the outside.  IMO this would be better
   880			       than a most to least common mapping as theres a decent chance the most
   881			       common letter will share a node with the least common, meaning the node
   882			       will not be compressable. With a middle is most common approach the worst
   883			       case is when we have the least common nodes twice.
   884			
   885			     */
   886			
   887			
   888	       99503        for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
   889	       84111            regnode *noper = NEXTOPER( cur );
   890	       84111            const U8 *uc = (U8*)STRING( noper );
   891	       84111            const U8 * const e  = uc + STR_LEN( noper );
   892	       84111            STRLEN foldlen = 0;
   893	       84111            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
   894	       84111            const U8 *scan = (U8*)NULL;
   895			
   896	      856085            for ( ; uc < e ; uc += len ) {
   897	      385987                trie->charcount++;
   898	      385987                TRIE_READ_CHAR;
   899	      385987                if ( uvc < 256 ) {
   900	      366238                    if ( !trie->charmap[ uvc ] ) {
   901	      163021                        trie->charmap[ uvc ]=( ++trie->uniquecharcount );
   902	      163021                        if ( folder )
   903	       31759                            trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
   904	      163021                        TRIE_DEBUG_CHAR;
   905			                }
   906			            } else {
   907	       19749                    SV** svpp;
   908	       19749                    if ( !trie->widecharmap )
   909	           5                        trie->widecharmap = newHV();
   910			
   911	       19749                    svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
   912			
   913	       19749                    if ( !svpp )
   914	      ######                        Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
   915			
   916	       19749                    if ( !SvTRUE( *svpp ) ) {
   917	       19749                        sv_setiv( *svpp, ++trie->uniquecharcount );
   918	       19749                        TRIE_DEBUG_CHAR;
   919			                }
   920			            }
   921			        }
   922	       84111            trie->wordcount++;
   923			    } /* end first pass */
   924			    DEBUG_TRIE_COMPILE_r(
   925			        PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
   926			                ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
   927			                (int)trie->charcount, trie->uniquecharcount )
   928	       15392        );
   929			
   930			
   931			    /*
   932			        We now know what we are dealing with in terms of unique chars and
   933			        string sizes so we can calculate how much memory a naive
   934			        representation using a flat table  will take. If it's over a reasonable
   935			        limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
   936			        conservative but potentially much slower representation using an array
   937			        of lists.
   938			
   939			        At the end we convert both representations into the same compressed
   940			        form that will be used in regexec.c for matching with. The latter
   941			        is a form that cannot be used to construct with but has memory
   942			        properties similar to the list form and access properties similar
   943			        to the table form making it both suitable for fast searches and
   944			        small enough that its feasable to store for the duration of a program.
   945			
   946			        See the comment in the code where the compressed table is produced
   947			        inplace from the flat tabe representation for an explanation of how
   948			        the compression works.
   949			
   950			    */
   951			
   952			
   953	       15392        if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
   954			        /*
   955			            Second Pass -- Array Of Lists Representation
   956			
   957			            Each state will be represented by a list of charid:state records
   958			            (reg_trie_trans_le) the first such element holds the CUR and LEN
   959			            points of the allocated array. (See defines above).
   960			
   961			            We build the initial structure using the lists, and then convert
   962			            it into the compressed table form which allows faster lookups
   963			            (but cant be modified once converted).
   964			
   965			
   966			        */
   967			
   968			
   969	           1            STRLEN transcount = 1;
   970			
   971	           1            Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
   972	           1            TRIE_LIST_NEW(1);
   973	           1            next_alloc = 2;
   974			
   975	       19752            for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
   976			
   977	       19751            regnode *noper   = NEXTOPER( cur );
   978	       19751            U8 *uc           = (U8*)STRING( noper );
   979	       19751            const U8 * const e = uc + STR_LEN( noper );
   980	       19751            U32 state        = 1;         /* required init */
   981	       19751            U16 charid       = 0;         /* sanity init */
   982	       19751            U8 *scan         = (U8*)NULL; /* sanity init */
   983	       19751            STRLEN foldlen   = 0;         /* required init */
   984	       39520            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
   985			
   986			
   987	       59289            for ( ; uc < e ; uc += len ) {
   988			
   989	       19769                TRIE_READ_CHAR;
   990			
   991	       19769                if ( uvc < 256 ) {
   992	          24                    charid = trie->charmap[ uvc ];
   993			            } else {
   994	       19745                    SV** svpp=(SV**)NULL;
   995	       19745                    svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
   996	       19745                    if ( !svpp ) {
   997	      ######                        charid = 0;
   998			                } else {
   999	       19745                        charid=(U16)SvIV( *svpp );
  1000			                }
  1001			            }
  1002	       19769                if ( charid ) {
  1003			
  1004	       19769                    U16 check;
  1005	       19769                    U32 newstate = 0;
  1006			
  1007	       19769                    charid--;
  1008	       19769                    if ( !trie->states[ state ].trans.list ) {
  1009	          18                        TRIE_LIST_NEW( state );
  1010			                }
  1011	   195060894                    for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
  1012	   195041125                        if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
  1013	      ######                            newstate = TRIE_LIST_ITEM( state, check ).newstate;
  1014	      ######                            break;
  1015			                    }
  1016					}
  1017	       19769    		if ( ! newstate ) {
  1018	       19769    		    newstate = next_alloc++;
  1019	       19769    		    TRIE_LIST_PUSH( state, charid, newstate );
  1020	       19769    		    transcount++;
  1021					}
  1022	       19769    		state = newstate;
  1023			            } else {
  1024	      ######                    Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
  1025			            }
  1026			            /* charid is now 0 if we dont know the char read, or nonzero if we do */
  1027			        }
  1028			
  1029	       19751            if ( !trie->states[ state ].wordnum ) {
  1030			            /* we havent inserted this word into the structure yet. */
  1031	       19751                trie->states[ state ].wordnum = ++curword;
  1032			
  1033			            DEBUG_r({
  1034			                /* store the word for dumping */
  1035			                SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
  1036			                if ( UTF ) SvUTF8_on( tmp );
  1037			                av_push( trie->words, tmp );
  1038	       19751                });
  1039			
  1040			        } else {
  1041			            /* Its a dupe. So ignore it. */
  1042			        }
  1043			
  1044			        } /* end second pass */
  1045			
  1046	           1            trie->laststate = next_alloc;
  1047	           1            Renew( trie->states, next_alloc, reg_trie_state );
  1048			
  1049			        DEBUG_TRIE_COMPILE_MORE_r({
  1050			            U32 state;
  1051			
  1052			            /* print out the table precompression.  */
  1053			
  1054			            PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
  1055			            PerlIO_printf( Perl_debug_log,   "------:-----+-----------------" );
  1056			
  1057			            for( state=1 ; state < next_alloc ; state ++ ) {
  1058					U16 charid;
  1059			
  1060			                PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state  );
  1061			                if ( ! trie->states[ state ].wordnum ) {
  1062			                    PerlIO_printf( Perl_debug_log, "%5s| ","");
  1063			                } else {
  1064			                    PerlIO_printf( Perl_debug_log, "W%04x| ",
  1065			                        trie->states[ state ].wordnum
  1066			                    );
  1067			                }
  1068			                for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
  1069			                    SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
  1070			                    PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
  1071			                        SvPV_nolen_const( *tmp ),
  1072			                        TRIE_LIST_ITEM(state,charid).forid,
  1073			                        (UV)TRIE_LIST_ITEM(state,charid).newstate
  1074			                    );
  1075			                }
  1076			
  1077			            }
  1078			            PerlIO_printf( Perl_debug_log, "\n\n" );
  1079	           1            });
  1080			
  1081	           1            Newz( 848203, trie->trans, transcount ,reg_trie_trans );
  1082			        {
  1083	           1                U32 state;
  1084	           1                U32 tp = 0;
  1085	           1                U32 zp = 0;
  1086			
  1087			
  1088	       19771                for( state=1 ; state < next_alloc ; state ++ ) {
  1089	       19770                    U32 base=0;
  1090			
  1091			                /*
  1092			                DEBUG_TRIE_COMPILE_MORE_r(
  1093			                    PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
  1094			                );
  1095			                */
  1096			
  1097	       19770                    if (trie->states[state].trans.list) {
  1098	          19                        U16 minid=TRIE_LIST_ITEM( state, 1).forid;
  1099	          19                        U16 maxid=minid;
  1100	          19    		    U16 idx;
  1101			
  1102	       19769                        for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
  1103	       19750                            if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
  1104	      ######                                minid=TRIE_LIST_ITEM( state, idx).forid;
  1105	       19750                            } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
  1106	       19749                                maxid=TRIE_LIST_ITEM( state, idx).forid;
  1107			                        }
  1108			                    }
  1109	          19                        if ( transcount < tp + maxid - minid + 1) {
  1110	      ######                            transcount *= 2;
  1111	      ######                            Renew( trie->trans, transcount, reg_trie_trans );
  1112	      ######                            Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
  1113			                    }
  1114	          19                        base = trie->uniquecharcount + tp - minid;
  1115	          19                        if ( maxid == minid ) {
  1116	          18                            U32 set = 0;
  1117	       39534                            for ( ; zp < tp ; zp++ ) {
  1118	       19765                                if ( ! trie->trans[ zp ].next ) {
  1119	           7                                    base = trie->uniquecharcount + zp - minid;
  1120	           7                                    trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
  1121	           7                                    trie->trans[ zp ].check = state;
  1122	           7                                    set = 1;
  1123	           7                                    break;
  1124			                            }
  1125			                        }
  1126	          18                            if ( !set ) {
  1127	          11                                trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
  1128	          11                                trie->trans[ tp ].check = state;
  1129	          11                                tp++;
  1130	          11                                zp = tp;
  1131			                        }
  1132			                    } else {
  1133	       19752                            for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
  1134	       19751                                U32 tid = base -  trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
  1135	       19751                                trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
  1136	       19751                                trie->trans[ tid ].check = state;
  1137			                        }
  1138	           1                            tp += ( maxid - minid + 1 );
  1139			                    }
  1140	          19                        Safefree(trie->states[ state ].trans.list);
  1141			                }
  1142			                /*
  1143			                DEBUG_TRIE_COMPILE_MORE_r(
  1144			                    PerlIO_printf( Perl_debug_log, " base: %d\n",base);
  1145			                );
  1146			                */
  1147	       19770                    trie->states[ state ].trans.base=base;
  1148			            }
  1149	           1                trie->lasttrans = tp + 1;
  1150			        }
  1151			    } else {
  1152			        /*
  1153			           Second Pass -- Flat Table Representation.
  1154			
  1155			           we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
  1156			           We know that we will need Charcount+1 trans at most to store the data
  1157			           (one row per char at worst case) So we preallocate both structures
  1158			           assuming worst case.
  1159			
  1160			           We then construct the trie using only the .next slots of the entry
  1161			           structs.
  1162			
  1163			           We use the .check field of the first entry of the node  temporarily to
  1164			           make compression both faster and easier by keeping track of how many non
  1165			           zero fields are in the node.
  1166			
  1167			           Since trans are numbered from 1 any 0 pointer in the table is a FAIL
  1168			           transition.
  1169			
  1170			           There are two terms at use here: state as a TRIE_NODEIDX() which is a
  1171			           number representing the first entry of the node, and state as a
  1172			           TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
  1173			           TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
  1174			           are 2 entrys per node. eg:
  1175			
  1176			             A B       A B
  1177			          1. 2 4    1. 3 7
  1178			          2. 0 3    3. 0 5
  1179			          3. 0 0    5. 0 0
  1180			          4. 0 0    7. 0 0
  1181			
  1182			           The table is internally in the right hand, idx form. However as we also
  1183			           have to deal with the states array which is indexed by nodenum we have to
  1184			           use TRIE_NODENUM() to convert.
  1185			
  1186			        */
  1187			
  1188			        Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
  1189	       15391                  reg_trie_trans );
  1190	       15391            Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
  1191	       15391            next_alloc = trie->uniquecharcount + 1;
  1192			
  1193	       79751            for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
  1194			
  1195	       64360                regnode *noper   = NEXTOPER( cur );
  1196	       64360    	    const U8 *uc     = (U8*)STRING( noper );
  1197	       64360    	    const U8 * const e = uc + STR_LEN( noper );
  1198			
  1199	       64360                U32 state        = 1;         /* required init */
  1200			
  1201	       64360                U16 charid       = 0;         /* sanity init */
  1202	       64360                U32 accept_state = 0;         /* sanity init */
  1203	       64360                U8 *scan         = (U8*)NULL; /* sanity init */
  1204			
  1205	       64360                STRLEN foldlen   = 0;         /* required init */
  1206	      430578                U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
  1207			
  1208			
  1209	      796796                for ( ; uc < e ; uc += len ) {
  1210			
  1211	      366218                    TRIE_READ_CHAR;
  1212			
  1213	      366218                    if ( uvc < 256 ) {
  1214	      366214                        charid = trie->charmap[ uvc ];
  1215			                } else {
  1216	           4                        SV** svpp=(SV**)NULL;
  1217	           4                        svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
  1218	           4                        if ( !svpp ) {
  1219	      ######                            charid = 0;
  1220			                    } else {
  1221	           4                            charid=(U16)SvIV( *svpp );
  1222			                    }
  1223			                }
  1224	      366218                    if ( charid ) {
  1225	      366218                        charid--;
  1226	      366218                        if ( !trie->trans[ state + charid ].next ) {
  1227	      338225                            trie->trans[ state + charid ].next = next_alloc;
  1228	      338225                            trie->trans[ state ].check++;
  1229	      338225                            next_alloc += trie->uniquecharcount;
  1230			                    }
  1231	      366218                        state = trie->trans[ state + charid ].next;
  1232			                } else {
  1233	      ######                        Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
  1234			                }
  1235			                /* charid is now 0 if we dont know the char read, or nonzero if we do */
  1236			            }
  1237			
  1238	       64360                accept_state = TRIE_NODENUM( state );
  1239	       64360                if ( !trie->states[ accept_state ].wordnum ) {
  1240			                /* we havent inserted this word into the structure yet. */
  1241	       64305                    trie->states[ accept_state ].wordnum = ++curword;
  1242			
  1243			                DEBUG_r({
  1244			                    /* store the word for dumping */
  1245			                    SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
  1246			                    if ( UTF ) SvUTF8_on( tmp );
  1247			                    av_push( trie->words, tmp );
  1248	       64305                    });
  1249			
  1250			            } else {
  1251			                /* Its a dupe. So ignore it. */
  1252			            }
  1253			
  1254			        } /* end second pass */
  1255			
  1256			        DEBUG_TRIE_COMPILE_MORE_r({
  1257			            /*
  1258			               print out the table precompression so that we can do a visual check
  1259			               that they are identical.
  1260			             */
  1261			            U32 state;
  1262			            U16 charid;
  1263			            PerlIO_printf( Perl_debug_log, "\nChar : " );
  1264			
  1265			            for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
  1266			                SV **tmp = av_fetch( trie->revcharmap, charid, 0);
  1267			                if ( tmp ) {
  1268			                  PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
  1269			                }
  1270			            }
  1271			
  1272			            PerlIO_printf( Perl_debug_log, "\nState+-" );
  1273			
  1274			            for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
  1275			                PerlIO_printf( Perl_debug_log, "%4s-", "----" );
  1276			            }
  1277			
  1278			            PerlIO_printf( Perl_debug_log, "\n" );
  1279			
  1280			            for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
  1281			
  1282			                PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
  1283			
  1284			                for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
  1285			                    PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
  1286			                        (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
  1287			                }
  1288			                if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
  1289			                    PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
  1290			                } else {
  1291			                    PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
  1292			                    trie->states[ TRIE_NODENUM( state ) ].wordnum );
  1293			                }
  1294			            }
  1295			            PerlIO_printf( Perl_debug_log, "\n\n" );
  1296	       15391            });
  1297			        {
  1298			        /*
  1299			           * Inplace compress the table.*
  1300			
  1301			           For sparse data sets the table constructed by the trie algorithm will
  1302			           be mostly 0/FAIL transitions or to put it another way mostly empty.
  1303			           (Note that leaf nodes will not contain any transitions.)
  1304			
  1305			           This algorithm compresses the tables by eliminating most such
  1306			           transitions, at the cost of a modest bit of extra work during lookup:
  1307			
  1308			           - Each states[] entry contains a .base field which indicates the
  1309			           index in the state[] array wheres its transition data is stored.
  1310			
  1311			           - If .base is 0 there are no  valid transitions from that node.
  1312			
  1313			           - If .base is nonzero then charid is added to it to find an entry in
  1314			           the trans array.
  1315			
  1316			           -If trans[states[state].base+charid].check!=state then the
  1317			           transition is taken to be a 0/Fail transition. Thus if there are fail
  1318			           transitions at the front of the node then the .base offset will point
  1319			           somewhere inside the previous nodes data (or maybe even into a node
  1320			           even earlier), but the .check field determines if the transition is
  1321			           valid.
  1322			
  1323			           The following process inplace converts the table to the compressed
  1324			           table: We first do not compress the root node 1,and mark its all its
  1325			           .check pointers as 1 and set its .base pointer as 1 as well. This
  1326			           allows to do a DFA construction from the compressed table later, and
  1327			           ensures that any .base pointers we calculate later are greater than
  1328			           0.
  1329			
  1330			           - We set 'pos' to indicate the first entry of the second node.
  1331			
  1332			           - We then iterate over the columns of the node, finding the first and
  1333			           last used entry at l and m. We then copy l..m into pos..(pos+m-l),
  1334			           and set the .check pointers accordingly, and advance pos
  1335			           appropriately and repreat for the next node. Note that when we copy
  1336			           the next pointers we have to convert them from the original
  1337			           NODEIDX form to NODENUM form as the former is not valid post
  1338			           compression.
  1339			
  1340			           - If a node has no transitions used we mark its base as 0 and do not
  1341			           advance the pos pointer.
  1342			
  1343			           - If a node only has one transition we use a second pointer into the
  1344			           structure to fill in allocated fail transitions from other states.
  1345			           This pointer is independent of the main pointer and scans forward
  1346			           looking for null transitions that are allocated to a state. When it
  1347			           finds one it writes the single transition into the "hole".  If the
  1348			           pointer doesnt find one the single transition is appeneded as normal.
  1349			
  1350			           - Once compressed we can Renew/realloc the structures to release the
  1351			           excess space.
  1352			
  1353			           See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
  1354			           specifically Fig 3.47 and the associated pseudocode.
  1355			
  1356			           demq
  1357			        */
  1358	       15391            const U32 laststate = TRIE_NODENUM( next_alloc );
  1359	       15391    	U32 state, charid;
  1360	       15391            U32 pos = 0, zp=0;
  1361	       15391            trie->laststate = laststate;
  1362			
  1363	      369007            for ( state = 1 ; state < laststate ; state++ ) {
  1364	      353616                U8 flag = 0;
  1365	      353616    	    const U32 stateidx = TRIE_NODEIDX( state );
  1366	      353616    	    const U32 o_used = trie->trans[ stateidx ].check;
  1367	      353616    	    U32 used = trie->trans[ stateidx ].check;
  1368	      353616                trie->trans[ stateidx ].check = 0;
  1369			
  1370	     2182545                for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
  1371	     2096004                    if ( flag || trie->trans[ stateidx + charid ].next ) {
  1372	      442259                        if ( trie->trans[ stateidx + charid ].next ) {
  1373	      338225                            if (o_used == 1) {
  1374	      403473                                for ( ; zp < pos ; zp++ ) {
  1375	      172037                                    if ( ! trie->trans[ zp ].next ) {
  1376	      103838                                        break;
  1377			                                }
  1378			                            }
  1379	      267075                                trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
  1380	      267075                                trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
  1381	      267075                                trie->trans[ zp ].check = state;
  1382	      267075                                if ( ++zp > pos ) pos = zp;
  1383	      163237                                break;
  1384			                        }
  1385	       71150                            used--;
  1386			                    }
  1387	      175184                        if ( !flag ) {
  1388	       24640                            flag = 1;
  1389	       24640                            trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
  1390			                    }
  1391	      175184                        trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
  1392	      175184                        trie->trans[ pos ].check = state;
  1393	      175184                        pos++;
  1394			                }
  1395			            }
  1396			        }
  1397	       15391            trie->lasttrans = pos + 1;
  1398	       15391            Renew( trie->states, laststate + 1, reg_trie_state);
  1399			        DEBUG_TRIE_COMPILE_MORE_r(
  1400			                PerlIO_printf( Perl_debug_log,
  1401					    " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
  1402					    (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
  1403					    (IV)next_alloc,
  1404					    (IV)pos,
  1405			                    ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
  1406	       15391                );
  1407			
  1408			        } /* end table compress */
  1409			    }
  1410			    /* resize the trans array to remove unused space */
  1411	       15392        Renew( trie->trans, trie->lasttrans, reg_trie_trans);
  1412			
  1413			    DEBUG_TRIE_COMPILE_r({
  1414			        U32 state;
  1415			        /*
  1416			           Now we print it out again, in a slightly different form as there is additional
  1417			           info we want to be able to see when its compressed. They are close enough for
  1418			           visual comparison though.
  1419			         */
  1420			        PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
  1421			
  1422			        for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
  1423			            SV **tmp = av_fetch( trie->revcharmap, state, 0);
  1424			            if ( tmp ) {
  1425			              PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
  1426			            }
  1427			        }
  1428			        PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
  1429			
  1430			        for( state = 0 ; state < trie->uniquecharcount ; state++ )
  1431			            PerlIO_printf( Perl_debug_log, "-----");
  1432			        PerlIO_printf( Perl_debug_log, "\n");
  1433			
  1434			        for( state = 1 ; state < trie->laststate ; state++ ) {
  1435				    const U32 base = trie->states[ state ].trans.base;
  1436			
  1437			            PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
  1438			
  1439			            if ( trie->states[ state ].wordnum ) {
  1440			                PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
  1441			            } else {
  1442			                PerlIO_printf( Perl_debug_log, "%6s", "" );
  1443			            }
  1444			
  1445			            PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
  1446			
  1447			            if ( base ) {
  1448			                U32 ofs = 0;
  1449			
  1450			                while( ( base + ofs  < trie->uniquecharcount ) ||
  1451			                       ( base + ofs - trie->uniquecharcount < trie->lasttrans
  1452			                         && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
  1453			                        ofs++;
  1454			
  1455			                PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
  1456			
  1457			                for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
  1458			                    if ( ( base + ofs >= trie->uniquecharcount ) &&
  1459			                         ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
  1460			                         trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
  1461			                    {
  1462			                       PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
  1463			                        (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
  1464			                    } else {
  1465			                        PerlIO_printf( Perl_debug_log, "%4s ","   0" );
  1466			                    }
  1467			                }
  1468			
  1469			                PerlIO_printf( Perl_debug_log, "]");
  1470			
  1471			            }
  1472			            PerlIO_printf( Perl_debug_log, "\n" );
  1473			        }
  1474	       15392        });
  1475			
  1476			    {
  1477			        /* now finally we "stitch in" the new TRIE node
  1478			           This means we convert either the first branch or the first Exact,
  1479			           depending on whether the thing following (in 'last') is a branch
  1480			           or not and whther first is the startbranch (ie is it a sub part of
  1481			           the alternation or is it the whole thing.)
  1482			           Assuming its a sub part we conver the EXACT otherwise we convert
  1483			           the whole branch sequence, including the first.
  1484			        */
  1485	       15392            regnode *convert;
  1486			
  1487			
  1488			
  1489			
  1490	       15392            if ( first == startbranch && OP( last ) != BRANCH ) {
  1491	       14480                convert = first;
  1492			        } else {
  1493	         912                convert = NEXTOPER( first );
  1494	         912                NEXT_OFF( first ) = (U16)(last - first);
  1495			        }
  1496			
  1497	       15392            OP( convert ) = TRIE + (U8)( flags - EXACT );
  1498	       15392            NEXT_OFF( convert ) = (U16)(tail - convert);
  1499	       15392            ARG_SET( convert, data_slot );
  1500			
  1501			        /* tells us if we need to handle accept buffers specially */
  1502	       15392            convert->flags = ( RExC_seen_evals ? 1 : 0 );
  1503			
  1504			
  1505			        /* needed for dumping*/
  1506			        DEBUG_r({
  1507			            regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
  1508			            /* We now need to mark all of the space originally used by the
  1509			               branches as optimized away. This keeps the dumpuntil from
  1510			               throwing a wobbly as it doesnt use regnext() to traverse the
  1511			               opcodes.
  1512			             */
  1513			            while( optimize < last ) {
  1514			                OP( optimize ) = OPTIMIZED;
  1515			                optimize++;
  1516			            }
  1517	       15392            });
  1518			    } /* end node insert */
  1519	       15392        return 1;
  1520			}
  1521			
  1522			
  1523			
  1524			/*
  1525			 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
  1526			 * These need to be revisited when a newer toolchain becomes available.
  1527			 */
  1528			#if defined(__sparc64__) && defined(__GNUC__)
  1529			#   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
  1530			#       undef  SPARC64_GCC_WORKAROUND
  1531			#       define SPARC64_GCC_WORKAROUND 1
  1532			#   endif
  1533			#endif
  1534			
  1535			/* REx optimizer.  Converts nodes into quickier variants "in place".
  1536			   Finds fixed substrings.  */
  1537			
  1538			/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
  1539			   to the position after last scanned or to NULL. */
  1540			
  1541			
  1542			STATIC I32
  1543			S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
  1544						/* scanp: Start here (read-write). */
  1545						/* deltap: Write maxlen-minlen here. */
  1546						/* last: Stop before this one. */
  1547	     2296461    {
  1548	     2296461        I32 min = 0, pars = 0, code;
  1549	     2296461        regnode *scan = *scanp, *next;
  1550	     2296461        I32 delta = 0;
  1551	     2296461        int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
  1552	     2296461        int is_inf_internal = 0;		/* The studied chunk is infinite */
  1553	     2296461        I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
  1554	     2296461        scan_data_t data_fake;
  1555	     2296461        struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
  1556	     2296461        SV *re_trie_maxbuff = NULL;
  1557			
  1558	     2296461        GET_RE_DEBUG_FLAGS_DECL;
  1559			
  1560	     5625604        while (scan && OP(scan) != END && scan < last) {
  1561				/* Peephole optimizer: */
  1562				DEBUG_OPTIMISE_r({
  1563				  SV *mysv=sv_newmortal();
  1564				  regprop( mysv, scan);
  1565				  PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
  1566				    (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
  1567	     3408651    	});
  1568			
  1569	     3408651    	if (PL_regkind[(U8)OP(scan)] == EXACT) {
  1570				    /* Merge several consecutive EXACTish nodes into one. */
  1571	     2162619    	    regnode *n = regnext(scan);
  1572	     2162619    	    U32 stringok = 1;
  1573			#ifdef DEBUGGING
  1574	     2162619    	    regnode *stop = scan;
  1575			#endif
  1576			
  1577	     2162619    	    next = scan + NODE_SZ_STR(scan);
  1578				    /* Skip NOTHING, merge EXACT*. */
  1579	     2185523    	    while (n &&
  1580					   ( PL_regkind[(U8)OP(n)] == NOTHING ||
  1581					     (stringok && (OP(n) == OP(scan))))
  1582					   && NEXT_OFF(n)
  1583					   && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
  1584	       25416    		if (OP(n) == TAIL || n > next)
  1585	       14998    		    stringok = 0;
  1586	       25416    		if (PL_regkind[(U8)OP(n)] == NOTHING) {
  1587	       20224    		    NEXT_OFF(scan) += NEXT_OFF(n);
  1588	       20224    		    next = n + NODE_STEP_REGNODE;
  1589			#ifdef DEBUGGING
  1590	       20224    		    if (stringok)
  1591	        5186    			stop = n;
  1592			#endif
  1593	       20224    		    n = regnext(n);
  1594					}
  1595	        5192    		else if (stringok) {
  1596	        5192    		    const int oldl = STR_LEN(scan);
  1597	        5192    		    regnode *nnext = regnext(n);
  1598			
  1599	        5192    		    if (oldl + STR_LEN(n) > U8_MAX)
  1600	        2512    			break;
  1601	        2680    		    NEXT_OFF(scan) += NEXT_OFF(n);
  1602	        2680    		    STR_LEN(scan) += STR_LEN(n);
  1603	        2680    		    next = n + NODE_SZ_STR(n);
  1604					    /* Now we can overwrite *n : */
  1605	        2680    		    Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
  1606			#ifdef DEBUGGING
  1607	        2680    		    stop = next - 1;
  1608			#endif
  1609	        2680    		    n = nnext;
  1610					}
  1611				    }
  1612			
  1613	     2162619    	    if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
  1614			/*
  1615			  Two problematic code points in Unicode casefolding of EXACT nodes:
  1616			
  1617			   U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
  1618			   U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
  1619			
  1620			   which casefold to
  1621			
  1622			   Unicode			UTF-8
  1623			
  1624			   U+03B9 U+0308 U+0301		0xCE 0xB9 0xCC 0x88 0xCC 0x81
  1625			   U+03C5 U+0308 U+0301		0xCF 0x85 0xCC 0x88 0xCC 0x81
  1626			
  1627			   This means that in case-insensitive matching (or "loose matching",
  1628			   as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
  1629			   length of the above casefolded versions) can match a target string
  1630			   of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
  1631			   This would rather mess up the minimum length computation.
  1632			
  1633			   What we'll do is to look for the tail four bytes, and then peek
  1634			   at the preceding two bytes to see whether we need to decrease
  1635			   the minimum length by four (six minus two).
  1636			
  1637			   Thanks to the design of UTF-8, there cannot be false matches:
  1638			   A sequence of valid UTF-8 bytes cannot be a subsequence of
  1639			   another valid sequence of UTF-8 bytes.
  1640			
  1641			*/
  1642	         385    		 char *s0 = STRING(scan), *s, *t;
  1643	         385    		 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
  1644	         385    		 const char * const t0 = "\xcc\x88\xcc\x81";
  1645	         385    		 const char * const t1 = t0 + 3;
  1646			
  1647	         403    		 for (s = s0 + 2;
  1648					      s < s2 && (t = ninstr(s, s1, t0, t1));
  1649					      s = t + 4) {
  1650	          18    		      if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
  1651						  ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
  1652	          18    			   min -= 4;
  1653					 }
  1654				    }
  1655			
  1656			#ifdef DEBUGGING
  1657				    /* Allow dumping */
  1658	     2162619    	    n = scan + NODE_SZ_STR(scan);
  1659	     2170556    	    while (n <= stop) {
  1660	        7937    		if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
  1661	        2810    		    OP(n) = OPTIMIZED;
  1662	        2810    		    NEXT_OFF(n) = 0;
  1663					}
  1664	        7937    		n++;
  1665				    }
  1666			#endif
  1667				}
  1668			
  1669			
  1670			
  1671				/* Follow the next-chain of the current node and optimize
  1672				   away all the NOTHINGs from it.  */
  1673	     3408651    	if (OP(scan) != CURLYX) {
  1674	     3329151    	    const int max = (reg_off_by_arg[OP(scan)]
  1675					       ? I32_MAX
  1676					       /* I32 may be smaller than U16 on CRAYs! */
  1677	     3329151    		       : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
  1678	     3329151    	    int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
  1679	     3329151    	    int noff;
  1680	     3329151    	    regnode *n = scan;
  1681				
  1682				    /* Skip NOTHING and LONGJMP. */
  1683	     3362404    	    while ((n = regnext(n))
  1684					   && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
  1685					       || ((OP(n) == LONGJMP) && (noff = ARG(n))))
  1686					   && off + noff < max)
  1687	       33253    		off += noff;
  1688	     3329151    	    if (reg_off_by_arg[OP(scan)])
  1689	       30401    		ARG(scan) = off;
  1690				    else
  1691	     3298750    		NEXT_OFF(scan) = off;
  1692				}
  1693			
  1694				/* The principal pseudo-switch.  Cannot be a switch, since we
  1695				   look into several different things.  */
  1696	     3408651    	if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
  1697					   || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
  1698	       26824    	    next = regnext(scan);
  1699	       26824    	    code = OP(scan);
  1700				    /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
  1701				
  1702	       26824    	    if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
  1703	       26824    		I32 max1 = 0, min1 = I32_MAX, num = 0;
  1704	       26824    		struct regnode_charclass_class accum;
  1705	       26824    		regnode *startbranch=scan;
  1706					
  1707	       26824    		if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
  1708	       18518    		    scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
  1709	       26824    		if (flags & SCF_DO_STCLASS)
  1710	       15477    		    cl_init_zero(pRExC_state, &accum);
  1711			
  1712	      177009    		while (OP(scan) == code) {
  1713	      151051    		    I32 deltanext, minnext, f = 0, fake;
  1714	      151051    		    struct regnode_charclass_class this_class;
  1715			
  1716	      151051    		    num++;
  1717	      151051    		    data_fake.flags = 0;
  1718	      151051    		    if (data) {		
  1719	      150941    			data_fake.whilem_c = data->whilem_c;
  1720	      150941    			data_fake.last_closep = data->last_closep;
  1721					    }
  1722					    else
  1723	         110    			data_fake.last_closep = &fake;
  1724	      151051    		    next = regnext(scan);
  1725	      151051    		    scan = NEXTOPER(scan);
  1726	      151051    		    if (code != BRANCH)
  1727	       42430    			scan = NEXTOPER(scan);
  1728	      151051    		    if (flags & SCF_DO_STCLASS) {
  1729	       85193    			cl_init(pRExC_state, &this_class);
  1730	       85193    			data_fake.start_class = &this_class;
  1731	       85193    			f = SCF_DO_STCLASS_AND;
  1732					    }		
  1733	      151051    		    if (flags & SCF_WHILEM_VISITED_POS)
  1734	      109745    			f |= SCF_WHILEM_VISITED_POS;
  1735			
  1736					    /* we suppose the run is continuous, last=next...*/
  1737	      151051    		    minnext = study_chunk(pRExC_state, &scan, &deltanext,
  1738								  next, &data_fake, f,depth+1);
  1739	      151051    		    if (min1 > minnext)
  1740	       39907    			min1 = minnext;
  1741	      151051    		    if (max1 < minnext + deltanext)
  1742	       41042    			max1 = minnext + deltanext;
  1743	      151051    		    if (deltanext == I32_MAX)
  1744	        6999    			is_inf = is_inf_internal = 1;
  1745	      151051    		    scan = next;
  1746	      151051    		    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
  1747	        4061    			pars++;
  1748	      151051    		    if (data && (data_fake.flags & SF_HAS_EVAL))
  1749	         756    			data->flags |= SF_HAS_EVAL;
  1750	      151051    		    if (data)
  1751	      150941    			data->whilem_c = data_fake.whilem_c;
  1752	      151051    		    if (flags & SCF_DO_STCLASS)
  1753	       85193    			cl_or(pRExC_state, &accum, &this_class);
  1754	      151051    		    if (code == SUSPEND)
  1755	       26824    			break;
  1756					}
  1757	       26824    		if (code == IFTHEN && num < 2) /* Empty ELSE branch */
  1758	         112    		    min1 = 0;
  1759	       26824    		if (flags & SCF_DO_SUBSTR) {
  1760	       18518    		    data->pos_min += min1;
  1761	       18518    		    data->pos_delta += max1 - min1;
  1762	       18518    		    if (max1 != min1 || is_inf)
  1763	       16719    			data->longest = &(data->longest_float);
  1764					}
  1765	       26824    		min += min1;
  1766	       26824    		delta += max1 - min1;
  1767	       26824    		if (flags & SCF_DO_STCLASS_OR) {
  1768	        4866    		    cl_or(pRExC_state, data->start_class, &accum);
  1769	        4866    		    if (min1) {
  1770	        4652    			cl_and(data->start_class, &and_with);
  1771	        4652    			flags &= ~SCF_DO_STCLASS;
  1772					    }
  1773					}
  1774	       21958    		else if (flags & SCF_DO_STCLASS_AND) {
  1775	       10611    		    if (min1) {
  1776	       10098    			cl_and(data->start_class, &accum);
  1777	       10098    			flags &= ~SCF_DO_STCLASS;
  1778					    }
  1779					    else {
  1780						/* Switch to OR mode: cache the old value of
  1781						 * data->start_class */
  1782						StructCopy(data->start_class, &and_with,
  1783	         513    				   struct regnode_charclass_class);
  1784	         513    			flags &= ~SCF_DO_STCLASS_AND;
  1785						StructCopy(&accum, data->start_class,
  1786	         513    				   struct regnode_charclass_class);
  1787	         513    			flags |= SCF_DO_STCLASS_OR;
  1788	         513    			data->start_class->flags |= ANYOF_EOS;
  1789					    }
  1790					}
  1791			
  1792					/* demq.
  1793			
  1794					   Assuming this was/is a branch we are dealing with: 'scan' now
  1795					   points at the item that follows the branch sequence, whatever
  1796					   it is. We now start at the beginning of the sequence and look
  1797					   for subsequences of
  1798			
  1799					   BRANCH->EXACT=>X
  1800					   BRANCH->EXACT=>X
  1801			
  1802					   which would be constructed from a pattern like /A|LIST|OF|WORDS/
  1803			
  1804					   If we can find such a subseqence we need to turn the first
  1805					   element into a trie and then add the subsequent branch exact
  1806					   strings to the trie.
  1807			
  1808					   We have two cases
  1809			
  1810					     1. patterns where the whole set of branch can be converted to a trie,
  1811			
  1812					     2. patterns where only a subset of the alternations can be
  1813					     converted to a trie.
  1814			
  1815					   In case 1 we can replace the whole set with a single regop
  1816					   for the trie. In case 2 we need to keep the start and end
  1817					   branchs so
  1818			
  1819					     'BRANCH EXACT; BRANCH EXACT; BRANCH X'
  1820					     becomes BRANCH TRIE; BRANCH X;
  1821			
  1822					   Hypthetically when we know the regex isnt anchored we can
  1823					   turn a case 1 into a DFA and let it rip... Every time it finds a match
  1824					   it would just call its tail, no WHILEM/CURLY needed.
  1825			
  1826					*/
  1827	       26824    		if (DO_TRIE) {
  1828	       26824    		    if (!re_trie_maxbuff) {
  1829	       26416    			re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
  1830	       26416    			if (!SvIOK(re_trie_maxbuff))
  1831	        1826    			    sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
  1832					    }
  1833	       26824                        if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
  1834	       25644                            regnode *cur;
  1835	       25644                            regnode *first = (regnode *)NULL;
  1836	       25644                            regnode *last = (regnode *)NULL;
  1837	       25644                            regnode *tail = scan;
  1838	       25644                            U8 optype = 0;
  1839	       25644                            U32 count=0;
  1840			
  1841			#ifdef DEBUGGING
  1842	       25644                            SV *mysv = sv_newmortal();       /* for dumping */
  1843			#endif
  1844			                        /* var tail is used because there may be a TAIL
  1845			                           regop in the way. Ie, the exacts will point to the
  1846			                           thing following the TAIL, but the last branch will
  1847			                           point at the TAIL. So we advance tail. If we
  1848			                           have nested (?:) we may have to move through several
  1849			                           tails.
  1850			                         */
  1851			
  1852	       32035                            while ( OP( tail ) == TAIL ) {
  1853			                            /* this is the TAIL generated by (?:) */
  1854	        6391                                tail = regnext( tail );
  1855			                        }
  1856			
  1857			                        DEBUG_OPTIMISE_r({
  1858			                            regprop( mysv, tail );
  1859			                            PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
  1860			                                (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
  1861			                                (RExC_seen_evals) ? "[EVAL]" : ""
  1862			                            );
  1863	       25644                            });
  1864			                        /*
  1865			
  1866			                           step through the branches, cur represents each
  1867			                           branch, noper is the first thing to be matched
  1868			                           as part of that branch and noper_next is the
  1869			                           regnext() of that node. if noper is an EXACT
  1870			                           and noper_next is the same as scan (our current
  1871			                           position in the regex) then the EXACT branch is
  1872			                           a possible optimization target. Once we have
  1873			                           two or more consequetive such branches we can
  1874			                           create a trie of the EXACT's contents and stich
  1875			                           it in place. If the sequence represents all of
  1876			                           the branches we eliminate the whole thing and
  1877			                           replace it with a single TRIE. If it is a
  1878			                           subsequence then we need to stitch it in. This
  1879			                           means the first branch has to remain, and needs
  1880			                           to be repointed at the item on the branch chain
  1881			                           following the last branch optimized. This could
  1882			                           be either a BRANCH, in which case the
  1883			                           subsequence is internal, or it could be the
  1884			                           item following the branch sequence in which
  1885			                           case the subsequence is at the end.
  1886			
  1887			                        */
  1888			
  1889			                        /* dont use tail as the end marker for this traverse */
  1890	      134265                            for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
  1891	      108621                                regnode *noper = NEXTOPER( cur );
  1892	      108621                                regnode *noper_next = regnext( noper );
  1893			
  1894			                            DEBUG_OPTIMISE_r({
  1895			                                regprop( mysv, cur);
  1896			                                PerlIO_printf( Perl_debug_log, "%*s%s",
  1897			                                   (int)depth * 2 + 2,"  ", SvPV_nolen_const( mysv ) );
  1898			
  1899			                                regprop( mysv, noper);
  1900			                                PerlIO_printf( Perl_debug_log, " -> %s",
  1901			                                    SvPV_nolen_const(mysv));
  1902			
  1903			                                if ( noper_next ) {
  1904			                                  regprop( mysv, noper_next );
  1905			                                  PerlIO_printf( Perl_debug_log,"\t=> %s\t",
  1906			                                    SvPV_nolen_const(mysv));
  1907			                                }
  1908			                                PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
  1909			                                   first, last, cur );
  1910	      108621                                });
  1911	      108621                                if ( ( first ? OP( noper ) == optype
  1912			                                         : PL_regkind[ (U8)OP( noper ) ] == EXACT )
  1913			                                  && noper_next == tail && count<U16_MAX)
  1914			                            {
  1915	       90622                                    count++;
  1916	       90622                                    if ( !first ) {
  1917	       21903                                        first = cur;
  1918	       21903                                        optype = OP( noper );
  1919			                                } else {
  1920			                                    DEBUG_OPTIMISE_r(
  1921			                                        if (!last ) {
  1922			                                            regprop( mysv, first);
  1923			                                            PerlIO_printf( Perl_debug_log, "%*s%s",
  1924			                                              (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
  1925			                                            regprop( mysv, NEXTOPER(first) );
  1926			                                            PerlIO_printf( Perl_debug_log, " -> %s\n",
  1927			                                              SvPV_nolen_const( mysv ) );
  1928			                                        }
  1929	       68719                                        );
  1930	       68719                                        last = cur;
  1931			                                    DEBUG_OPTIMISE_r({
  1932			                                        regprop( mysv, cur);
  1933			                                        PerlIO_printf( Perl_debug_log, "%*s%s",
  1934			                                          (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
  1935			                                        regprop( mysv, noper );
  1936			                                        PerlIO_printf( Perl_debug_log, " -> %s\n",
  1937			                                          SvPV_nolen_const( mysv ) );
  1938	       68719                                        });
  1939			                                }
  1940			                            } else {
  1941	       17999                                    if ( last ) {
  1942			                                    DEBUG_OPTIMISE_r(
  1943			                                        PerlIO_printf( Perl_debug_log, "%*s%s\n",
  1944			                                            (int)depth * 2 + 2, "E:", "**END**" );
  1945	         537                                        );
  1946	         537                                        make_trie( pRExC_state, startbranch, first, cur, tail, optype );
  1947			                                }
  1948	       17999                                    if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
  1949			                                     && noper_next == tail )
  1950			                                {
  1951	      ######                                        count = 1;
  1952	      ######                                        first = cur;
  1953	      ######                                        optype = OP( noper );
  1954			                                } else {
  1955	       17999                                        count = 0;
  1956	       17999                                        first = NULL;
  1957	       17999                                        optype = 0;
  1958			                                }
  1959	       17999                                    last = NULL;
  1960			                            }
  1961			                        }
  1962			                        DEBUG_OPTIMISE_r({
  1963			                            regprop( mysv, cur);
  1964			                            PerlIO_printf( Perl_debug_log,
  1965			                              "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
  1966			                              "  ", SvPV_nolen_const( mysv ), first, last, cur);
  1967			
  1968	       25644                            });
  1969	       25644                            if ( last ) {
  1970			                            DEBUG_OPTIMISE_r(
  1971			                                PerlIO_printf( Perl_debug_log, "%*s%s\n",
  1972			                                    (int)depth * 2 + 2, "E:", "==END==" );
  1973	       14855                                );
  1974	       14855                                make_trie( pRExC_state, startbranch, first, scan, tail, optype );
  1975			                        }
  1976			                    }
  1977			                }
  1978				    }
  1979	      ######    	    else if ( code == BRANCHJ ) {  /* single branch is optimized. */
  1980	      ######    		scan = NEXTOPER(NEXTOPER(scan));
  1981				    } else			/* single branch is optimized. */
  1982	      ######    		scan = NEXTOPER(scan);
  1983	      ######    	    continue;
  1984				}
  1985	     3381827    	else if (OP(scan) == EXACT) {
  1986	      419993    	    I32 l = STR_LEN(scan);
  1987	      419993    	    UV uc = *((U8*)STRING(scan));
  1988	      419993    	    if (UTF) {
  1989	       22938    		const U8 * const s = (U8*)STRING(scan);
  1990	       22938    		l = utf8_length(s, s + l);
  1991	       22938    		uc = utf8_to_uvchr(s, NULL);
  1992				    }
  1993	      419993    	    min += l;
  1994	      419993    	    if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
  1995					/* The code below prefers earlier match for fixed
  1996					   offset, later match for variable offset.  */
  1997	      209710    		if (data->last_end == -1) { /* Update the start info. */
  1998	      101070    		    data->last_start_min = data->pos_min;
  1999	      101070     		    data->last_start_max = is_inf
  2000			 			? I32_MAX : data->pos_min + data->pos_delta;
  2001					}
  2002	      209710    		sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
  2003					{
  2004	      209710    		    SV * sv = data->last_found;
  2005	      209710    		    MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
  2006	      209710    			mg_find(sv, PERL_MAGIC_utf8) : NULL;
  2007	      209710    		    if (mg && mg->mg_len >= 0)
  2008	        1015    			mg->mg_len += utf8_length((U8*)STRING(scan),
  2009									  (U8*)STRING(scan)+STR_LEN(scan));
  2010					}
  2011	      209710    		if (UTF)
  2012	        3117    		    SvUTF8_on(data->last_found);
  2013	      209710    		data->last_end = data->pos_min + l;
  2014	      209710    		data->pos_min += l; /* As in the first entry. */
  2015	      209710    		data->flags &= ~SF_BEFORE_EOL;
  2016				    }
  2017	      419993    	    if (flags & SCF_DO_STCLASS_AND) {
  2018					/* Check whether it is compatible with what we know already! */
  2019	      180522    		int compat = 1;
  2020			
  2021	      180522    		if (uc >= 0x100 ||
  2022					    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
  2023					    && !ANYOF_BITMAP_TEST(data->start_class, uc)
  2024					    && (!(data->start_class->flags & ANYOF_FOLD)
  2025						|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
  2026			                    )
  2027	       19802    		    compat = 0;
  2028	      180522    		ANYOF_CLASS_ZERO(data->start_class);
  2029	      180522    		ANYOF_BITMAP_ZERO(data->start_class);
  2030	      180522    		if (compat)
  2031	      160720    		    ANYOF_BITMAP_SET(data->start_class, uc);
  2032	      180522    		data->start_class->flags &= ~ANYOF_EOS;
  2033	      180522    		if (uc < 0x100)
  2034	      160720    		  data->start_class->flags &= ~ANYOF_UNICODE_ALL;
  2035				    }
  2036	      239471    	    else if (flags & SCF_DO_STCLASS_OR) {
  2037					/* false positive possible if the class is case-folded */
  2038	       17954    		if (uc < 0x100)
  2039	       17954    		    ANYOF_BITMAP_SET(data->start_class, uc);
  2040					else
  2041	      ######    		    data->start_class->flags |= ANYOF_UNICODE_ALL;
  2042	       17954    		data->start_class->flags &= ~ANYOF_EOS;
  2043	       17954    		cl_and(data->start_class, &and_with);
  2044				    }
  2045	      419993    	    flags &= ~SCF_DO_STCLASS;
  2046				}
  2047	     2961834    	else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
  2048	     1742626    	    I32 l = STR_LEN(scan);
  2049	     1742626    	    UV uc = *((U8*)STRING(scan));
  2050			
  2051				    /* Search for fixed substrings supports EXACT only. */
  2052	     1742626    	    if (flags & SCF_DO_SUBSTR)
  2053	     1721678    		scan_commit(pRExC_state, data);
  2054	     1742626    	    if (UTF) {
  2055	        8886    		U8 *s = (U8 *)STRING(scan);
  2056	        8886    		l = utf8_length(s, s + l);
  2057	        8886    		uc = utf8_to_uvchr(s, NULL);
  2058				    }
  2059	     1742626    	    min += l;
  2060	     1742626    	    if (data && (flags & SCF_DO_SUBSTR))
  2061	     1721678    		data->pos_min += l;
  2062	     1742626    	    if (flags & SCF_DO_STCLASS_AND) {
  2063					/* Check whether it is compatible with what we know already! */
  2064	       10399    		int compat = 1;
  2065			
  2066	       10399    		if (uc >= 0x100 ||
  2067					    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
  2068					    && !ANYOF_BITMAP_TEST(data->start_class, uc)
  2069					     && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
  2070	           4    		    compat = 0;
  2071	       10399    		ANYOF_CLASS_ZERO(data->start_class);
  2072	       10399    		ANYOF_BITMAP_ZERO(data->start_class);
  2073	       10399    		if (compat) {
  2074	       10395    		    ANYOF_BITMAP_SET(data->start_class, uc);
  2075	       10395    		    data->start_class->flags &= ~ANYOF_EOS;
  2076	       10395    		    data->start_class->flags |= ANYOF_FOLD;
  2077	       10395    		    if (OP(scan) == EXACTFL)
  2078	           9    			data->start_class->flags |= ANYOF_LOCALE;
  2079					}
  2080				    }
  2081	     1732227    	    else if (flags & SCF_DO_STCLASS_OR) {
  2082	         423    		if (data->start_class->flags & ANYOF_FOLD) {
  2083					    /* false positive possible if the class is case-folded.
  2084					       Assume that the locale settings are the same... */
  2085	         188    		    if (uc < 0x100)
  2086	         188    			ANYOF_BITMAP_SET(data->start_class, uc);
  2087	         188    		    data->start_class->flags &= ~ANYOF_EOS;
  2088					}
  2089	         423    		cl_and(data->start_class, &and_with);
  2090				    }
  2091	     1742626    	    flags &= ~SCF_DO_STCLASS;
  2092				}
  2093	     1219208    	else if (strchr((const char*)PL_varies,OP(scan))) {
  2094	      338524    	    I32 mincount, maxcount, minnext, deltanext, fl = 0;
  2095	      338524    	    I32 f = flags, pos_before = 0;
  2096	      338524    	    regnode *oscan = scan;
  2097	      338524    	    struct regnode_charclass_class this_class;
  2098	      338524    	    struct regnode_charclass_class *oclass = NULL;
  2099	      338524    	    I32 next_is_eval = 0;
  2100			
  2101	      338524    	    switch (PL_regkind[(U8)OP(scan)]) {
  2102				    case WHILEM:		/* End of (?:...)* . */
  2103	       79500    		scan = NEXTOPER(scan);
  2104	       79500    		goto finish;
  2105				    case PLUS:
  2106	       54918    		if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
  2107	       47249    		    next = NEXTOPER(scan);
  2108	       47249    		    if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
  2109	       14092    			mincount = 1;
  2110	       14092    			maxcount = REG_INFTY;
  2111	       14092    			next = regnext(scan);
  2112	       14092    			scan = NEXTOPER(scan);
  2113	       14092    			goto do_curly;
  2114					    }
  2115					}
  2116	       40826    		if (flags & SCF_DO_SUBSTR)
  2117	       33157    		    data->pos_min++;
  2118	       40826    		min++;
  2119					/* Fall through. */
  2120				    case STAR:
  2121	      147004    		if (flags & SCF_DO_STCLASS) {
  2122	       39235    		    mincount = 0;
  2123	       39235    		    maxcount = REG_INFTY;
  2124	       39235    		    next = regnext(scan);
  2125	       39235    		    scan = NEXTOPER(scan);
  2126	       39235    		    goto do_curly;
  2127					}
  2128	      107769    		is_inf = is_inf_internal = 1;
  2129	      107769    		scan = regnext(scan);
  2130	      107769    		if (flags & SCF_DO_SUBSTR) {
  2131	       85561    		    scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
  2132	       85561    		    data->longest = &(data->longest_float);
  2133					}
  2134	       85561    		goto optimize_curly_tail;
  2135				    case CURLY:
  2136	       96170    		mincount = ARG1(scan);
  2137	       96170    		maxcount = ARG2(scan);
  2138	       96170    		next = regnext(scan);
  2139	       96170    		if (OP(scan) == CURLYX) {
  2140	       79500    		    I32 lp = (data ? *(data->last_closep) : 0);
  2141	       79500    		    scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
  2142					}
  2143	       96170    		scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
  2144	       96170    		next_is_eval = (OP(scan) == EVAL);
  2145				      do_curly:
  2146	      149497    		if (flags & SCF_DO_SUBSTR) {
  2147	      134469    		    if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
  2148	      134469    		    pos_before = data->pos_min;
  2149					}
  2150	      149497    		if (data) {
  2151	      149444    		    fl = data->flags;
  2152	      149444    		    data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
  2153	      149444    		    if (is_inf)
  2154	       24847    			data->flags |= SF_IS_INF;
  2155					}
  2156	      149497    		if (flags & SCF_DO_STCLASS) {
  2157	       76633    		    cl_init(pRExC_state, &this_class);
  2158	       76633    		    oclass = data->start_class;
  2159	       76633    		    data->start_class = &this_class;
  2160	       76633    		    f |= SCF_DO_STCLASS_AND;
  2161	       76633    		    f &= ~SCF_DO_STCLASS_OR;
  2162					}
  2163					/* These are the cases when once a subexpression
  2164					   fails at a particular position, it cannot succeed
  2165					   even after backtracking at the enclosing scope.
  2166					
  2167					   XXXX what if minimal match and we are at the
  2168					        initial run of {n,m}? */
  2169	      149497    		if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
  2170	        2135    		    f &= ~SCF_WHILEM_VISITED_POS;
  2171			
  2172					/* This will finish on WHILEM, setting scan, or on NULL: */
  2173	      149497    		minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
  2174							      (mincount == 0
  2175								? (f & ~SCF_DO_SUBSTR) : f),depth+1);
  2176			
  2177	      149497    		if (flags & SCF_DO_STCLASS)
  2178	       76633    		    data->start_class = oclass;
  2179	      149497    		if (mincount == 0 || minnext == 0) {
  2180	      127180    		    if (flags & SCF_DO_STCLASS_OR) {
  2181	        9875    			cl_or(pRExC_state, data->start_class, &this_class);
  2182					    }
  2183	      117305    		    else if (flags & SCF_DO_STCLASS_AND) {
  2184						/* Switch to OR mode: cache the old value of
  2185						 * data->start_class */
  2186						StructCopy(data->start_class, &and_with,
  2187	       48116    				   struct regnode_charclass_class);
  2188	       48116    			flags &= ~SCF_DO_STCLASS_AND;
  2189						StructCopy(&this_class, data->start_class,
  2190	       48116    				   struct regnode_charclass_class);
  2191	       48116    			flags |= SCF_DO_STCLASS_OR;
  2192	       48116    			data->start_class->flags |= ANYOF_EOS;
  2193					    }
  2194					} else {		/* Non-zero len */
  2195	       22317    		    if (flags & SCF_DO_STCLASS_OR) {
  2196	        2629    			cl_or(pRExC_state, data->start_class, &this_class);
  2197	        2629    			cl_and(data->start_class, &and_with);
  2198					    }
  2199	       19688    		    else if (flags & SCF_DO_STCLASS_AND)
  2200	       16013    			cl_and(data->start_class, &this_class);
  2201	       22317    		    flags &= ~SCF_DO_STCLASS;
  2202					}
  2203	      149497    		if (!scan) 		/* It was not CURLYX, but CURLY. */
  2204	       69997    		    scan = next;
  2205	      149497    		if (ckWARN(WARN_REGEXP)
  2206					       /* ? quantifier ok, except for (?{ ... }) */
  2207					    && (next_is_eval || !(mincount == 0 && maxcount == 1))
  2208					    && (minnext == 0) && (deltanext == 0)
  2209					    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
  2210					    && maxcount <= REG_INFTY/3) /* Complement check for big count */
  2211					{
  2212					    vWARN(RExC_parse,
  2213	      ######    			  "Quantifier unexpected on zero-length expression");
  2214					}
  2215			
  2216	      149497    		min += minnext * mincount;
  2217	      149497    		is_inf_internal |= ((maxcount == REG_INFTY
  2218							     && (minnext + deltanext) > 0)
  2219							    || deltanext == I32_MAX);
  2220	      149497    		is_inf |= is_inf_internal;
  2221	      149497    		delta += (minnext + deltanext) * maxcount - minnext * mincount;
  2222			
  2223					/* Try powerful optimization CURLYX => CURLYN. */
  2224	      149497    		if (  OP(oscan) == CURLYX && data
  2225					      && data->flags & SF_IN_PAR
  2226					      && !(data->flags & SF_HAS_EVAL)
  2227					      && !deltanext && minnext == 1 ) {
  2228					    /* Try to optimize to CURLYN.  */
  2229	        1970    		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
  2230	        1970    		    regnode *nxt1 = nxt;
  2231			#ifdef DEBUGGING
  2232	        1970    		    regnode *nxt2;
  2233			#endif
  2234			
  2235					    /* Skip open. */
  2236	        1970    		    nxt = regnext(nxt);
  2237	        1970    		    if (!strchr((const char*)PL_simple,OP(nxt))
  2238						&& !(PL_regkind[(U8)OP(nxt)] == EXACT
  2239						     && STR_LEN(nxt) == 1))
  2240	        1823    			goto nogo;
  2241			#ifdef DEBUGGING
  2242	        1823    		    nxt2 = nxt;
  2243			#endif
  2244	        1823    		    nxt = regnext(nxt);
  2245	        1823    		    if (OP(nxt) != CLOSE)
  2246	      ######    			goto nogo;
  2247					    /* Now we know that nxt2 is the only contents: */
  2248	        1823    		    oscan->flags = (U8)ARG(nxt);
  2249	        1823    		    OP(oscan) = CURLYN;
  2250	        1823    		    OP(nxt1) = NOTHING;	/* was OPEN. */
  2251			#ifdef DEBUGGING
  2252	        1823    		    OP(nxt1 + 1) = OPTIMIZED; /* was count. */
  2253	        1823    		    NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
  2254	        1823    		    NEXT_OFF(nxt2) = 0;	/* just for consistancy with CURLY. */
  2255	        1823    		    OP(nxt) = OPTIMIZED;	/* was CLOSE. */
  2256	        1823    		    OP(nxt + 1) = OPTIMIZED; /* was count. */
  2257	        1823    		    NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
  2258			#endif
  2259					}
  2260				      nogo:
  2261			
  2262					/* Try optimization CURLYX => CURLYM. */
  2263	      149497    		if (  OP(oscan) == CURLYX && data
  2264					      && !(data->flags & SF_HAS_PAR)
  2265					      && !(data->flags & SF_HAS_EVAL)
  2266					      && !deltanext	/* atom is fixed width */
  2267					      && minnext != 0	/* CURLYM can't handle zero width */
  2268					) {
  2269					    /* XXXX How to optimize if data == 0? */
  2270					    /* Optimize to a simpler form.  */
  2271	       52131    		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
  2272	       52131    		    regnode *nxt2;
  2273			
  2274	       52131    		    OP(oscan) = CURLYM;
  2275	       64313    		    while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
  2276						    && (OP(nxt2) != WHILEM))
  2277	       12182    			nxt = nxt2;
  2278	       52131    		    OP(nxt2)  = SUCCEED; /* Whas WHILEM */
  2279					    /* Need to optimize away parenths. */
  2280	       52131    		    if (data->flags & SF_IN_PAR) {
  2281						/* Set the parenth number.  */
  2282	        5280    			regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
  2283			
  2284	        5280    			if (OP(nxt) != CLOSE)
  2285	      ######    			    FAIL("Panic opt close");
  2286	        5280    			oscan->flags = (U8)ARG(nxt);
  2287	        5280    			OP(nxt1) = OPTIMIZED;	/* was OPEN. */
  2288	        5280    			OP(nxt) = OPTIMIZED;	/* was CLOSE. */
  2289			#ifdef DEBUGGING
  2290	        5280    			OP(nxt1 + 1) = OPTIMIZED; /* was count. */
  2291	        5280    			OP(nxt + 1) = OPTIMIZED; /* was count. */
  2292	        5280    			NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
  2293	        5280    			NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
  2294			#endif
  2295			#if 0
  2296						while ( nxt1 && (OP(nxt1) != WHILEM)) {
  2297						    regnode *nnxt = regnext(nxt1);
  2298						
  2299						    if (nnxt == nxt) {
  2300							if (reg_off_by_arg[OP(nxt1)])
  2301							    ARG_SET(nxt1, nxt2 - nxt1);
  2302							else if (nxt2 - nxt1 < U16_MAX)
  2303							    NEXT_OFF(nxt1) = nxt2 - nxt1;
  2304							else
  2305							    OP(nxt) = NOTHING;	/* Cannot beautify */
  2306						    }
  2307						    nxt1 = nnxt;
  2308						}
  2309			#endif
  2310						/* Optimize again: */
  2311	        5280    			study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
  2312							    NULL, 0,depth+1);
  2313					    }
  2314					    else
  2315	       46851    			oscan->flags = 0;
  2316					}
  2317	       97366    		else if ((OP(oscan) == CURLYX)
  2318						 && (flags & SCF_WHILEM_VISITED_POS)
  2319						 /* See the comment on a similar expression above.
  2320						    However, this time it not a subexpression
  2321						    we care about, but the expression itself. */
  2322						 && (maxcount == REG_INFTY)
  2323						 && data && ++data->whilem_c < 16) {
  2324					    /* This stays as CURLYX, we can put the count/of pair. */
  2325					    /* Find WHILEM (as in regexec.c) */
  2326	        2414    		    regnode *nxt = oscan + NEXT_OFF(oscan);
  2327			
  2328	        2414    		    if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
  2329	      ######    			nxt += ARG(nxt);
  2330	        2414    		    PREVOPER(nxt)->flags = (U8)(data->whilem_c
  2331						| (RExC_whilem_seen << 4)); /* On WHILEM */
  2332					}
  2333	      149497    		if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
  2334	        7005    		    pars++;
  2335	      149497    		if (flags & SCF_DO_SUBSTR) {
  2336	      134469    		    SV *last_str = Nullsv;
  2337	      134469    		    int counted = mincount != 0;
  2338			
  2339	      134469    		    if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
  2340			#if defined(SPARC64_GCC_WORKAROUND)
  2341						I32 b = 0;
  2342						STRLEN l = 0;
  2343						const char *s = NULL;
  2344						I32 old = 0;
  2345			
  2346						if (pos_before >= data->last_start_min)
  2347						    b = pos_before;
  2348						else
  2349						    b = data->last_start_min;
  2350			
  2351						l = 0;
  2352						s = SvPV_const(data->last_found, l);
  2353						old = b - data->last_start_min;
  2354			
  2355			#else
  2356	        7976    			I32 b = pos_before >= data->last_start_min
  2357	        7976    			    ? pos_before : data->last_start_min;
  2358	        7976    			STRLEN l;
  2359	        7976    			const char *s = SvPV_const(data->last_found, l);
  2360	        7976    			I32 old = b - data->last_start_min;
  2361			#endif
  2362			
  2363	        7976    			if (UTF)
  2364	           5    			    old = utf8_hop((U8*)s, old) - (U8*)s;
  2365						
  2366	        7976    			l -= old;
  2367						/* Get the added string: */
  2368	        7976    			last_str = newSVpvn(s  + old, l);
  2369	        7976    			if (UTF)
  2370	           5    			    SvUTF8_on(last_str);
  2371	        7976    			if (deltanext == 0 && pos_before == b) {
  2372						    /* What was added is a constant string */
  2373	        7768    			    if (mincount > 1) {
  2374	         800    				SvGROW(last_str, (mincount * l) + 1);
  2375	         800    				repeatcpy(SvPVX(last_str) + l,
  2376								  SvPVX_const(last_str), l, mincount - 1);
  2377	         800    				SvCUR_set(last_str, SvCUR(last_str) * mincount);
  2378							/* Add additional parts. */
  2379							SvCUR_set(data->last_found,
  2380	         800    					  SvCUR(data->last_found) - l);
  2381	         800    				sv_catsv(data->last_found, last_str);
  2382							{
  2383	         800    				    SV * sv = data->last_found;
  2384	         800    				    MAGIC *mg =
  2385								SvUTF8(sv) && SvMAGICAL(sv) ?
  2386	         800    					mg_find(sv, PERL_MAGIC_utf8) : NULL;
  2387	         800    				    if (mg && mg->mg_len >= 0)
  2388	      ######    					mg->mg_len += CHR_SVLEN(last_str);
  2389							}
  2390	         800    				data->last_end += l * (mincount - 1);
  2391						    }
  2392						} else {
  2393						    /* start offset must point into the last copy */
  2394	         208    			    data->last_start_min += minnext * (mincount - 1);
  2395	         208    			    data->last_start_max += is_inf ? I32_MAX
  2396							: (maxcount - 1) * (minnext + data->pos_delta);
  2397						}
  2398					    }
  2399					    /* It is counted once already... */
  2400	      134469    		    data->pos_min += minnext * (mincount - counted);
  2401	      134469    		    data->pos_delta += - counted * deltanext +
  2402						(minnext + deltanext) * maxcount - minnext * mincount;
  2403	      134469    		    if (mincount != maxcount) {
  2404						 /* Cannot extend fixed substrings found inside
  2405						    the group.  */
  2406	      133279    			scan_commit(pRExC_state,data);
  2407	      133279    			if (mincount && last_str) {
  2408	        7165    			    sv_setsv(data->last_found, last_str);
  2409	        7165    			    data->last_end = data->pos_min;
  2410	        7165    			    data->last_start_min =
  2411							data->pos_min - CHR_SVLEN(last_str);
  2412	        7165    			    data->last_start_max = is_inf
  2413							? I32_MAX
  2414							: data->pos_min + data->pos_delta
  2415							- CHR_SVLEN(last_str);
  2416						}
  2417	      133279    			data->longest = &(data->longest_float);
  2418					    }
  2419	      134469    		    SvREFCNT_dec(last_str);
  2420					}
  2421	      149497    		if (data && (fl & SF_HAS_EVAL))
  2422	          76    		    data->flags |= SF_HAS_EVAL;
  2423				      optimize_curly_tail:
  2424	      257266    		if (OP(oscan) != CURLYX) {
  2425	      285848    		    while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
  2426						   && NEXT_OFF(next))
  2427	       54128    			NEXT_OFF(oscan) += NEXT_OFF(next);
  2428					}
  2429	        1758    		continue;
  2430				    default:			/* REF and CLUMP only? */
  2431	        1758    		if (flags & SCF_DO_SUBSTR) {
  2432	        1436    		    scan_commit(pRExC_state,data);	/* Cannot expect anything... */
  2433	        1436    		    data->longest = &(data->longest_float);
  2434					}
  2435	        1758    		is_inf = is_inf_internal = 1;
  2436	        1758    		if (flags & SCF_DO_STCLASS_OR)
  2437	          60    		    cl_anything(pRExC_state, data->start_class);
  2438	        1758    		flags &= ~SCF_DO_STCLASS;
  2439	        1758    		break;
  2440				    }
  2441				}
  2442	      880684    	else if (strchr((const char*)PL_simple,OP(scan))) {
  2443	      128277    	    int value = 0;
  2444			
  2445	      128277    	    if (flags & SCF_DO_SUBSTR) {
  2446	       71144    		scan_commit(pRExC_state,data);
  2447	       71144    		data->pos_min++;
  2448				    }
  2449	      128277    	    min++;
  2450	      128277    	    if (flags & SCF_DO_STCLASS) {
  2451	       67462    		data->start_class->flags &= ~ANYOF_EOS;	/* No match on empty */
  2452			
  2453					/* Some of the logic below assumes that switching
  2454					   locale on will only add false positives. */
  2455	       67462    		switch (PL_regkind[(U8)OP(scan)]) {
  2456					case SANY:
  2457					default:
  2458					  do_default:
  2459					    /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
  2460	       12717    		    if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
  2461	      ######    			cl_anything(pRExC_state, data->start_class);
  2462	      ######    		    break;
  2463					case REG_ANY:
  2464	       18713    		    if (OP(scan) == SANY)
  2465	       12717    			goto do_default;
  2466	        5996    		    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
  2467	      ######    			value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
  2468							 || (data->start_class->flags & ANYOF_CLASS));
  2469	      ######    			cl_anything(pRExC_state, data->start_class);
  2470					    }
  2471	        5996    		    if (flags & SCF_DO_STCLASS_AND || !value)
  2472	        5996    			ANYOF_BITMAP_CLEAR(data->start_class,'\n');
  2473	        5996    		    break;
  2474					case ANYOF:
  2475	       21438    		    if (flags & SCF_DO_STCLASS_AND)
  2476	       15581    			cl_and(data->start_class,
  2477						       (struct regnode_charclass_class*)scan);
  2478					    else
  2479	        5857    			cl_or(pRExC_state, data->start_class,
  2480						      (struct regnode_charclass_class*)scan);
  2481	        5857    		    break;
  2482					case ALNUM:
  2483	        2760    		    if (flags & SCF_DO_STCLASS_AND) {
  2484	        2697    			if (!(data->start_class->flags & ANYOF_LOCALE)) {
  2485	        2681    			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
  2486	      689017    			    for (value = 0; value < 256; value++)
  2487	      686336    				if (!isALNUM(value))
  2488	      517433    				    ANYOF_BITMAP_CLEAR(data->start_class, value);
  2489						}
  2490					    }
  2491					    else {
  2492	          63    			if (data->start_class->flags & ANYOF_LOCALE)
  2493	      ######    			    ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
  2494						else {
  2495	       16191    			    for (value = 0; value < 256; value++)
  2496	       16128    				if (isALNUM(value))
  2497	        3969    				    ANYOF_BITMAP_SET(data->start_class, value);			
  2498						}
  2499					    }
  2500	      ######    		    break;
  2501					case ALNUML:
  2502	      ######    		    if (flags & SCF_DO_STCLASS_AND) {
  2503	      ######    			if (data->start_class->flags & ANYOF_LOCALE)
  2504	      ######    			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
  2505					    }
  2506					    else {
  2507	      ######    			ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
  2508	      ######    			data->start_class->flags |= ANYOF_LOCALE;
  2509					    }
  2510	      ######    		    break;
  2511					case NALNUM:
  2512	        1289    		    if (flags & SCF_DO_STCLASS_AND) {
  2513	        1269    			if (!(data->start_class->flags & ANYOF_LOCALE)) {
  2514	        1269    			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
  2515	      326133    			    for (value = 0; value < 256; value++)
  2516	      324864    				if (isALNUM(value))
  2517	       79947    				    ANYOF_BITMAP_CLEAR(data->start_class, value);
  2518						}
  2519					    }
  2520					    else {
  2521	          20    			if (data->start_class->flags & ANYOF_LOCALE)
  2522	      ######    			    ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
  2523						else {
  2524	        5140    			    for (value = 0; value < 256; value++)
  2525	        5120    				if (!isALNUM(value))
  2526	        3860    				    ANYOF_BITMAP_SET(data->start_class, value);			
  2527						}
  2528					    }
  2529	      ######    		    break;
  2530					case NALNUML:
  2531	      ######    		    if (flags & SCF_DO_STCLASS_AND) {
  2532	      ######    			if (data->start_class->flags & ANYOF_LOCALE)
  2533	      ######    			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
  2534					    }
  2535					    else {
  2536	      ######    			data->start_class->flags |= ANYOF_LOCALE;
  2537	      ######    			ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
  2538					    }
  2539	      ######    		    break;
  2540					case SPACE:
  2541	       20185    		    if (flags & SCF_DO_STCLASS_AND) {
  2542	       20184    			if (!(data->start_class->flags & ANYOF_LOCALE)) {
  2543	       20157    			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
  2544	     5180349    			    for (value = 0; value < 256; value++)
  2545	     5160192    				if (!isSPACE(value))
  2546	     5059407    				    ANYOF_BITMAP_CLEAR(data->start_class, value);
  2547						}
  2548					    }
  2549					    else {
  2550	           1    			if (data->start_class->flags & ANYOF_LOCALE)
  2551	      ######    			    ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
  2552						else {
  2553	         257    			    for (value = 0; value < 256; value++)
  2554	         256    				if (isSPACE(value))
  2555	           5    				    ANYOF_BITMAP_SET(data->start_class, value);			
  2556						}
  2557					    }
  2558	      ######    		    break;
  2559					case SPACEL:
  2560	      ######    		    if (flags & SCF_DO_STCLASS_AND) {
  2561	      ######    			if (data->start_class->flags & ANYOF_LOCALE)
  2562	      ######    			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
  2563					    }
  2564					    else {
  2565	      ######    			data->start_class->flags |= ANYOF_LOCALE;
  2566	      ######    			ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
  2567					    }
  2568	      ######    		    break;
  2569					case NSPACE:
  2570	         510    		    if (flags & SCF_DO_STCLASS_AND) {
  2571	         327    			if (!(data->start_class->flags & ANYOF_LOCALE)) {
  2572	         324    			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
  2573	       83268    			    for (value = 0; value < 256; value++)
  2574	       82944    				if (isSPACE(value))
  2575	        1620    				    ANYOF_BITMAP_CLEAR(data->start_class, value);
  2576						}
  2577					    }
  2578					    else {
  2579	         183    			if (data->start_class->flags & ANYOF_LOCALE)
  2580	      ######    			    ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
  2581						else {
  2582	       47031    			    for (value = 0; value < 256; value++)
  2583	       46848    				if (!isSPACE(value))
  2584	       45933    				    ANYOF_BITMAP_SET(data->start_class, value);			
  2585						}
  2586					    }
  2587	      ######    		    break;
  2588					case NSPACEL:
  2589	      ######    		    if (flags & SCF_DO_STCLASS_AND) {
  2590	      ######    			if (data->start_class->flags & ANYOF_LOCALE) {
  2591	      ######    			    ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
  2592	      ######    			    for (value = 0; value < 256; value++)
  2593	      ######    				if (!isSPACE(value))
  2594	      ######    				    ANYOF_BITMAP_CLEAR(data->start_class, value);
  2595						}
  2596					    }
  2597					    else {
  2598	      ######    			data->start_class->flags |= ANYOF_LOCALE;
  2599	      ######    			ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
  2600					    }
  2601	      ######    		    break;
  2602					case DIGIT:
  2603	        2567    		    if (flags & SCF_DO_STCLASS_AND) {
  2604	        2559    			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
  2605	      657663    			for (value = 0; value < 256; value++)
  2606	      655104    			    if (!isDIGIT(value))
  2607	      629514    				ANYOF_BITMAP_CLEAR(data->start_class, value);
  2608					    }
  2609					    else {
  2610	           8    			if (data->start_class->flags & ANYOF_LOCALE)
  2611	      ######    			    ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
  2612						else {
  2613	        2056    			    for (value = 0; value < 256; value++)
  2614	        2048    				if (isDIGIT(value))
  2615	          80    				    ANYOF_BITMAP_SET(data->start_class, value);			
  2616						}
  2617					    }
  2618	      ######    		    break;
  2619					case NDIGIT:
  2620	      ######    		    if (flags & SCF_DO_STCLASS_AND) {
  2621	      ######    			ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
  2622	      ######    			for (value = 0; value < 256; value++)
  2623	      ######    			    if (isDIGIT(value))
  2624	      ######    				ANYOF_BITMAP_CLEAR(data->start_class, value);
  2625					    }
  2626					    else {
  2627	      ######    			if (data->start_class->flags & ANYOF_LOCALE)
  2628	      ######    			    ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
  2629						else {
  2630	      ######    			    for (value = 0; value < 256; value++)
  2631	      ######    				if (!isDIGIT(value))
  2632	      ######    				    ANYOF_BITMAP_SET(data->start_class, value);			
  2633						}
  2634					    }
  2635	       67462    		    break;
  2636					}
  2637	       67462    		if (flags & SCF_DO_STCLASS_OR)
  2638	        6132    		    cl_and(data->start_class, &and_with);
  2639	       67462    		flags &= ~SCF_DO_STCLASS;
  2640				    }
  2641				}
  2642	      752407    	else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
  2643	       66783    	    data->flags |= (OP(scan) == MEOL
  2644						    ? SF_BEFORE_MEOL
  2645						    : SF_BEFORE_SEOL);
  2646				}
  2647	      685624    	else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
  2648					 /* Lookbehind, or need to calculate parens/evals/stclass: */
  2649					   && (scan->flags || data || (flags &