     1			/*
     2			Data structures for encoding transformations.
     3			
     4			Perl works internally in either a native 'byte' encoding or
     5			in UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
     6			representation. When we do we can use utf8_to_uv().
     7			
     8			Most character encodings are either simple byte mappings or
     9			variable length multi-byte encodings. UTF-8 can be viewed as a
    10			rather extreme case of the latter.
    11			
    12			So to solve an important part of perl's encode needs we need to solve the
    13			"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
    14			case. (Where one of multi-bytes will usually be UTF-8.)
    15			
    16			The other type of encoding is a shift encoding where a prefix sequence
    17			determines what subsequent bytes mean. Such encodings have state.
    18			
    19			We also need to handle case where a character in one encoding has to be
    20			represented as multiple characters in the other. e.g. letter+diacritic.
    21			
    22			The process can be considered as pseudo perl:
    23			
    24			my $dst = '';
    25			while (length($src))
    26			 {
    27			  my $size    = $count($src);
    28			  my $in_seq  = substr($src,0,$size,'');
    29			  my $out_seq = $s2d_hash{$in_seq};
    30			  if (defined $out_seq)
    31			   {
    32			    $dst .= $out_seq;
    33			   }
    34			  else
    35			   {
    36			    # an error condition
    37			   }
    38			 }
    39			return $dst;
    40			
    41			That has the following components:
    42			 &src_count - a "rule" for how many bytes make up the next character in the
    43			              source.
    44			 %s2d_hash  - a mapping from input sequences to output sequences
    45			
    46			The problem with that scheme is that it does not allow the output
    47			character repertoire to affect the characters considered from the
    48			input.
    49			
    50			So we use a "trie" representation which can also be considered
    51			a state machine:
    52			
    53			my $dst   = '';
    54			my $seq   = \@s2d_seq;
    55			my $next  = \@s2d_next;
    56			while (length($src))
    57			 {
    58			  my $byte    = $substr($src,0,1,'');
    59			  my $out_seq = $seq->[$byte];
    60			  if (defined $out_seq)
    61			   {
    62			    $dst .= $out_seq;
    63			   }
    64			  else
    65			   {
    66			    # an error condition
    67			   }
    68			  ($next,$seq) = @$next->[$byte] if $next;
    69			 }
    70			return $dst;
    71			
    72			There is now a pair of data structures to represent everything.
    73			It is valid for output sequence at a particular point to
    74			be defined but zero length, that just means "don't know yet".
    75			For the single byte case there is no 'next' so new tables will be the same as
    76			the original tables. For a multi-byte case a prefix byte will flip to the tables
    77			for  the next page (adding nothing to the output), then the tables for the page
    78			will provide the actual output and set tables back to original base page.
    79			
    80			This scheme can also handle shift encodings.
    81			
    82			A slight enhancement to the scheme also allows for look-ahead - if
    83			we add a flag to re-add the removed byte to the source we could handle
    84			  a" -> ä
    85			  ab -> a (and take b back please)
    86			
    87			*/
    88			
    89			#include <EXTERN.h>
    90			#include <perl.h>
    91			#define U8 U8
    92			#include "encode.h"
    93			
    94			int
    95			do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
    96				  STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
    97	       40946    {
    98	       40946        const U8 *s = src;
    99	       40946        const U8 *send = s + *slen;
   100	       40946        const U8 *last = s;
   101	       40946        U8 *d = dst;
   102	       40946        U8 *dend = d + dlen, *dlast = d;
   103	       40946        int code = 0;
   104	     7214055        while (s < send) {
   105	     7175273    	encpage_t *e = enc;
   106	     7175273    	U8 byte = *s;
   107	    45155330    	while (byte > e->max)
   108	    37980057    	    e++;
   109	     7175273    	if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
   110	     7174584    	    const U8 *cend = s + (e->slen & 0x7f);
   111	     7174584    	    if (cend <= send) {
   112	     7174471    		STRLEN n;
   113	     7174471    		if ((n = e->dlen)) {
   114	     5202718    		    const U8 *out = e->seq + n * (byte - e->min);
   115	     5202718    		    U8 *oend = d + n;
   116	     5202718    		    if (dst) {
   117	     5202718    			if (oend <= dend) {
   118	    12265646    			    while (d < oend)
   119	     7064067    				*d++ = *out++;
   120						}
   121						else {
   122						    /* Out of space */
   123	        1139    			    code = ENCODE_NOSPACE;
   124	        1139    			    break;
   125						}
   126					    }
   127					    else
   128	      ######    			d = oend;
   129					}
   130	     7173332    		enc = e->next;
   131	     7173332    		s++;
   132	     7173332    		if (s == cend) {
   133	     5201579    		    if (approx && (e->slen & 0x80))
   134	      ######    			code = ENCODE_FALLBACK;
   135	     5201579    		    last = s;
   136	     5201579    		    if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
   137	         223    		      code = ENCODE_FOUND_TERM;
   138	         223    		      break;
   139					    }
   140	     5201356    		    dlast = d;
   141					}
   142				    }
   143				    else {
   144					/* partial source character */
   145	         113    		code = ENCODE_PARTIAL;
   146	         113    		break;
   147				    }
   148				}
   149				else {
   150				    /* Cannot represent */
   151	         689    	    code = ENCODE_NOREP;
   152				    break;
   153				}
   154			    }
   155	       40946        *slen = last - src;
   156	       40946        *dout = d - dst;
   157	       40946        return code;
   158			}
