Staging
v0.5.1
https://github.com/python/cpython
Revision 63f2bee57f1bddd4e635d7207de1ca92c23162c5 authored by Steven Bethard on 02 March 2010, 09:27:05 UTC, committed by Steven Bethard on 02 March 2010, 09:27:05 UTC
........
  r78576 | steven.bethard | 2010-03-02 00:38:09 -0800 (Tue, 02 Mar 2010) | 3 lines

  Initial commit of the argparse library, based on argparse 1.1.
  Docs still need some updating to make getopt and optparse match the wording promised in the PEP.
  There are also probably a number of :class:ArgumentParser etc. links that could be added to the argparse documentation.
........
1 parent 5edb876
Raw File
Tip revision: 63f2bee57f1bddd4e635d7207de1ca92c23162c5 authored by Steven Bethard on 02 March 2010, 09:27:05 UTC
Blocked revisions 78576 via svnmerge
Tip revision: 63f2bee
_codecs_iso2022.c
/*
 * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
 *
 * Written by Hye-Shik Chang <perky@FreeBSD.org>
 */

#define USING_IMPORTED_MAPS
#define USING_BINARY_PAIR_SEARCH
#define EXTERN_JISX0213_PAIR
#define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
#define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE

#include "cjkcodecs.h"
#include "alg_jisx0201.h"
#include "emu_jisx0213_2000.h"
#include "mappings_jisx0213_pair.h"

/* STATE

   state->c[0-3]

	00000000
	||^^^^^|
	|+-----+----  G0-3 Character Set
	+-----------  Is G0-3 double byte?

   state->c[4]

	00000000
	      ||
	      |+----  Locked-Shift?
	      +-----  ESC Throughout
*/

#define ESC			0x1B
#define SO			0x0E
#define SI			0x0F
#define LF			0x0A

#define MAX_ESCSEQLEN		16

#define CHARSET_ISO8859_1	'A'
#define CHARSET_ASCII		'B'
#define CHARSET_ISO8859_7	'F'
#define CHARSET_JISX0201_K	'I'
#define CHARSET_JISX0201_R	'J'

#define CHARSET_GB2312		('A'|CHARSET_DBCS)
#define CHARSET_JISX0208	('B'|CHARSET_DBCS)
#define CHARSET_KSX1001		('C'|CHARSET_DBCS)
#define CHARSET_JISX0212	('D'|CHARSET_DBCS)
#define CHARSET_GB2312_8565	('E'|CHARSET_DBCS)
#define CHARSET_CNS11643_1	('G'|CHARSET_DBCS)
#define CHARSET_CNS11643_2	('H'|CHARSET_DBCS)
#define CHARSET_JISX0213_2000_1	('O'|CHARSET_DBCS)
#define CHARSET_JISX0213_2	('P'|CHARSET_DBCS)
#define CHARSET_JISX0213_2004_1	('Q'|CHARSET_DBCS)
#define CHARSET_JISX0208_O	('@'|CHARSET_DBCS)

#define CHARSET_DBCS		0x80
#define ESCMARK(mark)		((mark) & 0x7f)

#define IS_ESCEND(c)	(((c) >= 'A' && (c) <= 'Z') || (c) == '@')
#define IS_ISO2022ESC(c2) \
		((c2) == '(' || (c2) == ')' || (c2) == '$' || \
		 (c2) == '.' || (c2) == '&')
	/* this is not a complete list of ISO-2022 escape sequence headers.
	 * but, it's enough to implement CJK instances of iso-2022. */

#define MAP_UNMAPPABLE		0xFFFF
#define MAP_MULTIPLE_AVAIL	0xFFFE /* for JIS X 0213 */

#define F_SHIFTED		0x01
#define F_ESCTHROUGHOUT		0x02

#define STATE_SETG(dn, v)	((state)->c[dn]) = (v);
#define STATE_GETG(dn)		((state)->c[dn])

#define STATE_G0		STATE_GETG(0)
#define STATE_G1		STATE_GETG(1)
#define STATE_G2		STATE_GETG(2)
#define STATE_G3		STATE_GETG(3)
#define STATE_SETG0(v)		STATE_SETG(0, v)
#define STATE_SETG1(v)		STATE_SETG(1, v)
#define STATE_SETG2(v)		STATE_SETG(2, v)
#define STATE_SETG3(v)		STATE_SETG(3, v)

#define STATE_SETFLAG(f)	((state)->c[4]) |= (f);
#define STATE_GETFLAG(f)	((state)->c[4] & (f))
#define STATE_CLEARFLAG(f)	((state)->c[4]) &= ~(f);
#define STATE_CLEARFLAGS()	((state)->c[4]) = 0;

#define ISO2022_CONFIG		((const struct iso2022_config *)config)
#define CONFIG_ISSET(flag)	(ISO2022_CONFIG->flags & (flag))
#define CONFIG_DESIGNATIONS	(ISO2022_CONFIG->designations)

/* iso2022_config.flags */
#define NO_SHIFT		0x01
#define USE_G2			0x02
#define USE_JISX0208_EXT	0x04

/*-*- internal data structures -*-*/

typedef int (*iso2022_init_func)(void);
typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data);
typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length);

struct iso2022_designation {
	unsigned char mark;
	unsigned char plane;
	unsigned char width;
	iso2022_init_func initializer;
	iso2022_decode_func decoder;
	iso2022_encode_func encoder;
};

struct iso2022_config {
	int flags;
	const struct iso2022_designation *designations; /* non-ascii desigs */
};

/*-*- iso-2022 codec implementation -*-*/

CODEC_INIT(iso2022)
{
	const struct iso2022_designation *desig = CONFIG_DESIGNATIONS;
	for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++)
		if (desig->initializer != NULL && desig->initializer() != 0)
			return -1;
	return 0;
}

ENCODER_INIT(iso2022)
{
	STATE_CLEARFLAGS()
	STATE_SETG0(CHARSET_ASCII)
	STATE_SETG1(CHARSET_ASCII)
	return 0;
}

ENCODER_RESET(iso2022)
{
	if (STATE_GETFLAG(F_SHIFTED)) {
		WRITE1(SI)
		NEXT_OUT(1)
		STATE_CLEARFLAG(F_SHIFTED)
	}
	if (STATE_G0 != CHARSET_ASCII) {
		WRITE3(ESC, '(', 'B')
		NEXT_OUT(3)
		STATE_SETG0(CHARSET_ASCII)
	}
	return 0;
}

ENCODER(iso2022)
{
	while (inleft > 0) {
		const struct iso2022_designation *dsg;
		DBCHAR encoded;
		ucs4_t c = **inbuf;
		Py_ssize_t insize;

		if (c < 0x80) {
			if (STATE_G0 != CHARSET_ASCII) {
				WRITE3(ESC, '(', 'B')
				STATE_SETG0(CHARSET_ASCII)
				NEXT_OUT(3)
			}
			if (STATE_GETFLAG(F_SHIFTED)) {
				WRITE1(SI)
				STATE_CLEARFLAG(F_SHIFTED)
				NEXT_OUT(1)
			}
			WRITE1((unsigned char)c)
			NEXT(1, 1)
			continue;
		}

		DECODE_SURROGATE(c)
		insize = GET_INSIZE(c);

		encoded = MAP_UNMAPPABLE;
		for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
			Py_ssize_t length = 1;
			encoded = dsg->encoder(&c, &length);
			if (encoded == MAP_MULTIPLE_AVAIL) {
				/* this implementation won't work for pair
				 * of non-bmp characters. */
				if (inleft < 2) {
					if (!(flags & MBENC_FLUSH))
						return MBERR_TOOFEW;
					length = -1;
				}
				else
					length = 2;
#if Py_UNICODE_SIZE == 2
				if (length == 2) {
					ucs4_t u4in[2];
					u4in[0] = (ucs4_t)IN1;
					u4in[1] = (ucs4_t)IN2;
					encoded = dsg->encoder(u4in, &length);
				} else
					encoded = dsg->encoder(&c, &length);
#else
				encoded = dsg->encoder(&c, &length);
#endif
				if (encoded != MAP_UNMAPPABLE) {
					insize = length;
					break;
				}
			}
			else if (encoded != MAP_UNMAPPABLE)
				break;
		}

		if (!dsg->mark)
			return 1;
		assert(dsg->width == 1 || dsg->width == 2);

		switch (dsg->plane) {
		case 0: /* G0 */
			if (STATE_GETFLAG(F_SHIFTED)) {
				WRITE1(SI)
				STATE_CLEARFLAG(F_SHIFTED)
				NEXT_OUT(1)
			}
			if (STATE_G0 != dsg->mark) {
				if (dsg->width == 1) {
					WRITE3(ESC, '(', ESCMARK(dsg->mark))
					STATE_SETG0(dsg->mark)
					NEXT_OUT(3)
				}
				else if (dsg->mark == CHARSET_JISX0208) {
					WRITE3(ESC, '$', ESCMARK(dsg->mark))
					STATE_SETG0(dsg->mark)
					NEXT_OUT(3)
				}
				else {
					WRITE4(ESC, '$', '(',
						ESCMARK(dsg->mark))
					STATE_SETG0(dsg->mark)
					NEXT_OUT(4)
				}
			}
			break;
		case 1: /* G1 */
			if (STATE_G1 != dsg->mark) {
				if (dsg->width == 1) {
					WRITE3(ESC, ')', ESCMARK(dsg->mark))
					STATE_SETG1(dsg->mark)
					NEXT_OUT(3)
				}
				else {
					WRITE4(ESC, '$', ')',
						ESCMARK(dsg->mark))
					STATE_SETG1(dsg->mark)
					NEXT_OUT(4)
				}
			}
			if (!STATE_GETFLAG(F_SHIFTED)) {
				WRITE1(SO)
				STATE_SETFLAG(F_SHIFTED)
				NEXT_OUT(1)
			}
			break;
		default: /* G2 and G3 is not supported: no encoding in
			  * CJKCodecs are using them yet */
			return MBERR_INTERNAL;
		}

		if (dsg->width == 1) {
			WRITE1((unsigned char)encoded)
			NEXT_OUT(1)
		}
		else {
			WRITE2(encoded >> 8, encoded & 0xff)
			NEXT_OUT(2)
		}
		NEXT_IN(insize)
	}

	return 0;
}

DECODER_INIT(iso2022)
{
	STATE_CLEARFLAGS()
	STATE_SETG0(CHARSET_ASCII)
	STATE_SETG1(CHARSET_ASCII)
	STATE_SETG2(CHARSET_ASCII)
	return 0;
}

DECODER_RESET(iso2022)
{
	STATE_SETG0(CHARSET_ASCII)
	STATE_CLEARFLAG(F_SHIFTED)
	return 0;
}

static Py_ssize_t
iso2022processesc(const void *config, MultibyteCodec_State *state,
		  const unsigned char **inbuf, Py_ssize_t *inleft)
{
	unsigned char charset, designation;
	Py_ssize_t i, esclen;

	for (i = 1;i < MAX_ESCSEQLEN;i++) {
		if (i >= *inleft)
			return MBERR_TOOFEW;
		if (IS_ESCEND((*inbuf)[i])) {
			esclen = i + 1;
			break;
		}
		else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
			 (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
			i += 2;
	}

	if (i >= MAX_ESCSEQLEN)
		return 1; /* unterminated escape sequence */

	switch (esclen) {
	case 3:
		if (IN2 == '$') {
			charset = IN3 | CHARSET_DBCS;
			designation = 0;
		}
		else {
			charset = IN3;
			if (IN2 == '(') designation = 0;
			else if (IN2 == ')') designation = 1;
			else if (CONFIG_ISSET(USE_G2) && IN2 == '.')
				designation = 2;
			else return 3;
		}
		break;
	case 4:
		if (IN2 != '$')
			return 4;

		charset = IN4 | CHARSET_DBCS;
		if (IN3 == '(') designation = 0;
		else if (IN3 == ')') designation = 1;
		else return 4;
		break;
	case 6: /* designation with prefix */
		if (CONFIG_ISSET(USE_JISX0208_EXT) &&
		    (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
		    (*inbuf)[5] == 'B') {
			charset = 'B' | CHARSET_DBCS;
			designation = 0;
		}
		else
			return 6;
		break;
	default:
		return esclen;
	}

	/* raise error when the charset is not designated for this encoding */
	if (charset != CHARSET_ASCII) {
		const struct iso2022_designation *dsg;

		for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++)
			if (dsg->mark == charset)
				break;
		if (!dsg->mark)
			return esclen;
	}

	STATE_SETG(designation, charset)
	*inleft -= esclen;
	(*inbuf) += esclen;
	return 0;
}

#define ISO8859_7_DECODE(c, assi)					\
	if ((c) < 0xa0) (assi) = (c);					\
	else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0))))	\
		(assi) = (c);						\
	else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 ||		\
		 (0xbffffd77L & (1L << ((c)-0xb4)))))			\
		(assi) = 0x02d0 + (c);					\
	else if ((c) == 0xa1) (assi) = 0x2018;				\
	else if ((c) == 0xa2) (assi) = 0x2019;				\
	else if ((c) == 0xaf) (assi) = 0x2015;

static Py_ssize_t
iso2022processg2(const void *config, MultibyteCodec_State *state,
		 const unsigned char **inbuf, Py_ssize_t *inleft,
		 Py_UNICODE **outbuf, Py_ssize_t *outleft)
{
	/* not written to use encoder, decoder functions because only few
	 * encodings use G2 designations in CJKCodecs */
	if (STATE_G2 == CHARSET_ISO8859_1) {
		if (IN3 < 0x80)
			OUT1(IN3 + 0x80)
		else
			return 3;
	}
	else if (STATE_G2 == CHARSET_ISO8859_7) {
		ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)
		else return 3;
	}
	else if (STATE_G2 == CHARSET_ASCII) {
		if (IN3 & 0x80) return 3;
		else **outbuf = IN3;
	}
	else
		return MBERR_INTERNAL;

	(*inbuf) += 3;
	*inleft -= 3;
	(*outbuf) += 1;
	*outleft -= 1;
	return 0;
}

DECODER(iso2022)
{
	const struct iso2022_designation *dsgcache = NULL;

	while (inleft > 0) {
		unsigned char c = IN1;
		Py_ssize_t err;

		if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
			/* ESC throughout mode:
			 * for non-iso2022 escape sequences */
			WRITE1(c) /* assume as ISO-8859-1 */
			NEXT(1, 1)
			if (IS_ESCEND(c)) {
				STATE_CLEARFLAG(F_ESCTHROUGHOUT)
			}
			continue;
		}

		switch (c) {
		case ESC:
			REQUIRE_INBUF(2)
			if (IS_ISO2022ESC(IN2)) {
				err = iso2022processesc(config, state,
							inbuf, &inleft);
				if (err != 0)
					return err;
			}
			else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
				REQUIRE_INBUF(3)
				err = iso2022processg2(config, state,
					inbuf, &inleft, outbuf, &outleft);
				if (err != 0)
					return err;
			}
			else {
				WRITE1(ESC)
				STATE_SETFLAG(F_ESCTHROUGHOUT)
				NEXT(1, 1)
			}
			break;
		case SI:
			if (CONFIG_ISSET(NO_SHIFT))
				goto bypass;
			STATE_CLEARFLAG(F_SHIFTED)
			NEXT_IN(1)
			break;
		case SO:
			if (CONFIG_ISSET(NO_SHIFT))
				goto bypass;
			STATE_SETFLAG(F_SHIFTED)
			NEXT_IN(1)
			break;
		case LF:
			STATE_CLEARFLAG(F_SHIFTED)
			WRITE1(LF)
			NEXT(1, 1)
			break;
		default:
			if (c < 0x20) /* C0 */
				goto bypass;
			else if (c >= 0x80)
				return 1;
			else {
				const struct iso2022_designation *dsg;
				unsigned char charset;
				ucs4_t decoded;

				if (STATE_GETFLAG(F_SHIFTED))
					charset = STATE_G1;
				else
					charset = STATE_G0;

				if (charset == CHARSET_ASCII) {
bypass:					WRITE1(c)
					NEXT(1, 1)
					break;
				}

				if (dsgcache != NULL &&
				    dsgcache->mark == charset)
					dsg = dsgcache;
				else {
					for (dsg = CONFIG_DESIGNATIONS;
					     dsg->mark != charset
#ifdef Py_DEBUG
						&& dsg->mark != '\0'
#endif
					     ;dsg++)
						/* noop */;
					assert(dsg->mark != '\0');
					dsgcache = dsg;
				}

				REQUIRE_INBUF(dsg->width)
				decoded = dsg->decoder(*inbuf);
				if (decoded == MAP_UNMAPPABLE)
					return dsg->width;

				if (decoded < 0x10000) {
					WRITE1(decoded)
					NEXT_OUT(1)
				}
				else if (decoded < 0x30000) {
					WRITEUCS4(decoded)
				}
				else { /* JIS X 0213 pairs */
					WRITE2(decoded >> 16, decoded & 0xffff)
					NEXT_OUT(2)
				}
				NEXT_IN(dsg->width)
			}
			break;
		}
	}
	return 0;
}

/*-*- mapping table holders -*-*/

#define ENCMAP(enc) static const encode_map *enc##_encmap = NULL;
#define DECMAP(enc) static const decode_map *enc##_decmap = NULL;

/* kr */
ENCMAP(cp949)
DECMAP(ksx1001)

/* jp */
ENCMAP(jisxcommon)
DECMAP(jisx0208)
DECMAP(jisx0212)
ENCMAP(jisx0213_bmp)
DECMAP(jisx0213_1_bmp)
DECMAP(jisx0213_2_bmp)
ENCMAP(jisx0213_emp)
DECMAP(jisx0213_1_emp)
DECMAP(jisx0213_2_emp)

/* cn */
ENCMAP(gbcommon)
DECMAP(gb2312)

/* tw */

/*-*- mapping access functions -*-*/

static int
ksx1001_init(void)
{
	static int initialized = 0;

	if (!initialized && (
			IMPORT_MAP(kr, cp949, &cp949_encmap, NULL) ||
			IMPORT_MAP(kr, ksx1001, NULL, &ksx1001_decmap)))
		return -1;
	initialized = 1;
	return 0;
}

static ucs4_t
ksx1001_decoder(const unsigned char *data)
{
	ucs4_t u;
	TRYMAP_DEC(ksx1001, u, data[0], data[1])
		return u;
	else
		return MAP_UNMAPPABLE;
}

static DBCHAR
ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded;
	assert(*length == 1);
	if (*data < 0x10000) {
		TRYMAP_ENC(cp949, coded, *data)
			if (!(coded & 0x8000))
				return coded;
	}
	return MAP_UNMAPPABLE;
}

static int
jisx0208_init(void)
{
	static int initialized = 0;

	if (!initialized && (
			IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
			IMPORT_MAP(jp, jisx0208, NULL, &jisx0208_decmap)))
		return -1;
	initialized = 1;
	return 0;
}

static ucs4_t
jisx0208_decoder(const unsigned char *data)
{
	ucs4_t u;
	if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
		return 0xff3c;
	else TRYMAP_DEC(jisx0208, u, data[0], data[1])
		return u;
	else
		return MAP_UNMAPPABLE;
}

static DBCHAR
jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded;
	assert(*length == 1);
	if (*data < 0x10000) {
		if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
			return 0x2140;
		else TRYMAP_ENC(jisxcommon, coded, *data) {
			if (!(coded & 0x8000))
				return coded;
		}
	}
	return MAP_UNMAPPABLE;
}

static int
jisx0212_init(void)
{
	static int initialized = 0;

	if (!initialized && (
			IMPORT_MAP(jp, jisxcommon, &jisxcommon_encmap, NULL) ||
			IMPORT_MAP(jp, jisx0212, NULL, &jisx0212_decmap)))
		return -1;
	initialized = 1;
	return 0;
}

static ucs4_t
jisx0212_decoder(const unsigned char *data)
{
	ucs4_t u;
	TRYMAP_DEC(jisx0212, u, data[0], data[1])
		return u;
	else
		return MAP_UNMAPPABLE;
}

static DBCHAR
jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded;
	assert(*length == 1);
	if (*data < 0x10000) {
		TRYMAP_ENC(jisxcommon, coded, *data) {
			if (coded & 0x8000)
				return coded & 0x7fff;
		}
	}
	return MAP_UNMAPPABLE;
}

static int
jisx0213_init(void)
{
	static int initialized = 0;

	if (!initialized && (
			jisx0208_init() ||
			IMPORT_MAP(jp, jisx0213_bmp,
				   &jisx0213_bmp_encmap, NULL) ||
			IMPORT_MAP(jp, jisx0213_1_bmp,
				   NULL, &jisx0213_1_bmp_decmap) ||
			IMPORT_MAP(jp, jisx0213_2_bmp,
				   NULL, &jisx0213_2_bmp_decmap) ||
			IMPORT_MAP(jp, jisx0213_emp,
				   &jisx0213_emp_encmap, NULL) ||
			IMPORT_MAP(jp, jisx0213_1_emp,
				   NULL, &jisx0213_1_emp_decmap) ||
			IMPORT_MAP(jp, jisx0213_2_emp,
				   NULL, &jisx0213_2_emp_decmap) ||
			IMPORT_MAP(jp, jisx0213_pair, &jisx0213_pair_encmap,
				   &jisx0213_pair_decmap)))
		return -1;
	initialized = 1;
	return 0;
}

#define config ((void *)2000)
static ucs4_t
jisx0213_2000_1_decoder(const unsigned char *data)
{
	ucs4_t u;
	EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1])
	else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
		return 0xff3c;
	else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
	else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
	else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
		u |= 0x20000;
	else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
	else
		return MAP_UNMAPPABLE;
	return u;
}

static ucs4_t
jisx0213_2000_2_decoder(const unsigned char *data)
{
	ucs4_t u;
	EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1])
	TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
	else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
		u |= 0x20000;
	else
		return MAP_UNMAPPABLE;
	return u;
}
#undef config

static ucs4_t
jisx0213_2004_1_decoder(const unsigned char *data)
{
	ucs4_t u;
	if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
		return 0xff3c;
	else TRYMAP_DEC(jisx0208, u, data[0], data[1]);
	else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]);
	else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1])
		u |= 0x20000;
	else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]);
	else
		return MAP_UNMAPPABLE;
	return u;
}

static ucs4_t
jisx0213_2004_2_decoder(const unsigned char *data)
{
	ucs4_t u;
	TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]);
	else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1])
		u |= 0x20000;
	else
		return MAP_UNMAPPABLE;
	return u;
}

static DBCHAR
jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config)
{
	DBCHAR coded;

	switch (*length) {
	case 1: /* first character */
		if (*data >= 0x10000) {
			if ((*data) >> 16 == 0x20000 >> 16) {
				EMULATE_JISX0213_2000_ENCODE_EMP(coded, *data)
				else TRYMAP_ENC(jisx0213_emp, coded,
						(*data) & 0xffff)
					return coded;
			}
			return MAP_UNMAPPABLE;
		}

		EMULATE_JISX0213_2000_ENCODE_BMP(coded, *data)
		else TRYMAP_ENC(jisx0213_bmp, coded, *data) {
			if (coded == MULTIC)
				return MAP_MULTIPLE_AVAIL;
		}
		else TRYMAP_ENC(jisxcommon, coded, *data) {
			if (coded & 0x8000)
				return MAP_UNMAPPABLE;
		}
		else
			return MAP_UNMAPPABLE;
		return coded;
	case 2: /* second character of unicode pair */
		coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
				jisx0213_pair_encmap, JISX0213_ENCPAIRS);
		if (coded == DBCINV) {
			*length = 1;
			coded = find_pairencmap((ucs2_t)data[0], 0,
				  jisx0213_pair_encmap, JISX0213_ENCPAIRS);
			if (coded == DBCINV)
				return MAP_UNMAPPABLE;
		}
		else
			return coded;
	case -1: /* flush unterminated */
		*length = 1;
		coded = find_pairencmap((ucs2_t)data[0], 0,
				jisx0213_pair_encmap, JISX0213_ENCPAIRS);
		if (coded == DBCINV)
			return MAP_UNMAPPABLE;
		else
			return coded;
	default:
		return MAP_UNMAPPABLE;
	}
}

static DBCHAR
jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
	if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
		return coded;
	else if (coded & 0x8000)
		return MAP_UNMAPPABLE;
	else
		return coded;
}

static DBCHAR
jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded;
	Py_ssize_t ilength = *length;

	coded = jisx0213_encoder(data, length, (void *)2000);
	switch (ilength) {
	case 1:
		if (coded == MAP_MULTIPLE_AVAIL)
			return MAP_MULTIPLE_AVAIL;
		else
			return MAP_UNMAPPABLE;
	case 2:
		if (*length != 2)
			return MAP_UNMAPPABLE;
		else
			return coded;
	default:
		return MAP_UNMAPPABLE;
	}
}

static DBCHAR
jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded = jisx0213_encoder(data, length, (void *)2000);
	if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
		return coded;
	else if (coded & 0x8000)
		return coded & 0x7fff;
	else
		return MAP_UNMAPPABLE;
}

static DBCHAR
jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded = jisx0213_encoder(data, length, NULL);
	if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
		return coded;
	else if (coded & 0x8000)
		return MAP_UNMAPPABLE;
	else
		return coded;
}

static DBCHAR
jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded;
	Py_ssize_t ilength = *length;

	coded = jisx0213_encoder(data, length, NULL);
	switch (ilength) {
	case 1:
		if (coded == MAP_MULTIPLE_AVAIL)
			return MAP_MULTIPLE_AVAIL;
		else
			return MAP_UNMAPPABLE;
	case 2:
		if (*length != 2)
			return MAP_UNMAPPABLE;
		else
			return coded;
	default:
		return MAP_UNMAPPABLE;
	}
}

static DBCHAR
jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded = jisx0213_encoder(data, length, NULL);
	if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
		return coded;
	else if (coded & 0x8000)
		return coded & 0x7fff;
	else
		return MAP_UNMAPPABLE;
}

static ucs4_t
jisx0201_r_decoder(const unsigned char *data)
{
	ucs4_t u;
	JISX0201_R_DECODE(*data, u)
	else return MAP_UNMAPPABLE;
	return u;
}

static DBCHAR
jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded;
	JISX0201_R_ENCODE(*data, coded)
	else return MAP_UNMAPPABLE;
	return coded;
}

static ucs4_t
jisx0201_k_decoder(const unsigned char *data)
{
	ucs4_t u;
	JISX0201_K_DECODE(*data ^ 0x80, u)
	else return MAP_UNMAPPABLE;
	return u;
}

static DBCHAR
jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded;
	JISX0201_K_ENCODE(*data, coded)
	else return MAP_UNMAPPABLE;
	return coded - 0x80;
}

static int
gb2312_init(void)
{
	static int initialized = 0;

	if (!initialized && (
			IMPORT_MAP(cn, gbcommon, &gbcommon_encmap, NULL) ||
			IMPORT_MAP(cn, gb2312, NULL, &gb2312_decmap)))
		return -1;
	initialized = 1;
	return 0;
}

static ucs4_t
gb2312_decoder(const unsigned char *data)
{
	ucs4_t u;
	TRYMAP_DEC(gb2312, u, data[0], data[1])
		return u;
	else
		return MAP_UNMAPPABLE;
}

static DBCHAR
gb2312_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	DBCHAR coded;
	assert(*length == 1);
	if (*data < 0x10000) {
		TRYMAP_ENC(gbcommon, coded, *data) {
			if (!(coded & 0x8000))
				return coded;
		}
	}
	return MAP_UNMAPPABLE;
}


static ucs4_t
dummy_decoder(const unsigned char *data)
{
	return MAP_UNMAPPABLE;
}

static DBCHAR
dummy_encoder(const ucs4_t *data, Py_ssize_t *length)
{
	return MAP_UNMAPPABLE;
}

/*-*- registry tables -*-*/

#define REGISTRY_KSX1001_G0	{ CHARSET_KSX1001, 0, 2,		\
				  ksx1001_init,				\
				  ksx1001_decoder, ksx1001_encoder }
#define REGISTRY_KSX1001_G1	{ CHARSET_KSX1001, 1, 2,		\
				  ksx1001_init,				\
				  ksx1001_decoder, ksx1001_encoder }
#define REGISTRY_JISX0201_R	{ CHARSET_JISX0201_R, 0, 1,		\
				  NULL,					\
				  jisx0201_r_decoder, jisx0201_r_encoder }
#define REGISTRY_JISX0201_K	{ CHARSET_JISX0201_K, 0, 1,		\
				  NULL,					\
				  jisx0201_k_decoder, jisx0201_k_encoder }
#define REGISTRY_JISX0208	{ CHARSET_JISX0208, 0, 2,		\
				  jisx0208_init,			\
				  jisx0208_decoder, jisx0208_encoder }
#define REGISTRY_JISX0208_O	{ CHARSET_JISX0208_O, 0, 2,		\
				  jisx0208_init,			\
				  jisx0208_decoder, jisx0208_encoder }
#define REGISTRY_JISX0212	{ CHARSET_JISX0212, 0, 2,		\
				  jisx0212_init,			\
				  jisx0212_decoder, jisx0212_encoder }
#define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2,	\
				  jisx0213_init,			\
				  jisx0213_2000_1_decoder,		\
				  jisx0213_2000_1_encoder }
#define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
				  jisx0213_init,			\
				  jisx0213_2000_1_decoder,		\
				  jisx0213_2000_1_encoder_paironly }
#define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2,		\
				  jisx0213_init,			\
				  jisx0213_2000_2_decoder,		\
				  jisx0213_2000_2_encoder }
#define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2,	\
				  jisx0213_init,			\
				  jisx0213_2004_1_decoder,		\
				  jisx0213_2004_1_encoder }
#define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
				  jisx0213_init,			\
				  jisx0213_2004_1_decoder,		\
				  jisx0213_2004_1_encoder_paironly }
#define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2,		\
				  jisx0213_init,			\
				  jisx0213_2004_2_decoder,		\
				  jisx0213_2004_2_encoder }
#define REGISTRY_GB2312		{ CHARSET_GB2312, 0, 2,			\
				  gb2312_init,				\
				  gb2312_decoder, gb2312_encoder }
#define REGISTRY_CNS11643_1	{ CHARSET_CNS11643_1, 1, 2,		\
				  cns11643_init,			\
				  cns11643_1_decoder, cns11643_1_encoder }
#define REGISTRY_CNS11643_2	{ CHARSET_CNS11643_2, 2, 2,		\
				  cns11643_init,			\
				  cns11643_2_decoder, cns11643_2_encoder }
#define REGISTRY_ISO8859_1	{ CHARSET_ISO8859_1, 2, 1,		\
				  NULL, dummy_decoder, dummy_encoder }
#define REGISTRY_ISO8859_7	{ CHARSET_ISO8859_7, 2, 1,		\
				  NULL, dummy_decoder, dummy_encoder }
#define REGISTRY_SENTINEL	{ 0, }
#define CONFIGDEF(var, attrs)						\
	static const struct iso2022_config iso2022_##var##_config = {	\
		attrs, iso2022_##var##_designations			\
	};

static const struct iso2022_designation iso2022_kr_designations[] = {
	REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
};
CONFIGDEF(kr, 0)

static const struct iso2022_designation iso2022_jp_designations[] = {
	REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
	REGISTRY_SENTINEL
};
CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)

static const struct iso2022_designation iso2022_jp_1_designations[] = {
	REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
	REGISTRY_JISX0208_O, REGISTRY_SENTINEL
};
CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)

static const struct iso2022_designation iso2022_jp_2_designations[] = {
	REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
	REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
	REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
};
CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)

static const struct iso2022_designation iso2022_jp_2004_designations[] = {
	REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
	REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
};
CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)

static const struct iso2022_designation iso2022_jp_3_designations[] = {
	REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
	REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
};
CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)

static const struct iso2022_designation iso2022_jp_ext_designations[] = {
	REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
	REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
};
CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)


BEGIN_MAPPINGS_LIST
  /* no mapping table here */
END_MAPPINGS_LIST

#define ISO2022_CODEC(variation) {		\
	"iso2022_" #variation,			\
	&iso2022_##variation##_config,		\
	iso2022_codec_init,			\
	_STATEFUL_METHODS(iso2022)		\
},

BEGIN_CODECS_LIST
  ISO2022_CODEC(kr)
  ISO2022_CODEC(jp)
  ISO2022_CODEC(jp_1)
  ISO2022_CODEC(jp_2)
  ISO2022_CODEC(jp_2004)
  ISO2022_CODEC(jp_3)
  ISO2022_CODEC(jp_ext)
END_CODECS_LIST

I_AM_A_MODULE_FOR(iso2022)
back to top