Staging
v0.5.1
https://github.com/python/cpython
Revision b43dbcb4e873dfebbf07654d732d2bc0033f4605 authored by cvs2svn on 08 July 2004, 10:24:04 UTC, committed by cvs2svn on 08 July 2004, 10:24:04 UTC
1 parent 3970c11
Raw File
Tip revision: b43dbcb4e873dfebbf07654d732d2bc0033f4605 authored by cvs2svn on 08 July 2004, 10:24:04 UTC
This commit was manufactured by cvs2svn to create tag 'r24a1'.
Tip revision: b43dbcb
iso2022common.h
/*
 * iso2022common.h: Common Codec Routines for ISO-2022 codecs.
 *
 * Written by Hye-Shik Chang <perky@FreeBSD.org>
 * $CJKCodecs: iso2022common.h,v 1.8 2003/12/31 05:46:55 perky Exp $
 */

/* This ISO-2022 implementation is intended to comply ECMA-43 Level 1
 * rather than RFCs itself */

#define ESC     0x1b
#define SO      0x0e
#define SI      0x0f

#define MAX_ESCSEQLEN       16

#define IS_ESCEND(c)        (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
#define IS_ISO2022ESC(c2)   ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
                             (c2) == '.' || (c2) == '&')
        /* this is not a full list of ISO-2022 escape sequence headers.
         * but, it's enough to implement CJK instances of iso-2022. */

/* STATE

  state->c[0-3]

        00000000
        ||^^^^^|
        |+-----+----  G0-3 Character Set
        +-----------  Is G0-3 double byte?

  state->c[4]

        00000000
              ||
              |+----  Locked-Shift?
              +-----  ESC Throughout
*/

#define CHARSET_DOUBLEBYTE  0x80

#define CHARSET_ASCII       'B'

#define CHARSET_ISO8859_1   'A'
#define CHARSET_ISO8859_7   'F'

#define CHARSET_KSX1001     ('C'|CHARSET_DOUBLEBYTE)

#define CHARSET_JISX0201_R  'J'
#define CHARSET_JISX0201_K  'I'
#define CHARSET_JISX0208    ('B'|CHARSET_DOUBLEBYTE)
#define CHARSET_JISX0208_O  ('@'|CHARSET_DOUBLEBYTE)
#define CHARSET_JISX0212    ('D'|CHARSET_DOUBLEBYTE)
#define CHARSET_JISX0213_1  ('O'|CHARSET_DOUBLEBYTE)
#define CHARSET_JISX0213_2  ('P'|CHARSET_DOUBLEBYTE)

#define CHARSET_GB2312      ('A'|CHARSET_DOUBLEBYTE)
#define CHARSET_GB2312_8565 ('E'|CHARSET_DOUBLEBYTE)

#define CHARSET_DESIGN(c)   ((c) & 0x7f)
#define CHARSET_ISDBCS(c)   ((c) & 0x80)

#define F_SHIFTED           0x01
#define F_ESCTHROUGHOUT     0x02

#define STATE_SETG(dn, s, v)    ((s)->c[dn]) = (v);
#define STATE_GETG(dn, s)       ((s)->c[dn])

#define STATE_SETG0(s, v)   STATE_SETG(0, s, v)
#define STATE_GETG0(s)      STATE_GETG(0, s)
#define STATE_SETG1(s, v)   STATE_SETG(1, s, v)
#define STATE_GETG1(s)      STATE_GETG(1, s)
#define STATE_SETG2(s, v)   STATE_SETG(2, s, v)
#define STATE_GETG2(s)      STATE_GETG(2, s)
#define STATE_SETG3(s, v)   STATE_SETG(3, s, v)
#define STATE_GETG3(s)      STATE_GETG(3, s)

#define STATE_SETFLAG(s, f)     ((s)->c[4]) |= (f);
#define STATE_GETFLAG(s, f)     ((s)->c[4] & (f))
#define STATE_CLEARFLAG(s, f)   ((s)->c[4]) &= ~(f);
#define STATE_CLEARFLAGS(s)     ((s)->c[4]) = 0;

#define ISO2022_GETCHARSET(charset, c1)                             \
    if ((c) >= 0x80)                                                \
        return 1;                                                   \
    if (STATE_GETFLAG(state, F_SHIFTED)) /* G1 */                   \
        (charset) = STATE_GETG1(state);                             \
    else /* G1 */                                                   \
        (charset) = STATE_GETG0(state);                             \

#ifdef ISO2022_USE_G2_DESIGNATION
/* hardcoded for iso-2022-jp-2 for now. we'll need to generalize it
   when we have more G2 designating encodings */
#define SS2_ROUTINE                                                 \
    if (IN2 == 'N') { /* SS2 */                                     \
        RESERVE_INBUF(3)                                            \
        if (STATE_GETG2(state) == CHARSET_ISO8859_1) {              \
            ISO8859_1_DECODE(IN3 ^ 0x80, **outbuf)                  \
            else return 3;                                          \
        } else if (STATE_GETG2(state) == CHARSET_ISO8859_7) {       \
            ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf)                  \
            else return 3;                                          \
        } else if (STATE_GETG2(state) == CHARSET_ASCII) {           \
            if (IN3 & 0x80) return 3;                               \
            else **outbuf = IN3;                                    \
        } else                                                      \
            return MBERR_INTERNAL;                                  \
        NEXT(3, 1)                                                  \
    } else
#else
#define SS2_ROUTINE
#endif

#ifndef ISO2022_NO_SHIFT
#define SHIFT_CASES                                                 \
    case SI:                                                        \
        STATE_CLEARFLAG(state, F_SHIFTED)                           \
        NEXT_IN(1)                                                  \
        break;                                                      \
    case SO:                                                        \
        STATE_SETFLAG(state, F_SHIFTED)                             \
        NEXT_IN(1)                                                  \
        break;
#else
/* for compatibility with JapaneseCodecs */
#define SHIFT_CASES
#endif

#define ISO2022_BASECASES(c1)                                       \
    case ESC:                                                       \
        RESERVE_INBUF(2)                                            \
        if (IS_ISO2022ESC(IN2)) {                                   \
            int err;                                                \
            err = iso2022processesc(state, inbuf, &inleft);         \
            if (err != 0)                                           \
                return err;                                         \
        } else SS2_ROUTINE {                                        \
            STATE_SETFLAG(state, F_ESCTHROUGHOUT)                   \
            OUT1(ESC)                                               \
            NEXT(1, 1)                                              \
        }                                                           \
        break;                                                      \
    SHIFT_CASES                                                     \
    case '\n':                                                      \
        STATE_CLEARFLAG(state, F_SHIFTED)                           \
        WRITE1('\n')                                                \
        NEXT(1, 1)                                                  \
        break;

#define ISO2022_ESCTHROUGHOUT(c)                                    \
    if (STATE_GETFLAG(state, F_ESCTHROUGHOUT)) {                    \
        /* ESC throughout mode: for non-iso2022 escape sequences */ \
        RESERVE_OUTBUF(1)                                           \
        OUT1(c) /* assume as ISO-8859-1 */                          \
        NEXT(1, 1)                                                  \
        if (IS_ESCEND(c)) {                                         \
            STATE_CLEARFLAG(state, F_ESCTHROUGHOUT)                 \
        }                                                           \
        continue;                                                   \
    }

#define ISO2022_LOOP_BEGIN                                          \
    while (inleft > 0) {                                            \
        unsigned char c = IN1;                                      \
        ISO2022_ESCTHROUGHOUT(c)                                    \
        switch(c) {                                                 \
        ISO2022_BASECASES(c)                                        \
        default:                                                    \
            if (c < 0x20) { /* C0 */                                \
                RESERVE_OUTBUF(1)                                   \
                OUT1(c)                                             \
                NEXT(1, 1)                                          \
            } else if (c >= 0x80)                                   \
                return 1;                                           \
            else {
#define ISO2022_LOOP_END                                            \
            }                                                       \
        }                                                           \
    }

static int
iso2022processesc(MultibyteCodec_State *state,
                  const unsigned char **inbuf, size_t *inleft)
{
    unsigned char charset, designation;
    size_t  i, esclen;

    for (i = 1;i < MAX_ESCSEQLEN;i++) {
        if (i >= *inleft)
            return MBERR_TOOFEW;
        if (IS_ESCEND((*inbuf)[i])) {
            esclen = i + 1;
            break;
        }
#ifdef ISO2022_USE_JISX0208EXT
        else if (i+1 < *inleft && (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@')
            i += 2;
#endif
    }

    if (i >= MAX_ESCSEQLEN)
        return 1; /* unterminated escape sequence */

    switch (esclen) {
    case 3:
        if (IN2 == '$') {
            charset = IN3 | CHARSET_DOUBLEBYTE;
            designation = 0;
        } else {
            charset = IN3;
            if (IN2 == '(') designation = 0;
            else if (IN2 == ')') designation = 1;
#ifdef ISO2022_USE_G2_DESIGNATION
            else if (IN2 == '.') designation = 2;
#endif
            else return 3;
        }
        break;
    case 4:
        if (IN2 != '$')
            return 4;

        charset = IN4 | CHARSET_DOUBLEBYTE;
        if (IN3 == '(') designation = 0;
        else if (IN3 == ')') designation = 1;
        else return 4;
        break;
#ifdef ISO2022_USE_JISX0208EXT
    case 6: /* designation with prefix */
        if ((*inbuf)[3] == ESC && (*inbuf)[4] == '$' && (*inbuf)[5] == 'B') {
            charset = 'B' | CHARSET_DOUBLEBYTE;
            designation = 0;
        } else
            return 6;
        break;
#endif
    default:
        return esclen;
    }

    { /* raise error when the charset is not designated for this encoding */
        const unsigned char dsgs[] = {ISO2022_DESIGNATIONS, '\x00'};

        for (i = 0; dsgs[i] != '\x00'; i++)
            if (dsgs[i] == charset)
                break;

        if (dsgs[i] == '\x00')
            return esclen;
    }

    STATE_SETG(designation, state, charset)
    *inleft -= esclen;
    (*inbuf) += esclen;
    return 0;
}
back to top