/* ------------------------------------------------------------------------ unicodedata -- Provides access to the Unicode database. Data was extracted from the UnicodeData.txt file. The current version number is reported in the unidata_version constant. Written by Marc-Andre Lemburg (mal@lemburg.com). Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) Modified by Martin v. Löwis (martin@v.loewis.de) Copyright (c) Corporation for National Research Initiatives. ------------------------------------------------------------------------ */ #include "Python.h" #include "ucnhash.h" #include "structmember.h" /* character properties */ typedef struct { const unsigned char category; /* index into _PyUnicode_CategoryNames */ const unsigned char combining; /* combining class value 0 - 255 */ const unsigned char bidirectional; /* index into _PyUnicode_BidirectionalNames */ const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char east_asian_width; /* index into _PyUnicode_EastAsianWidth */ const unsigned char normalization_quick_check; /* see is_normalized() */ } _PyUnicode_DatabaseRecord; typedef struct change_record { /* sequence of fields should be the same as in merge_old_version */ const unsigned char bidir_changed; const unsigned char category_changed; const unsigned char decimal_changed; const unsigned char mirrored_changed; const double numeric_changed; } change_record; /* data file generated by Tools/unicode/makeunicodedata.py */ #include "unicodedata_db.h" static const _PyUnicode_DatabaseRecord* _getrecord_ex(Py_UCS4 code) { int index; if (code >= 0x110000) index = 0; else { index = index1[(code>>SHIFT)]; index = index2[(index<getrecord)(v)) static PyMemberDef DB_members[] = { {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, {NULL} }; /* forward declaration */ static PyTypeObject UCD_Type; #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type) static PyObject* new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), Py_UCS4 (*normalization)(Py_UCS4)) { PreviousDBVersion *self; self = PyObject_New(PreviousDBVersion, &UCD_Type); if (self == NULL) return NULL; self->name = name; self->getrecord = getrecord; self->normalization = normalization; return (PyObject*)self; } static Py_UCS4 getuchar(PyUnicodeObject *obj) { if (PyUnicode_READY(obj)) return (Py_UCS4)-1; if (PyUnicode_GET_LENGTH(obj) == 1) { if (PyUnicode_READY(obj)) return (Py_UCS4)-1; return PyUnicode_READ_CHAR(obj, 0); } PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); return (Py_UCS4)-1; } /* --- Module API --------------------------------------------------------- */ PyDoc_STRVAR(unicodedata_decimal__doc__, "decimal(unichr[, default])\n\ \n\ Returns the decimal value assigned to the Unicode character unichr\n\ as integer. If no such value is defined, default is returned, or, if\n\ not given, ValueError is raised."); static PyObject * unicodedata_decimal(PyObject *self, PyObject *args) { PyUnicodeObject *v; PyObject *defobj = NULL; int have_old = 0; long rc; Py_UCS4 c; if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) { /* unassigned */ have_old = 1; rc = -1; } else if (old->decimal_changed != 0xFF) { have_old = 1; rc = old->decimal_changed; } } if (!have_old) rc = Py_UNICODE_TODECIMAL(c); if (rc < 0) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "not a decimal"); return NULL; } else { Py_INCREF(defobj); return defobj; } } return PyLong_FromLong(rc); } PyDoc_STRVAR(unicodedata_digit__doc__, "digit(unichr[, default])\n\ \n\ Returns the digit value assigned to the Unicode character unichr as\n\ integer. If no such value is defined, default is returned, or, if\n\ not given, ValueError is raised."); static PyObject * unicodedata_digit(PyObject *self, PyObject *args) { PyUnicodeObject *v; PyObject *defobj = NULL; long rc; Py_UCS4 c; if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; rc = Py_UNICODE_TODIGIT(c); if (rc < 0) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "not a digit"); return NULL; } else { Py_INCREF(defobj); return defobj; } } return PyLong_FromLong(rc); } PyDoc_STRVAR(unicodedata_numeric__doc__, "numeric(unichr[, default])\n\ \n\ Returns the numeric value assigned to the Unicode character unichr\n\ as float. If no such value is defined, default is returned, or, if\n\ not given, ValueError is raised."); static PyObject * unicodedata_numeric(PyObject *self, PyObject *args) { PyUnicodeObject *v; PyObject *defobj = NULL; int have_old = 0; double rc; Py_UCS4 c; if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) { /* unassigned */ have_old = 1; rc = -1.0; } else if (old->decimal_changed != 0xFF) { have_old = 1; rc = old->decimal_changed; } } if (!have_old) rc = Py_UNICODE_TONUMERIC(c); if (rc == -1.0) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "not a numeric character"); return NULL; } else { Py_INCREF(defobj); return defobj; } } return PyFloat_FromDouble(rc); } PyDoc_STRVAR(unicodedata_category__doc__, "category(unichr)\n\ \n\ Returns the general category assigned to the Unicode character\n\ unichr as string."); static PyObject * unicodedata_category(PyObject *self, PyObject *args) { PyUnicodeObject *v; int index; Py_UCS4 c; if (!PyArg_ParseTuple(args, "O!:category", &PyUnicode_Type, &v)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->category; if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed != 0xFF) index = old->category_changed; } return PyUnicode_FromString(_PyUnicode_CategoryNames[index]); } PyDoc_STRVAR(unicodedata_bidirectional__doc__, "bidirectional(unichr)\n\ \n\ Returns the bidirectional class assigned to the Unicode character\n\ unichr as string. If no such value is defined, an empty string is\n\ returned."); static PyObject * unicodedata_bidirectional(PyObject *self, PyObject *args) { PyUnicodeObject *v; int index; Py_UCS4 c; if (!PyArg_ParseTuple(args, "O!:bidirectional", &PyUnicode_Type, &v)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->bidirectional; if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ else if (old->bidir_changed != 0xFF) index = old->bidir_changed; } return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]); } PyDoc_STRVAR(unicodedata_combining__doc__, "combining(unichr)\n\ \n\ Returns the canonical combining class assigned to the Unicode\n\ character unichr as integer. Returns 0 if no combining class is\n\ defined."); static PyObject * unicodedata_combining(PyObject *self, PyObject *args) { PyUnicodeObject *v; int index; Py_UCS4 c; if (!PyArg_ParseTuple(args, "O!:combining", &PyUnicode_Type, &v)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->combining; if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ } return PyLong_FromLong(index); } PyDoc_STRVAR(unicodedata_mirrored__doc__, "mirrored(unichr)\n\ \n\ Returns the mirrored property assigned to the Unicode character\n\ unichr as integer. Returns 1 if the character has been identified as\n\ a \"mirrored\" character in bidirectional text, 0 otherwise."); static PyObject * unicodedata_mirrored(PyObject *self, PyObject *args) { PyUnicodeObject *v; int index; Py_UCS4 c; if (!PyArg_ParseTuple(args, "O!:mirrored", &PyUnicode_Type, &v)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->mirrored; if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ else if (old->mirrored_changed != 0xFF) index = old->mirrored_changed; } return PyLong_FromLong(index); } PyDoc_STRVAR(unicodedata_east_asian_width__doc__, "east_asian_width(unichr)\n\ \n\ Returns the east asian width assigned to the Unicode character\n\ unichr as string."); static PyObject * unicodedata_east_asian_width(PyObject *self, PyObject *args) { PyUnicodeObject *v; int index; Py_UCS4 c; if (!PyArg_ParseTuple(args, "O!:east_asian_width", &PyUnicode_Type, &v)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; index = (int) _getrecord_ex(c)->east_asian_width; if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) index = 0; /* unassigned */ } return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); } PyDoc_STRVAR(unicodedata_decomposition__doc__, "decomposition(unichr)\n\ \n\ Returns the character decomposition mapping assigned to the Unicode\n\ character unichr as string. An empty string is returned in case no\n\ such mapping is defined."); static PyObject * unicodedata_decomposition(PyObject *self, PyObject *args) { PyUnicodeObject *v; char decomp[256]; int code, index, count; size_t i; unsigned int prefix_index; Py_UCS4 c; if (!PyArg_ParseTuple(args, "O!:decomposition", &PyUnicode_Type, &v)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; code = (int)c; if (self && UCD_Check(self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) return PyUnicode_FromString(""); /* unassigned */ } if (code < 0 || code >= 0x110000) index = 0; else { index = decomp_index1[(code>>DECOMP_SHIFT)]; index = decomp_index2[(index<> 8; /* XXX: could allocate the PyString up front instead (strlen(prefix) + 5 * count + 1 bytes) */ /* Based on how index is calculated above and decomp_data is generated from Tools/unicode/makeunicodedata.py, it should not be possible to overflow decomp_prefix. */ prefix_index = decomp_data[index] & 255; assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix)); /* copy prefix */ i = strlen(decomp_prefix[prefix_index]); memcpy(decomp, decomp_prefix[prefix_index], i); while (count-- > 0) { if (i) decomp[i++] = ' '; assert(i < sizeof(decomp)); PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", decomp_data[++index]); i += strlen(decomp + i); } return PyUnicode_FromStringAndSize(decomp, i); } static void get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) { if (code >= 0x110000) { *index = 0; } else if (self && UCD_Check(self) && get_old_record(self, code)->category_changed==0) { /* unassigned in old version */ *index = 0; } else { *index = decomp_index1[(code>>DECOMP_SHIFT)]; *index = decomp_index2[(*index<> 8; *prefix = decomp_data[*index] & 255; (*index)++; } #define SBase 0xAC00 #define LBase 0x1100 #define VBase 0x1161 #define TBase 0x11A7 #define LCount 19 #define VCount 21 #define TCount 28 #define NCount (VCount*TCount) #define SCount (LCount*NCount) static PyObject* nfd_nfkd(PyObject *self, PyObject *input, int k) { PyObject *result; Py_UCS4 *output; Py_ssize_t i, o, osize; int kind; void *data; /* Longest decomposition in Unicode 3.2: U+FDFA */ Py_UCS4 stack[20]; Py_ssize_t space, isize; int index, prefix, count, stackptr; unsigned char prev, cur; stackptr = 0; isize = PyUnicode_GET_LENGTH(input); /* Overallocate at most 10 characters. */ space = (isize > 10 ? 10 : isize) + isize; osize = space; output = PyMem_Malloc(space * sizeof(Py_UCS4)); if (!output) { PyErr_NoMemory(); return NULL; } i = o = 0; kind = PyUnicode_KIND(input); data = PyUnicode_DATA(input); while (i < isize) { stack[stackptr++] = PyUnicode_READ(kind, data, i++); while(stackptr) { Py_UCS4 code = stack[--stackptr]; /* Hangul Decomposition adds three characters in a single step, so we need at least that much room. */ if (space < 3) { Py_UCS4 *new_output; osize += 10; space += 10; new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4)); if (new_output == NULL) { PyMem_Free(output); PyErr_NoMemory(); return NULL; } output = new_output; } /* Hangul Decomposition. */ if (SBase <= code && code < (SBase+SCount)) { int SIndex = code - SBase; int L = LBase + SIndex / NCount; int V = VBase + (SIndex % NCount) / TCount; int T = TBase + SIndex % TCount; output[o++] = L; output[o++] = V; space -= 2; if (T != TBase) { output[o++] = T; space --; } continue; } /* normalization changes */ if (self && UCD_Check(self)) { Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); if (value != 0) { stack[stackptr++] = value; continue; } } /* Other decompositions. */ get_decomp_record(self, code, &index, &prefix, &count); /* Copy character if it is not decomposable, or has a compatibility decomposition, but we do NFD. */ if (!count || (prefix && !k)) { output[o++] = code; space--; continue; } /* Copy decomposition onto the stack, in reverse order. */ while(count) { code = decomp_data[index + (--count)]; stack[stackptr++] = code; } } } result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, o); PyMem_Free(output); if (!result) return NULL; /* result is guaranteed to be ready, as it is compact. */ kind = PyUnicode_KIND(result); data = PyUnicode_DATA(result); /* Sort canonically. */ i = 0; prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; for (i++; i < PyUnicode_GET_LENGTH(result); i++) { cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; if (prev == 0 || cur == 0 || prev <= cur) { prev = cur; continue; } /* Non-canonical order. Need to switch *i with previous. */ o = i - 1; while (1) { Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1); PyUnicode_WRITE(kind, data, o+1, PyUnicode_READ(kind, data, o)); PyUnicode_WRITE(kind, data, o, tmp); o--; if (o < 0) break; prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining; if (prev == 0 || prev <= cur) break; } prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining; } return result; } static int find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code) { unsigned int index; for (index = 0; nfc[index].start; index++) { unsigned int start = nfc[index].start; if (code < start) return -1; if (code <= start + nfc[index].count) { unsigned int delta = code - start; return nfc[index].index + delta; } } return -1; } static PyObject* nfc_nfkc(PyObject *self, PyObject *input, int k) { PyObject *result; int kind; void *data; Py_UCS4 *output; Py_ssize_t i, i1, o, len; int f,l,index,index1,comb; Py_UCS4 code; Py_ssize_t skipped[20]; int cskipped = 0; result = nfd_nfkd(self, input, k); if (!result) return NULL; /* result will be "ready". */ kind = PyUnicode_KIND(result); data = PyUnicode_DATA(result); len = PyUnicode_GET_LENGTH(result); /* We allocate a buffer for the output. If we find that we made no changes, we still return the NFD result. */ output = PyMem_Malloc(len * sizeof(Py_UCS4)); if (!output) { PyErr_NoMemory(); Py_DECREF(result); return 0; } i = o = 0; again: while (i < len) { for (index = 0; index < cskipped; index++) { if (skipped[index] == i) { /* *i character is skipped. Remove from list. */ skipped[index] = skipped[cskipped-1]; cskipped--; i++; goto again; /* continue while */ } } /* Hangul Composition. We don't need to check for pairs, since we always have decomposed data. */ code = PyUnicode_READ(kind, data, i); if (LBase <= code && code < (LBase+LCount) && i + 1 < len && VBase <= PyUnicode_READ(kind, data, i+1) && PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) { int LIndex, VIndex; LIndex = code - LBase; VIndex = PyUnicode_READ(kind, data, i+1) - VBase; code = SBase + (LIndex*VCount+VIndex)*TCount; i+=2; if (i < len && TBase <= PyUnicode_READ(kind, data, i) && PyUnicode_READ(kind, data, i) <= (TBase+TCount)) { code += PyUnicode_READ(kind, data, i)-TBase; i++; } output[o++] = code; continue; } /* code is still input[i] here */ f = find_nfc_index(self, nfc_first, code); if (f == -1) { output[o++] = code; i++; continue; } /* Find next unblocked character. */ i1 = i+1; comb = 0; /* output base character for now; might be updated later. */ output[o] = PyUnicode_READ(kind, data, i); while (i1 < len) { Py_UCS4 code1 = PyUnicode_READ(kind, data, i1); int comb1 = _getrecord_ex(code1)->combining; if (comb) { if (comb1 == 0) break; if (comb >= comb1) { /* Character is blocked. */ i1++; continue; } } l = find_nfc_index(self, nfc_last, code1); /* i1 cannot be combined with i. If i1 is a starter, we don't need to look further. Otherwise, record the combining class. */ if (l == -1) { not_combinable: if (comb1 == 0) break; comb = comb1; i1++; continue; } index = f*TOTAL_LAST + l; index1 = comp_index[index >> COMP_SHIFT]; code = comp_data[(index1<combining; unsigned char quickcheck = record->normalization_quick_check; if (quickcheck & quickcheck_mask) return 0; /* this string might need normalization */ if (combining && prev_combining > combining) return 0; /* non-canonical sort order, not normalized */ prev_combining = combining; } return 1; /* certainly normalized */ } PyDoc_STRVAR(unicodedata_normalize__doc__, "normalize(form, unistr)\n\ \n\ Return the normal form 'form' for the Unicode string unistr. Valid\n\ values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'."); static PyObject* unicodedata_normalize(PyObject *self, PyObject *args) { char *form; PyObject *input; if(!PyArg_ParseTuple(args, "sO!:normalize", &form, &PyUnicode_Type, &input)) return NULL; if (PyUnicode_READY(input) == -1) return NULL; if (PyUnicode_GET_LENGTH(input) == 0) { /* Special case empty input strings, since resizing them later would cause internal errors. */ Py_INCREF(input); return input; } if (strcmp(form, "NFC") == 0) { if (is_normalized(self, input, 1, 0)) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 0); } if (strcmp(form, "NFKC") == 0) { if (is_normalized(self, input, 1, 1)) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 1); } if (strcmp(form, "NFD") == 0) { if (is_normalized(self, input, 0, 0)) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 0); } if (strcmp(form, "NFKD") == 0) { if (is_normalized(self, input, 0, 1)) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 1); } PyErr_SetString(PyExc_ValueError, "invalid normalization form"); return NULL; } /* -------------------------------------------------------------------- */ /* unicode character name tables */ /* data file generated by Tools/unicode/makeunicodedata.py */ #include "unicodename_db.h" /* -------------------------------------------------------------------- */ /* database code (cut and pasted from the unidb package) */ static unsigned long _gethash(const char *s, int len, int scale) { int i; unsigned long h = 0; unsigned long ix; for (i = 0; i < len; i++) { h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i])); ix = h & 0xff000000; if (ix) h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; } return h; } static char *hangul_syllables[][3] = { { "G", "A", "" }, { "GG", "AE", "G" }, { "N", "YA", "GG" }, { "D", "YAE", "GS" }, { "DD", "EO", "N", }, { "R", "E", "NJ" }, { "M", "YEO", "NH" }, { "B", "YE", "D" }, { "BB", "O", "L" }, { "S", "WA", "LG" }, { "SS", "WAE", "LM" }, { "", "OE", "LB" }, { "J", "YO", "LS" }, { "JJ", "U", "LT" }, { "C", "WEO", "LP" }, { "K", "WE", "LH" }, { "T", "WI", "M" }, { "P", "YU", "B" }, { "H", "EU", "BS" }, { 0, "YI", "S" }, { 0, "I", "SS" }, { 0, 0, "NG" }, { 0, 0, "J" }, { 0, 0, "C" }, { 0, 0, "K" }, { 0, 0, "T" }, { 0, 0, "P" }, { 0, 0, "H" } }; /* These ranges need to match makeunicodedata.py:cjk_ranges. */ static int is_unified_ideograph(Py_UCS4 code) { return (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */ (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */ (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */ } /* macros used to determine if the given codepoint is in the PUA range that * we are using to store aliases and named sequences */ #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \ (cp < named_sequences_end)) static int _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq) { /* Find the name associated with the given codepoint. * If with_alias_and_seq is 1, check for names in the Private Use Area 15 * that we are using for aliases and named sequences. */ int offset; int i; int word; unsigned char* w; if (code >= 0x110000) return 0; /* XXX should we just skip all the codepoints in the PUAs here? */ if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) return 0; if (self && UCD_Check(self)) { /* in 3.2.0 there are no aliases and named sequences */ const change_record *old; if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) return 0; old = get_old_record(self, code); if (old->category_changed == 0) { /* unassigned */ return 0; } } if (SBase <= code && code < SBase+SCount) { /* Hangul syllable. */ int SIndex = code - SBase; int L = SIndex / NCount; int V = (SIndex % NCount) / TCount; int T = SIndex % TCount; if (buflen < 27) /* Worst case: HANGUL SYLLABLE <10chars>. */ return 0; strcpy(buffer, "HANGUL SYLLABLE "); buffer += 16; strcpy(buffer, hangul_syllables[L][0]); buffer += strlen(hangul_syllables[L][0]); strcpy(buffer, hangul_syllables[V][1]); buffer += strlen(hangul_syllables[V][1]); strcpy(buffer, hangul_syllables[T][2]); buffer += strlen(hangul_syllables[T][2]); *buffer = '\0'; return 1; } if (is_unified_ideograph(code)) { if (buflen < 28) /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ return 0; sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); return 1; } /* get offset into phrasebook */ offset = phrasebook_offset1[(code>>phrasebook_shift)]; offset = phrasebook_offset2[(offset<= 0) { word = (word << 8) + phrasebook[offset+1]; offset += 2; } else word = phrasebook[offset++]; if (i) { if (i > buflen) return 0; /* buffer overflow */ buffer[i++] = ' '; } /* copy word string from lexicon. the last character in the word has bit 7 set. the last word in a string ends with 0x80 */ w = lexicon + lexicon_offset[word]; while (*w < 128) { if (i >= buflen) return 0; /* buffer overflow */ buffer[i++] = *w++; } if (i >= buflen) return 0; /* buffer overflow */ buffer[i++] = *w & 127; if (*w == 128) break; /* end of word */ } return 1; } static int _cmpname(PyObject *self, int code, const char* name, int namelen) { /* check if code corresponds to the given name */ int i; char buffer[NAME_MAXLEN]; if (!_getucname(self, code, buffer, sizeof(buffer), 1)) return 0; for (i = 0; i < namelen; i++) { if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i]) return 0; } return buffer[namelen] == '\0'; } static void find_syllable(const char *str, int *len, int *pos, int count, int column) { int i, len1; *len = -1; for (i = 0; i < count; i++) { char *s = hangul_syllables[i][column]; len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int); if (len1 <= *len) continue; if (strncmp(str, s, len1) == 0) { *len = len1; *pos = i; } } if (*len == -1) { *len = 0; } } static int _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) { /* check if named sequences are allowed */ if (!with_named_seq && IS_NAMED_SEQ(cp)) return 0; /* if the codepoint is in the PUA range that we use for aliases, * convert it to obtain the right codepoint */ if (IS_ALIAS(cp)) *code = name_aliases[cp-aliases_start]; else *code = cp; return 1; } static int _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, int with_named_seq) { /* Return the codepoint associated with the given name. * Named aliases are resolved too (unless self != NULL (i.e. we are using * 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are * using for the named sequence, and the caller must then convert it. */ unsigned int h, v; unsigned int mask = code_size-1; unsigned int i, incr; /* Check for hangul syllables. */ if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { int len, L = -1, V = -1, T = -1; const char *pos = name + 16; find_syllable(pos, &len, &L, LCount, 0); pos += len; find_syllable(pos, &len, &V, VCount, 1); pos += len; find_syllable(pos, &len, &T, TCount, 2); pos += len; if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { *code = SBase + (L*VCount+V)*TCount + T; return 1; } /* Otherwise, it's an illegal syllable name. */ return 0; } /* Check for unified ideographs. */ if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { /* Four or five hexdigits must follow. */ v = 0; name += 22; namelen -= 22; if (namelen != 4 && namelen != 5) return 0; while (namelen--) { v *= 16; if (*name >= '0' && *name <= '9') v += *name - '0'; else if (*name >= 'A' && *name <= 'F') v += *name - 'A' + 10; else return 0; name++; } if (!is_unified_ideograph(v)) return 0; *code = v; return 1; } /* the following is the same as python's dictionary lookup, with only minor changes. see the makeunicodedata script for more details */ h = (unsigned int) _gethash(name, namelen, code_magic); i = (~h) & mask; v = code_hash[i]; if (!v) return 0; if (_cmpname(self, v, name, namelen)) return _check_alias_and_seq(v, code, with_named_seq); incr = (h ^ (h >> 3)) & mask; if (!incr) incr = mask; for (;;) { i = (i + incr) & mask; v = code_hash[i]; if (!v) return 0; if (_cmpname(self, v, name, namelen)) return _check_alias_and_seq(v, code, with_named_seq); incr = incr << 1; if (incr > mask) incr = incr ^ code_poly; } } static const _PyUnicode_Name_CAPI hashAPI = { sizeof(_PyUnicode_Name_CAPI), _getucname, _getcode }; /* -------------------------------------------------------------------- */ /* Python bindings */ PyDoc_STRVAR(unicodedata_name__doc__, "name(unichr[, default])\n\ Returns the name assigned to the Unicode character unichr as a\n\ string. If no name is defined, default is returned, or, if not\n\ given, ValueError is raised."); static PyObject * unicodedata_name(PyObject* self, PyObject* args) { char name[NAME_MAXLEN]; Py_UCS4 c; PyUnicodeObject* v; PyObject* defobj = NULL; if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) return NULL; c = getuchar(v); if (c == (Py_UCS4)-1) return NULL; if (!_getucname(self, c, name, sizeof(name), 0)) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "no such name"); return NULL; } else { Py_INCREF(defobj); return defobj; } } return PyUnicode_FromString(name); } PyDoc_STRVAR(unicodedata_lookup__doc__, "lookup(name)\n\ \n\ Look up character by name. If a character with the\n\ given name is found, return the corresponding Unicode\n\ character. If not found, KeyError is raised."); static PyObject * unicodedata_lookup(PyObject* self, PyObject* args) { Py_UCS4 code; char* name; int namelen; unsigned int index; if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) return NULL; if (!_getcode(self, name, namelen, &code, 1)) { PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); return NULL; } /* check if code is in the PUA range that we use for named sequences and convert it */ if (IS_NAMED_SEQ(code)) { index = code-named_sequences_start; return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, named_sequences[index].seq, named_sequences[index].seqlen); } return PyUnicode_FromOrdinal(code); } /* XXX Add doc strings. */ static PyMethodDef unicodedata_functions[] = { {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__}, {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__}, {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__}, {"category", unicodedata_category, METH_VARARGS, unicodedata_category__doc__}, {"bidirectional", unicodedata_bidirectional, METH_VARARGS, unicodedata_bidirectional__doc__}, {"combining", unicodedata_combining, METH_VARARGS, unicodedata_combining__doc__}, {"mirrored", unicodedata_mirrored, METH_VARARGS, unicodedata_mirrored__doc__}, {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS, unicodedata_east_asian_width__doc__}, {"decomposition", unicodedata_decomposition, METH_VARARGS, unicodedata_decomposition__doc__}, {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__}, {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, {"normalize", unicodedata_normalize, METH_VARARGS, unicodedata_normalize__doc__}, {NULL, NULL} /* sentinel */ }; static PyTypeObject UCD_Type = { /* The ob_type field must be initialized in the module init function * to be portable to Windows without using C++. */ PyVarObject_HEAD_INIT(NULL, 0) "unicodedata.UCD", /*tp_name*/ sizeof(PreviousDBVersion), /*tp_basicsize*/ 0, /*tp_itemsize*/ /* methods */ (destructor)PyObject_Del, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ 0, /*tp_reserved*/ 0, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_sequence*/ 0, /*tp_as_mapping*/ 0, /*tp_hash*/ 0, /*tp_call*/ 0, /*tp_str*/ PyObject_GenericGetAttr,/*tp_getattro*/ 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT, /*tp_flags*/ 0, /*tp_doc*/ 0, /*tp_traverse*/ 0, /*tp_clear*/ 0, /*tp_richcompare*/ 0, /*tp_weaklistoffset*/ 0, /*tp_iter*/ 0, /*tp_iternext*/ unicodedata_functions, /*tp_methods*/ DB_members, /*tp_members*/ 0, /*tp_getset*/ 0, /*tp_base*/ 0, /*tp_dict*/ 0, /*tp_descr_get*/ 0, /*tp_descr_set*/ 0, /*tp_dictoffset*/ 0, /*tp_init*/ 0, /*tp_alloc*/ 0, /*tp_new*/ 0, /*tp_free*/ 0, /*tp_is_gc*/ }; PyDoc_STRVAR(unicodedata_docstring, "This module provides access to the Unicode Character Database which\n\ defines character properties for all Unicode characters. The data in\n\ this database is based on the UnicodeData.txt file version\n\ " UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\ \n\ The module uses the same names and symbols as defined by the\n\ UnicodeData File Format " UNIDATA_VERSION "."); static struct PyModuleDef unicodedatamodule = { PyModuleDef_HEAD_INIT, "unicodedata", unicodedata_docstring, -1, unicodedata_functions, NULL, NULL, NULL, NULL }; PyMODINIT_FUNC PyInit_unicodedata(void) { PyObject *m, *v; Py_TYPE(&UCD_Type) = &PyType_Type; m = PyModule_Create(&unicodedatamodule); if (!m) return NULL; PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); Py_INCREF(&UCD_Type); PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); /* Previous versions */ v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); if (v != NULL) PyModule_AddObject(m, "ucd_3_2_0", v); /* Export C API */ v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL); if (v != NULL) PyModule_AddObject(m, "ucnhash_CAPI", v); return m; } /* Local variables: c-basic-offset: 4 indent-tabs-mode: nil End: */