123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403 |
- #ifndef Py_UNICODEOBJECT_H
- #define Py_UNICODEOBJECT_H
- #ifndef SIZEOF_WCHAR_T
- #error Must define SIZEOF_WCHAR_T
- #endif
- #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
- /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
- Otherwise, Unicode strings are stored as UCS-2 (with limited support
- for UTF-16) */
- #if Py_UNICODE_SIZE >= 4
- #define Py_UNICODE_WIDE
- #endif
- /* Set these flags if the platform has "wchar.h" and the
- wchar_t type is a 16-bit unsigned type */
- /* #define HAVE_WCHAR_H */
- /* #define HAVE_USABLE_WCHAR_T */
- #ifdef HAVE_WCHAR_H
- /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
- # ifdef _HAVE_BSDI
- # include <time.h>
- # endif
- # include <wchar.h>
- #endif
- #ifdef __cplusplus
- extern "C" {
- #endif
- #include "cpyext_unicodeobject.h"
- /*#define PyUnicode_Check(op) \
- ** PyType_FastSubclass((op)->ob_type, Py_TPFLAGS_UNICODE_SUBCLASS)
- **#define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
- */
- /* Fast access macros */
- #ifndef Py_LIMITED_API
- #define PyUnicode_WSTR_LENGTH(op) \
- (PyUnicode_IS_COMPACT_ASCII(op) ? \
- ((PyASCIIObject*)op)->length : \
- ((PyCompactUnicodeObject*)op)->wstr_length)
- /* Returns the deprecated Py_UNICODE representation's size in code units
- (this includes surrogate pairs as 2 units).
- If the Py_UNICODE representation is not available, it will be computed
- on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
- #define PyUnicode_GET_SIZE(op) \
- (assert(PyUnicode_Check(op)), \
- (((PyASCIIObject *)(op))->wstr) ? \
- PyUnicode_WSTR_LENGTH(op) : \
- ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
- assert(((PyASCIIObject *)(op))->wstr), \
- PyUnicode_WSTR_LENGTH(op)))
- #define PyUnicode_GET_DATA_SIZE(op) \
- (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
- /* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
- representation on demand. Using this macro is very inefficient now,
- try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
- use PyUnicode_WRITE() and PyUnicode_READ(). */
- #define PyUnicode_AS_UNICODE(op) \
- ((((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
- PyUnicode_AsUnicode((PyObject *)(op)))
- #define PyUnicode_AS_DATA(op) \
- ((const char *)(PyUnicode_AS_UNICODE(op)))
- /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
- /* Values for PyASCIIObject.state: */
- /* Interning state. */
- #define SSTATE_NOT_INTERNED 0
- #define SSTATE_INTERNED_MORTAL 1
- #define SSTATE_INTERNED_IMMORTAL 2
- /* Return true if the string contains only ASCII characters, or 0 if not. The
- string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
- ready. */
- #define PyUnicode_IS_ASCII(op) \
- (assert(PyUnicode_Check(op)), \
- assert(PyUnicode_IS_READY(op)), \
- ((PyASCIIObject*)op)->state.ascii)
- /* Return true if the string is compact or 0 if not.
- No type checks or Ready calls are performed. */
- #define PyUnicode_IS_COMPACT(op) \
- (((PyASCIIObject*)(op))->state.compact)
- /* Return true if the string is a compact ASCII string (use PyASCIIObject
- structure), or 0 if not. No type checks or Ready calls are performed. */
- #define PyUnicode_IS_COMPACT_ASCII(op) \
- (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
- enum PyUnicode_Kind {
- /* String contains only wstr byte characters. This is only possible
- when the string was created with a legacy API and _PyUnicode_Ready()
- has not been called yet. */
- PyUnicode_WCHAR_KIND = 0,
- /* Return values of the PyUnicode_KIND() macro: */
- PyUnicode_1BYTE_KIND = 1,
- PyUnicode_2BYTE_KIND = 2,
- PyUnicode_4BYTE_KIND = 4
- };
- /* Return pointers to the canonical representation cast to unsigned char,
- Py_UCS2, or Py_UCS4 for direct character access.
- No checks are performed, use PyUnicode_KIND() before to ensure
- these will work correctly. */
- #define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
- #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
- #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
- /* Return one of the PyUnicode_*_KIND values defined above. */
- #define PyUnicode_KIND(op) \
- (assert(PyUnicode_Check(op)), \
- assert(PyUnicode_IS_READY(op)), \
- ((PyASCIIObject *)(op))->state.kind)
- /* Return a void pointer to the raw unicode buffer. */
- #define _PyUnicode_COMPACT_DATA(op) \
- (PyUnicode_IS_ASCII(op) ? \
- ((void*)((PyASCIIObject*)(op) + 1)) : \
- ((void*)((PyCompactUnicodeObject*)(op) + 1)))
- #define _PyUnicode_NONCOMPACT_DATA(op) \
- (assert(((PyUnicodeObject*)(op))->data), \
- ((((PyUnicodeObject *)(op))->data)))
- #define PyUnicode_DATA(op) \
- (assert(PyUnicode_Check(op)), \
- PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
- _PyUnicode_NONCOMPACT_DATA(op))
- /* In the access macros below, "kind" may be evaluated more than once.
- All other macro parameters are evaluated exactly once, so it is safe
- to put side effects into them (such as increasing the index). */
- /* Write into the canonical representation, this macro does not do any sanity
- checks and is intended for usage in loops. The caller should cache the
- kind and data pointers obtained from other macro calls.
- index is the index in the string (starts at 0) and value is the new
- code point value which should be written to that location. */
- #define PyUnicode_WRITE(kind, data, index, value) \
- do { \
- switch ((kind)) { \
- case PyUnicode_1BYTE_KIND: { \
- ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
- break; \
- } \
- case PyUnicode_2BYTE_KIND: { \
- ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
- break; \
- } \
- default: { \
- assert((kind) == PyUnicode_4BYTE_KIND); \
- ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
- } \
- } \
- } while (0)
- /* Read a code point from the string's canonical representation. No checks
- or ready calls are performed. */
- #define PyUnicode_READ(kind, data, index) \
- ((Py_UCS4) \
- ((kind) == PyUnicode_1BYTE_KIND ? \
- ((const Py_UCS1 *)(data))[(index)] : \
- ((kind) == PyUnicode_2BYTE_KIND ? \
- ((const Py_UCS2 *)(data))[(index)] : \
- ((const Py_UCS4 *)(data))[(index)] \
- ) \
- ))
- /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
- calls PyUnicode_KIND() and might call it twice. For single reads, use
- PyUnicode_READ_CHAR, for multiple consecutive reads callers should
- cache kind and use PyUnicode_READ instead. */
- #define PyUnicode_READ_CHAR(unicode, index) \
- (assert(PyUnicode_Check(unicode)), \
- assert(PyUnicode_IS_READY(unicode)), \
- (Py_UCS4) \
- (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
- ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
- (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
- ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
- ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
- ) \
- ))
- /* Returns the length of the unicode string. The caller has to make sure that
- the string has it's canonical representation set before calling
- this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
- #define PyUnicode_GET_LENGTH(op) \
- (assert(PyUnicode_Check(op)), \
- assert(PyUnicode_IS_READY(op)), \
- ((PyASCIIObject *)(op))->length)
- /* Fast check to determine whether an object is ready. Equivalent to
- PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
- #define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
- /* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
- case. If the canonical representation is not yet set, it will still call
- _PyUnicode_Ready().
- Returns 0 on success and -1 on errors. */
- #define PyUnicode_READY(op) \
- (assert(PyUnicode_Check(op)), \
- (PyUnicode_IS_READY(op) ? \
- 0 : _PyUnicode_Ready((PyObject *)(op))))
- /* Return a maximum character value which is suitable for creating another
- string based on op. This is always an approximation but more efficient
- than iterating over the string. */
- #define PyUnicode_MAX_CHAR_VALUE(op) \
- (assert(PyUnicode_IS_READY(op)), \
- (PyUnicode_IS_ASCII(op) ? \
- (0x7f) : \
- (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
- (0xffU) : \
- (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
- (0xffffU) : \
- (0x10ffffU)))))
- #endif
- /* --- Constants ---------------------------------------------------------- */
- /* This Unicode character will be used as replacement character during
- decoding if the errors argument is set to "replace". Note: the
- Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
- Unicode 3.0. */
- #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
- /* === Public API ========================================================= */
- /* Get the length of the Unicode object. */
- PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
- PyObject *unicode
- );
- /* Get the number of Py_UNICODE units in the
- string representation. */
- PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
- PyObject *unicode /* Unicode object */
- );
- PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
- const char *format, /* ASCII-encoded string */
- va_list vargs
- );
- PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
- const char *format, /* ASCII-encoded string */
- ...
- );
- /* Use only if you know it's a string */
- #define PyUnicode_CHECK_INTERNED(op) \
- (((PyASCIIObject *)(op))->state.interned)
- /* --- wchar_t support for platforms which support it --------------------- */
- #ifdef HAVE_WCHAR_H
- /* Create a Unicode Object from the wchar_t buffer w of the given
- size.
- The buffer is copied into the new object. */
- PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
- const wchar_t *w, /* wchar_t buffer */
- Py_ssize_t size /* size of buffer */
- );
- /* Convert the Unicode object to a wide character string. The output string
- always ends with a nul character. If size is not NULL, write the number of
- wide characters (excluding the null character) into *size.
- Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
- on success. On error, returns NULL, *size is undefined and raises a
- MemoryError. */
- PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
- PyObject *unicode, /* Unicode object */
- Py_ssize_t *size /* number of characters of the result */
- );
- #endif
- /* === Builtin Codecs =====================================================
- Many of these APIs take two arguments encoding and errors. These
- parameters encoding and errors have the same semantics as the ones
- of the builtin str() API.
- Setting encoding to NULL causes the default encoding (UTF-8) to be used.
- Error handling is set by errors which may also be set to NULL
- meaning to use the default handling defined for the codec. Default
- error handling for all builtin codecs is "strict" (ValueErrors are
- raised).
- The codecs all use a similar interface. Only deviation from the
- generic ones are documented.
- */
- /* --- Manage the default encoding ---------------------------------------- */
- /* Returns a pointer to the default encoding (UTF-8) of the
- Unicode object unicode and the size of the encoded representation
- in bytes stored in *size.
- In case of an error, no *size is set.
- This function caches the UTF-8 encoded string in the unicodeobject
- and subsequent calls will return the same string. The memory is released
- when the unicodeobject is deallocated.
- _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
- support the previous internal function with the same behaviour.
- *** This API is for interpreter INTERNAL USE ONLY and will likely
- *** be removed or changed in the future.
- *** If you need to access the Unicode object as UTF-8 bytes string,
- *** please use PyUnicode_AsUTF8String() instead.
- */
- #ifndef Py_LIMITED_API
- PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
- PyObject *unicode,
- Py_ssize_t *size);
- #define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
- #endif
- /* Returns a pointer to the default encoding (UTF-8) of the
- Unicode object unicode.
- Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
- in the unicodeobject.
- _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
- support the previous internal function with the same behaviour.
- Use of this API is DEPRECATED since no size information can be
- extracted from the returned data.
- *** This API is for interpreter INTERNAL USE ONLY and will likely
- *** be removed or changed for Python 3.1.
- *** If you need to access the Unicode object as UTF-8 bytes string,
- *** please use PyUnicode_AsUTF8String() instead.
- */
- #ifndef Py_LIMITED_API
- #define _PyUnicode_AsString PyUnicode_AsUTF8
- #endif
- Py_LOCAL_INLINE(size_t) Py_UNICODE_strlen(const Py_UNICODE *u)
- {
- size_t res = 0;
- while(*u++)
- res++;
- return res;
- }
- Py_LOCAL_INLINE(int)
- Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
- {
- while (*s1 && *s2 && *s1 == *s2)
- s1++, s2++;
- if (*s1 && *s2)
- return (*s1 < *s2) ? -1 : +1;
- if (*s1)
- return 1;
- if (*s2)
- return -1;
- return 0;
- }
- #ifdef __cplusplus
- }
- #endif
- #endif /* !Py_UNICODEOBJECT_H */
|