summaryrefslogtreecommitdiff
path: root/c/wchar_helper.h
diff options
context:
space:
mode:
Diffstat (limited to 'c/wchar_helper.h')
-rw-r--r--c/wchar_helper.h246
1 files changed, 246 insertions, 0 deletions
diff --git a/c/wchar_helper.h b/c/wchar_helper.h
new file mode 100644
index 0000000..8e6ea58
--- /dev/null
+++ b/c/wchar_helper.h
@@ -0,0 +1,246 @@
+/*
+ * wchar_t helpers
+ */
+
+typedef uint16_t cffi_char16_t;
+typedef uint32_t cffi_char32_t;
+
+
+#if Py_UNICODE_SIZE == 2
+
+/* Before Python 2.7, PyUnicode_FromWideChar is not able to convert
+ wchar_t values greater than 65535 into two-unicode-characters surrogates.
+ But even the Python 2.7 version doesn't detect wchar_t values that are
+ out of range(1114112), and just returns nonsense.
+
+ From cffi 1.11 we can't use it anyway, because we need a version
+ with char32_t input types.
+*/
+static PyObject *
+_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
+{
+ PyObject *unicode;
+ register Py_ssize_t i;
+ Py_ssize_t alloc;
+ const cffi_char32_t *orig_w;
+
+ alloc = size;
+ orig_w = w;
+ for (i = size; i > 0; i--) {
+ if (*w > 0xFFFF)
+ alloc++;
+ w++;
+ }
+ w = orig_w;
+ unicode = PyUnicode_FromUnicode(NULL, alloc);
+ if (!unicode)
+ return NULL;
+
+ /* Copy the wchar_t data into the new object */
+ {
+ register Py_UNICODE *u;
+ u = PyUnicode_AS_UNICODE(unicode);
+ for (i = size; i > 0; i--) {
+ if (*w > 0xFFFF) {
+ cffi_char32_t ordinal;
+ if (*w > 0x10FFFF) {
+ PyErr_Format(PyExc_ValueError,
+ "char32_t out of range for "
+ "conversion to unicode: 0x%x", (int)*w);
+ Py_DECREF(unicode);
+ return NULL;
+ }
+ ordinal = *w++;
+ ordinal -= 0x10000;
+ *u++ = 0xD800 | (ordinal >> 10);
+ *u++ = 0xDC00 | (ordinal & 0x3FF);
+ }
+ else
+ *u++ = *w++;
+ }
+ }
+ return unicode;
+}
+
+static PyObject *
+_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
+{
+ return PyUnicode_FromUnicode((const Py_UNICODE *)w, size);
+}
+
+#else /* Py_UNICODE_SIZE == 4 */
+
+static PyObject *
+_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
+{
+ return PyUnicode_FromUnicode((const Py_UNICODE *)w, size);
+}
+
+static PyObject *
+_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
+{
+ /* 'size' is the length of the 'w' array */
+ PyObject *result = PyUnicode_FromUnicode(NULL, size);
+
+ if (result != NULL) {
+ Py_UNICODE *u_base = PyUnicode_AS_UNICODE(result);
+ Py_UNICODE *u = u_base;
+
+ if (size == 1) { /* performance only */
+ *u = (cffi_char32_t)*w;
+ }
+ else {
+ while (size > 0) {
+ cffi_char32_t ch = *w++;
+ size--;
+ if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
+ cffi_char32_t ch2 = *w;
+ if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ w++;
+ size--;
+ }
+ }
+ *u++ = ch;
+ }
+ if (PyUnicode_Resize(&result, u - u_base) < 0) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ }
+ }
+ return result;
+}
+
+#endif
+
+
+#define IS_SURROGATE(u) (0xD800 <= (u)[0] && (u)[0] <= 0xDBFF && \
+ 0xDC00 <= (u)[1] && (u)[1] <= 0xDFFF)
+#define AS_SURROGATE(u) (0x10000 + (((u)[0] - 0xD800) << 10) + \
+ ((u)[1] - 0xDC00))
+
+static int
+_my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
+ char *err_got)
+{
+ Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+ if (PyUnicode_GET_SIZE(unicode) != 1) {
+ sprintf(err_got, "unicode string of length %zd",
+ PyUnicode_GET_SIZE(unicode));
+ return -1;
+ }
+#if Py_UNICODE_SIZE == 4
+ if (((unsigned int)u[0]) > 0xFFFF)
+ {
+ sprintf(err_got, "larger-than-0xFFFF character");
+ return -1;
+ }
+#endif
+ *result = (cffi_char16_t)u[0];
+ return 0;
+}
+
+static int
+_my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
+ char *err_got)
+{
+ Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+ if (PyUnicode_GET_SIZE(unicode) == 1) {
+ *result = (cffi_char32_t)u[0];
+ return 0;
+ }
+#if Py_UNICODE_SIZE == 2
+ if (PyUnicode_GET_SIZE(unicode) == 2 && IS_SURROGATE(u)) {
+ *result = AS_SURROGATE(u);
+ return 0;
+ }
+#endif
+ sprintf(err_got, "unicode string of length %zd",
+ PyUnicode_GET_SIZE(unicode));
+ return -1;
+}
+
+static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
+{
+ Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
+ Py_ssize_t result = length;
+
+#if Py_UNICODE_SIZE == 4
+ Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+ Py_ssize_t i;
+
+ for (i=0; i<length; i++) {
+ if (u[i] > 0xFFFF)
+ result++;
+ }
+#endif
+ return result;
+}
+
+static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
+{
+ Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
+ Py_ssize_t result = length;
+
+#if Py_UNICODE_SIZE == 2
+ Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+ Py_ssize_t i;
+
+ for (i=0; i<length-1; i++) {
+ if (IS_SURROGATE(u+i))
+ result--;
+ }
+#endif
+ return result;
+}
+
+static int _my_PyUnicode_AsChar16(PyObject *unicode,
+ cffi_char16_t *result,
+ Py_ssize_t resultlen)
+{
+ Py_ssize_t len = PyUnicode_GET_SIZE(unicode);
+ Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+ Py_ssize_t i;
+ for (i=0; i<len; i++) {
+#if Py_UNICODE_SIZE == 2
+ cffi_char16_t ordinal = u[i];
+#else
+ cffi_char32_t ordinal = u[i];
+ if (ordinal > 0xFFFF) {
+ if (ordinal > 0x10FFFF) {
+ PyErr_Format(PyExc_ValueError,
+ "unicode character out of range for "
+ "conversion to char16_t: 0x%x", (int)ordinal);
+ return -1;
+ }
+ ordinal -= 0x10000;
+ *result++ = 0xD800 | (ordinal >> 10);
+ *result++ = 0xDC00 | (ordinal & 0x3FF);
+ continue;
+ }
+#endif
+ *result++ = ordinal;
+ }
+ return 0;
+}
+
+static int _my_PyUnicode_AsChar32(PyObject *unicode,
+ cffi_char32_t *result,
+ Py_ssize_t resultlen)
+{
+ Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+ Py_ssize_t i;
+ for (i=0; i<resultlen; i++) {
+ cffi_char32_t ordinal = *u;
+#if Py_UNICODE_SIZE == 2
+ if (IS_SURROGATE(u)) {
+ ordinal = AS_SURROGATE(u);
+ u++;
+ }
+#endif
+ result[i] = ordinal;
+ u++;
+ }
+ return 0;
+}