summaryrefslogtreecommitdiff
path: root/c/wchar_helper_3.h
diff options
context:
space:
mode:
Diffstat (limited to 'c/wchar_helper_3.h')
-rw-r--r--c/wchar_helper_3.h149
1 files changed, 149 insertions, 0 deletions
diff --git a/c/wchar_helper_3.h b/c/wchar_helper_3.h
new file mode 100644
index 0000000..f15464e
--- /dev/null
+++ b/c/wchar_helper_3.h
@@ -0,0 +1,149 @@
+/*
+ * wchar_t helpers, version CPython >= 3.3.
+ *
+ * CPython 3.3 added support for sys.maxunicode == 0x10FFFF on all
+ * platforms, even ones with wchar_t limited to 2 bytes. As such,
+ * this code here works from the outside like wchar_helper.h in the
+ * case Py_UNICODE_SIZE == 4, but the implementation is very different.
+ */
+
+typedef uint16_t cffi_char16_t;
+typedef uint32_t cffi_char32_t;
+
+
+static PyObject *
+_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
+{
+ return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, w, size);
+}
+
+static PyObject *
+_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
+{
+ /* are there any surrogate pairs, and if so, how many? */
+ Py_ssize_t i, count_surrogates = 0;
+ for (i = 0; i < size - 1; i++) {
+ if (0xD800 <= w[i] && w[i] <= 0xDBFF &&
+ 0xDC00 <= w[i+1] && w[i+1] <= 0xDFFF)
+ count_surrogates++;
+ }
+ if (count_surrogates == 0) {
+ /* no, fast path */
+ return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, w, size);
+ }
+ else
+ {
+ PyObject *result = PyUnicode_New(size - count_surrogates, 0x10FFFF);
+ Py_UCS4 *data;
+ assert(PyUnicode_KIND(result) == PyUnicode_4BYTE_KIND);
+ data = PyUnicode_4BYTE_DATA(result);
+
+ for (i = 0; i < size; i++)
+ {
+ cffi_char32_t ch = w[i];
+ if (0xD800 <= ch && ch <= 0xDBFF && i < size - 1) {
+ cffi_char32_t ch2 = w[i + 1];
+ if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ i++;
+ }
+ }
+ *data++ = ch;
+ }
+ return result;
+ }
+}
+
+static int
+_my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
+ char *err_got)
+{
+ cffi_char32_t ch;
+ if (PyUnicode_GET_LENGTH(unicode) != 1) {
+ sprintf(err_got, "unicode string of length %zd",
+ PyUnicode_GET_LENGTH(unicode));
+ return -1;
+ }
+ ch = PyUnicode_READ_CHAR(unicode, 0);
+
+ if (ch > 0xFFFF)
+ {
+ sprintf(err_got, "larger-than-0xFFFF character");
+ return -1;
+ }
+ *result = (cffi_char16_t)ch;
+ return 0;
+}
+
+static int
+_my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
+ char *err_got)
+{
+ if (PyUnicode_GET_LENGTH(unicode) != 1) {
+ sprintf(err_got, "unicode string of length %zd",
+ PyUnicode_GET_LENGTH(unicode));
+ return -1;
+ }
+ *result = PyUnicode_READ_CHAR(unicode, 0);
+ return 0;
+}
+
+static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
+{
+ Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
+ Py_ssize_t result = length;
+ unsigned int kind = PyUnicode_KIND(unicode);
+
+ if (kind == PyUnicode_4BYTE_KIND)
+ {
+ Py_UCS4 *data = PyUnicode_4BYTE_DATA(unicode);
+ Py_ssize_t i;
+ for (i = 0; i < length; i++) {
+ if (data[i] > 0xFFFF)
+ result++;
+ }
+ }
+ return result;
+}
+
+static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
+{
+ return PyUnicode_GET_LENGTH(unicode);
+}
+
+static int _my_PyUnicode_AsChar16(PyObject *unicode,
+ cffi_char16_t *result,
+ Py_ssize_t resultlen)
+{
+ Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+ unsigned int kind = PyUnicode_KIND(unicode);
+ void *data = PyUnicode_DATA(unicode);
+ Py_ssize_t i;
+
+ for (i = 0; i < len; i++) {
+ cffi_char32_t ordinal = PyUnicode_READ(kind, data, i);
+ if (ordinal > 0xFFFF) {
+ if (ordinal > 0x10FFFF) {
+ PyErr_Format(PyExc_ValueError,
+ "unicode character out of range for "
+ "conversion to char16_t: 0x%x", (int)ordinal);
+ return -1;
+ }
+ ordinal -= 0x10000;
+ *result++ = 0xD800 | (ordinal >> 10);
+ *result++ = 0xDC00 | (ordinal & 0x3FF);
+ }
+ else
+ *result++ = ordinal;
+ }
+ return 0;
+}
+
+static int _my_PyUnicode_AsChar32(PyObject *unicode,
+ cffi_char32_t *result,
+ Py_ssize_t resultlen)
+{
+ if (PyUnicode_AsUCS4(unicode, (Py_UCS4 *)result, resultlen, 0) == NULL)
+ return -1;
+ return 0;
+}