Merge pull request #102 from static-frame/100/dta-proc-rec

flexatone · web-flow · commit f875ec63cb5f · 2023-04-23T08:08:07.000-06:00
diff --git a/src/_arraykit.c b/src/_arraykit.c
@@ -2612,8 +2612,7 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
         PyObject *line_select
         )
 {
-    Py_UCS4 c;
-    Py_ssize_t pos, linelen;
+    Py_ssize_t linelen;
     unsigned int kind;
     const void *data;
     PyObject *record;
@@ -2667,20 +2666,43 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
 
         kind = PyUnicode_KIND(record);
         data = PyUnicode_DATA(record);
-        pos = 0;
         linelen = PyUnicode_GET_LENGTH(record);
-        while (linelen--) {
-            c = PyUnicode_READ(kind, data, pos);
-            if (c == '\0') {
-                Py_DECREF(record);
-                PyErr_Format(PyExc_RuntimeError, "line contains NUL");
-                return -1;
+
+        // NOTE: we used to check that the read character was not \0; this seems rare enough to not be necessary to handle explicit, as AK_DR_process_char will treat it as an end of record
+        switch (kind) {
+            case PyUnicode_1BYTE_KIND: {
+                Py_UCS1* uc = (Py_UCS1*)data;
+                Py_UCS1* uc_end = uc + linelen;
+                while (uc < uc_end) {
+                    if (AK_DR_process_char(dr, cpg, *uc++)) {
+                        Py_DECREF(record);
+                        return -1;
+                    }
+                }
+                break;
             }
-            if (AK_DR_process_char(dr, cpg, c)) {
-                Py_DECREF(record);
-                return -1;
+            case PyUnicode_2BYTE_KIND: {
+                Py_UCS2* uc = (Py_UCS2*)data;
+                Py_UCS2* uc_end = uc + linelen;
+                while (uc < uc_end) {
+                    if (AK_DR_process_char(dr, cpg, *uc++)) {
+                        Py_DECREF(record);
+                        return -1;
+                    }
+                }
+                break;
+            }
+            case PyUnicode_4BYTE_KIND: {
+                Py_UCS4* uc = (Py_UCS4*)data;
+                Py_UCS4* uc_end = uc + linelen;
+                while (uc < uc_end) {
+                    if (AK_DR_process_char(dr, cpg, *uc++)) {
+                        Py_DECREF(record);
+                        return -1;
+                    }
+                }
+                break;
             }
-            pos++;
         }
         Py_DECREF(record);
         // force signaling we are at the end of a line
diff --git a/test/test_delimited_to_arrays.py b/test/test_delimited_to_arrays.py
@@ -688,13 +688,14 @@ def test_delimited_to_arrays_parse_f(self) -> None:
 
     def test_delimited_to_arrays_parse_g(self) -> None:
         msg = [
-            'a, 10, foo',
-            'b,  20, \0',
+            'a,10,foo',
+            'b,20,\0',
             ]
-        # if a null character is encountered
-        with self.assertRaises(RuntimeError):
-            _ = delimited_to_arrays(msg, axis=1)
-
+        # if a null character is encountered it used to raise; this seemed unnecessary
+        post = delimited_to_arrays(msg, axis=1)
+        self.assertEqual( [a.tolist() for a in post],
+                [['a', 'b'], [10, 20], ['foo', '']]
+                )
 
     def test_delimited_to_arrays_parse_h(self) -> None:
         msg = [',0', 'False,1']