Skip to content

Commit f875ec6

Browse files
authored
Merge pull request #102 from static-frame/100/dta-proc-rec
2 parents 9b94d77 + 232106c commit f875ec6

File tree

2 files changed

+42
-19
lines changed

2 files changed

+42
-19
lines changed

src/_arraykit.c

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2612,8 +2612,7 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
26122612
PyObject *line_select
26132613
)
26142614
{
2615-
Py_UCS4 c;
2616-
Py_ssize_t pos, linelen;
2615+
Py_ssize_t linelen;
26172616
unsigned int kind;
26182617
const void *data;
26192618
PyObject *record;
@@ -2667,20 +2666,43 @@ AK_DR_ProcessRecord(AK_DelimitedReader *dr,
26672666

26682667
kind = PyUnicode_KIND(record);
26692668
data = PyUnicode_DATA(record);
2670-
pos = 0;
26712669
linelen = PyUnicode_GET_LENGTH(record);
2672-
while (linelen--) {
2673-
c = PyUnicode_READ(kind, data, pos);
2674-
if (c == '\0') {
2675-
Py_DECREF(record);
2676-
PyErr_Format(PyExc_RuntimeError, "line contains NUL");
2677-
return -1;
2670+
2671+
// NOTE: we used to check that the read character was not \0; this seems rare enough to not be necessary to handle explicit, as AK_DR_process_char will treat it as an end of record
2672+
switch (kind) {
2673+
case PyUnicode_1BYTE_KIND: {
2674+
Py_UCS1* uc = (Py_UCS1*)data;
2675+
Py_UCS1* uc_end = uc + linelen;
2676+
while (uc < uc_end) {
2677+
if (AK_DR_process_char(dr, cpg, *uc++)) {
2678+
Py_DECREF(record);
2679+
return -1;
2680+
}
2681+
}
2682+
break;
26782683
}
2679-
if (AK_DR_process_char(dr, cpg, c)) {
2680-
Py_DECREF(record);
2681-
return -1;
2684+
case PyUnicode_2BYTE_KIND: {
2685+
Py_UCS2* uc = (Py_UCS2*)data;
2686+
Py_UCS2* uc_end = uc + linelen;
2687+
while (uc < uc_end) {
2688+
if (AK_DR_process_char(dr, cpg, *uc++)) {
2689+
Py_DECREF(record);
2690+
return -1;
2691+
}
2692+
}
2693+
break;
2694+
}
2695+
case PyUnicode_4BYTE_KIND: {
2696+
Py_UCS4* uc = (Py_UCS4*)data;
2697+
Py_UCS4* uc_end = uc + linelen;
2698+
while (uc < uc_end) {
2699+
if (AK_DR_process_char(dr, cpg, *uc++)) {
2700+
Py_DECREF(record);
2701+
return -1;
2702+
}
2703+
}
2704+
break;
26822705
}
2683-
pos++;
26842706
}
26852707
Py_DECREF(record);
26862708
// force signaling we are at the end of a line

test/test_delimited_to_arrays.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -688,13 +688,14 @@ def test_delimited_to_arrays_parse_f(self) -> None:
688688

689689
def test_delimited_to_arrays_parse_g(self) -> None:
690690
msg = [
691-
'a, 10, foo',
692-
'b, 20, \0',
691+
'a,10,foo',
692+
'b,20,\0',
693693
]
694-
# if a null character is encountered
695-
with self.assertRaises(RuntimeError):
696-
_ = delimited_to_arrays(msg, axis=1)
697-
694+
# if a null character is encountered it used to raise; this seemed unnecessary
695+
post = delimited_to_arrays(msg, axis=1)
696+
self.assertEqual( [a.tolist() for a in post],
697+
[['a', 'b'], [10, 20], ['foo', '']]
698+
)
698699

699700
def test_delimited_to_arrays_parse_h(self) -> None:
700701
msg = [',0', 'False,1']

0 commit comments

Comments
 (0)