@@ -3535,7 +3535,7 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
35353535//------------------------------------------------------------------------------
35363536// general utility
35373537
3538- #define NONZERO_APPEND_INDEX { \
3538+ #define NONZERO_APPEND_INDEX_RELATIVE { \
35393539 if (AK_UNLIKELY(count == capacity)) { \
35403540 capacity <<= 1; \
35413541 indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
@@ -3546,6 +3546,17 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
35463546 indices[count++] = p - p_start; \
35473547} \
35483548
3549+ #define NONZERO_APPEND_INDEX_ABSOLUTE { \
3550+ if (AK_UNLIKELY(count == capacity)) { \
3551+ capacity <<= 1; \
3552+ indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
3553+ if (indices == NULL) { \
3554+ return NULL; \
3555+ } \
3556+ } \
3557+ indices[count++] = i; \
3558+ } \
3559+
35493560// Given a Boolean, contiguous 1D array, return the index positions in an int64 array.
35503561// Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help. Using bit masks does not improve perforamnce over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy befits from first counting the nonzeros, then allocating only enough data for the expexted number.
35513562static inline PyObject *
@@ -3567,44 +3578,71 @@ AK_nonzero_1d(PyArrayObject* array) {
35673578 Py_ssize_t capacity = count_max < 1024 ? count_max : count_max / 8 ;
35683579 npy_int64 * indices = (npy_int64 * )malloc (sizeof (npy_int64 ) * capacity );
35693580
3570- // array is contiguous, 1d, boolean
35713581 NPY_BEGIN_THREADS_DEF ;
35723582 NPY_BEGIN_THREADS ;
35733583
3574- npy_bool * p_start = (npy_bool * )PyArray_DATA (array );
3575- npy_bool * p = p_start ;
3576- npy_bool * p_end = p + count_max ;
3577- npy_bool * p_end_roll = p_end - size_div .rem ;
3584+ if (PyArray_IS_C_CONTIGUOUS (array )) {
3585+ npy_bool * p_start = (npy_bool * )PyArray_DATA (array );
3586+ npy_bool * p = p_start ;
3587+ npy_bool * p_end = p + count_max ;
3588+ npy_bool * p_end_roll = p_end - size_div .rem ;
35783589
3579- while (p < p_end_roll ) {
3580- if (* (npy_uint64 * )p == 0 ) {
3581- p += 8 ; // no true within this 8 byte roll region
3582- continue ;
3590+ while (p < p_end_roll ) {
3591+ if (* (npy_uint64 * )p == 0 ) {
3592+ p += 8 ; // no true within this 8 byte roll region
3593+ continue ;
3594+ }
3595+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3596+ p ++ ;
3597+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3598+ p ++ ;
3599+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3600+ p ++ ;
3601+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3602+ p ++ ;
3603+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3604+ p ++ ;
3605+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3606+ p ++ ;
3607+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3608+ p ++ ;
3609+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3610+ p ++ ;
3611+ }
3612+ while (p < p_end ) {
3613+ if (* p ) {NONZERO_APPEND_INDEX_RELATIVE ;}
3614+ p ++ ;
35833615 }
3584- if (* p ) {NONZERO_APPEND_INDEX ;}
3585- p ++ ;
3586- if (* p ) {NONZERO_APPEND_INDEX ;}
3587- p ++ ;
3588- if (* p ) {NONZERO_APPEND_INDEX ;}
3589- p ++ ;
3590- if (* p ) {NONZERO_APPEND_INDEX ;}
3591- p ++ ;
3592- if (* p ) {NONZERO_APPEND_INDEX ;}
3593- p ++ ;
3594- if (* p ) {NONZERO_APPEND_INDEX ;}
3595- p ++ ;
3596- if (* p ) {NONZERO_APPEND_INDEX ;}
3597- p ++ ;
3598- if (* p ) {NONZERO_APPEND_INDEX ;}
3599- p ++ ;
36003616 }
3601- while (p < p_end ) {
3602- if (* p ) {NONZERO_APPEND_INDEX ;}
3603- p ++ ;
3617+ else {
3618+ npy_intp i = 0 ; // position within Boolean array
3619+ npy_intp i_end = count_max ;
3620+ npy_intp i_end_roll = count_max - size_div .rem ;
3621+ while (i < i_end_roll ) {
3622+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3623+ i ++ ;
3624+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3625+ i ++ ;
3626+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3627+ i ++ ;
3628+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3629+ i ++ ;
3630+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3631+ i ++ ;
3632+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3633+ i ++ ;
3634+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3635+ i ++ ;
3636+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3637+ i ++ ;
3638+ }
3639+ while (i < i_end ) {
3640+ if (* (npy_bool * )PyArray_GETPTR1 (array , i )) {NONZERO_APPEND_INDEX_ABSOLUTE ;}
3641+ i ++ ;
3642+ }
36043643 }
36053644 NPY_END_THREADS ;
36063645
3607-
36083646 npy_intp dims = {count };
36093647 final = PyArray_SimpleNewFromData (1 , & dims , NPY_INT64 , (void * )indices );
36103648 if (!final ) {
@@ -3616,7 +3654,7 @@ AK_nonzero_1d(PyArrayObject* array) {
36163654 PyArray_CLEARFLAGS ((PyArrayObject * )final , NPY_ARRAY_WRITEABLE );
36173655 return final ;
36183656}
3619- #undef NONZERO_APPEND_INDEX
3657+ #undef NONZERO_APPEND_INDEX_RELATIVE
36203658
36213659static PyObject *
36223660nonzero_1d (PyObject * Py_UNUSED (m ), PyObject * a ) {
@@ -3630,10 +3668,6 @@ nonzero_1d(PyObject *Py_UNUSED(m), PyObject *a) {
36303668 PyErr_SetString (PyExc_ValueError , "Array must be of type bool" );
36313669 return NULL ;
36323670 }
3633- if (!PyArray_IS_C_CONTIGUOUS (array )) {
3634- PyErr_SetString (PyExc_ValueError , "Array must be contiguous" );
3635- return NULL ;
3636- }
36373671 return AK_nonzero_1d (array );
36383672}
36393673
0 commit comments