diff --git a/.github/instructions/python.instructions.md b/.github/instructions/python.instructions.md index 878db1779..558232add 100644 --- a/.github/instructions/python.instructions.md +++ b/.github/instructions/python.instructions.md @@ -14,7 +14,7 @@ pixi run format # Ruff (target: Python 3.8) ## Repository-Specific Conventions - **Path setup**: Root `conftest.py` adds all plugin paths to `sys.path` - **never modify `sys.path` in tests** -- **Test style**: Function-based tests only (`pytest.ini` disables class discovery) +- **Test style**: Primarily function-based tests. Class-based tests (`Test*` and `*Test` patterns) are enabled in `pytest.ini` to support parameterized tests. - **Multi-version**: Changes must work across Python 3.8-3.14 - test with `pixi run test-all` - **PyArrow versions**: Tightly coupled to Python version in `pixi.toml` - don't update independently diff --git a/org.knime.python3.arrow.tests/src/test/python/unittest/emptyGeneratedTestData.zip b/org.knime.python3.arrow.tests/src/test/python/unittest/emptyGeneratedTestData.zip new file mode 100644 index 000000000..d07bdb2f7 --- /dev/null +++ b/org.knime.python3.arrow.tests/src/test/python/unittest/emptyGeneratedTestData.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ab7dc4c6a5dd3fd1a6c8ea5c6a9ac535d2d24405f66a0a7bcacf8ba66ea9331 +size 35026 diff --git a/org.knime.python3.arrow.tests/src/test/python/unittest/structDictEncodedDataCellsWithBatches.zip b/org.knime.python3.arrow.tests/src/test/python/unittest/structDictEncodedDataCellsWithBatches.zip new file mode 100644 index 000000000..554b8ae91 --- /dev/null +++ b/org.knime.python3.arrow.tests/src/test/python/unittest/structDictEncodedDataCellsWithBatches.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd23d6342465242ca35320f569168e89687665ee2d4bafded40de92b59fcd7eb +size 30330 diff --git a/org.knime.python3.arrow.tests/src/test/python/unittest/test_pandas_extension_type.py b/org.knime.python3.arrow.tests/src/test/python/unittest/test_pandas_extension_type.py index a68662ee7..8e82781bf 100644 --- a/org.knime.python3.arrow.tests/src/test/python/unittest/test_pandas_extension_type.py +++ b/org.knime.python3.arrow.tests/src/test/python/unittest/test_pandas_extension_type.py @@ -74,6 +74,7 @@ _generate_test_data_frame, _apply_to_array, _register_extension_types, + _generate_arrow_table, ) @@ -942,6 +943,49 @@ def test_struct_dict_encoded_logical_type_extension_type(self): self.assertEqual(df["Name"].iloc[5], "LINESTRING (40 20, 10 30, 35 40)") self.assertEqual(df["Name"].iloc[6], "LINESTRING (30 10, 10 30, 40 40)") + def test_struct_dict_encoding_with_chunks_regression(self): + """ + Regression test for struct dict encoded array corruption during round-trip conversion. + + Bug Context: + KNIME uses struct dict encoding to compress data: values are stored only on first + occurrence in a struct {key: uint, value: T}, with subsequent occurrences referencing + them by key. This creates an encoded array where keys point to indices in the data + array, and the data array contains actual values at those indices. + + The Problem: + In pandas 2.1.0+, pd.concat changed and calls KnimePandasExtensionArray.take with indices + re-batches ChunkedArrays at different boundaries. When an already-encoded struct dict + array gets split incorrectly: + - Chunk 1 might contain data array with actual values at indices [0,1,2...] + - Chunk 2 gets keys [0,0,0...] referencing index 0, but chunk 2's data array has + null at index 0 (the actual value is in chunk 1) + + This causes reads to fail with "Cannot read DataCell with empty type information" + because the dictionary structure is broken across chunks. + + What This Test Does: + 1. Loads structDictEncodedDataCellsWithBatches.zip which + - has 3 batches + - contains a struct dict encoded column for generic data cells + 2. Performs arrow → pandas → arrow round-trip conversion + 3. Asserts all columns remain equal after round-trip + + This was fixed by shortcutting taking indices from the storage array in KnimePandasExtensionArray.take if + the indices cover the full array with the lines: + ``` + if len(indices) == len(self) and np.all(indices == np.arange(len(self))): + return self.copy() + ``` + """ + arrow_table = _generate_arrow_table("structDictEncodedDataCellsWithBatches.zip") + df = kap.arrow_data_to_pandas_df(arrow_table) + arrow_table_2 = kap.pandas_df_to_arrow(df) + + self.assertEqual(len(arrow_table_2.columns), 2) + self.assertEqual(arrow_table.column(0), arrow_table_2.column(0)) + self.assertEqual(arrow_table.column(1), arrow_table_2.column(1)) + def test_chunk_calculation(self): def _get_chunked_array_for_start_indices(chunk_start_indices): chunk_list = [] diff --git a/org.knime.python3.arrow.tests/src/test/python/unittest/test_table.py b/org.knime.python3.arrow.tests/src/test/python/unittest/test_table.py index 689159859..2178911d2 100644 --- a/org.knime.python3.arrow.tests/src/test/python/unittest/test_table.py +++ b/org.knime.python3.arrow.tests/src/test/python/unittest/test_table.py @@ -1,6 +1,7 @@ import unittest import pyarrow as pa import pandas as pd +import pytest import knime_schema as ks from knime.api.schema import _ColumnSlicingOperation @@ -109,18 +110,28 @@ def close(self): pass -class ArrowTableTest(unittest.TestCase): - @staticmethod - def _generate_test_table(): +@pytest.mark.parametrize( + ("file_name", "is_empty"), + [ + ("generatedTestData.zip", False), # Always test with data + *( + [("emptyGeneratedTestData.zip", True)] + if int(pa.__version__.split(".")[0]) > 6 + else [] + ), # Test with empty table only for pyarrow>6 + ], +) +class ArrowTableTest: + def _generate_test_table(self, file_name): """ Creates a Dataframe from a KNIME table on disk - @param path: path for the KNIME Table + @param file_name: file name for the KNIME Table @return: Arrow Table containing data from KNIME's TestDataGenerator """ import os knime_generated_table_path = os.path.normpath( - os.path.join(__file__, "..", "generatedTestData.zip") + os.path.join(__file__, "..", file_name) ) column_names = [ "StringCol", @@ -173,170 +184,166 @@ def generate_test_table_with_rowid_column(self): ) return kt.Table.from_pyarrow(rowid_table, row_ids="generate") - @classmethod - def setUpClass(cls): + @pytest.fixture(scope="class", autouse=True) + def setup_class(self): import knime.api.table as ktn ktn._backend = kat._ArrowBackend(lambda: DummyDataSink()) - cls._test_table = cls._generate_test_table() - - def test_table_setup(self): - table = self._test_table - self.assertTrue(table.num_columns > 0) - self.assertTrue(table.num_rows > 0) - self.assertIsInstance(table, kat.ArrowSourceTable) - self.assertIsInstance(table, kt.Table) - self.assertTrue(hasattr(table, "to_batches")) - - def test_table_view(self): - table = self._test_table + yield + + def test_table_setup(self, file_name, is_empty): + table = self._generate_test_table(file_name) + assert table.num_columns > 0 + assert table.num_rows == 0 if is_empty else table.num_rows > 0 + assert isinstance(table, kat.ArrowSourceTable) + assert isinstance(table, kt.Table) + assert hasattr(table, "to_batches") + + def test_table_view(self, file_name, is_empty): + table = self._generate_test_table(file_name) tv = table[:, :] - self.assertTrue(tv.num_columns > 0) - self.assertTrue(tv.num_rows > 0) - self.assertEqual(table.num_columns, tv.num_columns) - self.assertEqual(table.num_rows, tv.num_rows) - self.assertNotIsInstance(tv, kat.ArrowSourceTable) - self.assertIsInstance(tv, kt._Tabular) - self.assertIsInstance(tv, kt._TabularView) - with self.assertRaises(RuntimeError): + assert tv.num_columns > 0 + assert tv.num_rows == 0 if is_empty else tv.num_rows > 0 + assert table.num_columns == tv.num_columns + assert table.num_rows == tv.num_rows + assert not isinstance(tv, kat.ArrowSourceTable) + assert isinstance(tv, kt._Tabular) + assert isinstance(tv, kt._TabularView) + with pytest.raises(RuntimeError): for b in tv.to_batches(): print(b) - def test_row_slicing(self): - table = self._test_table + def test_row_slicing(self, file_name, is_empty): + table = self._generate_test_table(file_name) sliced = table[:, 3:13] - self.assertEqual(table.num_columns, sliced.num_columns) - self.assertEqual(10, sliced.num_rows) + assert table.num_columns == sliced.num_columns + assert sliced.num_rows == 0 if is_empty else 10 == sliced.num_rows sliced = table[:, :13] - self.assertEqual(table.num_columns, sliced.num_columns) - self.assertEqual(13, sliced.num_rows) + assert table.num_columns == sliced.num_columns + assert sliced.num_rows == 0 if is_empty else 13 == sliced.num_rows sliced = table[:, -5:] - self.assertEqual(table.num_columns, sliced.num_columns) - self.assertEqual(5, sliced.num_rows) + assert table.num_columns == sliced.num_columns + assert sliced.num_rows == 0 if is_empty else 5 == sliced.num_rows - def test_column_slicing_slice(self): - table = self._test_table + def test_column_slicing_slice(self, file_name, is_empty): + table = self._generate_test_table(file_name) # use a slice sliced = table[3:13] - self.assertEqual(table.num_rows, sliced.num_rows) - self.assertEqual(10, sliced.num_columns) + assert table.num_rows == sliced.num_rows + assert 10 == sliced.num_columns sliced = table[:13] - self.assertEqual(table.num_rows, sliced.num_rows) - self.assertEqual(13, sliced.num_columns) + assert table.num_rows == sliced.num_rows + assert 13 == sliced.num_columns sliced = table[-5:] - self.assertEqual(table.num_rows, sliced.num_rows) - self.assertEqual(5, sliced.num_columns) + assert table.num_rows == sliced.num_rows + assert 5 == sliced.num_columns - def test_column_slicing_int(self): - table = self._test_table + def test_column_slicing_int(self, file_name, is_empty): + table = self._generate_test_table(file_name) # use a single index sliced = table[2] - self.assertEqual(1, sliced.num_columns) - self.assertEqual(sliced.column_names[0], table.column_names[2]) + assert 1 == sliced.num_columns + assert sliced.column_names[0] == table.column_names[2] - def test_column_slicing_str(self): - table = self._test_table + def test_column_slicing_str(self, file_name, is_empty): + table = self._generate_test_table(file_name) # use a single column name sliced = table[table.column_names[2]] - self.assertEqual(1, sliced.num_columns) - self.assertEqual(sliced.column_names[0], table.column_names[2]) + assert 1 == sliced.num_columns + assert sliced.column_names[0] == table.column_names[2] - def test_column_slicing_int_list(self): - table = self._test_table + def test_column_slicing_int_list(self, file_name, is_empty): + table = self._generate_test_table(file_name) # use a list of indices in wild order indices = [3, 7, 2] sliced = table[indices] - self.assertEqual(len(indices), sliced.num_columns) - self.assertTrue( - all( - table.column_names[i] == sliced.column_names[e] - for e, i in enumerate(indices) - ) + assert len(indices) == sliced.num_columns + assert all( + table.column_names[i] == sliced.column_names[e] + for e, i in enumerate(indices) ) - def test_column_slicing_str_list(self): - table = self._test_table + def test_column_slicing_str_list(self, file_name, is_empty): + table = self._generate_test_table(file_name) # use a list of indices in wild order indices = [3, 7, 2] names = [table.column_names[i] for i in indices] sliced = table[names] - self.assertEqual(len(names), sliced.num_columns) - self.assertListEqual( - [table.column_names[i] for i in indices], - [sliced.column_names[i] for i in range(len(indices))], - ) + assert len(names) == sliced.num_columns + assert [table.column_names[i] for i in indices] == [ + sliced.column_names[i] for i in range(len(indices)) + ] data = sliced.to_pyarrow() - self.assertEqual(data.schema.names[0], "") - self.assertListEqual( - [table.column_names[i] for i in indices], - [data.schema.names[i + 1] for i in range(len(indices))], - ) + assert data.schema.names[0] == "" + assert [table.column_names[i] for i in indices] == [ + data.schema.names[i + 1] for i in range(len(indices)) + ] - def test_both_slicings(self): - table = self._test_table + def test_both_slicings(self, file_name, is_empty): + table = self._generate_test_table(file_name) sliced = table[5:10, 5:10] - self.assertEqual(5, sliced.num_columns) - self.assertEqual(5, sliced.num_rows) + assert 5 == sliced.num_columns + assert 0 == sliced.num_rows if is_empty else 5 == sliced.num_rows data = sliced.to_pyarrow() - self.assertEqual(5, len(data)) + assert 0 == len(data) if is_empty else 5 == len(data) - def test_to_from_pyarrow(self): - table = self._test_table + def test_to_from_pyarrow(self, file_name, is_empty): + table = self._generate_test_table(file_name) data = table.to_pyarrow() other = kt.Table.from_pyarrow(data) - self.assertEqual(table.num_rows, other.num_rows) - self.assertEqual(table.num_columns, other.num_columns) - self.assertEqual(table.column_names, other.column_names) + assert table.num_rows == other.num_rows + assert table.num_columns == other.num_columns + assert table.column_names == other.column_names - with self.assertRaises(RuntimeError): + with pytest.raises(RuntimeError): self.generate_test_table_with_rowid_column().to_pyarrow() - def test_to_from_pandas(self): - table = self._test_table + def test_to_from_pandas(self, file_name, is_empty): + table = self._generate_test_table(file_name) data = table.to_pandas() other = kt.Table.from_pandas(data) - self.assertEqual(table.num_rows, other.num_rows) - self.assertEqual(table.num_columns, other.num_columns) - self.assertEqual(table.column_names, other.column_names) + assert table.num_rows == other.num_rows + assert table.num_columns == other.num_columns + assert table.column_names == other.column_names - with self.assertRaises(RuntimeError): + with pytest.raises(RuntimeError): self.generate_test_table_with_rowid_column().to_pandas() - def test_pandas_roundtrip_with_data(self): - table = self._test_table + def test_pandas_roundtrip_with_data(self, file_name, is_empty): + table = self._generate_test_table(file_name) data = table.to_pandas() other = kt.Table.from_pandas(data) data2 = other.to_pandas() pd.testing.assert_frame_equal(data, data2) - def test_to_batches(self): - table = self._test_table + def test_to_batches(self, file_name, is_empty): + table = self._generate_test_table(file_name) batches = list(table.to_batches()) - self.assertTrue(len(batches) > 0) - self.assertIsInstance(batches[0], kt.Table) - self.assertEqual(table.num_columns, batches[0].num_columns) + assert len(batches) > 0 + assert isinstance(batches[0], kt.Table) + assert table.num_columns == batches[0].num_columns - def test_batches(self): - table = self._test_table + def test_batches(self, file_name, is_empty): + table = self._generate_test_table(file_name) batches = list(table.batches()) - self.assertTrue(len(batches) > 0) - self.assertIsInstance(batches[0], kt.Table) - self.assertEqual(table.num_columns, batches[0].num_columns) + assert len(batches) > 0 + assert isinstance(batches[0], kt.Table) + assert table.num_columns == batches[0].num_columns - def test_schema(self): - table = self._test_table + def test_schema(self, file_name, is_empty): + table = self._generate_test_table(file_name) schema = table.schema - self.assertEqual(table.num_columns, schema.num_columns) - self.assertEqual(schema.column_names[0], "StringCol") - self.assertEqual(schema[3].ktype, ks.int32()) + assert table.num_columns == schema.num_columns + assert schema.column_names[0] == "StringCol" + assert schema[3].ktype == ks.int32() - def test_row_ids_from_pandas(self): + def test_row_ids_from_pandas(self, file_name, is_empty): df_len = 5 def create_df(index): @@ -355,17 +362,17 @@ def get_row_ids(table): # Simple range index table = kt.Table.from_pandas(create_df(pd.RangeIndex(df_len)), row_ids="keep") - self.assertListEqual(get_row_ids(table), [f"{i}" for i in range(df_len)]) + assert get_row_ids(table) == [f"{i}" for i in range(df_len)] # Strings index = ["a", "b", "foo", "g", "e"] table = kt.Table.from_pandas(create_df(pd.Index(index)), row_ids="keep") - self.assertListEqual(get_row_ids(table), index) + assert get_row_ids(table) == index # Floats index = [2.0, 7.1, 1000.1, 0.0001, 3.14] table = kt.Table.from_pandas(create_df(pd.Index(index)), row_ids="keep") - self.assertListEqual(get_row_ids(table), [str(i) for i in index]) + assert get_row_ids(table) == [str(i) for i in index] # ========== row_ids = "generate" @@ -375,17 +382,17 @@ def get_row_ids(table): table = kt.Table.from_pandas( create_df(pd.RangeIndex(df_len)), row_ids="generate" ) - self.assertListEqual(get_row_ids(table), generated_keys) + assert get_row_ids(table) == generated_keys # Strings index = ["a", "b", "foo", "g", "e"] table = kt.Table.from_pandas(create_df(pd.Index(index)), row_ids="generate") - self.assertListEqual(get_row_ids(table), generated_keys) + assert get_row_ids(table) == generated_keys # Floats index = [2.0, 7.1, 1000.1, 0.0001, 3.14] table = kt.Table.from_pandas(create_df(pd.Index(index)), row_ids="generate") - self.assertListEqual(get_row_ids(table), generated_keys) + assert get_row_ids(table) == generated_keys # ========== row_ids = "auto" @@ -393,35 +400,33 @@ def get_row_ids(table): # Simple range index table = kt.Table.from_pandas(create_df(pd.RangeIndex(df_len)), row_ids="auto") - self.assertListEqual(get_row_ids(table), generated_keys) + assert get_row_ids(table) == generated_keys # Strings index = ["a", "b", "foo", "g", "e"] table = kt.Table.from_pandas(create_df(pd.Index(index)), row_ids="auto") - self.assertListEqual(get_row_ids(table), index) + assert get_row_ids(table) == index # Floats index = [2.0, 7.1, 1000.1, 0.0001, 3.14] table = kt.Table.from_pandas(create_df(pd.Index(index)), row_ids="auto") - self.assertListEqual(get_row_ids(table), [str(i) for i in index]) + assert get_row_ids(table) == [str(i) for i in index] # Integers index = [5, 2, 1, 4, 7] table = kt.Table.from_pandas(create_df(pd.Index(index)), row_ids="auto") - self.assertListEqual(get_row_ids(table), [f"Row{i}" for i in index]) + assert get_row_ids(table) == [f"Row{i}" for i in index] # Range starting at 10 index = pd.RangeIndex(10, 10 + df_len) table = kt.Table.from_pandas(create_df(index), row_ids="auto") - self.assertListEqual( - get_row_ids(table), [f"Row{i}" for i in range(10, 10 + df_len)] - ) + assert get_row_ids(table) == [f"Row{i}" for i in range(10, 10 + df_len)] # ========== row_ids = "unsupported" - with self.assertRaises(ValueError): + with pytest.raises(ValueError): kt.Table.from_pandas(create_df(None), row_ids="unsupported") - def test_row_ids_from_pyarrow(self): + def test_row_ids_from_pyarrow(self, file_name, is_empty): table_len = 5 def create_table(row_key_col=None): @@ -448,64 +453,64 @@ def get_row_ids(table): # Correct string col for RowID row_ids = ["a", "b", "foo", "g", "e"] table = kt.Table.from_pyarrow(create_table(pa.array(row_ids)), row_ids="keep") - self.assertListEqual(get_row_ids(table), row_ids) - self.assertEqual(table.num_columns, 2) + assert get_row_ids(table) == row_ids + assert table.num_columns == 2 # Use first string column as RowID table = kt.Table.from_pyarrow(create_table(), row_ids="keep") - self.assertListEqual(get_row_ids(table), create_table()[0].to_pylist()) - self.assertEqual(table.num_columns, 1) + assert get_row_ids(table) == create_table()[0].to_pylist() + assert table.num_columns == 1 # Column named "" but with a wrong type row_ids = [4, 1, 3, 2, 8] - with self.assertRaises(TypeError): + with pytest.raises(TypeError): kt.Table.from_pyarrow(create_table(pa.array(row_ids)), row_ids="keep") # ========== row_ids = "generate" # Simple: Create new RowIDs table = kt.Table.from_pyarrow(create_table(), row_ids="generate") - self.assertListEqual(get_row_ids(table), [f"Row{i}" for i in range(table_len)]) - self.assertEqual(table.num_columns, 2) + assert get_row_ids(table) == [f"Row{i}" for i in range(table_len)] + assert table.num_columns == 2 # RowID column present but generate another one row_ids = ["a", "b", "foo", "g", "e"] table = kt.Table.from_pyarrow( create_table(pa.array(row_ids)), row_ids="generate" ) - self.assertListEqual(get_row_ids(table), [f"Row{i}" for i in range(table_len)]) - self.assertEqual(table.num_columns, 3) + assert get_row_ids(table) == [f"Row{i}" for i in range(table_len)] + assert table.num_columns == 3 # ========== row_ids = "auto" # First column is string but not "" table = kt.Table.from_pyarrow(create_table(), row_ids="auto") - self.assertListEqual(get_row_ids(table), [f"Row{i}" for i in range(table_len)]) - self.assertEqual(table.num_columns, 2) + assert get_row_ids(table) == [f"Row{i}" for i in range(table_len)] + assert table.num_columns == 2 # First column is string and "" row_ids = ["a", "b", "foo", "g", "e"] table = kt.Table.from_pyarrow(create_table(pa.array(row_ids)), row_ids="auto") - self.assertListEqual(get_row_ids(table), row_ids) - self.assertEqual(table.num_columns, 2) + assert get_row_ids(table) == row_ids + assert table.num_columns == 2 # First column is integer and "" row_ids = [4, 1, 3, 2, 8] table = kt.Table.from_pyarrow(create_table(pa.array(row_ids)), row_ids="auto") - self.assertListEqual(get_row_ids(table), [f"Row{i}" for i in row_ids]) - self.assertEqual(table.num_columns, 2) + assert get_row_ids(table) == [f"Row{i}" for i in row_ids] + assert table.num_columns == 2 # First column cannot converted to RowID but is named "" row_ids = [2.0, 7.1, 1000.1, 0.0001, 3.14] table = kt.Table.from_pyarrow(create_table(pa.array(row_ids)), row_ids="auto") - self.assertListEqual(get_row_ids(table), [f"Row{i}" for i in range(table_len)]) - self.assertEqual(table.num_columns, 3) + assert get_row_ids(table) == [f"Row{i}" for i in range(table_len)] + assert table.num_columns == 3 # ========== row_ids = "unsupported" - with self.assertRaises(ValueError): + with pytest.raises(ValueError): kt.Table.from_pyarrow(create_table(), row_ids="unsupported") - def test_constant_batch_size_check(self): + def test_constant_batch_size_check(self, file_name, is_empty): def _create_pyarrow_table(batch_sizes): return pa.Table.from_batches( [ @@ -516,20 +521,16 @@ def _create_pyarrow_table(batch_sizes): # Constant batch sizes + last batch smaller - should work knime_table = kt.Table.from_pyarrow(_create_pyarrow_table([10, 10, 7])) - self.assertEqual(knime_table.num_rows, 27) - self.assertEqual(len(knime_table._table.to_batches()), 3) + assert knime_table.num_rows == 27 + assert len(knime_table._table.to_batches()) == 3 # Non-constant batch sizes - should fail - self.assertRaises( - ValueError, - lambda: kt.Table.from_pyarrow(_create_pyarrow_table([10, 10, 12, 10])), - ) + with pytest.raises(ValueError): + kt.Table.from_pyarrow(_create_pyarrow_table([10, 10, 12, 10])) # Last batch bigger than the other batches - should fail - self.assertRaises( - ValueError, - lambda: kt.Table.from_pyarrow(_create_pyarrow_table([10, 10, 12])), - ) + with pytest.raises(ValueError): + kt.Table.from_pyarrow(_create_pyarrow_table([10, 10, 12])) class BatchOutputTableTest(unittest.TestCase): diff --git a/org.knime.python3.arrow/src/main/python/knime/_arrow/_dictencoding.py b/org.knime.python3.arrow/src/main/python/knime/_arrow/_dictencoding.py index 433ba548c..499616f31 100644 --- a/org.knime.python3.arrow/src/main/python/knime/_arrow/_dictencoding.py +++ b/org.knime.python3.arrow/src/main/python/knime/_arrow/_dictencoding.py @@ -422,7 +422,7 @@ def create_storage_for_struct_dict_encoded_array( else: entries_array = pa.array(entries, type=value_type) - mask_array = pa.array(mask) + mask_array = pa.array(mask, type=pa.bool_()) if key_type is None: key_type = pa.uint64() keys_array = pa.array(keys, type=key_type) diff --git a/org.knime.python3.arrow/src/main/python/knime/_arrow/_pandas.py b/org.knime.python3.arrow/src/main/python/knime/_arrow/_pandas.py index 354a1e0d0..e107a922b 100644 --- a/org.knime.python3.arrow/src/main/python/knime/_arrow/_pandas.py +++ b/org.knime.python3.arrow/src/main/python/knime/_arrow/_pandas.py @@ -104,6 +104,7 @@ def _integrate_row_ids(data_frame: pd.DataFrame, row_ids: str) -> pd.DataFrame: row_ids_series = _create_row_ids_for_auto_keep(data_frame, row_ids) # Prepend the index to the data_frame: row_ids_series.name = "" + # NOTE(AP-25573) since pandas 2.1.0 pd.concat will call __getitem__ on our extension array return pd.concat( [row_ids_series.reset_index(drop=True), data_frame.reset_index(drop=True)], axis=1, @@ -787,6 +788,9 @@ def nbytes(self): return self._data.nbytes def isna(self): + if len(self) == 0: + return np.array([], dtype=bool) + # needed for pandas ExtensionArray API if isinstance(self._data, pa.ChunkedArray): return np.concatenate( @@ -836,18 +840,12 @@ def take( numpy.take api.extensions.take """ - if (isinstance(indices, list) and indices == [None] * len(indices)) or ( - isinstance(indices, np.ndarray) and (indices == None).all() - ): - return self._from_sequence( - [None] * len(indices), - storage_type=self._storage_type, - logical_type=self._logical_type, - converter=self._converter, - ) if isinstance(indices, list): - indices = np.array(indices) + indices = np.array(indices, dtype=np.int64) + + if len(indices) == len(self) and np.all(indices == np.arange(len(self))): + return self.copy() storage = katy._to_storage_array( self._data @@ -872,6 +870,8 @@ def take( taken = pa.array(taken, type=storage_type, mask=null_mask) else: + # TODO(AP-25592) storage.take on a ChunkedArray with an inner StructDictEncodedArray will merge + # the chunks without re-encoding the struct dict encoding properly taken = storage.take(indices) wrapped = katy._to_extension_array(taken, self._data.type) diff --git a/org.knime.python3.arrow/src/main/python/knime/_arrow/_table.py b/org.knime.python3.arrow/src/main/python/knime/_arrow/_table.py index 7c0407bd1..38111ef26 100644 --- a/org.knime.python3.arrow/src/main/python/knime/_arrow/_table.py +++ b/org.knime.python3.arrow/src/main/python/knime/_arrow/_table.py @@ -346,6 +346,8 @@ def _split_table(self, data: pa.Table) -> List[pa.RecordBatch]: if desired_num_batches < 1: return data.to_batches() num_rows_per_batch = int(len(data) // desired_num_batches) + # TODO(AP-25594) this lead to re-batching of struct-dict-encoded arrays and they + # won't be re-encoded properly return data.to_batches(max_chunksize=num_rows_per_batch) diff --git a/org.knime.python3.arrow/src/main/python/knime/_arrow/_types.py b/org.knime.python3.arrow/src/main/python/knime/_arrow/_types.py index 100faa9ba..6a8e06500 100644 --- a/org.knime.python3.arrow/src/main/python/knime/_arrow/_types.py +++ b/org.knime.python3.arrow/src/main/python/knime/_arrow/_types.py @@ -198,7 +198,8 @@ def get_struct_arrays(arrays, inner_fns, dtype): [ not any([value.is_valid for value in values]) for values in zip(*struct_arrays) - ] + ], + type=pa.bool_(), ), ) diff --git a/pytest.ini b/pytest.ini index 51fda90cc..fdedfe9ab 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,2 @@ [pytest] -# Ignore all classes that are not a unittest.TestCase -python_classes = +python_classes = Test* *Test