diff --git a/.github/workflows/packaging_wheels.yml b/.github/workflows/packaging_wheels.yml index a2e4f857..23a16af7 100644 --- a/.github/workflows/packaging_wheels.yml +++ b/.github/workflows/packaging_wheels.yml @@ -33,11 +33,12 @@ jobs: python: [ cp39, cp310, cp311, cp312, cp313, cp314 ] platform: - { os: windows-2025, arch: amd64, cibw_system: win } + - { os: windows-11-arm, arch: ARM64, cibw_system: win } # cibw requires ARM64 to be uppercase - { os: ubuntu-24.04, arch: x86_64, cibw_system: manylinux } - { os: ubuntu-24.04-arm, arch: aarch64, cibw_system: manylinux } - { os: macos-15, arch: arm64, cibw_system: macosx } - { os: macos-15, arch: universal2, cibw_system: macosx } - - { os: macos-13, arch: x86_64, cibw_system: macosx } + - { os: macos-15-intel, arch: x86_64, cibw_system: macosx } minimal: - ${{ inputs.minimal }} exclude: @@ -46,6 +47,8 @@ jobs: - { minimal: true, python: cp312 } - { minimal: true, python: cp313 } - { minimal: true, platform: { arch: universal2 } } + - { python: cp39, platform: { os: windows-11-arm, arch: ARM64 } } # too many dependency problems for win arm64 + - { python: cp310, platform: { os: windows-11-arm, arch: ARM64 } } # too many dependency problems for win arm64 runs-on: ${{ matrix.platform.os }} env: ### cibuildwheel configuration diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f3550fb0..727f8027 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -22,6 +22,11 @@ on: options: - test - prod + nightly-stale-after-days: + type: string + description: After how many days should nightlies be considered stale + required: true + default: 3 store-s3: type: boolean description: Also store test packages in S3 (always true for prod) @@ -41,6 +46,17 @@ jobs: duckdb-sha: ${{ inputs.duckdb-sha }} set-version: ${{ inputs.stable-version }} + submodule_pr: + name: Create or update PR to bump submodule to given SHA + needs: build_sdist + uses: ./.github/workflows/submodule_auto_pr.yml + with: + duckdb-python-sha: ${{ inputs.duckdb-python-sha }} + duckdb-sha: ${{ inputs.duckdb-sha }} + secrets: + # reusable workflows and secrets are not great: https://github.com/actions/runner/issues/3206 + DUCKDBLABS_BOT_TOKEN: ${{ secrets.DUCKDBLABS_BOT_TOKEN }} + workflow_state: name: Set state for the release workflow needs: build_sdist @@ -51,23 +67,36 @@ jobs: runs-on: ubuntu-latest steps: - id: index_check - name: Check ${{ needs.build_sdist.outputs.package-version }} on PyPI + name: Check version on PyPI run: | - set -eu - # Check PyPI whether the release we're building is already present + set -ex pypi_hostname=${{ inputs.pypi-index == 'test' && 'test.' || '' }}pypi.org - pkg_version=${{ needs.build_sdist.outputs.package-version }} - url=https://${pypi_hostname}/pypi/duckdb/${pkg_version}/json - http_status=$( curl -s -o /dev/null -w "%{http_code}" $url || echo $? ) - if [[ $http_status == "200" ]]; then - echo "::warning::Package version ${pkg_version} is already present on ${pypi_hostname}" - pypi_state=VERSION_FOUND - elif [[ $http_status == 000* ]]; then - echo "::error::Error checking PyPI at ${url}: curl exit code ${http_status#'000'}" - pypi_state=UNKNOWN - else - echo "::notice::Package version ${pkg_version} not found on ${pypi_hostname} (http status: ${http_status})" + # install duckdb + curl https://install.duckdb.org | sh + # query pypi + result=$(cat <>'upload_time_iso_8601')::DATE AS age, + FROM read_json('https://${pypi_hostname}/pypi/duckdb/json') AS jd + CROSS JOIN json_each(jd.releases) AS rel(key, value) + CROSS JOIN unnest(FROM_JSON(rel.value, '["JSON"]')) AS file(value) + WHERE rel.key='${{ needs.build_sdist.outputs.package-version }}' + LIMIT 1; + EOF + ) + if [ -z "$result" ]; then pypi_state=VERSION_NOT_FOUND + else + pypi_state=VERSION_FOUND + fi + if [[ -z "${{ inputs.stable-version }}" ]]; then + age=${result#age = } + if [ "${age}" -ge "${{ inputs.nightly-stale-after-days }}" ]; then + echo "::warning title=Stale nightly for ${{ github.ref_name }}::Nightly is ${age} days old (max=${{ inputs.nightly-stale-after-days }})" + fi fi echo "pypi_state=${pypi_state}" >> $GITHUB_OUTPUT @@ -96,7 +125,7 @@ jobs: echo "::notice::S3 upload disabled in inputs, not generating S3 URL" exit 0 fi - if [[ VERSION_FOUND == "${{ steps.index_check.outputs.pypi_state }}" ]]; then + if [[ VERSION_NOT_FOUND != "${{ steps.index_check.outputs.pypi_state }}" ]]; then echo "::warning::S3 upload disabled because package version already uploaded to PyPI" exit 0 fi @@ -110,7 +139,7 @@ jobs: build_wheels: name: Build and test releases needs: workflow_state - if: ${{ needs.workflow_state.outputs.pypi_state != 'VERSION_FOUND' }} + if: ${{ needs.workflow_state.outputs.pypi_state == 'VERSION_NOT_FOUND' }} uses: ./.github/workflows/packaging_wheels.yml with: minimal: false diff --git a/.github/workflows/submodule_auto_pr.yml b/.github/workflows/submodule_auto_pr.yml new file mode 100644 index 00000000..43a9860a --- /dev/null +++ b/.github/workflows/submodule_auto_pr.yml @@ -0,0 +1,130 @@ +name: Submodule Auto PR +on: + workflow_call: + inputs: + duckdb-python-sha: + type: string + description: The commit to build against (defaults to latest commit of current ref) + required: false + duckdb-sha: + type: string + description: The DuckDB submodule commit or ref to build against + required: true + auto-land: + type: boolean + description: Immediately merge the PR (placeholder - doesn't work) + default: false + secrets: + DUCKDBLABS_BOT_TOKEN: + description: Github token of the DuckDBLabs bot + required: true + +defaults: + run: + shell: bash + +jobs: + create_pr: + name: Create PR to bump duckdb submodule to given SHA + runs-on: ubuntu-latest + steps: + - name: Checkout DuckDB Python + uses: actions/checkout@v4 + with: + ref: ${{ inputs.duckdb-python-sha }} + fetch-depth: 0 + submodules: true + + - name: Checkout or Create Needed Branch + run: | + git fetch --all + head_sha=${{ inputs.duckdb-python-sha }} + branch_name="vendoring-${{ github.ref_name }}" + if [[ `git rev-parse --verify ${branch_name} 2>/dev/null` ]]; then + # branch exists + git checkout ${branch_name} + else + # new branch + git checkout -b ${branch_name} + fi + [[ ${head_sha} ]] && git reset --hard ${head_sha} || true + + - name: Checkout DuckDB at Given SHA + run: | + cd external/duckdb + git fetch origin + git checkout ${{ inputs.duckdb-sha }} + + - name: Determine GH PR Command + id: gh_pr_command + env: + GH_TOKEN: ${{ secrets.DUCKDBLABS_BOT_TOKEN }} + run: | + pr_url=$( gh pr list --head vendoring-${{ github.ref_name }} --state open --json url --jq '.[].url' ) + if [[ $pr_url ]]; then + echo "::notice::Found existing pr, will edit (${pr_url})" + gh_command="edit ${pr_url}" + else + echo "::notice::No existing PR, will create new" + gh_command="create --head vendoring-${{ github.ref_name }} --base ${{ github.ref_name }}" + fi + echo "subcommand=${gh_command}" >> $GITHUB_OUTPUT + + - name: Set Git User + run: | + git config --global user.email "github_bot@duckdblabs.com" + git config --global user.name "DuckDB Labs GitHub Bot" + + - name: Create PR to Bump DuckDB Submodule + env: + GH_TOKEN: ${{ secrets.DUCKDBLABS_BOT_TOKEN }} + run: | + # No need to do anything if the submodule is already at the given sha + [[ `git status --porcelain -- external/duckdb` == "" ]] && exit 0 + # We have changes. Commit and push + git add external/duckdb + git commit -m "Bump submodule" + git push --force origin vendoring-${{ github.ref_name }} + # create PR msg + echo "Bump duckdb submodule:" > body.txt + echo "- Target branch: ${{ github.ref_name }}" >> body.txt + echo "- Date: $( date +"%Y-%m-%d %H:%M:%S" )" >> body.txt + echo "- DuckDB SHA: ${{ inputs.duckdb-sha }}" >> body.txt + echo "- Trigger: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" >> body.txt + subcommand="${{ steps.gh_pr_command.outputs.subcommand }}" + gh pr ${subcommand} \ + --title "[duckdb-labs bot] Bump DuckDB submodule" \ + --body-file body.txt > output.txt 2>&1 + success=$? + # Show summary + url=$( [[ $success ]] && gh pr view vendoring-${{ github.ref_name }} --json url --jq .url || true ) + echo "## Submodule PR Summary" >> $GITHUB_STEP_SUMMARY + if [[ $success ]]; then + prefix=$( [[ $subcommand == edit* ]] && echo "Created" || echo "Updated" ) + echo "### ${prefix} PR: [${url}](${url})" >> $GITHUB_STEP_SUMMARY + else + echo "### Failed to create PR" >> $GITHUB_STEP_SUMMARY + fi + echo '```' >> $GITHUB_STEP_SUMMARY + cat output.txt >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + [[ $success ]] || exit 1 + + - name: Automerge PR + if: ${{ inputs.auto-land }} + env: + GH_TOKEN: ${{ secrets.DUCKDBLABS_BOT_TOKEN }} + run: | + # PLACEHOLDER: DUCKDBLABS_BOT_TOKEN DOES NOT HAVE PERMISSIONS TO MERGE PRS + set -ex + gh pr merge vendoring-${{ github.ref_name }} --rebase > output.txt + success=$? + # Show summary + if [[ $success ]]; then + echo "### PR merged" >> $GITHUB_STEP_SUMMARY + else + echo "### Failed to auto-merge PR" >> $GITHUB_STEP_SUMMARY + fi + echo '```' >> $GITHUB_STEP_SUMMARY + cat output.txt >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/targeted_test.yml b/.github/workflows/targeted_test.yml index 812bb9c5..13ae9566 100644 --- a/.github/workflows/targeted_test.yml +++ b/.github/workflows/targeted_test.yml @@ -9,6 +9,7 @@ on: type: choice options: - 'windows-2025' + - 'windows-11-arm' - 'ubuntu-24.04' - 'ubuntu-24.04-arm' - 'macos-15' @@ -36,6 +37,11 @@ on: description: 'Custom test path (must be in tests/ directory, overrides testsuite)' required: false type: string + verbose-uv: + description: 'Let uv generate verbose output (pytest verbosity is always on)' + required: false + type: boolean + default: true jobs: test: @@ -83,4 +89,4 @@ jobs: - name: Run tests shell: bash run: | - uv run pytest -vv ${{ steps.test_path.outputs.test_path }} + uv ${{ inputs.verbose-uv && 'run -v' || 'run' }} pytest -vv ${{ steps.test_path.outputs.test_path }} diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index d4f4b61b..00000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,19 +0,0 @@ -# Changelog - -## v1.4.1 -**DuckDB Core**: v1.4.1 - -### Bug Fixes -- **ADBC Driver**: Fixed ADBC driver implementation (#81) -- **SQLAlchemy compatibility**: Added `__hash__` method overload (#61) -- **Error Handling**: Reset PyErr before throwing Python exceptions (#69) -- **Polars Lazyframes**: Fixed Polars expression pushdown (#102) - -### Code Quality Improvements & Developer Experience -- **MyPy Support**: MyPy is functional again and better integrated with the dev workflow -- **Stubs**: Re-created and manually curated stubs for the binary extension -- **Type Shadowing**: Deprecated `typing` and `functional` modules -- **Linting & Formatting**: Comprehensive code quality improvements with Ruff -- **Type Annotations**: Added missing overloads and improved type coverage -- **Pre-commit Integration**: Added ruff, clang-format, cmake-format and mypy configs -- **CI/CD**: Added code quality workflow diff --git a/_duckdb-stubs/__init__.pyi b/_duckdb-stubs/__init__.pyi index 6c36d7be..6b323184 100644 --- a/_duckdb-stubs/__init__.pyi +++ b/_duckdb-stubs/__init__.pyi @@ -318,7 +318,18 @@ class DuckDBPyConnection: def list_type(self, type: sqltypes.DuckDBPyType) -> sqltypes.DuckDBPyType: ... def load_extension(self, extension: str) -> None: ... def map_type(self, key: sqltypes.DuckDBPyType, value: sqltypes.DuckDBPyType) -> sqltypes.DuckDBPyType: ... - def pl(self, rows_per_batch: pytyping.SupportsInt = 1000000, *, lazy: bool = False) -> polars.DataFrame: ... + @pytyping.overload + def pl( + self, rows_per_batch: pytyping.SupportsInt = 1000000, *, lazy: pytyping.Literal[False] = ... + ) -> polars.DataFrame: ... + @pytyping.overload + def pl( + self, rows_per_batch: pytyping.SupportsInt = 1000000, *, lazy: pytyping.Literal[True] + ) -> polars.LazyFrame: ... + @pytyping.overload + def pl( + self, rows_per_batch: pytyping.SupportsInt = 1000000, *, lazy: bool = False + ) -> pytyping.Union[polars.DataFrame, polars.LazyFrame]: ... def query(self, query: str, *, alias: str = "", params: object = None) -> DuckDBPyRelation: ... def query_progress(self) -> float: ... def read_csv( @@ -596,7 +607,16 @@ class DuckDBPyRelation: ) -> DuckDBPyRelation: ... def order(self, order_expr: str) -> DuckDBPyRelation: ... def percent_rank(self, window_spec: str, projected_columns: str = "") -> DuckDBPyRelation: ... - def pl(self, batch_size: pytyping.SupportsInt = 1000000, *, lazy: bool = False) -> polars.DataFrame: ... + @pytyping.overload + def pl( + self, batch_size: pytyping.SupportsInt = 1000000, *, lazy: pytyping.Literal[False] = ... + ) -> polars.DataFrame: ... + @pytyping.overload + def pl(self, batch_size: pytyping.SupportsInt = 1000000, *, lazy: pytyping.Literal[True]) -> polars.LazyFrame: ... + @pytyping.overload + def pl( + self, batch_size: pytyping.SupportsInt = 1000000, *, lazy: bool = False + ) -> pytyping.Union[polars.DataFrame, polars.LazyFrame]: ... def product( self, column: str, groups: str = "", window_spec: str = "", projected_columns: str = "" ) -> DuckDBPyRelation: ... @@ -700,6 +720,7 @@ class DuckDBPyRelation: partition_by: pytyping.List[str] | None = None, write_partition_columns: bool | None = None, append: bool | None = None, + filename_pattern: str | None = None, ) -> None: ... def to_table(self, table_name: str) -> None: ... def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ... @@ -752,6 +773,7 @@ class DuckDBPyRelation: partition_by: pytyping.List[str] | None = None, write_partition_columns: bool | None = None, append: bool | None = None, + filename_pattern: str | None = None, ) -> None: ... @property def alias(self) -> str: ... @@ -1048,7 +1070,7 @@ def commit(*, connection: DuckDBPyConnection | None = None) -> DuckDBPyConnectio def connect( database: str | pathlib.Path = ":memory:", read_only: bool = False, - config: dict[str, str] | None = None, + config: dict[str, str | bool | int | float | list[str]] | None = None, ) -> DuckDBPyConnection: ... def create_function( name: str, @@ -1241,12 +1263,27 @@ def map_type( def order( df: pandas.DataFrame, order_expr: str, *, connection: DuckDBPyConnection | None = None ) -> DuckDBPyRelation: ... +@pytyping.overload def pl( rows_per_batch: pytyping.SupportsInt = 1000000, *, - lazy: bool = False, + lazy: pytyping.Literal[False] = ..., connection: DuckDBPyConnection | None = None, ) -> polars.DataFrame: ... +@pytyping.overload +def pl( + rows_per_batch: pytyping.SupportsInt = 1000000, + *, + lazy: pytyping.Literal[True], + connection: DuckDBPyConnection | None = None, +) -> polars.LazyFrame: ... +@pytyping.overload +def pl( + rows_per_batch: pytyping.SupportsInt = 1000000, + *, + lazy: bool = False, + connection: DuckDBPyConnection | None = None, +) -> pytyping.Union[polars.DataFrame, polars.LazyFrame]: ... def project( df: pandas.DataFrame, *args: str | Expression, groups: str = "", connection: DuckDBPyConnection | None = None ) -> DuckDBPyRelation: ... diff --git a/duckdb/experimental/__init__.py b/duckdb/experimental/__init__.py index 1b5ee51b..51d08709 100644 --- a/duckdb/experimental/__init__.py +++ b/duckdb/experimental/__init__.py @@ -1,3 +1,5 @@ from . import spark # noqa: D104 -__all__ = spark.__all__ +__all__ = [ + "spark", +] diff --git a/external/duckdb b/external/duckdb index 23d01e69..880ae8d1 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 23d01e6975f847a9f143e2f153304bdf06cf3803 +Subproject commit 880ae8d1f5e6daeb9731d3da51211098ac54ea86 diff --git a/pyproject.toml b/pyproject.toml index 7df13b61..3bd54543 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ readme = "README.md" keywords = ["DuckDB", "Database", "SQL", "OLAP"] requires-python = ">=3.9.0" classifiers = [ - "Development Status :: 3 - Alpha", + "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Topic :: Database", @@ -40,7 +40,7 @@ maintainers = [{name = "DuckDB Foundation"}] Documentation = "https://duckdb.org/docs/stable/clients/python/overview" Source = "https://github.com/duckdb/duckdb-python" Issues = "https://github.com/duckdb/duckdb-python/issues" -Changelog = "https://github.com/duckdb/duckdb/releases" +Changelog = "https://github.com/duckdb/duckdb-python/releases" [project.optional-dependencies] all = [ # users can install duckdb with 'duckdb[all]', which will install this list @@ -48,7 +48,7 @@ all = [ # users can install duckdb with 'duckdb[all]', which will install this l "fsspec", # used in duckdb.filesystem "numpy", # used in duckdb.experimental.spark and in duckdb.fetchnumpy() "pandas", # used for pandas dataframes all over the place - "pyarrow; python_version < '3.14'", # used for pyarrow support + "pyarrow", # used for pyarrow support "adbc-driver-manager", # for the adbc driver ] @@ -184,7 +184,8 @@ exclude = [ # - numpy: tensorflow doesn't play nice with numpy>2 so for every platform that can run tensorflow (cp39-cp311) we use # numpy<2. numpy<2 has no wheels for cp31[2|3], meaning an sdist will be used. However, on Windows amd64 + # cp313 this results in a segfault / access violation. To get around this, we install numpy>=2 on all >=cp312 -# platforms. +# platforms. Then for windows arm64, for which there is no tensorflow, we only allow numpy>=2.3 because that +# ships arm64 win32 wheels. ###################################################################################################### [tool.uv] @@ -197,6 +198,7 @@ environments = [ # no need to resolve packages beyond these platforms with uv... "python_version >= '3.9' and sys_platform == 'darwin' and platform_machine == 'arm64'", "python_version >= '3.9' and sys_platform == 'darwin' and platform_machine == 'x86_64'", "python_version >= '3.9' and sys_platform == 'win32' and platform_machine == 'AMD64'", + "python_version >= '3.11' and sys_platform == 'win32' and platform_machine == 'ARM64'", "python_version >= '3.9' and sys_platform == 'linux' and platform_machine == 'x86_64'", "python_version >= '3.9' and sys_platform == 'linux' and platform_machine == 'aarch64'", ] @@ -204,6 +206,7 @@ required-environments = [ # ... but do always resolve for all of them "python_version >= '3.9' and sys_platform == 'darwin' and platform_machine == 'arm64'", "python_version >= '3.9' and sys_platform == 'darwin' and platform_machine == 'x86_64'", "python_version >= '3.9' and sys_platform == 'win32' and platform_machine == 'AMD64'", + "python_version >= '3.11' and sys_platform == 'win32' and platform_machine == 'ARM64'", "python_version >= '3.9' and sys_platform == 'linux' and platform_machine == 'x86_64'", "python_version >= '3.9' and sys_platform == 'linux' and platform_machine == 'aarch64'", ] @@ -219,6 +222,7 @@ explicit = true torch = [ { index = "pytorch-cpu" } ] torchvision = [ { index = "pytorch-cpu" } ] +# todo: adjust for windows arm64 while test dependencies become available [dependency-groups] # used for development only, requires pip >=25.1.0 stubdeps = [ # dependencies used for typehints in the stubs "pybind11-stubgen", @@ -226,17 +230,18 @@ stubdeps = [ # dependencies used for typehints in the stubs "fsspec", "pandas", "polars", - "pyarrow; python_version < '3.14'", + "pyarrow; sys_platform != 'win32' or platform_machine != 'ARM64'", + "typing-extensions", ] test = [ # dependencies used for running tests - "adbc-driver-manager", + "adbc-driver-manager; sys_platform != 'win32' or platform_machine != 'ARM64'", "pytest", "pytest-reraise", "pytest-timeout", "pytest-timestamper", "coverage", "gcovr", - "gcsfs", + "gcsfs; sys_platform != 'win32' or platform_machine != 'ARM64'", "packaging", "polars", "psutil", @@ -246,16 +251,18 @@ test = [ # dependencies used for running tests "pytz", "requests", "urllib3", - "fsspec>=2022.11.0", + "fsspec>=2022.11.0; sys_platform != 'win32' or platform_machine != 'ARM64'", "pandas>=2.0.0", - "pyarrow>=18.0.0; python_version < '3.14'", - "torch>=2.2.2; python_version < '3.14' and ( sys_platform != 'darwin' or platform_machine != 'x86_64' or python_version < '3.13' )", + "pyarrow>=18.0.0; sys_platform != 'win32' or platform_machine != 'ARM64'", + "torch>=2.2.2; python_version < '3.14' and ( sys_platform != 'darwin' or platform_machine != 'x86_64' or python_version < '3.13' ) and ( sys_platform != 'win32' or platform_machine != 'ARM64' or python_version > '3.11' )", "tensorflow==2.14.0; sys_platform == 'darwin' and python_version < '3.12'", "tensorflow-cpu>=2.14.0; sys_platform == 'linux' and platform_machine != 'aarch64' and python_version < '3.12'", - "tensorflow-cpu>=2.14.0; sys_platform == 'win32' and python_version < '3.12'", + "tensorflow-cpu>=2.14.0; sys_platform == 'win32' and platform_machine != 'ARM64' and python_version < '3.12'", "tensorflow-cpu-aws==2.15.1; sys_platform == 'linux' and platform_machine == 'aarch64' and python_version < '3.12'", - "numpy<2; python_version < '3.12'", - "numpy>=2; python_version >= '3.12'", + "typing-extensions", + "numpy<2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version < '3.12'", + "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'", + "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'", ] scripts = [ # dependencies used for running scripts "cxxheaderparser", @@ -265,7 +272,7 @@ scripts = [ # dependencies used for running scripts "pandas", "pcpp", "polars", - "pyarrow; python_version < '3.14'", + "pyarrow; sys_platform != 'win32' or platform_machine != 'ARM64'", "pytz" ] pypi = [ # dependencies used by the pypi cleanup script @@ -327,6 +334,10 @@ exclude = [ "tests", "scripts", ] +[[tool.mypy.overrides]] +module = "duckdb.experimental.*" +ignore_errors = true + [[tool.mypy.overrides]] module = [ "fsspec.*", diff --git a/src/duckdb_py/include/duckdb_python/pandas/pandas_scan.hpp b/src/duckdb_py/include/duckdb_python/pandas/pandas_scan.hpp index 50565f05..0ef9a24c 100644 --- a/src/duckdb_py/include/duckdb_python/pandas/pandas_scan.hpp +++ b/src/duckdb_py/include/duckdb_python/pandas/pandas_scan.hpp @@ -55,6 +55,8 @@ struct PandasScanFunction : public TableFunction { static void PandasSerialize(Serializer &serializer, const optional_ptr bind_data, const TableFunction &function); + + static unique_ptr PandasDeserialize(Deserializer &deserializer, TableFunction &function); }; } // namespace duckdb diff --git a/src/duckdb_py/include/duckdb_python/pyrelation.hpp b/src/duckdb_py/include/duckdb_python/pyrelation.hpp index e1f78b5a..06cf9e94 100644 --- a/src/duckdb_py/include/duckdb_python/pyrelation.hpp +++ b/src/duckdb_py/include/duckdb_python/pyrelation.hpp @@ -214,7 +214,7 @@ struct DuckDBPyRelation { const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(), const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(), const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(), - const py::object &append = py::none()); + const py::object &append = py::none(), const py::object &filename_pattern = py::none()); void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(), const py::object &header = py::none(), const py::object "echar = py::none(), @@ -235,7 +235,7 @@ struct DuckDBPyRelation { void InsertInto(const string &table); - void Insert(const py::object ¶ms = py::list()); + void Insert(const py::object ¶ms = py::list()) const; void Update(const py::object &set, const py::object &where = py::none()); void Create(const string &table); diff --git a/src/duckdb_py/include/duckdb_python/pyresult.hpp b/src/duckdb_py/include/duckdb_python/pyresult.hpp index fc3641c4..941a203b 100644 --- a/src/duckdb_py/include/duckdb_python/pyresult.hpp +++ b/src/duckdb_py/include/duckdb_python/pyresult.hpp @@ -66,7 +66,7 @@ struct DuckDBPyResult { PandasDataFrame FrameFromNumpy(bool date_as_object, const py::handle &o); - void ChangeToTZType(PandasDataFrame &df); + void ConvertDateTimeTypes(PandasDataFrame &df, bool date_as_object) const; unique_ptr FetchNext(QueryResult &result); unique_ptr FetchNextRaw(QueryResult &result); unique_ptr InitializeNumpyConversion(bool pandas = false); diff --git a/src/duckdb_py/pandas/scan.cpp b/src/duckdb_py/pandas/scan.cpp index ebce31bb..47c7ba6c 100644 --- a/src/duckdb_py/pandas/scan.cpp +++ b/src/duckdb_py/pandas/scan.cpp @@ -66,6 +66,7 @@ PandasScanFunction::PandasScanFunction() cardinality = PandasScanCardinality; table_scan_progress = PandasProgress; serialize = PandasSerialize; + deserialize = PandasDeserialize; projection_pushdown = true; } @@ -235,4 +236,8 @@ void PandasScanFunction::PandasSerialize(Serializer &serializer, const optional_ throw NotImplementedException("PandasScan function cannot be serialized"); } +unique_ptr PandasScanFunction::PandasDeserialize(Deserializer &deserializer, TableFunction &function) { + throw NotImplementedException("PandasScan function cannot be deserialized"); +} + } // namespace duckdb diff --git a/src/duckdb_py/pyrelation.cpp b/src/duckdb_py/pyrelation.cpp index 3553bff0..bbc7a2ec 100644 --- a/src/duckdb_py/pyrelation.cpp +++ b/src/duckdb_py/pyrelation.cpp @@ -1213,7 +1213,8 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr const py::object &row_group_size_bytes, const py::object &row_group_size, const py::object &overwrite, const py::object &per_thread_output, const py::object &use_tmp_file, const py::object &partition_by, - const py::object &write_partition_columns, const py::object &append) { + const py::object &write_partition_columns, const py::object &append, + const py::object &filename_pattern) { case_insensitive_map_t> options; if (!py::none().is(compression)) { @@ -1304,6 +1305,13 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr options["use_tmp_file"] = {Value::BOOLEAN(py::bool_(use_tmp_file))}; } + if (!py::none().is(filename_pattern)) { + if (!py::isinstance(filename_pattern)) { + throw InvalidInputException("to_parquet only accepts 'filename_pattern' as a string"); + } + options["filename_pattern"] = {Value(py::str(filename_pattern))}; + } + auto write_parquet = rel->WriteParquetRel(filename, std::move(options)); PyExecuteRelation(write_parquet); } @@ -1511,14 +1519,10 @@ DuckDBPyRelation &DuckDBPyRelation::Execute() { void DuckDBPyRelation::InsertInto(const string &table) { AssertRelation(); auto parsed_info = QualifiedName::Parse(table); - auto insert = rel->InsertRel(parsed_info.schema, parsed_info.name); + auto insert = rel->InsertRel(parsed_info.catalog, parsed_info.schema, parsed_info.name); PyExecuteRelation(insert); } -static bool IsAcceptedInsertRelationType(const Relation &relation) { - return relation.type == RelationType::TABLE_RELATION; -} - void DuckDBPyRelation::Update(const py::object &set_p, const py::object &where) { AssertRelation(); unique_ptr condition; @@ -1563,9 +1567,9 @@ void DuckDBPyRelation::Update(const py::object &set_p, const py::object &where) return rel->Update(std::move(names), std::move(expressions), std::move(condition)); } -void DuckDBPyRelation::Insert(const py::object ¶ms) { +void DuckDBPyRelation::Insert(const py::object ¶ms) const { AssertRelation(); - if (!IsAcceptedInsertRelationType(*this->rel)) { + if (this->rel->type != RelationType::TABLE_RELATION) { throw InvalidInputException("'DuckDBPyRelation.insert' can only be used on a table relation"); } vector> values {DuckDBPyConnection::TransformPythonParamList(params)}; diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp index cd1f042c..7bfea441 100644 --- a/src/duckdb_py/pyrelation/initialize.cpp +++ b/src/duckdb_py/pyrelation/initialize.cpp @@ -36,7 +36,8 @@ static void InitializeConsumers(py::class_ &m) { py::arg("row_group_size_bytes") = py::none(), py::arg("row_group_size") = py::none(), py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(), py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(), - py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none()); + py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(), + py::arg("filename_pattern") = py::none()); DefineMethod( {"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'", diff --git a/src/duckdb_py/pyresult.cpp b/src/duckdb_py/pyresult.cpp index 43edf0e1..e92f6abe 100644 --- a/src/duckdb_py/pyresult.cpp +++ b/src/duckdb_py/pyresult.cpp @@ -287,8 +287,13 @@ py::dict DuckDBPyResult::FetchNumpyInternal(bool stream, idx_t vectors_per_chunk return res; } +static void ReplaceDFColumn(PandasDataFrame &df, const char *col_name, idx_t idx, const py::handle &new_value) { + df.attr("drop")("columns"_a = col_name, "inplace"_a = true); + df.attr("insert")(idx, col_name, new_value, "allow_duplicates"_a = false); +} + // TODO: unify these with an enum/flag to indicate which conversions to do -void DuckDBPyResult::ChangeToTZType(PandasDataFrame &df) { +void DuckDBPyResult::ConvertDateTimeTypes(PandasDataFrame &df, bool date_as_object) const { auto names = df.attr("columns").cast>(); for (idx_t i = 0; i < result->ColumnCount(); i++) { @@ -297,8 +302,10 @@ void DuckDBPyResult::ChangeToTZType(PandasDataFrame &df) { auto utc_local = df[names[i].c_str()].attr("dt").attr("tz_localize")("UTC"); auto new_value = utc_local.attr("dt").attr("tz_convert")(result->client_properties.time_zone); // We need to create the column anew because the exact dt changed to a new timezone - df.attr("drop")("columns"_a = names[i].c_str(), "inplace"_a = true); - df.attr("__setitem__")(names[i].c_str(), new_value); + ReplaceDFColumn(df, names[i].c_str(), i, new_value); + } else if (date_as_object && result->types[i] == LogicalType::DATE) { + auto new_value = df[names[i].c_str()].attr("dt").attr("date"); + ReplaceDFColumn(df, names[i].c_str(), i, new_value); } } } @@ -374,20 +381,11 @@ PandasDataFrame DuckDBPyResult::FrameFromNumpy(bool date_as_object, const py::ha } PandasDataFrame df = py::cast(pandas.attr("DataFrame").attr("from_dict")(o)); - // Unfortunately we have to do a type change here for timezones since these types are not supported by numpy - ChangeToTZType(df); + // Convert TZ and (optionally) Date types + ConvertDateTimeTypes(df, date_as_object); auto names = df.attr("columns").cast>(); D_ASSERT(result->ColumnCount() == names.size()); - if (date_as_object) { - for (idx_t i = 0; i < result->ColumnCount(); i++) { - if (result->types[i] == LogicalType::DATE) { - auto new_value = df[names[i].c_str()].attr("dt").attr("date"); - df.attr("drop")("columns"_a = names[i].c_str(), "inplace"_a = true); - df.attr("__setitem__")(names[i].c_str(), new_value); - } - } - } return df; } diff --git a/tests/conftest.py b/tests/conftest.py index 3baefdec..8a16652d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -48,47 +48,6 @@ def import_pandas(): pytest.skip("Couldn't import pandas", allow_module_level=True) -@pytest.hookimpl(hookwrapper=True) -def pytest_runtest_call(item): - """Convert missing pyarrow imports to skips. - - TODO(evertlammerts): Remove skip when pyarrow releases for 3.14. - https://github.com/duckdblabs/duckdb-internal/issues/6182 - """ - outcome = yield - if sys.version_info[:2] == (3, 14): - try: - outcome.get_result() - except ImportError as e: - if e.name == "pyarrow": - pytest.skip(f"pyarrow not available - {item.name} requires pyarrow") - else: - raise - - -@pytest.hookimpl(hookwrapper=True) -def pytest_make_collect_report(collector): - """Wrap module collection to catch pyarrow import errors on Python 3.14. - - If we're on Python 3.14 and a test module raises ModuleNotFoundError - for 'pyarrow', mark the entire module as xfailed rather than failing collection. - - TODO(evertlammerts): Remove skip when pyarrow releases for 3.14. - https://github.com/duckdblabs/duckdb-internal/issues/6182 - """ - outcome = yield - report: pytest.CollectReport = outcome.get_result() - - if sys.version_info[:2] == (3, 14): - # Only handle failures from module collectors - if report.failed and collector.__class__.__name__ == "Module": - longreprtext = report.longreprtext - if "ModuleNotFoundError: No module named 'pyarrow'" in longreprtext: - report.outcome = "skipped" - reason = f"XFAIL: [pyarrow not available] {longreprtext}" - report.longrepr = (report.fspath, None, reason) - - # https://docs.pytest.org/en/latest/example/simple.html#control-skipping-of-tests-according-to-command-line-option # https://stackoverflow.com/a/47700320 def pytest_addoption(parser): diff --git a/tests/fast/adbc/test_adbc.py b/tests/fast/adbc/test_adbc.py index 80920a99..f82d0982 100644 --- a/tests/fast/adbc/test_adbc.py +++ b/tests/fast/adbc/test_adbc.py @@ -1,13 +1,13 @@ import datetime -import sys from pathlib import Path -import adbc_driver_manager.dbapi import numpy as np -import pyarrow import pytest -import adbc_driver_duckdb.dbapi +adbc_driver_manager = pytest.importorskip("adbc_driver_manager") +adbc_driver_manager_dbapi = pytest.importorskip("adbc_driver_manager.dbapi") +adbc_driver_duckdb = pytest.importorskip("adbc_driver_duckdb") +pyarrow = pytest.importorskip("pyarrow") xfail = pytest.mark.xfail driver_path = adbc_driver_duckdb.driver_path() @@ -15,7 +15,7 @@ @pytest.fixture def duck_conn(): - with adbc_driver_manager.dbapi.connect(driver=driver_path, entrypoint="duckdb_adbc_init") as conn: + with adbc_driver_manager_dbapi.connect(driver=driver_path, entrypoint="duckdb_adbc_init") as conn: yield conn @@ -29,7 +29,6 @@ def example_table(): ) -@xfail(sys.platform == "win32", reason="adbc-driver-manager.adbc_get_info() returns an empty dict on windows") def test_connection_get_info(duck_conn): assert duck_conn.adbc_get_info() != {} @@ -42,9 +41,6 @@ def test_connection_get_table_types(duck_conn): assert duck_conn.adbc_get_table_types() == ["BASE TABLE"] -@xfail( - sys.platform == "win32", reason="adbc-driver-manager.adbc_get_objects() returns an invalid schema dict on windows" -) def test_connection_get_objects(duck_conn): with duck_conn.cursor() as cursor: cursor.execute("CREATE TABLE getobjects (ints BIGINT PRIMARY KEY)") @@ -66,9 +62,6 @@ def test_connection_get_objects(duck_conn): assert depth_all.schema == depth_catalogs.schema -@xfail( - sys.platform == "win32", reason="adbc-driver-manager.adbc_get_objects() returns an invalid schema dict on windows" -) def test_connection_get_objects_filters(duck_conn): with duck_conn.cursor() as cursor: cursor.execute("CREATE TABLE getobjects (ints BIGINT PRIMARY KEY)") @@ -101,7 +94,7 @@ def test_commit(tmp_path): table = example_table() db_kwargs = {"path": f"{db}"} # Start connection with auto-commit off - with adbc_driver_manager.dbapi.connect( + with adbc_driver_manager_dbapi.connect( driver=driver_path, entrypoint="duckdb_adbc_init", db_kwargs=db_kwargs, @@ -111,7 +104,7 @@ def test_commit(tmp_path): cur.adbc_ingest("ingest", table, "create") # Check Data is not there - with adbc_driver_manager.dbapi.connect( + with adbc_driver_manager_dbapi.connect( driver=driver_path, entrypoint="duckdb_adbc_init", db_kwargs=db_kwargs, @@ -130,7 +123,7 @@ def test_commit(tmp_path): # This now works because we enabled autocommit with ( - adbc_driver_manager.dbapi.connect( + adbc_driver_manager_dbapi.connect( driver=driver_path, entrypoint="duckdb_adbc_init", db_kwargs=db_kwargs, @@ -207,7 +200,6 @@ def test_statement_query(duck_conn): assert cursor.fetch_arrow_table().to_pylist() == [{"foo": 1}] -@xfail(sys.platform == "win32", reason="adbc-driver-manager returns an invalid table schema on windows") def test_insertion(duck_conn): table = example_table() reader = table.to_reader() @@ -225,8 +217,8 @@ def test_insertion(duck_conn): # Test Append with duck_conn.cursor() as cursor: with pytest.raises( - adbc_driver_manager.InternalError, - match=r'Table with name "ingest_table" already exists!', + adbc_driver_manager.ProgrammingError, + match=r"ALREADY_EXISTS", ): cursor.adbc_ingest("ingest_table", table, "create") cursor.adbc_ingest("ingest_table", table, "append") @@ -234,7 +226,6 @@ def test_insertion(duck_conn): assert cursor.fetch_arrow_table().to_pydict() == {"count_star()": [8]} -@xfail(sys.platform == "win32", reason="adbc-driver-manager returns an invalid table schema on windows") def test_read(duck_conn): with duck_conn.cursor() as cursor: filename = Path(__file__).parent / ".." / "data" / "category.csv" @@ -304,7 +295,7 @@ def test_large_chunk(tmp_path): db.unlink() db_kwargs = {"path": f"{db}"} with ( - adbc_driver_manager.dbapi.connect( + adbc_driver_manager_dbapi.connect( driver=driver_path, entrypoint="duckdb_adbc_init", db_kwargs=db_kwargs, @@ -330,7 +321,7 @@ def test_dictionary_data(tmp_path): db.unlink() db_kwargs = {"path": f"{db}"} with ( - adbc_driver_manager.dbapi.connect( + adbc_driver_manager_dbapi.connect( driver=driver_path, entrypoint="duckdb_adbc_init", db_kwargs=db_kwargs, @@ -358,7 +349,7 @@ def test_ree_data(tmp_path): db.unlink() db_kwargs = {"path": f"{db}"} with ( - adbc_driver_manager.dbapi.connect( + adbc_driver_manager_dbapi.connect( driver=driver_path, entrypoint="duckdb_adbc_init", db_kwargs=db_kwargs, diff --git a/tests/fast/adbc/test_connection_get_info.py b/tests/fast/adbc/test_connection_get_info.py index aa2b3d32..cd6298ed 100644 --- a/tests/fast/adbc/test_connection_get_info.py +++ b/tests/fast/adbc/test_connection_get_info.py @@ -1,19 +1,22 @@ -import pyarrow as pa +import pytest -import adbc_driver_duckdb.dbapi import duckdb +pa = pytest.importorskip("pyarrow") +pytest.importorskip("adbc_driver_manager") +adbc_driver_duckdb_dbapi = pytest.importorskip("adbc_driver_duckdb.dbapi") + class TestADBCConnectionGetInfo: def test_connection_basic(self): - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() with con.cursor() as cursor: cursor.execute("select 42") res = cursor.fetchall() assert res == [(42,)] def test_connection_get_info_all(self): - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() adbc_con = con.adbc_connection res = adbc_con.get_info() reader = pa.RecordBatchReader._import_from_c(res.address) @@ -37,7 +40,7 @@ def test_connection_get_info_all(self): assert string_values == expected_result def test_empty_result(self): - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() adbc_con = con.adbc_connection res = adbc_con.get_info([1337]) reader = pa.RecordBatchReader._import_from_c(res.address) @@ -48,7 +51,7 @@ def test_empty_result(self): assert values.num_chunks == 0 def test_unrecognized_codes(self): - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() adbc_con = con.adbc_connection res = adbc_con.get_info([0, 1000, 4, 2000]) reader = pa.RecordBatchReader._import_from_c(res.address) diff --git a/tests/fast/adbc/test_statement_bind.py b/tests/fast/adbc/test_statement_bind.py index d35693ff..e8df14c7 100644 --- a/tests/fast/adbc/test_statement_bind.py +++ b/tests/fast/adbc/test_statement_bind.py @@ -1,10 +1,10 @@ import sys -import adbc_driver_manager -import pyarrow as pa import pytest -import adbc_driver_duckdb.dbapi +pa = pytest.importorskip("pyarrow") +adbc_driver_manager = pytest.importorskip("adbc_driver_manager") +adbc_driver_duckdb_dbapi = pytest.importorskip("adbc_driver_duckdb.dbapi") xfail = pytest.mark.xfail @@ -35,7 +35,7 @@ def test_bind_multiple_rows(self): names=["ints"], ) - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() with con.cursor() as cursor: statement = cursor.adbc_statement statement.set_sql_query("select ? * 2 as i") @@ -57,7 +57,7 @@ def test_bind_single_row(self): names=["ints"], ) - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() with con.cursor() as cursor: statement = cursor.adbc_statement statement.set_sql_query("select ? * 2 as i") @@ -93,7 +93,7 @@ def test_multiple_parameters(self): names=["ints", "strings", "bools"], ) - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() with con.cursor() as cursor: statement = cursor.adbc_statement statement.set_sql_query("select ? as a, ? as b, ? as c") @@ -123,7 +123,7 @@ def test_bind_composite_type(self): # Create the RecordBatch record_batch = pa.RecordBatch.from_arrays([struct_array], schema=schema) - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() with con.cursor() as cursor: statement = cursor.adbc_statement statement.set_sql_query("select ? as a") @@ -146,7 +146,7 @@ def test_too_many_parameters(self): names=["ints", "strings"], ) - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() with con.cursor() as cursor: statement = cursor.adbc_statement statement.set_sql_query("select ? as a") @@ -175,7 +175,7 @@ def test_not_enough_parameters(self): names=["strings"], ) - con = adbc_driver_duckdb.dbapi.connect() + con = adbc_driver_duckdb_dbapi.connect() with con.cursor() as cursor: statement = cursor.adbc_statement statement.set_sql_query("select ? as a, ? as b") diff --git a/tests/fast/api/test_to_parquet.py b/tests/fast/api/test_to_parquet.py index 8d8162b0..f0952e68 100644 --- a/tests/fast/api/test_to_parquet.py +++ b/tests/fast/api/test_to_parquet.py @@ -1,4 +1,6 @@ import os +import pathlib +import re import tempfile import pytest @@ -170,3 +172,56 @@ def test_append(self, pd): ("shinji", 123.0, "a"), ] assert result.execute().fetchall() == expected + + @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) + def test_filename_pattern_with_index(self, pd): + temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 + df = pd.DataFrame( + { + "name": ["rei", "shinji", "asuka", "kaworu"], + "float": [321.0, 123.0, 23.0, 340.0], + "category": ["a", "a", "b", "c"], + } + ) + rel = duckdb.from_df(df) + rel.to_parquet(temp_file_name, partition_by=["category"], filename_pattern="orders_{i}") + # Check that files follow the pattern with {i} + files_a = list(pathlib.Path(f"{temp_file_name}/category=a").iterdir()) + files_b = list(pathlib.Path(f"{temp_file_name}/category=b").iterdir()) + files_c = list(pathlib.Path(f"{temp_file_name}/category=c").iterdir()) + filename_pattern = re.compile(r"^orders_[09]+\.parquet$") + assert all(filename_pattern.search(str(f.name)) for f in files_a) + assert all(filename_pattern.search(str(f.name)) for f in files_b) + assert all(filename_pattern.search(str(f.name)) for f in files_c) + + # Verify data integrity + result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)") + expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")] + assert result.execute().fetchall() == expected + + @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) + def test_filename_pattern_with_uuid(self, pd): + temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 + df = pd.DataFrame( + { + "name": ["rei", "shinji", "asuka", "kaworu"], + "float": [321.0, 123.0, 23.0, 340.0], + "category": ["a", "a", "b", "c"], + } + ) + rel = duckdb.from_df(df) + rel.to_parquet(temp_file_name, partition_by=["category"], filename_pattern="file_{uuid}") + # Check that files follow the pattern with {uuid} + files_a = list(pathlib.Path(f"{temp_file_name}/category=a").iterdir()) + files_b = list(pathlib.Path(f"{temp_file_name}/category=b").iterdir()) + files_c = list(pathlib.Path(f"{temp_file_name}/category=c").iterdir()) + filename_pattern = re.compile(r"^file_[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}\.parquet$") + print(files_a) + assert all(filename_pattern.search(str(f.name)) for f in files_a) + assert all(filename_pattern.search(str(f.name)) for f in files_b) + assert all(filename_pattern.search(str(f.name)) for f in files_c) + + # Verify data integrity + result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)") + expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")] + assert result.execute().fetchall() == expected diff --git a/tests/fast/arrow/test_2426.py b/tests/fast/arrow/test_2426.py index f43284d3..6f76613f 100644 --- a/tests/fast/arrow/test_2426.py +++ b/tests/fast/arrow/test_2426.py @@ -1,5 +1,9 @@ +import pytest + import duckdb +pytest.importorskip("pyarrow") + try: can_run = True except Exception: diff --git a/tests/fast/arrow/test_arrow_fetch.py b/tests/fast/arrow/test_arrow_fetch.py index 0547020f..ba5d13a4 100644 --- a/tests/fast/arrow/test_arrow_fetch.py +++ b/tests/fast/arrow/test_arrow_fetch.py @@ -1,5 +1,10 @@ +import pytest + import duckdb +pytest.importorskip("pyarrow") + + try: can_run = True except Exception: diff --git a/tests/fast/pandas/test_column_order.py b/tests/fast/pandas/test_column_order.py new file mode 100644 index 00000000..0600bc4c --- /dev/null +++ b/tests/fast/pandas/test_column_order.py @@ -0,0 +1,16 @@ +import duckdb + + +class TestColumnOrder: + def test_column_order(self, duckdb_cursor): + to_execute = """ + CREATE OR REPLACE TABLE t1 AS ( + SELECT NULL AS col1, + NULL::TIMESTAMPTZ AS timepoint, + NULL::DATE AS date, + ); + SELECT timepoint, date, col1 FROM t1; + """ + df = duckdb.execute(to_execute).fetchdf() + cols = list(df.columns) + assert cols == ["timepoint", "date", "col1"] diff --git a/tests/fast/test_all_types.py b/tests/fast/test_all_types.py index 77074fdc..c4ba0e55 100644 --- a/tests/fast/test_all_types.py +++ b/tests/fast/test_all_types.py @@ -534,6 +534,7 @@ def test_fetchnumpy(self, cur_type): @pytest.mark.parametrize("cur_type", all_types) def test_arrow(self, cur_type): + pytest.importorskip("pyarrow") try: pass except Exception: diff --git a/tests/fast/test_insert.py b/tests/fast/test_insert.py index c5de1589..ea43894c 100644 --- a/tests/fast/test_insert.py +++ b/tests/fast/test_insert.py @@ -27,7 +27,6 @@ def test_insert_with_schema(self, duckdb_cursor): res = duckdb_cursor.table("not_main.tbl").fetchall() assert len(res) == 10 - # Insert into a schema-qualified table should work; table has a single column from range(10) duckdb_cursor.table("not_main.tbl").insert([42]) res2 = duckdb_cursor.table("not_main.tbl").fetchall() assert len(res2) == 11