Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 50 additions & 14 deletions narwhals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,8 +564,36 @@ def show_versions() -> None:
print(f"{k:>13}: {stat}") # noqa: T201


def _validate_separator(separator: str, native_separator: str, **kwargs: Any) -> None:
if native_separator in kwargs and kwargs[native_separator] != separator:
msg = (
f"`separator` and `{native_separator}` do not match: "
f"`separator`={separator} and `{native_separator}`={kwargs[native_separator]}."
)
raise TypeError(msg)


def _validate_separator_pyarrow(separator: str, **kwargs: Any) -> Any:
if "parse_options" in kwargs:
parse_options = kwargs.pop("parse_options")
if parse_options.delimiter != separator:
msg = (
"`separator` and `parse_options.delimiter` do not match: "
f"`separator`={separator} and `delimiter`={parse_options.delimiter}."
)
raise TypeError(msg)
return kwargs
from pyarrow import csv # ignore-banned-import

return {"parse_options": csv.ParseOptions(delimiter=separator)}
Copy link
Member

@FBruzzesi FBruzzesi Aug 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nevermind I completely misread this

Fake panic review

The issue I have with this is that if any other argument was provided in parse_options then it will be silently ignored.

Say someone is calling the following:

nw.read_csv(file, separator=",", parse_options=csv.ParseOptions(ignore_empty_lines=False)

Then at the end of validate_separator_pyarrow, we will end up with csv.ParseOptions(delimiter=separator, ignore_empty_lines=True) (i.e. the default value), silently.

I agree that hardcoding fields as suggested in #2989 (review) is not ideal, yet pyarrow does not provide much else we can use. We could dynamically lookup its __dir__ or use inspect.get_members, exclude dunder methods, but for example we would end up with validate and equals, which are not attribute to set at instantiation.

Unsuccessful tentatives I tried:

inspect.signature

from inspect import signature
from pyarrow import csv

print(signature(csv.ParseOptions.__init__))

(self, /, *args, **kwargs)

dataclasses.fields

From the stubs I got tricked into thinking that is a dataclass:

from dataclasses import fields
from pyarrow import csv

print(fields(csv.ParseOptions))

TypeError: must be called with a dataclass type or instance

I have mixed feelings as now a pyarrow user should pass both separator=xyz, parse_options=csv.ParseOptions(delimiter=xyz)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @FBruzzesi!

I have mixed feelings as now a pyarrow user should pass both separator=xyz, parse_options=csv.ParseOptions(delimiter=xyz)

A user won't need to pass both separator and delimiter as if "parse_options" not in kwargs: then we
return {"parse_options": csv.ParseOptions(delimiter=separator)}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right, so someone would only need to pass both separator and delimiter if they were specifying another parse option (like double_quote)

tbh I think this is fine

Copy link
Member

@dangotbanned dangotbanned Aug 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since our default is:

separator: str = ","

and matches pyarrow's default:

delimiter: str = ","

Alternative

ParseOptions.delimiter has higher precedence unless separator overrides the default.

In either case - every other argument is respected

Show definitions

from __future__ import annotations

from pyarrow import csv
from typing import Any


def merge_options(separator: str = ",", **kwargs: Any) -> dict[str, Any]:
    DEFAULT = ","  # noqa: N806
    if separator != DEFAULT:
        if opts := kwargs.pop("parse_options", None):
            opts.delimiter = separator
        else:
            opts = csv.ParseOptions(delimiter=separator)
        kwargs["parse_options"] = opts
    return kwargs


def display_merge(result: dict[str, Any]) -> None:
    if result and (options := result.pop("parse_options", None)):
        print(f"{options.delimiter=}\n{options.double_quote=}")
        if result:
            print(f"Remaining: {result!r}")
    elif result:
        print(f"Unrelated: {result!r}")
    else:
        print(f"Empty: {result!r}")

Would this behavior not be more ideal?

# NOTE: `double_quote` default is `True`
user_options = csv.ParseOptions(delimiter="\t", double_quote=False)
>>> display_merge(merge_options(parse_options=user_options))
options.delimiter='\t'
options.double_quote=False
>>> display_merge(merge_options(",", parse_options=user_options))
options.delimiter='\t'
options.double_quote=False
>>> display_merge(merge_options("?", parse_options=user_options))
options.delimiter='?'
options.double_quote=False
>>> display_merge(merge_options())
Empty: {}
>>> display_merge(merge_options("\t"))
options.delimiter='\t'
options.double_quote=True
>>> display_merge(
    merge_options(
        "?",
        parse_options=csv.ParseOptions(double_quote=False),
        read_options=csv.ReadOptions(),
    )
)
options.delimiter='?'
options.double_quote=False
Remaining: {'read_options': <pyarrow._csv.ReadOptions object at 0x000001F29413AD40>}

Although it is cython, the important part is they're all properties with setters
https://github.com/apache/arrow/blob/f8b20f131a072ef423e81b8a676f42a82255f4ec/python/pyarrow/_csv.pyx#L435-L543

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ParseOptions.delimiter has higher precedence unless separator overrides the default.

hmmm yes, that does sound better actually, thanks!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Marco

with (#2989 (comment)) in mind ...

Not sure if this is on duckdb or sqlframe, but sep has higher precedence than delim

from sqlframe.duckdb import DuckDBSession
import polars as pl
from pathlib import Path


data: Mapping[str, Any] = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]}
fp = Path.cwd() / "data" / "file.csv"

pl.DataFrame(data).write_csv(fp, separator="\t")

session = DuckDBSession.builder.getOrCreate()
>>> session.read.format("csv").load(str(fp), sep="\t", delim="?").collect()
[Row(a=1, b=4.5, z='x'), Row(a=2, b=6.7, z='y'), Row(a=3, b=8.9, z='w')]

Personally, I think we're best off just defining rule(s) and documenting what we do for each backend if needed.

So instead of

>>> nw.scan_csv("...", backend="sqlframe", separator=",", sep="?", delim="\t", delimiter="!")
TypeError: `separator` and `sep` do not match: `separator`=, and `sep`=?.

We either:

  • pick one and replace it - leaving everything else unchanged
  • say we'll pick ... then ... and then ...

If any backend raises on non-matching arguments - I say let them - as it saves us the hassle πŸ˜…



def read_csv(
source: str, *, backend: IntoBackend[EagerAllowed], **kwargs: Any
source: str,
*,
backend: IntoBackend[EagerAllowed],
separator: str = ",",
**kwargs: Any,
) -> DataFrame[Any]:
"""Read a CSV file into a DataFrame.

Expand All @@ -578,6 +606,7 @@ def read_csv(
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
separator: Single byte character to use as separator in the file.
kwargs: Extra keyword arguments which are passed to the native CSV reader.
For example, you could use
`nw.read_csv('file.csv', backend='pandas', engine='pyarrow')`.
Expand All @@ -599,14 +628,13 @@ def read_csv(
impl = Implementation.from_backend(backend)
native_namespace = impl.to_native_namespace()
native_frame: NativeDataFrame
if impl in {
Implementation.POLARS,
Implementation.PANDAS,
Implementation.MODIN,
Implementation.CUDF,
}:
native_frame = native_namespace.read_csv(source, **kwargs)
if impl in {Implementation.PANDAS, Implementation.MODIN, Implementation.CUDF}:
_validate_separator(separator, "sep", **kwargs)
native_frame = native_namespace.read_csv(source, sep=separator, **kwargs)
elif impl is Implementation.POLARS:
native_frame = native_namespace.read_csv(source, separator=separator, **kwargs)
elif impl is Implementation.PYARROW:
kwargs = _validate_separator_pyarrow(separator, **kwargs)
from pyarrow import csv # ignore-banned-import

native_frame = csv.read_csv(source, **kwargs)
Expand Down Expand Up @@ -635,7 +663,7 @@ def read_csv(


def scan_csv(
source: str, *, backend: IntoBackend[Backend], **kwargs: Any
source: str, *, backend: IntoBackend[Backend], separator: str = ",", **kwargs: Any
) -> LazyFrame[Any]:
"""Lazily read from a CSV file.

Expand All @@ -651,6 +679,7 @@ def scan_csv(
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
separator: Single byte character to use as separator in the file.
kwargs: Extra keyword arguments which are passed to the native CSV reader.
For example, you could use
`nw.scan_csv('file.csv', backend=pd, engine='pyarrow')`.
Expand All @@ -676,33 +705,40 @@ def scan_csv(
native_namespace = implementation.to_native_namespace()
native_frame: NativeDataFrame | NativeLazyFrame
if implementation is Implementation.POLARS:
native_frame = native_namespace.scan_csv(source, **kwargs)
native_frame = native_namespace.scan_csv(source, separator=separator, **kwargs)
elif implementation in {
Implementation.PANDAS,
Implementation.MODIN,
Implementation.CUDF,
Implementation.DASK,
Implementation.DUCKDB,
Implementation.IBIS,
}:
native_frame = native_namespace.read_csv(source, **kwargs)
_validate_separator(separator, "sep", **kwargs)
native_frame = native_namespace.read_csv(source, sep=separator, **kwargs)
elif implementation is Implementation.DUCKDB:
_validate_separator(separator, "delimiter", **kwargs)
_validate_separator(separator, "delim", **kwargs)
native_frame = native_namespace.read_csv(source, delimiter=separator, **kwargs)
elif implementation is Implementation.PYARROW:
kwargs = _validate_separator_pyarrow(separator, **kwargs)
from pyarrow import csv # ignore-banned-import

native_frame = csv.read_csv(source, **kwargs)
elif implementation.is_spark_like():
_validate_separator(separator, "sep", **kwargs)
_validate_separator(separator, "delimiter", **kwargs)
if (session := kwargs.pop("session", None)) is None:
msg = "Spark like backends require a session object to be passed in `kwargs`."
raise ValueError(msg)

csv_reader = session.read.format("csv")
native_frame = (
csv_reader.load(source)
csv_reader.load(source, sep=separator)
if (
implementation is Implementation.SQLFRAME
and implementation._backend_version() < (3, 27, 0)
)
else csv_reader.options(**kwargs).load(source)
else csv_reader.options(sep=separator, **kwargs).load(source)
)
else: # pragma: no cover
try:
Expand Down
18 changes: 14 additions & 4 deletions narwhals/stable/v2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,7 +1138,11 @@ def from_numpy(


def read_csv(
source: str, *, backend: IntoBackend[EagerAllowed], **kwargs: Any
source: str,
*,
backend: IntoBackend[EagerAllowed],
separator: str = ",",
**kwargs: Any,
) -> DataFrame[Any]:
"""Read a CSV file into a DataFrame.

Expand All @@ -1151,18 +1155,21 @@ def read_csv(
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
separator: Single byte character to use as separator in the file.
kwargs: Extra keyword arguments which are passed to the native CSV reader.
For example, you could use
`nw.read_csv('file.csv', backend='pandas', engine='pyarrow')`.

Returns:
DataFrame.
"""
return _stableify(nw_f.read_csv(source, backend=backend, **kwargs))
return _stableify(
nw_f.read_csv(source, backend=backend, separator=separator, **kwargs)
)


def scan_csv(
source: str, *, backend: IntoBackend[Backend], **kwargs: Any
source: str, *, backend: IntoBackend[Backend], separator: str = ",", **kwargs: Any
) -> LazyFrame[Any]:
"""Lazily read from a CSV file.

Expand All @@ -1178,14 +1185,17 @@ def scan_csv(
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.
separator: Single byte character to use as separator in the file.
kwargs: Extra keyword arguments which are passed to the native CSV reader.
For example, you could use
`nw.scan_csv('file.csv', backend=pd, engine='pyarrow')`.

Returns:
LazyFrame.
"""
return _stableify(nw_f.scan_csv(source, backend=backend, **kwargs))
return _stableify(
nw_f.scan_csv(source, backend=backend, separator=separator, **kwargs)
)


def read_parquet(
Expand Down
76 changes: 76 additions & 0 deletions tests/read_scan_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ def test_read_csv(tmpdir: pytest.TempdirFactory, eager_backend: EagerAllowed) ->
result = nw.read_csv(filepath, backend=eager_backend)
assert_equal_data(result, data)
assert isinstance(result, nw.DataFrame)
df_pl.write_csv(filepath, separator=";")
result = nw.read_csv(filepath, backend=eager_backend, separator=";")
assert_equal_data(result, data)
assert isinstance(result, nw.DataFrame)


@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow")
Expand Down Expand Up @@ -90,15 +94,87 @@ def test_scan_csv(tmpdir: pytest.TempdirFactory, constructor: Constructor) -> No
result = nw.scan_csv(filepath, backend=backend, **kwargs)
assert_equal_data(result, data)
assert isinstance(result, nw.LazyFrame)
df_pl.write_csv(filepath, separator="|")
df = nw.from_native(constructor(data))
backend = nw.get_native_namespace(df)
result = nw.scan_csv(filepath, backend=backend, separator="|", **kwargs)
assert_equal_data(result, data)
assert isinstance(result, nw.LazyFrame)


@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow")
def test_scan_csv_kwargs(tmpdir: pytest.TempdirFactory) -> None:
pytest.importorskip("pyarrow")
from pyarrow import csv

df_pl = pl.DataFrame(data)
filepath = str(tmpdir / "file.csv") # type: ignore[operator]
df_pl.write_csv(filepath)
result = nw.scan_csv(filepath, backend=pd, engine="pyarrow")
assert_equal_data(result, data)
result = nw.scan_csv(
filepath, backend="pyarrow", parse_options=csv.ParseOptions(delimiter=",")
)
assert_equal_data(result, data)


def test_read_csv_raise_sep_multiple(tmpdir: pytest.TempdirFactory) -> None:
pytest.importorskip("duckdb")
pytest.importorskip("pandas")
pytest.importorskip("pyarrow")
pytest.importorskip("sqlframe")
import duckdb
import pandas as pd
import pyarrow as pa
import sqlframe
from pyarrow import csv
from sqlframe.duckdb import DuckDBSession

df_pl = pl.DataFrame(data)
filepath = str(tmpdir / "file.csv") # type: ignore[operator]
df_pl.write_csv(filepath)

msg = "do not match:"
with pytest.raises(TypeError, match=msg):
nw.read_csv(
filepath,
backend=pa,
separator="|",
parse_options=csv.ParseOptions(delimiter=";"),
)
with pytest.raises(TypeError, match=msg):
nw.scan_csv(
filepath,
backend=pa,
separator="|",
parse_options=csv.ParseOptions(delimiter=";"),
)
with pytest.raises(TypeError, match=msg):
nw.read_csv(filepath, backend=pd, separator="|", sep=";")
with pytest.raises(TypeError, match=msg):
nw.scan_csv(filepath, backend=pd, separator="|", sep=";")
with pytest.raises(TypeError, match=msg):
nw.scan_csv(filepath, backend=duckdb, separator="|", delimiter=";")
with pytest.raises(TypeError, match=msg):
nw.scan_csv(filepath, backend=duckdb, separator="|", delim=";")
with pytest.raises(TypeError, match=msg):
nw.scan_csv(
filepath,
backend=sqlframe,
separator="|",
sep=";",
session=DuckDBSession(),
inferSchema=True,
)
with pytest.raises(TypeError, match=msg):
nw.scan_csv(
filepath,
backend=sqlframe,
separator="|",
delimiter=";",
session=DuckDBSession(),
inferSchema=True,
)


@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow")
Expand Down
Loading