Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3954,7 +3954,10 @@ def to_pandas(self) -> Self | pd.Series | pd.DataFrame:
return pandas_object

def to_dataframe(
self, name: Hashable | None = None, dim_order: Sequence[Hashable] | None = None
self,
name: Hashable | None = None,
dim_order: Sequence[Hashable] | None = None,
create_index: bool = True,
) -> pd.DataFrame:
"""Convert this array and its coordinates into a tidy pandas.DataFrame.

Expand All @@ -3979,6 +3982,11 @@ def to_dataframe(

If provided, must include all dimensions of this DataArray. By default,
dimensions are sorted according to the DataArray dimensions order.
create_index : bool, default: True
If True (default), create a MultiIndex from the Cartesian product
of this DataArray's indices. If False, use a RangeIndex instead.
This can be useful to avoid the potentially expensive MultiIndex
creation.

Returns
-------
Expand Down Expand Up @@ -4013,7 +4021,7 @@ def to_dataframe(
else:
ordered_dims = ds._normalize_dim_order(dim_order=dim_order)

df = ds._to_dataframe(ordered_dims)
df = ds._to_dataframe(ordered_dims, create_index=create_index)
df.columns = [name if c == unique_name else c for c in df.columns]
return df

Expand Down
25 changes: 21 additions & 4 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7200,7 +7200,7 @@ def to_pandas(self) -> pd.Series | pd.DataFrame:
"Please use Dataset.to_dataframe() instead."
)

def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
def _to_dataframe(self, ordered_dims: Mapping[Any, int], create_index: bool = True):
from xarray.core.extension_array import PandasExtensionArray

# All and only non-index arrays (whether data or coordinates) should
Expand Down Expand Up @@ -7231,7 +7231,15 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
self._variables[k].set_dims(ordered_dims).values.reshape(-1)
for k in non_extension_array_columns
]
index = self.coords.to_index([*ordered_dims])
if create_index:
index = self.coords.to_index([*ordered_dims])
else:
# Use a simple RangeIndex when create_index=False
# Calculate the total size from ordered_dims
total_size = (
int(np.prod(list(ordered_dims.values()))) if ordered_dims else 0
)
index = pd.RangeIndex(total_size)
broadcasted_df = pd.DataFrame(
{
**dict(zip(non_extension_array_columns, data, strict=True)),
Expand Down Expand Up @@ -7259,7 +7267,11 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
broadcasted_df = broadcasted_df.join(extension_array_df)
return broadcasted_df[columns_in_order]

def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame:
def to_dataframe(
self,
dim_order: Sequence[Hashable] | None = None,
create_index: bool = True,
) -> pd.DataFrame:
"""Convert this dataset into a pandas.DataFrame.

Non-index variables in this dataset form the columns of the
Expand All @@ -7278,6 +7290,11 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr

If provided, must include all dimensions of this dataset. By
default, dimensions are in the same order as in `Dataset.sizes`.
create_index : bool, default: True
If True (default), create a MultiIndex from the Cartesian product
of this dataset's indices. If False, use a RangeIndex instead.
This can be useful to avoid the potentially expensive MultiIndex
creation.
Comment on lines +7293 to +7297
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
create_index : bool, default: True
If True (default), create a MultiIndex from the Cartesian product
of this dataset's indices. If False, use a RangeIndex instead.
This can be useful to avoid the potentially expensive MultiIndex
creation.
create_index : bool, default: True
If True (default), create a :py:class:`pandas.MultiIndex` from the Cartesian product
of this dataset's indices. If False, use a :py:class:`pandas.RangeIndex` instead.
This can be useful to avoid the potentially expensive MultiIndex
creation.

To avoid any confusion with xarray.indexes.RangeIndex (float range) and xarray.indexes.PandasMultiIndex.


Returns
-------
Expand All @@ -7288,7 +7305,7 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr

ordered_dims = self._normalize_dim_order(dim_order=dim_order)

return self._to_dataframe(ordered_dims=ordered_dims)
return self._to_dataframe(ordered_dims=ordered_dims, create_index=create_index)

def _set_sparse_data_from_dataframe(
self, idx: pd.Index, arrays: list[tuple[Hashable, np.ndarray]], dims: tuple
Expand Down
26 changes: 26 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3655,6 +3655,32 @@ def test_to_dataframe_0length(self) -> None:
assert len(actual) == 0
assert_array_equal(actual.index.names, list("ABC"))

def test_to_dataframe_create_index(self) -> None:
# Test create_index parameter
arr_np = np.arange(12).reshape(3, 4)
arr = DataArray(arr_np, [("x", [1, 2, 3]), ("y", list("abcd"))], name="foo")

# Default behavior: create MultiIndex
df_with_index = arr.to_dataframe()
assert isinstance(df_with_index.index, pd.MultiIndex)
assert df_with_index.index.names == ["x", "y"]
assert len(df_with_index) == 12

# With create_index=False: use RangeIndex
df_without_index = arr.to_dataframe(create_index=False)
assert isinstance(df_without_index.index, pd.RangeIndex)
assert len(df_without_index) == 12

# Data should be the same regardless
assert_array_equal(df_with_index["foo"].values, df_without_index["foo"].values)

# Test with coords that have different dimensions
arr.coords["z"] = ("x", [-1, -2, -3])
df_with_coords = arr.to_dataframe(create_index=False)
assert isinstance(df_with_coords.index, pd.RangeIndex)
assert "z" in df_with_coords.columns
assert len(df_with_coords) == 12

@pytest.mark.parametrize(
"x_dtype,y_dtype,v_dtype",
[
Expand Down
32 changes: 32 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2282,6 +2282,38 @@ def test_to_pandas(self) -> None:
with pytest.raises(ValueError, match=r"cannot convert Datasets"):
Dataset({"a": (["t", "r"], x2d), "b": (["t", "r"], y2d)}).to_pandas()

def test_to_dataframe_create_index(self) -> None:
# Test create_index parameter for Dataset
x = np.random.randn(3, 4)
y = np.random.randn(3, 4)
ds = Dataset(
{"a": (("x", "y"), x), "b": (("x", "y"), y)},
coords={"x": [1, 2, 3], "y": list("abcd")},
)

# Default behavior: create MultiIndex
df_with_index = ds.to_dataframe()
assert isinstance(df_with_index.index, pd.MultiIndex)
assert df_with_index.index.names == ["x", "y"]
assert len(df_with_index) == 12

# With create_index=False: use RangeIndex
df_without_index = ds.to_dataframe(create_index=False)
assert isinstance(df_without_index.index, pd.RangeIndex)
assert len(df_without_index) == 12

# Data should be the same regardless
assert_array_equal(df_with_index["a"].values, df_without_index["a"].values)
assert_array_equal(df_with_index["b"].values, df_without_index["b"].values)

# Test with dim_order and create_index=False
df_reordered = ds.to_dataframe(dim_order=["y", "x"], create_index=False)
assert isinstance(df_reordered.index, pd.RangeIndex)
assert len(df_reordered) == 12
# Check that dim_order affects the data ordering
df_reordered_with_idx = ds.to_dataframe(dim_order=["y", "x"])
assert_array_equal(df_reordered["a"].values, df_reordered_with_idx["a"].values)

def test_reindex_like(self) -> None:
data = create_test_data()
data["letters"] = ("dim3", 10 * ["a"])
Expand Down
Loading