Skip to content

Commit 8226043

Browse files
authored
BUG: DataFrame.combine_first, combine with non-unique columns (#62941)
1 parent bdc9a7b commit 8226043

File tree

4 files changed

+47
-9
lines changed

4 files changed

+47
-9
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,6 +1184,8 @@ Reshaping
11841184
- Bug in :func:`concat` with mixed integer and bool dtypes incorrectly casting the bools to integers (:issue:`45101`)
11851185
- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
11861186
- Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`)
1187+
- Bug in :meth:`DataFrame.combine_first` with non-unique columns incorrectly raising (:issue:`29135`)
1188+
- Bug in :meth:`DataFrame.combine` with non-unique columns incorrectly raising (:issue:`51340`)
11871189
- Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`)
11881190
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
11891191
- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)

pandas/core/frame.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9096,11 +9096,14 @@ def combine(
90969096

90979097
# preserve column order
90989098
new_columns = self.columns.union(other_columns, sort=False)
9099+
this = this.reindex(new_columns, axis=1)
9100+
other = other.reindex(new_columns, axis=1)
9101+
90999102
do_fill = fill_value is not None
91009103
result = {}
9101-
for col in new_columns:
9102-
series = this[col]
9103-
other_series = other[col]
9104+
for i in range(this.shape[1]):
9105+
series = this.iloc[:, i]
9106+
other_series = other.iloc[:, i]
91049107

91059108
this_dtype = series.dtype
91069109
other_dtype = other_series.dtype
@@ -9111,7 +9114,7 @@ def combine(
91119114
# don't overwrite columns unnecessarily
91129115
# DO propagate if this column is not in the intersection
91139116
if not overwrite and other_mask.all():
9114-
result[col] = this[col].copy()
9117+
result[i] = series.copy()
91159118
continue
91169119

91179120
if do_fill:
@@ -9120,7 +9123,7 @@ def combine(
91209123
series[this_mask] = fill_value
91219124
other_series[other_mask] = fill_value
91229125

9123-
if col not in self.columns:
9126+
if new_columns[i] not in self.columns:
91249127
# If self DataFrame does not have col in other DataFrame,
91259128
# try to promote series, which is all NaN, as other_dtype.
91269129
new_dtype = other_dtype
@@ -9145,10 +9148,10 @@ def combine(
91459148
arr, new_dtype
91469149
)
91479150

9148-
result[col] = arr
9151+
result[i] = arr
91499152

9150-
# convert_objects just in case
9151-
frame_result = self._constructor(result, index=new_index, columns=new_columns)
9153+
frame_result = self._constructor(result, index=new_index)
9154+
frame_result.columns = new_columns
91529155
return frame_result.__finalize__(self, method="combine")
91539156

91549157
def combine_first(self, other: DataFrame) -> DataFrame:
@@ -9212,9 +9215,14 @@ def combiner(x: Series, y: Series):
92129215
combined = self.combine(other, combiner, overwrite=False)
92139216

92149217
dtypes = {
9218+
# Check for isinstance(..., (np.dtype, ExtensionDtype))
9219+
# to prevent raising on non-unique columns see GH#29135.
9220+
# Note we will just not-cast in these cases.
92159221
col: find_common_type([self.dtypes[col], other.dtypes[col]])
92169222
for col in self.columns.intersection(other.columns)
9217-
if combined.dtypes[col] != self.dtypes[col]
9223+
if isinstance(combined.dtypes[col], (np.dtype, ExtensionDtype))
9224+
and isinstance(self.dtypes[col], (np.dtype, ExtensionDtype))
9225+
and combined.dtypes[col] != self.dtypes[col]
92189226
}
92199227

92209228
if dtypes:

pandas/tests/frame/methods/test_combine.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,19 @@ def test_combine_generic(self, float_frame):
4545
)
4646
tm.assert_frame_equal(chunk, exp)
4747
tm.assert_frame_equal(chunk2, exp)
48+
49+
def test_combine_nonunique_columns(self):
50+
# GH#51340
51+
52+
df = pd.DataFrame({"A": range(5), "B": range(5)})
53+
df.columns = ["A", "A"]
54+
55+
other = df.copy()
56+
df.iloc[1, :] = None
57+
58+
def combiner(a, b):
59+
return b
60+
61+
result = df.combine(other, combiner)
62+
expected = other.astype("float64")
63+
tm.assert_frame_equal(result, expected)

pandas/tests/frame/methods/test_combine_first.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,18 @@ def test_combine_first_preserve_EA_precision(self, wide_val, dtype):
413413
expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype)
414414
tm.assert_frame_equal(result, expected)
415415

416+
def test_combine_first_non_unique_columns(self):
417+
# GH#29135
418+
df1 = DataFrame([[1, np.nan], [3, 4]], columns=["P", "Q"], index=["A", "B"])
419+
df2 = DataFrame(
420+
[[5, 6, 7], [8, 9, np.nan]], columns=["P", "Q", "Q"], index=["A", "B"]
421+
)
422+
result = df1.combine_first(df2)
423+
expected = DataFrame(
424+
[[1, 6.0, 7.0], [3, 4.0, 4.0]], index=["A", "B"], columns=["P", "Q", "Q"]
425+
)
426+
tm.assert_frame_equal(result, expected)
427+
416428

417429
@pytest.mark.parametrize(
418430
"scalar1, scalar2",

0 commit comments

Comments
 (0)