BUG: DataFrame.combine_first, combine with non-unique columns (#62941)

jbrockmendel · web-flow · commit 8226043690dd · 2025-10-31T13:26:49.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1184,6 +1184,8 @@ Reshaping
 - Bug in :func:`concat` with mixed integer and bool dtypes incorrectly casting the bools to integers (:issue:`45101`)
 - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
 - Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`)
+- Bug in :meth:`DataFrame.combine_first` with non-unique columns incorrectly raising (:issue:`29135`)
+- Bug in :meth:`DataFrame.combine` with non-unique columns incorrectly raising (:issue:`51340`)
 - Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`)
 - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
 - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9096,11 +9096,14 @@ def combine(
 
         # preserve column order
         new_columns = self.columns.union(other_columns, sort=False)
+        this = this.reindex(new_columns, axis=1)
+        other = other.reindex(new_columns, axis=1)
+
         do_fill = fill_value is not None
         result = {}
-        for col in new_columns:
-            series = this[col]
-            other_series = other[col]
+        for i in range(this.shape[1]):
+            series = this.iloc[:, i]
+            other_series = other.iloc[:, i]
 
             this_dtype = series.dtype
             other_dtype = other_series.dtype
@@ -9111,7 +9114,7 @@ def combine(
             # don't overwrite columns unnecessarily
             # DO propagate if this column is not in the intersection
             if not overwrite and other_mask.all():
-                result[col] = this[col].copy()
+                result[i] = series.copy()
                 continue
 
             if do_fill:
@@ -9120,7 +9123,7 @@ def combine(
                 series[this_mask] = fill_value
                 other_series[other_mask] = fill_value
 
-            if col not in self.columns:
+            if new_columns[i] not in self.columns:
                 # If self DataFrame does not have col in other DataFrame,
                 # try to promote series, which is all NaN, as other_dtype.
                 new_dtype = other_dtype
@@ -9145,10 +9148,10 @@ def combine(
                     arr, new_dtype
                 )
 
-            result[col] = arr
+            result[i] = arr
 
-        # convert_objects just in case
-        frame_result = self._constructor(result, index=new_index, columns=new_columns)
+        frame_result = self._constructor(result, index=new_index)
+        frame_result.columns = new_columns
         return frame_result.__finalize__(self, method="combine")
 
     def combine_first(self, other: DataFrame) -> DataFrame:
@@ -9212,9 +9215,14 @@ def combiner(x: Series, y: Series):
             combined = self.combine(other, combiner, overwrite=False)
 
         dtypes = {
+            # Check for isinstance(..., (np.dtype, ExtensionDtype))
+            #  to prevent raising on non-unique columns see GH#29135.
+            #  Note we will just not-cast in these cases.
             col: find_common_type([self.dtypes[col], other.dtypes[col]])
             for col in self.columns.intersection(other.columns)
-            if combined.dtypes[col] != self.dtypes[col]
+            if isinstance(combined.dtypes[col], (np.dtype, ExtensionDtype))
+            and isinstance(self.dtypes[col], (np.dtype, ExtensionDtype))
+            and combined.dtypes[col] != self.dtypes[col]
         }
 
         if dtypes:
diff --git a/pandas/tests/frame/methods/test_combine.py b/pandas/tests/frame/methods/test_combine.py
@@ -45,3 +45,19 @@ def test_combine_generic(self, float_frame):
         )
         tm.assert_frame_equal(chunk, exp)
         tm.assert_frame_equal(chunk2, exp)
+
+    def test_combine_nonunique_columns(self):
+        # GH#51340
+
+        df = pd.DataFrame({"A": range(5), "B": range(5)})
+        df.columns = ["A", "A"]
+
+        other = df.copy()
+        df.iloc[1, :] = None
+
+        def combiner(a, b):
+            return b
+
+        result = df.combine(other, combiner)
+        expected = other.astype("float64")
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
@@ -413,6 +413,18 @@ def test_combine_first_preserve_EA_precision(self, wide_val, dtype):
         expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
+    def test_combine_first_non_unique_columns(self):
+        # GH#29135
+        df1 = DataFrame([[1, np.nan], [3, 4]], columns=["P", "Q"], index=["A", "B"])
+        df2 = DataFrame(
+            [[5, 6, 7], [8, 9, np.nan]], columns=["P", "Q", "Q"], index=["A", "B"]
+        )
+        result = df1.combine_first(df2)
+        expected = DataFrame(
+            [[1, 6.0, 7.0], [3, 4.0, 4.0]], index=["A", "B"], columns=["P", "Q", "Q"]
+        )
+        tm.assert_frame_equal(result, expected)
+
 
 @pytest.mark.parametrize(
     "scalar1, scalar2",