Skip to content

Commit 24d90e0

Browse files
authored
Improve error reporting when merge validation fails (#62837)
1 parent 00feea4 commit 24d90e0

File tree

3 files changed

+46
-5
lines changed

3 files changed

+46
-5
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ Other enhancements
219219
- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
220220
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
221221
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
222+
- Improve error reporting through outputting the first few duplicates when :func:`merge` validation fails (:issue:`62742`)
222223
- Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`)
223224
- Improved deprecation message for offset aliases (:issue:`60820`)
224225
- Many type aliases are now exposed in the new submodule :py:mod:`pandas.api.typing.aliases` (:issue:`55231`)

pandas/core/reshape/merge.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1954,42 +1954,62 @@ def _validate_left_right_on(self, left_on, right_on):
19541954
def _validate_validate_kwd(self, validate: str) -> None:
19551955
# Check uniqueness of each
19561956
if self.left_index:
1957-
left_unique = self.orig_left.index.is_unique
1957+
left_join_index = self.orig_left.index
1958+
left_unique = left_join_index.is_unique
19581959
else:
1959-
left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
1960+
left_join_index = MultiIndex.from_arrays(self.left_join_keys)
1961+
left_unique = left_join_index.is_unique
19601962

19611963
if self.right_index:
1964+
right_join_index = self.orig_right.index
19621965
right_unique = self.orig_right.index.is_unique
19631966
else:
1964-
right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
1967+
right_join_index = MultiIndex.from_arrays(self.right_join_keys)
1968+
right_unique = right_join_index.is_unique
1969+
1970+
def left_error_msg(x: Index) -> str:
1971+
name = self.left_on if not self.left_index else lib.no_default
1972+
msg = x[x.duplicated()][:5].to_frame(name=name).to_string(index=False)
1973+
return f"\nDuplicates in left:\n {msg} ..."
1974+
1975+
def right_error_msg(x: Index) -> str:
1976+
name = self.right_on if not self.right_index else lib.no_default
1977+
msg = x[x.duplicated()][:5].to_frame(name=name).to_string(index=False)
1978+
return f"\nDuplicates in right:\n {msg} ..."
19651979

19661980
# Check data integrity
19671981
if validate in ["one_to_one", "1:1"]:
19681982
if not left_unique and not right_unique:
19691983
raise MergeError(
19701984
"Merge keys are not unique in either left "
1971-
"or right dataset; not a one-to-one merge"
1985+
"or right dataset; not a one-to-one merge."
1986+
f"{left_error_msg(left_join_index)}"
1987+
f"{right_error_msg(right_join_index)}"
19721988
)
19731989
if not left_unique:
19741990
raise MergeError(
19751991
"Merge keys are not unique in left dataset; not a one-to-one merge"
1992+
f"{left_error_msg(left_join_index)}"
19761993
)
19771994
if not right_unique:
19781995
raise MergeError(
19791996
"Merge keys are not unique in right dataset; not a one-to-one merge"
1997+
f"{right_error_msg(right_join_index)}"
19801998
)
19811999

19822000
elif validate in ["one_to_many", "1:m"]:
19832001
if not left_unique:
19842002
raise MergeError(
19852003
"Merge keys are not unique in left dataset; not a one-to-many merge"
2004+
f"{left_error_msg(left_join_index)}"
19862005
)
19872006

19882007
elif validate in ["many_to_one", "m:1"]:
19892008
if not right_unique:
19902009
raise MergeError(
19912010
"Merge keys are not unique in right dataset; "
1992-
"not a many-to-one merge"
2011+
"not a many-to-one merge\n"
2012+
f"{right_error_msg(right_join_index)}"
19932013
)
19942014

19952015
elif validate in ["many_to_many", "m:m"]:

pandas/tests/reshape/merge/test_merge.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1325,6 +1325,26 @@ def test_validation(self):
13251325
result = merge(left, right, on=["a", "b"], validate="1:1")
13261326
tm.assert_frame_equal(result, expected_multi)
13271327

1328+
def test_merge_validate_error_message(self):
1329+
# GH#62742
1330+
left = DataFrame({"key": [1, 1, 2]})
1331+
right = DataFrame({"key": [1, 2, 2]})
1332+
1333+
with pytest.raises(MergeError, match="Duplicates in left:\n key\n 1 ...\n"):
1334+
merge(left, right, validate="1:1")
1335+
with pytest.raises(MergeError, match="Duplicates in left:\n key\n 1 ..."):
1336+
merge(left, right, validate="1:m")
1337+
with pytest.raises(MergeError, match="Duplicates in right:\n key\n 2 ..."):
1338+
merge(left, right, validate="1:1")
1339+
with pytest.raises(MergeError, match="Duplicates in right:\n key\n 2 ..."):
1340+
merge(left, right, validate="m:1")
1341+
1342+
right = DataFrame({"key": [1, 2, 3]})
1343+
with pytest.raises(MergeError, match="Duplicates in left:\n key\n 1 ..."):
1344+
merge(left, right, validate="1:1")
1345+
with pytest.raises(MergeError, match="Duplicates in right:\n key\n 1 ..."):
1346+
merge(right, left, validate="1:1")
1347+
13281348
def test_merge_two_empty_df_no_division_error(self):
13291349
# GH17776, PR #17846
13301350
a = DataFrame({"a": [], "b": [], "c": []})

0 commit comments

Comments
 (0)