From 25b35efed20aaba28df4ea435173ba84a4145e1d Mon Sep 17 00:00:00 2001 From: Sindre Breda Date: Wed, 8 Jan 2025 14:21:44 +0100 Subject: [PATCH 1/6] feat(graph validation): add graph validation and tests --- .../tests/validate/test_validate_graph.py | 46 ++++++++++++++ graphistry/validate/validate_graph.py | 61 +++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 graphistry/tests/validate/test_validate_graph.py create mode 100644 graphistry/validate/validate_graph.py diff --git a/graphistry/tests/validate/test_validate_graph.py b/graphistry/tests/validate/test_validate_graph.py new file mode 100644 index 0000000000..9c06e7ca5e --- /dev/null +++ b/graphistry/tests/validate/test_validate_graph.py @@ -0,0 +1,46 @@ +from graphistry.validate.validate_graph import validate_graph +import graphistry +import pandas as pd + + +def test_validate_graph_good(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') + assert (validate_graph(g) is True) + + +def test_validate_graph_undefined_nodeid(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']})) + assert (validate_graph(g) is False) + + +def test_validate_graph_duplicate_nodeid(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a','a', 'b', 'c'], 'name': ['A','A2', 'B', 'C']}), node='id') + assert (validate_graph(g) is False) + + +def test_validate_graph_missing_nodes(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']})) + assert (validate_graph(g) is False) + + +def test_validate_graph_nan_nodes(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': [None, 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') + assert (validate_graph(g) is False) + + +def test_validate_graph_missing_src_node(): + # Only returns warning + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['b', 'c'], 'name': ['B', 'C']}), node='id') + assert (validate_graph(g) is True) + + +def test_validate_graph_missing_dst_node(): + # Only returns warning + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a','b', ], 'name': ['A', 'B']}), node='id') + assert (validate_graph(g) is True) \ No newline at end of file diff --git a/graphistry/validate/validate_graph.py b/graphistry/validate/validate_graph.py new file mode 100644 index 0000000000..60d333c07a --- /dev/null +++ b/graphistry/validate/validate_graph.py @@ -0,0 +1,61 @@ +def check_node_dataframe_exists(g, verbose=True): + if g._nodes is None: + if verbose: + print("Warning: graph was created with only edges. Skipping Node ID check if Node IDs match edge IDs. Use g2 = g.materialize_nodes() to force node df creation. Exiting.") + return False + return True + + +def check_node_id_defined(g, verbose=True): + if g._node is None: + if verbose: + print("Invalid graph: Missing Node ID. Did you forget to specify the node ID in the .nodes() function? Exiting.") + return False + return True + + +def check_nan_node_ids(g, verbose=True): + if g._nodes[g._node].isnull().any(): + if verbose: + print("Invalid graph: Contains NaN Node IDs.") + return False + return True + + +def check_duplicate_node_ids(g, verbose=True): + if g._nodes[g._node].duplicated().any(): + if verbose: + print("Invalid graph: Contains duplicate Node IDs.") + return False + return True + + +def check_edge_sources_exist_in_nodes(g, verbose=True): + if not g._edges[g._source].isin(g._nodes[g._node]).all(): + if verbose: + print("Warning: Contains source edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") + return True + + +def check_edge_destinations_exist_in_nodes(g, verbose=True): + if not g._edges[g._destination].isin(g._nodes[g._node]).all(): + if verbose: + print("Warning: Contains destination edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") + return True + + +def validate_graph(g, verbose=True): + if not check_node_dataframe_exists(g, verbose): + return False + if not check_node_id_defined(g, verbose): + return False + if not check_nan_node_ids(g, verbose): + return False + if not check_duplicate_node_ids(g, verbose): + return False + check_edge_sources_exist_in_nodes(g, verbose) # Warnings only + check_edge_destinations_exist_in_nodes(g, verbose) # Warnings only + + if verbose: + print("Graph is valid.") + return True \ No newline at end of file From e08bc3746188b1a0f4822129773ebe7b210af0a4 Mon Sep 17 00:00:00 2001 From: Sindre Breda Date: Wed, 8 Jan 2025 14:53:27 +0100 Subject: [PATCH 2/6] fix(lint): add newline to end of graph validation and tests --- graphistry/tests/validate/test_validate_graph.py | 2 +- graphistry/validate/validate_graph.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/graphistry/tests/validate/test_validate_graph.py b/graphistry/tests/validate/test_validate_graph.py index 9c06e7ca5e..a8ff813095 100644 --- a/graphistry/tests/validate/test_validate_graph.py +++ b/graphistry/tests/validate/test_validate_graph.py @@ -43,4 +43,4 @@ def test_validate_graph_missing_dst_node(): # Only returns warning g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( pd.DataFrame({'id': ['a','b', ], 'name': ['A', 'B']}), node='id') - assert (validate_graph(g) is True) \ No newline at end of file + assert (validate_graph(g) is True) diff --git a/graphistry/validate/validate_graph.py b/graphistry/validate/validate_graph.py index 60d333c07a..a21dd2607b 100644 --- a/graphistry/validate/validate_graph.py +++ b/graphistry/validate/validate_graph.py @@ -58,4 +58,4 @@ def validate_graph(g, verbose=True): if verbose: print("Graph is valid.") - return True \ No newline at end of file + return True From 411e9007c2742f4d7bf854618e0027c219e0ea3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Percy=20Camilo=20Trive=C3=B1o=20Aucahuasi?= Date: Thu, 9 Jan 2025 13:49:04 -0500 Subject: [PATCH 3/6] fix type checks for enum fields --- graphistry/Engine.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/graphistry/Engine.py b/graphistry/Engine.py index 57b0a3cbce..0869c4c93b 100644 --- a/graphistry/Engine.py +++ b/graphistry/Engine.py @@ -6,13 +6,13 @@ from graphistry.utils.lazy_import import lazy_cudf_import -class Engine(Enum): - PANDAS : str = 'pandas' - CUDF : str = 'cudf' - DASK : str = 'dask' - DASK_CUDF : str = 'dask_cudf' +class Engine(str, Enum): + PANDAS = 'pandas' + CUDF = 'cudf' + DASK = 'dask' + DASK_CUDF = 'dask_cudf' -class EngineAbstract(Enum): +class EngineAbstract(str, Enum): PANDAS = Engine.PANDAS.value CUDF = Engine.CUDF.value DASK = Engine.DASK.value From 644e641092ce6a449876e9f3f79d055e004587b1 Mon Sep 17 00:00:00 2001 From: Sindre Breda Date: Wed, 8 Jan 2025 14:21:44 +0100 Subject: [PATCH 4/6] feat(graph validation): add graph validation and tests --- .../tests/validate/test_validate_graph.py | 46 ++++++++++++++ graphistry/validate/validate_graph.py | 61 +++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 graphistry/tests/validate/test_validate_graph.py create mode 100644 graphistry/validate/validate_graph.py diff --git a/graphistry/tests/validate/test_validate_graph.py b/graphistry/tests/validate/test_validate_graph.py new file mode 100644 index 0000000000..9c06e7ca5e --- /dev/null +++ b/graphistry/tests/validate/test_validate_graph.py @@ -0,0 +1,46 @@ +from graphistry.validate.validate_graph import validate_graph +import graphistry +import pandas as pd + + +def test_validate_graph_good(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') + assert (validate_graph(g) is True) + + +def test_validate_graph_undefined_nodeid(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']})) + assert (validate_graph(g) is False) + + +def test_validate_graph_duplicate_nodeid(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a','a', 'b', 'c'], 'name': ['A','A2', 'B', 'C']}), node='id') + assert (validate_graph(g) is False) + + +def test_validate_graph_missing_nodes(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']})) + assert (validate_graph(g) is False) + + +def test_validate_graph_nan_nodes(): + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': [None, 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') + assert (validate_graph(g) is False) + + +def test_validate_graph_missing_src_node(): + # Only returns warning + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['b', 'c'], 'name': ['B', 'C']}), node='id') + assert (validate_graph(g) is True) + + +def test_validate_graph_missing_dst_node(): + # Only returns warning + g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( + pd.DataFrame({'id': ['a','b', ], 'name': ['A', 'B']}), node='id') + assert (validate_graph(g) is True) \ No newline at end of file diff --git a/graphistry/validate/validate_graph.py b/graphistry/validate/validate_graph.py new file mode 100644 index 0000000000..60d333c07a --- /dev/null +++ b/graphistry/validate/validate_graph.py @@ -0,0 +1,61 @@ +def check_node_dataframe_exists(g, verbose=True): + if g._nodes is None: + if verbose: + print("Warning: graph was created with only edges. Skipping Node ID check if Node IDs match edge IDs. Use g2 = g.materialize_nodes() to force node df creation. Exiting.") + return False + return True + + +def check_node_id_defined(g, verbose=True): + if g._node is None: + if verbose: + print("Invalid graph: Missing Node ID. Did you forget to specify the node ID in the .nodes() function? Exiting.") + return False + return True + + +def check_nan_node_ids(g, verbose=True): + if g._nodes[g._node].isnull().any(): + if verbose: + print("Invalid graph: Contains NaN Node IDs.") + return False + return True + + +def check_duplicate_node_ids(g, verbose=True): + if g._nodes[g._node].duplicated().any(): + if verbose: + print("Invalid graph: Contains duplicate Node IDs.") + return False + return True + + +def check_edge_sources_exist_in_nodes(g, verbose=True): + if not g._edges[g._source].isin(g._nodes[g._node]).all(): + if verbose: + print("Warning: Contains source edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") + return True + + +def check_edge_destinations_exist_in_nodes(g, verbose=True): + if not g._edges[g._destination].isin(g._nodes[g._node]).all(): + if verbose: + print("Warning: Contains destination edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") + return True + + +def validate_graph(g, verbose=True): + if not check_node_dataframe_exists(g, verbose): + return False + if not check_node_id_defined(g, verbose): + return False + if not check_nan_node_ids(g, verbose): + return False + if not check_duplicate_node_ids(g, verbose): + return False + check_edge_sources_exist_in_nodes(g, verbose) # Warnings only + check_edge_destinations_exist_in_nodes(g, verbose) # Warnings only + + if verbose: + print("Graph is valid.") + return True \ No newline at end of file From 9b5c88796dd695155d57f1f5517580fb4b136872 Mon Sep 17 00:00:00 2001 From: Sindre Breda Date: Wed, 8 Jan 2025 14:53:27 +0100 Subject: [PATCH 5/6] fix(lint): add newline to end of graph validation and tests --- graphistry/tests/validate/test_validate_graph.py | 2 +- graphistry/validate/validate_graph.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/graphistry/tests/validate/test_validate_graph.py b/graphistry/tests/validate/test_validate_graph.py index 9c06e7ca5e..a8ff813095 100644 --- a/graphistry/tests/validate/test_validate_graph.py +++ b/graphistry/tests/validate/test_validate_graph.py @@ -43,4 +43,4 @@ def test_validate_graph_missing_dst_node(): # Only returns warning g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( pd.DataFrame({'id': ['a','b', ], 'name': ['A', 'B']}), node='id') - assert (validate_graph(g) is True) \ No newline at end of file + assert (validate_graph(g) is True) diff --git a/graphistry/validate/validate_graph.py b/graphistry/validate/validate_graph.py index 60d333c07a..a21dd2607b 100644 --- a/graphistry/validate/validate_graph.py +++ b/graphistry/validate/validate_graph.py @@ -58,4 +58,4 @@ def validate_graph(g, verbose=True): if verbose: print("Graph is valid.") - return True \ No newline at end of file + return True From 1ce534590af436f5b688b7f18e696761a2e96688 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Percy=20Camilo=20Trive=C3=B1o=20Aucahuasi?= Date: Tue, 18 Mar 2025 08:16:17 -0500 Subject: [PATCH 6/6] Trigger CI