Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions graphistry/Engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
from enum import Enum


class Engine(Enum):
class Engine(str, Enum):
PANDAS = 'pandas'
CUDF = 'cudf'
DASK = 'dask'
DASK_CUDF = 'dask_cudf'

class EngineAbstract(Enum):
class EngineAbstract(str, Enum):
PANDAS = Engine.PANDAS.value
CUDF = Engine.CUDF.value
DASK = Engine.DASK.value
Expand Down
46 changes: 46 additions & 0 deletions graphistry/tests/validate/test_validate_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from graphistry.validate.validate_graph import validate_graph
import graphistry
import pandas as pd


def test_validate_graph_good():
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes(
pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id')
assert (validate_graph(g) is True)


def test_validate_graph_undefined_nodeid():
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes(
pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']}))
assert (validate_graph(g) is False)


def test_validate_graph_duplicate_nodeid():
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes(
pd.DataFrame({'id': ['a','a', 'b', 'c'], 'name': ['A','A2', 'B', 'C']}), node='id')
assert (validate_graph(g) is False)


def test_validate_graph_missing_nodes():
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}))
assert (validate_graph(g) is False)


def test_validate_graph_nan_nodes():
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes(
pd.DataFrame({'id': [None, 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id')
assert (validate_graph(g) is False)


def test_validate_graph_missing_src_node():
# Only returns warning
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes(
pd.DataFrame({'id': ['b', 'c'], 'name': ['B', 'C']}), node='id')
assert (validate_graph(g) is True)


def test_validate_graph_missing_dst_node():
# Only returns warning
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes(
pd.DataFrame({'id': ['a','b', ], 'name': ['A', 'B']}), node='id')
assert (validate_graph(g) is True)
61 changes: 61 additions & 0 deletions graphistry/validate/validate_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
def check_node_dataframe_exists(g, verbose=True):
if g._nodes is None:
if verbose:
print("Warning: graph was created with only edges. Skipping Node ID check if Node IDs match edge IDs. Use g2 = g.materialize_nodes() to force node df creation. Exiting.")
Copy link
Contributor

@lmeyerov lmeyerov Mar 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

switch from print statements to warn + logger.info depending on use

removes need for verbose param via Pythonic conventions

return False
return True


def check_node_id_defined(g, verbose=True):
if g._node is None:
if verbose:
print("Invalid graph: Missing Node ID. Did you forget to specify the node ID in the .nodes() function? Exiting.")
return False
return True


def check_nan_node_ids(g, verbose=True):
if g._nodes[g._node].isnull().any():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

compute & report .sum() vs just .any()

a more verbose mode might return the df to help save time

if verbose:
print("Invalid graph: Contains NaN Node IDs.")
return False
return True


def check_duplicate_node_ids(g, verbose=True):
if g._nodes[g._node].duplicated().any():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if verbose:
print("Invalid graph: Contains duplicate Node IDs.")
return False
return True


def check_edge_sources_exist_in_nodes(g, verbose=True):
if not g._edges[g._source].isin(g._nodes[g._node]).all():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if verbose:
print("Warning: Contains source edge IDs that do not exist in the node DataFrame. This can cause unexpected results.")
return True


def check_edge_destinations_exist_in_nodes(g, verbose=True):
if not g._edges[g._destination].isin(g._nodes[g._node]).all():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if verbose:
print("Warning: Contains destination edge IDs that do not exist in the node DataFrame. This can cause unexpected results.")
return True


def validate_graph(g, verbose=True):
Copy link
Contributor

@lmeyerov lmeyerov Mar 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

@lmeyerov lmeyerov Mar 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for validators used for debugging, instead of doing an immediate error, good to collect errors and report them all, with failure only at the end

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as a public function, should have a docstr matching other docstrs format

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

recommend also checking that dtypes of src/dst match, as does node/src if exist

type mismatch is more of a warning vs a fail, as we can support bipartite graphs, like imagine user:str <> day:int

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

likewise, as a warn vs fail, if there is no overlap of src/dst edges, they may have a mismatch needing cleanup, imagine 123 <> id:123

if not check_node_dataframe_exists(g, verbose):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

node existence should be more like:

  • either _node and _nodes are both defined or neither defined

return False
if not check_node_id_defined(g, verbose):
return False
if not check_nan_node_ids(g, verbose):
return False
if not check_duplicate_node_ids(g, verbose):
return False
check_edge_sources_exist_in_nodes(g, verbose) # Warnings only
check_edge_destinations_exist_in_nodes(g, verbose) # Warnings only
Copy link
Contributor

@lmeyerov lmeyerov Mar 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not returning False?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I guess I was considering only graph with edge IDs that could be valid.

However, to your point, we should error if g._node and g._nodes exist and there are missing node IDs from g._edges src/dst, that should throw an error.


if verbose:
print("Graph is valid.")
return True
Loading