-
Notifications
You must be signed in to change notification settings - Fork 217
269 fea validator - add validator code. #652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
25b35ef
e08bc37
411e900
644e641
9b5c887
8702367
1ce5345
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from graphistry.validate.validate_graph import validate_graph | ||
import graphistry | ||
import pandas as pd | ||
|
||
|
||
def test_validate_graph_good(): | ||
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') | ||
assert (validate_graph(g) is True) | ||
|
||
|
||
def test_validate_graph_undefined_nodeid(): | ||
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']})) | ||
assert (validate_graph(g) is False) | ||
|
||
|
||
def test_validate_graph_duplicate_nodeid(): | ||
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
pd.DataFrame({'id': ['a','a', 'b', 'c'], 'name': ['A','A2', 'B', 'C']}), node='id') | ||
assert (validate_graph(g) is False) | ||
|
||
|
||
def test_validate_graph_missing_nodes(): | ||
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']})) | ||
assert (validate_graph(g) is False) | ||
|
||
|
||
def test_validate_graph_nan_nodes(): | ||
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
pd.DataFrame({'id': [None, 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') | ||
assert (validate_graph(g) is False) | ||
|
||
|
||
def test_validate_graph_missing_src_node(): | ||
# Only returns warning | ||
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
pd.DataFrame({'id': ['b', 'c'], 'name': ['B', 'C']}), node='id') | ||
assert (validate_graph(g) is True) | ||
|
||
|
||
def test_validate_graph_missing_dst_node(): | ||
# Only returns warning | ||
g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
pd.DataFrame({'id': ['a','b', ], 'name': ['A', 'B']}), node='id') | ||
assert (validate_graph(g) is True) |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,61 @@ | ||||
def check_node_dataframe_exists(g, verbose=True): | ||||
if g._nodes is None: | ||||
if verbose: | ||||
print("Warning: graph was created with only edges. Skipping Node ID check if Node IDs match edge IDs. Use g2 = g.materialize_nodes() to force node df creation. Exiting.") | ||||
return False | ||||
return True | ||||
|
||||
|
||||
def check_node_id_defined(g, verbose=True): | ||||
if g._node is None: | ||||
if verbose: | ||||
print("Invalid graph: Missing Node ID. Did you forget to specify the node ID in the .nodes() function? Exiting.") | ||||
return False | ||||
return True | ||||
|
||||
|
||||
def check_nan_node_ids(g, verbose=True): | ||||
if g._nodes[g._node].isnull().any(): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. compute & report a more verbose mode might return the df to help save time |
||||
if verbose: | ||||
print("Invalid graph: Contains NaN Node IDs.") | ||||
return False | ||||
return True | ||||
|
||||
|
||||
def check_duplicate_node_ids(g, verbose=True): | ||||
if g._nodes[g._node].duplicated().any(): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||
if verbose: | ||||
print("Invalid graph: Contains duplicate Node IDs.") | ||||
return False | ||||
return True | ||||
|
||||
|
||||
def check_edge_sources_exist_in_nodes(g, verbose=True): | ||||
if not g._edges[g._source].isin(g._nodes[g._node]).all(): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||
if verbose: | ||||
print("Warning: Contains source edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") | ||||
return True | ||||
|
||||
|
||||
def check_edge_destinations_exist_in_nodes(g, verbose=True): | ||||
if not g._edges[g._destination].isin(g._nodes[g._node]).all(): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||
if verbose: | ||||
print("Warning: Contains destination edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") | ||||
return True | ||||
|
||||
|
||||
def validate_graph(g, verbose=True): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for validators used for debugging, instead of doing an immediate error, good to collect errors and report them all, with failure only at the end There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as a public function, should have a docstr matching other docstrs format There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. recommend also checking that dtypes of src/dst match, as does node/src if exist type mismatch is more of a warning vs a fail, as we can support bipartite graphs, like imagine There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. likewise, as a warn vs fail, if there is no overlap of src/dst edges, they may have a mismatch needing cleanup, imagine |
||||
if not check_node_dataframe_exists(g, verbose): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. node existence should be more like:
|
||||
return False | ||||
if not check_node_id_defined(g, verbose): | ||||
return False | ||||
if not check_nan_node_ids(g, verbose): | ||||
return False | ||||
if not check_duplicate_node_ids(g, verbose): | ||||
return False | ||||
check_edge_sources_exist_in_nodes(g, verbose) # Warnings only | ||||
check_edge_destinations_exist_in_nodes(g, verbose) # Warnings only | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not returning There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, I guess I was considering only graph with edge IDs that could be valid. However, to your point, we should error if g._node and g._nodes exist and there are missing node IDs from g._edges src/dst, that should throw an error. |
||||
|
||||
if verbose: | ||||
print("Graph is valid.") | ||||
return True |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
switch from
print
statements towarn
+logger.info
depending on useremoves need for
verbose
param via Pythonic conventions