Skip to content

Commit

Permalink
[BUG] Fix Calls to cudf.DataFrame/Series.unique that relied on old be…
Browse files Browse the repository at this point in the history
…havior (#3616)

Updates the code that relied on `unique()` to return values in sorted order by explicitly sorting the unique values.

Closes #3615

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Vibhu Jawa (https://github.com/VibhuJawa)

URL: #3616
  • Loading branch information
alexbarghi-nv authored May 27, 2023
1 parent aa00704 commit 1e6da2b
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 15 deletions.
8 changes: 5 additions & 3 deletions python/cugraph-service/tests/test_remote_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,11 @@ def test_extract_subgraph(

assert remote_sg.get_num_vertices() == sg.number_of_vertices()

expected_vertex_ids = cudf.concat(
[sg.edgelist.edgelist_df["src"], sg.edgelist.edgelist_df["dst"]]
).unique()
expected_vertex_ids = (
cudf.concat([sg.edgelist.edgelist_df["src"], sg.edgelist.edgelist_df["dst"]])
.unique()
.sort_values()
)
if renumber:
expected_vertex_ids = sg.unrenumber(
cudf.DataFrame({"v": expected_vertex_ids}), "v"
Expand Down
2 changes: 1 addition & 1 deletion python/cugraph/cugraph/components/connectivity.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def _convert_df_to_output_type(df, input_type, return_labels):
# The number of connected components (number of unique labels).
# labels: ndarray
# The length-N array of labels of the connected components.
n_components = len(df["labels"].unique())
n_components = df["labels"].nunique()
sorted_df = df.sort_values("vertex")
if return_labels:
if is_cp_matrix_type(input_type):
Expand Down
6 changes: 5 additions & 1 deletion python/cugraph/cugraph/dask/structure/mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,11 @@ def get_vertices(self, selection=None):
vert_sers = self.__get_all_vertices_series()
if vert_sers:
if self.__series_type is dask_cudf.Series:
return dask_cudf.concat(vert_sers, ignore_index=True).unique()
return (
dask_cudf.concat(vert_sers, ignore_index=True)
.unique()
.sort_values()
)
else:
raise TypeError("dataframe must be a CUDF Dask dataframe.")
return self.__series_type()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def view_edge_list(self):
df[dst] : cudf.Series
contains the destination index for each edge
df[weight] : cusd.Series
df[weight] : cudf.Series
Column is only present for weighted Graph,
then containing the weight value for each edge
"""
Expand Down Expand Up @@ -1179,7 +1179,8 @@ def edges(self):

def nodes(self):
"""
Returns all the nodes in the graph as a cudf.Series.
Returns all the nodes in the graph as a cudf.Series, in order of appearance
in the edgelist (source column first, then destination column).
If multi columns vertices, return a cudf.DataFrame.
"""
if self.edgelist is not None:
Expand Down
4 changes: 2 additions & 2 deletions python/cugraph/cugraph/structure/hypergraph.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -325,7 +325,7 @@ def _create_entity_nodes(

for key, col in events[columns].items():
cat = categories.get(key, key)
col = col.unique()
col = col.unique().sort_values()
col = col.nans_to_nulls().dropna() if dropna else col
if len(col) == 0:
continue
Expand Down
9 changes: 4 additions & 5 deletions python/cugraph/cugraph/structure/property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ def get_num_edges(self, type=None):
def get_vertices(self, selection=None):
"""
Return a Series containing the unique vertex IDs contained in both
the vertex and edge property data.
the vertex and edge property data in ascending order.
Selection is not yet supported.
Parameters
Expand Down Expand Up @@ -530,12 +530,11 @@ def get_vertices(self, selection=None):
if vert_sers:
if self.__series_type is cudf.Series:
return self.__series_type(
cudf.concat(vert_sers, ignore_index=True).unique()
cudf.concat(vert_sers, ignore_index=True).unique().sort_values()
)
else:
return self.__series_type(
pd.concat(vert_sers, ignore_index=True).unique()
)
x = pd.Series(pd.concat(vert_sers, ignore_index=True).unique())
return self.__series_type(x.sort_values())
return self.__series_type()

def vertices_ids(self):
Expand Down
2 changes: 1 addition & 1 deletion python/cugraph/cugraph/tests/structure/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,7 @@ def test_bipartite_api(graph_file):
# This test only tests the functionality of adding set of nodes and
# retrieving them. The datasets currently used are not truly bipartite.
cu_M = utils.read_csv_file(graph_file)
nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique()
nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique().sort_values()

# Create set of nodes for partition
set1_exp = cudf.Series(nodes[0 : int(len(nodes) / 2)])
Expand Down

0 comments on commit 1e6da2b

Please sign in to comment.