rapidsai · rapids-bot · Nov 19, 2023 · Aug 30, 2023 · Sep 5, 2023 · Sep 7, 2023
@@ -120,9 +120,13 @@ The benchmark datasets are described below:
 | soc-twitter-2010  | 21,297,772 |   265,025,809 | No       | No       |
 
 **cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.
+
 **soc-LiveJournal** : A graph of the LiveJournal social network.
+
 **europe_osm** : A graph of OpenStreetMap data for Europe.
+
 **hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together.
+
 **soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.
 
 _NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._

@@ -39,3 +39,13 @@
 small_tree = Dataset(meta_path / "small_tree.yaml")
 toy_graph = Dataset(meta_path / "toy_graph.yaml")
 toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml")
+
+# Benchmarking datasets: be mindful of memory usage
+# 250 MB
+soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml")
+# 965 MB
+cit_patents = Dataset(meta_path / "cit-patents.yaml")
+# 1.8 GB
+europe_osm = Dataset(meta_path / "europe_osm.yaml")
+# 1.5 GB
+hollywood = Dataset(meta_path / "hollywood.yaml")
@@ -20,38 +20,36 @@
 
 class DefaultDownloadDir:
     """
-    Maintains the path to the download directory used by Dataset instances.
+    Maintains a path to be used as a default download directory.
+
+    All DefaultDownloadDir instances are based on RAPIDS_DATASET_ROOT_DIR if
+    set, or _default_base_dir if not set.
+
     Instances of this class are typically shared by several Dataset instances
     in order to allow for the download directory to be defined and updated by
     a single object.
     """
+    _default_base_dir = Path.home() / ".cugraph/datasets"
 
-    def __init__(self):
-        self._path = Path(
-            os.environ.get("RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets")
-        )
+    def __init__(self, *, subdir=""):
+        """
+        subdir can be specified to provide a specialized dir under the base dir.
+        """
+        self._subdir = Path(subdir)
+        self.reset()
 
     @property
     def path(self):
-        """
-        If `path` is not set, set it to the environment variable
-        RAPIDS_DATASET_ROOT_DIR. If the variable is not set, default to the
-        user's home directory.
-        """
-        if self._path is None:
-            self._path = Path(
-                os.environ.get(
-                    "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets"
-                )
-            )
-        return self._path
+        return self._path.absolute()
 
     @path.setter
     def path(self, new):
         self._path = Path(new)
 
-    def clear(self):
-        self._path = None
+    def reset(self):
+        self._basedir = Path(os.environ.get("RAPIDS_DATASET_ROOT_DIR",
+                                            self._default_base_dir))
+        self._path = self._basedir / self._subdir
 
 
 default_download_dir = DefaultDownloadDir()
@@ -69,7 +67,6 @@ class Dataset:
         information on the name, type, url link, data loading format, graph
         properties
     """
-
     def __init__(
         self,
         metadata_yaml_file=None,
@@ -159,15 +156,20 @@ def unload(self):
         """
         self._edgelist = None
 
-    def get_edgelist(self, download=False):
+    def get_edgelist(self, download=False, reader=cudf.read_csv):
         """
-        Return an Edgelist
+        Return a DataFrame that represents a graph edgelist.
 
         Parameters
         ----------
         download : Boolean (default=False)
             Automatically download the dataset from the 'url' location within
             the YAML file.
+
+        reader : callable (default=cudf.read_csv)
+            A callable to use to read the dataset. The callable must be
+            compatible with the pandas/cudf read_csv() function and return a
+            compatible DataFrame.
         """
         if self._edgelist is None:
             full_path = self.get_path()
@@ -183,7 +185,7 @@ def get_edgelist(self, download=False):
             header = None
             if isinstance(self.metadata["header"], int):
                 header = self.metadata["header"]
-            self._edgelist = cudf.read_csv(
+            self._edgelist = reader(
                 full_path,
                 delimiter=self.metadata["delim"],
                 names=self.metadata["col_names"],
@@ -219,6 +221,10 @@ def get_graph(
             dataset -if present- will be applied to the Graph. If the
             dataset does not contain weights, the Graph returned will
             be unweighted regardless of ignore_weights.
+
+        store_transposed: Boolean (default=False)
+            If True, stores the transpose of the adjacency matrix.  Required
+            for certain algorithms, such as pagerank.
         """
         if self._edgelist is None:
             self.get_edgelist(download)
@@ -237,20 +243,19 @@ def get_graph(
                 "(or subclass) type or instance, got: "
                 f"{type(create_using)}"
             )
-
         if len(self.metadata["col_names"]) > 2 and not (ignore_weights):
             G.from_cudf_edgelist(
                 self._edgelist,
-                source="src",
-                destination="dst",
-                edge_attr="wgt",
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
+                edge_attr=self.metadata["col_names"][2],
                 store_transposed=store_transposed,
             )
         else:
             G.from_cudf_edgelist(
                 self._edgelist,
-                source="src",
-                destination="dst",
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
                 store_transposed=store_transposed,
             )
         return G
@@ -331,18 +336,18 @@ def download_all(force=False):
 
 def set_download_dir(path):
     """
-    Set the download location fors datasets
+    Set the download location for datasets
 
     Parameters
     ----------
     path : String
         Location used to store datafiles
     """
     if path is None:
-        default_download_dir.clear()
+        default_download_dir.reset()
     else:
         default_download_dir.path = path
 
 
 def get_download_dir():
-    return default_download_dir.path.absolute()
+    return default_download_dir.path
@@ -0,0 +1,22 @@
+name: cit-Patents
+file_type: .csv
+description: A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.
+author: NBER
+refs:
+  J. Leskovec, J. Kleinberg and C. Faloutsos. Graphs over Time Densification Laws, Shrinking Diameters and Possible Explanations. 
+  ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2005.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: true
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 16518948
+number_of_nodes: 3774768
+url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv
@@ -0,0 +1,21 @@
+name: europe_osm
+file_type: .csv
+description: A graph of OpenStreetMap data for Europe.
+author: M. Kobitzsh / Geofabrik GmbH
+refs:
+  Rossi, Ryan. Ahmed, Nesreen. The Network Data Respoistory with Interactive Graph Analytics and Visualization.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: true
+number_of_edges: 54054660
+number_of_nodes: 50912018
+url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv
@@ -0,0 +1,26 @@
+name: hollywood
+file_type: .csv
+description:
+  A graph of movie actors where vertices are actors, and two actors are
+  joined by an edge whenever they appeared in a movie together.
+author: Laboratory for Web Algorithmics (LAW)
+refs:
+  The WebGraph Framework I Compression Techniques, Paolo Boldi
+  and Sebastiano Vigna, Proc. of the Thirteenth International
+  World Wide Web Conference (WWW 2004), 2004, Manhattan, USA,
+  pp. 595--601, ACM Press.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: true
+number_of_edges: 57515616
+number_of_nodes: 1139905
+url: https://data.rapids.ai/cugraph/datasets/hollywood.csv
@@ -0,0 +1,22 @@
+name: soc-LiveJournal1
+file_type: .csv
+description:  A graph of the LiveJournal social network.
+author: L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan
+refs:
+  L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan. Group Formation in        
+  Large Social Networks Membership, Growth, and Evolution. KDD, 2006.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: true
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 68993773
+number_of_nodes: 4847571
+url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv
@@ -0,0 +1,22 @@
+name: soc-twitter-2010
+file_type: .csv
+description: A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.
+author: H. Kwak, C. Lee, H. Park, S. Moon
+refs:
+  J. Yang, J. Leskovec. Temporal Variation in Online Media. ACM Intl.        
+  Conf. on Web Search and Data Mining (WSDM '11), 2011. 
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 530051354
+number_of_nodes: 21297772
+url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv