support importing parquet files

bigbio · May 31, 2024 · 33634c4 · 33634c4
1 parent 16002e7
commit 33634c4
Showing 1 changed file with 26 additions and 1 deletion.
diff --git a/fsspark/utils/io.py b/fsspark/utils/io.py
@@ -16,7 +16,7 @@ def import_table(path: str,
     Import tsv file as Spark DataFrame.
 
     :param path: File path
-    :param header:
+    :param header: True if the first row is header.
     :param sep: Column separator
     :param n_partitions: Minimal number of partitions
 
@@ -39,6 +39,31 @@ def import_table(path: str,
     return sdf
 
 
+def import_parquet(path: str,
+                   header: bool = True) -> pyspark.sql.DataFrame:
+    """
+    Import parquet file as Spark DataFrame.
+
+    :param path: File path
+    :param header: True if the first row is header.
+
+    :return: Spark DataFrame
+    """
+
+    _sc = pyspark.sql.SparkSession.getActiveSession()
+
+    if _sc is None:
+        raise ValueError("Active Spark Session not found...")
+
+    sdf = (_sc
+           .read
+           .option("header", header)
+           .option("inferSchema", "true")
+           .parquet(path)
+           )
+    return sdf
+
+
 def import_table_as_psdf(path: str,
                          sep: str = "\t",
                          n_partitions: int = 5) -> pyspark.pandas.DataFrame: