diff --git a/fsspark/utils/io.py b/fsspark/utils/io.py index 59321a5..951c5cb 100644 --- a/fsspark/utils/io.py +++ b/fsspark/utils/io.py @@ -16,7 +16,7 @@ def import_table(path: str, Import tsv file as Spark DataFrame. :param path: File path - :param header: + :param header: True if the first row is header. :param sep: Column separator :param n_partitions: Minimal number of partitions @@ -39,6 +39,31 @@ def import_table(path: str, return sdf +def import_parquet(path: str, + header: bool = True) -> pyspark.sql.DataFrame: + """ + Import parquet file as Spark DataFrame. + + :param path: File path + :param header: True if the first row is header. + + :return: Spark DataFrame + """ + + _sc = pyspark.sql.SparkSession.getActiveSession() + + if _sc is None: + raise ValueError("Active Spark Session not found...") + + sdf = (_sc + .read + .option("header", header) + .option("inferSchema", "true") + .parquet(path) + ) + return sdf + + def import_table_as_psdf(path: str, sep: str = "\t", n_partitions: int = 5) -> pyspark.pandas.DataFrame: