Skip to content

Commit

Permalink
support importing parquet files
Browse files Browse the repository at this point in the history
  • Loading branch information
enriquea committed May 31, 2024
1 parent 16002e7 commit 33634c4
Showing 1 changed file with 26 additions and 1 deletion.
27 changes: 26 additions & 1 deletion fsspark/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def import_table(path: str,
Import tsv file as Spark DataFrame.
:param path: File path
:param header:
:param header: True if the first row is header.
:param sep: Column separator
:param n_partitions: Minimal number of partitions
Expand All @@ -39,6 +39,31 @@ def import_table(path: str,
return sdf


def import_parquet(path: str,
header: bool = True) -> pyspark.sql.DataFrame:
"""
Import parquet file as Spark DataFrame.
:param path: File path
:param header: True if the first row is header.
:return: Spark DataFrame
"""

_sc = pyspark.sql.SparkSession.getActiveSession()

if _sc is None:
raise ValueError("Active Spark Session not found...")

sdf = (_sc
.read
.option("header", header)
.option("inferSchema", "true")
.parquet(path)
)
return sdf


def import_table_as_psdf(path: str,
sep: str = "\t",
n_partitions: int = 5) -> pyspark.pandas.DataFrame:
Expand Down

0 comments on commit 33634c4

Please sign in to comment.