From 4facb2c6b7b807d5b87ecce77f5be0323781410b Mon Sep 17 00:00:00 2001 From: Steven Date: Fri, 24 Mar 2023 12:10:40 -0700 Subject: [PATCH 1/4] add how to process parquet files --- docs/source/_toctree.yml | 4 +- docs/source/parquet_process.mdx | 226 ++++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+), 1 deletion(-) create mode 100644 docs/source/parquet_process.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index b35ddb653d..c9cd7bf511 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -13,7 +13,9 @@ - local: first_rows title: Preview a dataset - local: parquet - title: List parquet files + title: List Parquet files + - local: parquet_process + title: Process Parquet files - title: Conceptual Guides sections: - local: configs_and_splits diff --git a/docs/source/parquet_process.mdx b/docs/source/parquet_process.mdx new file mode 100644 index 0000000000..ac968e3250 --- /dev/null +++ b/docs/source/parquet_process.mdx @@ -0,0 +1,226 @@ +# Process Parquet files + +[Parquet](https://parquet.apache.org/docs/) files are column-based and they really shine when you're working with big data. Datasets Server automatically converts and stores datasets on the Hub as Parquet files, which you can learn more about in the [List Parquet files](parquet) guide. There are several ways you can work with Parquet files, and this guide will show you how to: + +- read Parquet files with Pandas and Polars +- load, read, and query Parquet files with DuckDB and DuckDB-Wasm + +## Polars + +[Polars](https://pola-rs.github.io/polars-book/user-guide/introduction.html) is a fast DataFrame library written in Rust with Arrow as its foundation. + +Let's start by grabbing the URLs to the `train` split of the [`blog_authorship_corpus`](https://huggingface.co/datasets/blog_authorship_corpus) dataset: + +```py +r = requests.get("https://datasets-server.huggingface.co/parquet?dataset=blog_authorship_corpus") +j = r.json() +urls = [f['url'] for f in j['parquet_files'] if f['split'] == 'train'] +urls +['https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00000-of-00002.parquet', + 'https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00001-of-00002.parquet'] +``` + +To read from a single Parquet file, use the [`read_parquet`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html) function to read them into a DataFrame: + +```py +import polars as pl + +df = ( + pl.read_parquet("https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00000-of-00002.parquet") + .groupby("horoscope") + .agg( + [ + pl.count(), + pl.col("text").str.n_chars().mean().alias("avg_blog_length") + ] + ) + .sort("avg_blog_length", descending=True) + .limit(5) +) +print(df) +shape: (5, 3) +┌───────────┬───────┬─────────────────┐ +│ horoscope ┆ count ┆ avg_blog_length │ +│ --- ┆ --- ┆ --- │ +│ str ┆ u32 ┆ f64 │ +╞═══════════╪═══════╪═════════════════╡ +│ Aquarius ┆ 34062 ┆ 1129.218836 │ +│ Cancer ┆ 41509 ┆ 1098.366812 │ +│ Capricorn ┆ 33961 ┆ 1073.2002 │ +│ Libra ┆ 40302 ┆ 1072.071833 │ +│ Leo ┆ 40587 ┆ 1064.053687 │ +└───────────┴───────┴─────────────────┘ +``` + +To read multiple Parquet files, you'll also need to use the [`concat`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.concat.html) function to concatenate them into a single DataFrame: + +```py +import polars as pl +df = ( + pl.concat([pl.read_parquet(url) for url in urls]) + .groupby("horoscope") + .agg( + [ + pl.count(), + pl.col("text").str.n_chars().mean().alias("avg_blog_length") + ] + ) + .sort("avg_blog_length", descending=True) + .limit(5) +) +print(df) +shape: (5, 3) +┌─────────────┬───────┬─────────────────┐ +│ horoscope ┆ count ┆ avg_blog_length │ +│ --- ┆ --- ┆ --- │ +│ str ┆ u32 ┆ f64 │ +╞═════════════╪═══════╪═════════════════╡ +│ Aquarius ┆ 49568 ┆ 1125.830677 │ +│ Cancer ┆ 63512 ┆ 1097.956087 │ +│ Libra ┆ 60304 ┆ 1060.611054 │ +│ Capricorn ┆ 49402 ┆ 1059.555261 │ +│ Sagittarius ┆ 50431 ┆ 1057.458984 │ +└─────────────┴───────┴─────────────────┘ +``` + +### Lazy API + +Polars offers a [lazy API](https://pola-rs.github.io/polars-book/user-guide/lazy-api/intro.html) that is more performant and memory-efficient for large Parquet files. It keeps track of what you want to do, and only when you're ready, Polars executes the query. This way, the lazy API doesn't load everything into RAM before you even execute the query. It allows you to work with datasets larger than your available RAM. + +To lazily read a Parquet file, use the [`scan_parquet`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_parquet.html) function. Then, execute the entire query with the [`collect`](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/api/polars.LazyFrame.collect.html) function: + +```py +import polars as pl + +q = ( + pl.scan_parquet("https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00000-of-00002.parquet") + .groupby("horoscope") + .agg( + [ + pl.count(), + pl.col("text").str.n_chars().mean().alias("avg_blog_length") + ] + ) + .sort("avg_blog_length", descending=True) + .limit(5) +) +df = q.collect() +``` + +## Pandas + +You can also use the popular Pandas DataFrame library to read Parquet files. + +To read from a single Parquet file, use the [`read_parquet`](https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html) function to read them into a DataFrame: + +```py +import pandas as pd + +df = ( + pd.read_parquet("https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00000-of-00002.parquet") + .groupby('horoscope')['text'] + .apply(lambda x: x.str.len().mean()) + .sort_values(ascending=False) + .head(5) +) +``` + +To read multiple Parquet files, you'll also need to use the [`concat`](https://pandas.pydata.org/docs/reference/api/pandas.concat.html) function to concatenate them into a single DataFrame: + +```py +df = ( + pd.concat([pd.read_parquet(url) for url in urls]) + .groupby('horoscope')['text'] + .apply(lambda x: x.str.len().mean()) + .sort_values(ascending=False) + .head(5) +) +``` + +## DuckDB + +[DuckDB](https://duckdb.org/docs/) is a database that supports reading and querying Parquet files really fast. Begin by creating a connection to DuckDB, and then install and load the [`httpfs`](https://duckdb.org/docs/extensions/httpfs.html) extension to read and write remote files: + + + +```py +import duckdb + +url = "https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00000-of-00002.parquet" + +con = duckdb.connect() +con.execute("INSTALL httpfs;") +con.execute("LOAD httpfs;") +``` + + +```js +var duckdb = require('duckdb'); +var db = new duckdb.Database(':memory:'); +var con = db.connect(); +con.exec('INSTALL httpfs'); +con.exec('LOAD httpfs'); + +const url = "https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00000-of-00002.parquet" +``` + + + +Now you can write and execute your SQL query on the Parquet file: + + + +```py +con.sql(f"SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM '{url}' GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)")┌───────────┬──────────────┬────────────────────┐ +│ horoscope │ count_star() │ avg_blog_length │ +│ varchar │ int64 │ double │ +├───────────┼──────────────┼────────────────────┤ +│ Aquarius │ 34062 │ 1129.218836239798 │ +│ Cancer │ 41509 │ 1098.366812016671 │ +│ Capricorn │ 33961 │ 1073.2002002296751 │ +│ Libra │ 40302 │ 1072.0718326633914 │ +│ Leo │ 40587 │ 1064.0536871412028 │ +└───────────┴──────────────┴────────────────────┘ +``` + + +```js +con.all(`SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM '${url}' GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)`, function(err, res) { + if (err) { + throw err; + } + console.log(res) +}); +``` + + + +To query multiple files: + + + +```py +con.sql(f"SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM read_parquet({urls[:2]}) GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)") +┌─────────────┬──────────────┬────────────────────┐ +│ horoscope │ count_star() │ avg_blog_length │ +│ varchar │ int64 │ double │ +├─────────────┼──────────────┼────────────────────┤ +│ Aquarius │ 49568 │ 1125.8306770497095 │ +│ Cancer │ 63512 │ 1097.95608703867 │ +│ Libra │ 60304 │ 1060.6110539931017 │ +│ Capricorn │ 49402 │ 1059.5552609206104 │ +│ Sagittarius │ 50431 │ 1057.4589835616982 │ +└─────────────┴──────────────┴────────────────────┘ +``` + + +```js +con.all(`SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM read_parquet(${JSON.stringify(urls)}) GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)`, function(err, res) { + if (err) { + throw err; + } + console.log(res) +}); +``` + + \ No newline at end of file From 4415345d5326633b4b5c69038a3eeab4b3e7efbb Mon Sep 17 00:00:00 2001 From: Steven Date: Mon, 27 Mar 2023 11:38:12 -0700 Subject: [PATCH 2/4] apply feedback --- docs/source/parquet_process.mdx | 39 ++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/docs/source/parquet_process.mdx b/docs/source/parquet_process.mdx index ac968e3250..c1624f69a9 100644 --- a/docs/source/parquet_process.mdx +++ b/docs/source/parquet_process.mdx @@ -1,15 +1,21 @@ -# Process Parquet files +# Query datasets from Datasets Server -[Parquet](https://parquet.apache.org/docs/) files are column-based and they really shine when you're working with big data. Datasets Server automatically converts and stores datasets on the Hub as Parquet files, which you can learn more about in the [List Parquet files](parquet) guide. There are several ways you can work with Parquet files, and this guide will show you how to: +Datasets Server automatically converts and publishes datasets on the Hub as Parquet files.[Parquet](https://parquet.apache.org/docs/) files are column-based and they really shine when you're working with big data. There are several ways you can work with Parquet files, and this guide will show you how to: -- read Parquet files with Pandas and Polars -- load, read, and query Parquet files with DuckDB and DuckDB-Wasm +- read and query Parquet files with Pandas and Polars +- access, read and query Parquet files with DuckDB and DuckDB-Wasm -## Polars +## Polars -[Polars](https://pola-rs.github.io/polars-book/user-guide/introduction.html) is a fast DataFrame library written in Rust with Arrow as its foundation. +[Polars](https://pola-rs.github.io/polars-book/user-guide/introduction.html) is a fast DataFrame library written in Rust with Arrow as its foundation. -Let's start by grabbing the URLs to the `train` split of the [`blog_authorship_corpus`](https://huggingface.co/datasets/blog_authorship_corpus) dataset: + + +💡 Learn more about how to get the dataset URLs in the [List Parquet files](parquet) guide. + + + +Let's start by grabbing the URLs to the `train` split of the [`blog_authorship_corpus`](https://huggingface.co/datasets/blog_authorship_corpus) dataset from Datasets Server: ```py r = requests.get("https://datasets-server.huggingface.co/parquet?dataset=blog_authorship_corpus") @@ -20,7 +26,7 @@ urls 'https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00001-of-00002.parquet'] ``` -To read from a single Parquet file, use the [`read_parquet`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html) function to read them into a DataFrame: +To read from a single Parquet file, use the [`read_parquet`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html) function to read it into a DataFrame and then execute your query: ```py import polars as pl @@ -52,7 +58,7 @@ shape: (5, 3) └───────────┴───────┴─────────────────┘ ``` -To read multiple Parquet files, you'll also need to use the [`concat`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.concat.html) function to concatenate them into a single DataFrame: +To read multiple Parquet files, you'll need to use the [`concat`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.concat.html) function to concatenate the files into a single DataFrame: ```py import polars as pl @@ -85,9 +91,9 @@ shape: (5, 3) ### Lazy API -Polars offers a [lazy API](https://pola-rs.github.io/polars-book/user-guide/lazy-api/intro.html) that is more performant and memory-efficient for large Parquet files. It keeps track of what you want to do, and only when you're ready, Polars executes the query. This way, the lazy API doesn't load everything into RAM before you even execute the query. It allows you to work with datasets larger than your available RAM. +Polars offers a [lazy API](https://pola-rs.github.io/polars-book/user-guide/lazy-api/intro.html) that is more performant and memory-efficient for large Parquet files. The LazyFrame API keeps track of what you want to do, and it'll only execute the entire query when you're ready. This way, the lazy API doesn't load everything into RAM beforehand, and it allows you to work with datasets larger than your available RAM. -To lazily read a Parquet file, use the [`scan_parquet`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_parquet.html) function. Then, execute the entire query with the [`collect`](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/api/polars.LazyFrame.collect.html) function: +To lazily read a Parquet file, use the [`scan_parquet`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_parquet.html) function instead. Then, execute the entire query with the [`collect`](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/api/polars.LazyFrame.collect.html) function: ```py import polars as pl @@ -111,7 +117,7 @@ df = q.collect() You can also use the popular Pandas DataFrame library to read Parquet files. -To read from a single Parquet file, use the [`read_parquet`](https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html) function to read them into a DataFrame: +To read from a single Parquet file, use the [`read_parquet`](https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html) function to read it into a DataFrame: ```py import pandas as pd @@ -125,7 +131,7 @@ df = ( ) ``` -To read multiple Parquet files, you'll also need to use the [`concat`](https://pandas.pydata.org/docs/reference/api/pandas.concat.html) function to concatenate them into a single DataFrame: +To read multiple Parquet files, you'll need to use the [`concat`](https://pandas.pydata.org/docs/reference/api/pandas.concat.html) function to concatenate the files into a single DataFrame: ```py df = ( @@ -171,7 +177,8 @@ Now you can write and execute your SQL query on the Parquet file: ```py -con.sql(f"SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM '{url}' GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)")┌───────────┬──────────────┬────────────────────┐ +con.sql(f"SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM '{url}' GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)") +┌───────────┬──────────────┬────────────────────┐ │ horoscope │ count_star() │ avg_blog_length │ │ varchar │ int64 │ double │ ├───────────┼──────────────┼────────────────────┤ @@ -223,4 +230,6 @@ con.all(`SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM r }); ``` - \ No newline at end of file + + +There is also [DuckDB-Wasm](https://duckdb.org/docs/api/wasm), a package powered by WebAssembly for running DuckDB in a browser. This could be useful, for instance, if you want to create a web app to query Parquet files from the browser! \ No newline at end of file From c51188349b80f82adfb87e934fc1b4e6ccc64438 Mon Sep 17 00:00:00 2001 From: Steven Date: Mon, 27 Mar 2023 11:44:22 -0700 Subject: [PATCH 3/4] fix toctree title --- docs/source/_toctree.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index c9cd7bf511..0da4770d21 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -15,7 +15,7 @@ - local: parquet title: List Parquet files - local: parquet_process - title: Process Parquet files + title: Query datasets from Datasets Server - title: Conceptual Guides sections: - local: configs_and_splits From 1216d8b12876aeb1f6c530ce7a375e9347d74a96 Mon Sep 17 00:00:00 2001 From: Steven Date: Tue, 28 Mar 2023 10:23:55 -0700 Subject: [PATCH 4/4] apply feedback/light edits --- docs/source/parquet_process.mdx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/parquet_process.mdx b/docs/source/parquet_process.mdx index c1624f69a9..829465e42a 100644 --- a/docs/source/parquet_process.mdx +++ b/docs/source/parquet_process.mdx @@ -1,9 +1,9 @@ # Query datasets from Datasets Server -Datasets Server automatically converts and publishes datasets on the Hub as Parquet files.[Parquet](https://parquet.apache.org/docs/) files are column-based and they really shine when you're working with big data. There are several ways you can work with Parquet files, and this guide will show you how to: +Datasets Server automatically converts and publishes datasets on the Hub as Parquet files. [Parquet](https://parquet.apache.org/docs/) files are column-based and they shine when you're working with big data. There are several ways you can work with Parquet files, and this guide will show you how to: - read and query Parquet files with Pandas and Polars -- access, read and query Parquet files with DuckDB and DuckDB-Wasm +- connect, read and query Parquet files with DuckDB and DuckDB-Wasm ## Polars @@ -58,7 +58,7 @@ shape: (5, 3) └───────────┴───────┴─────────────────┘ ``` -To read multiple Parquet files, you'll need to use the [`concat`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.concat.html) function to concatenate the files into a single DataFrame: +To read multiple Parquet files - for example, if the dataset is sharded - you'll need to use the [`concat`](https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.concat.html) function to concatenate the files into a single DataFrame: ```py import polars as pl @@ -131,7 +131,7 @@ df = ( ) ``` -To read multiple Parquet files, you'll need to use the [`concat`](https://pandas.pydata.org/docs/reference/api/pandas.concat.html) function to concatenate the files into a single DataFrame: +To read multiple Parquet files - for example, if the dataset is sharded - you'll need to use the [`concat`](https://pandas.pydata.org/docs/reference/api/pandas.concat.html) function to concatenate the files into a single DataFrame: ```py df = ( @@ -202,7 +202,7 @@ con.all(`SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM ' -To query multiple files: +To query multiple files - for example, if the dataset is sharded: @@ -232,4 +232,4 @@ con.all(`SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM r -There is also [DuckDB-Wasm](https://duckdb.org/docs/api/wasm), a package powered by WebAssembly for running DuckDB in a browser. This could be useful, for instance, if you want to create a web app to query Parquet files from the browser! \ No newline at end of file +[DuckDB-Wasm](https://duckdb.org/docs/api/wasm), a package powered by , is also availabe for running DuckDB in a browser. This could be useful, for instance, if you want to create a web app to query Parquet files from the browser! \ No newline at end of file