From 6b20201d12e67fb575e79fdb3a8853859adf95f5 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Tue, 13 Aug 2024 00:26:38 -0700 Subject: [PATCH 01/23] A11Y technology usage queries --- .../a11y_overall_tech_usage_by_rank.sql | 61 +++++++++++++++++ .../third-parties/a11y_technology_usage.sql | 35 ++++++++++ .../a11y_technology_usage_by_rank.sql | 65 +++++++++++++++++++ 3 files changed, 161 insertions(+) create mode 100644 sql/2024/third-parties/a11y_overall_tech_usage_by_rank.sql create mode 100644 sql/2024/third-parties/a11y_technology_usage.sql create mode 100644 sql/2024/third-parties/a11y_technology_usage_by_rank.sql diff --git a/sql/2024/third-parties/a11y_overall_tech_usage_by_rank.sql b/sql/2024/third-parties/a11y_overall_tech_usage_by_rank.sql new file mode 100644 index 00000000000..20ffcd3ff1d --- /dev/null +++ b/sql/2024/third-parties/a11y_overall_tech_usage_by_rank.sql @@ -0,0 +1,61 @@ +#standardSQL +# Overall A11Y technology usage by domain rank + +WITH a11y_technologies AS ( + SELECT + _TABLE_SUFFIX AS client, + url + FROM + `httparchive.technologies.2024_06_01_*` + WHERE + category = 'Accessibility' +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + rank_grouping + FROM + `httparchive.summary_pages.2024_06_01_*`, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + rank <= rank_grouping +), + +rank_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + rank_grouping, + COUNT(0) AS total + FROM + `httparchive.summary_pages.2024_06_01_*`, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + rank <= rank_grouping + GROUP BY + client, + rank_grouping +) + +SELECT + client, + rank_grouping AS rank, + COUNT(DISTINCT url) AS freq, + total, + COUNT(DISTINCT url) / total AS pct +FROM + a11y_technologies +LEFT OUTER JOIN + pages +USING (client, url) +JOIN + rank_totals +USING (client, rank_grouping) +GROUP BY + rank_grouping, + total, + client +ORDER BY + client, + rank diff --git a/sql/2024/third-parties/a11y_technology_usage.sql b/sql/2024/third-parties/a11y_technology_usage.sql new file mode 100644 index 00000000000..926cd566fc7 --- /dev/null +++ b/sql/2024/third-parties/a11y_technology_usage.sql @@ -0,0 +1,35 @@ +#standardSQL +# A11Y technology usage + +WITH a11y_technologies AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS freq + FROM + `httparchive.technologies.2024_06_01_*` + WHERE + category = 'Accessibility' + GROUP BY + client +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(0) AS total + FROM + `httparchive.summary_pages.2024_06_01_*` + GROUP BY + client +) + +SELECT + client, + freq, + total, + freq / total AS pct +FROM + a11y_technologies +JOIN + pages +USING (client) diff --git a/sql/2024/third-parties/a11y_technology_usage_by_rank.sql b/sql/2024/third-parties/a11y_technology_usage_by_rank.sql new file mode 100644 index 00000000000..237b8d167e6 --- /dev/null +++ b/sql/2024/third-parties/a11y_technology_usage_by_rank.sql @@ -0,0 +1,65 @@ +#standardSQL +# A11Y technology usage by domain rank + +WITH a11y_technologies AS ( + SELECT + _TABLE_SUFFIX AS client, + app, + url + FROM + `httparchive.technologies.2024_06_01_*` + WHERE + category = 'Accessibility' +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + rank_grouping + FROM + `httparchive.summary_pages.2024_06_01_*`, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + rank <= rank_grouping +), + +rank_totals AS ( + SELECT + _TABLE_SUFFIX AS client, + rank_grouping, + COUNT(0) AS total + FROM + `httparchive.summary_pages.2024_06_01_*`, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + rank <= rank_grouping + GROUP BY + client, + rank_grouping +) + +SELECT + client, + rank_grouping AS rank, + app, + COUNT(0) AS freq, + total, + COUNT(0) / total AS pct +FROM + a11y_technologies +LEFT OUTER JOIN + pages +USING (client, url) +JOIN + rank_totals +USING (client, rank_grouping) +GROUP BY + rank_grouping, + total, + client, + app +ORDER BY + client, + rank, + pct DESC From 4184b12b6be292ada71f4d36f1d1160b2d25bc7e Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Tue, 13 Aug 2024 22:38:50 -0700 Subject: [PATCH 02/23] lighthouse and distribution related queries --- .../third-parties/compressed_images_by_3p.sql | 81 ++++++++++++++ sql/2024/third-parties/content_encoding.sql | 51 +++++++++ .../content_encoding_by_content_type.sql | 55 +++++++++ ...distribution_of_3XX_response_body_size.sql | 64 +++++++++++ ...ion_of_lighthouse_unminified_css_by_3p.sql | 61 ++++++++++ ...tion_of_lighthouse_unminified_js_by_3p.sql | 62 +++++++++++ ...ibution_of_lighthouse_unused_css_by_3p.sql | 62 +++++++++++ ...ribution_of_lighthouse_unused_js_by_3p.sql | 62 +++++++++++ ...lighthouse_uses_optimized_images_by_3p.sql | 61 ++++++++++ ...tion_of_size_and_time_by_third_parties.sql | 65 +++++++++++ ...of_third_parties_by_number_of_websites.sql | 65 +++++++++++ ...of_websites_by_number_of_third_parties.sql | 63 +++++++++++ ...tes_by_number_of_third_party_providers.sql | 63 +++++++++++ .../third-parties/iframe_allow_attribute.sql | 44 ++++++++ .../iframe_attribute_popular_hosts.sql | 53 +++++++++ ...ighthouse_average_unminified_css_by_3p.sql | 42 +++++++ ...lighthouse_average_unminified_js_by_3p.sql | 42 +++++++ .../lighthouse_third_party_facades.sql | 20 ++++ .../lighthouse_unminified_css_by_3p.sql | 70 ++++++++++++ .../lighthouse_unminified_js_by_3p.sql | 70 ++++++++++++ .../lighthouse_unminified_js_by_3p_by_url.sql | 75 +++++++++++++ ...unminified_uses_optimized_images_by_3p.sql | 70 ++++++++++++ .../lighthouse_unused_css_bytes_by_3p.sql | 70 ++++++++++++ .../lighthouse_unused_js_bytes_by_3p.sql | 70 ++++++++++++ .../number_of_third_parties_by_rank.sql | 77 +++++++++++++ ..._of_third_parties_by_rank_and_category.sql | 85 ++++++++++++++ ...umber_of_third_party_providers_by_rank.sql | 79 +++++++++++++ ...d_party_providers_by_rank_and_category.sql | 85 ++++++++++++++ sql/2024/third-parties/tao_by_third_party.sql | 104 ++++++++++++++++++ .../usage_of_lite_youtube_embed.sql | 37 +++++++ sql/2024/third-parties/usage_of_partytown.sql | 37 +++++++ 31 files changed, 1945 insertions(+) create mode 100644 sql/2024/third-parties/compressed_images_by_3p.sql create mode 100644 sql/2024/third-parties/content_encoding.sql create mode 100644 sql/2024/third-parties/content_encoding_by_content_type.sql create mode 100644 sql/2024/third-parties/distribution_of_3XX_response_body_size.sql create mode 100644 sql/2024/third-parties/distribution_of_lighthouse_unminified_css_by_3p.sql create mode 100644 sql/2024/third-parties/distribution_of_lighthouse_unminified_js_by_3p.sql create mode 100644 sql/2024/third-parties/distribution_of_lighthouse_unused_css_by_3p.sql create mode 100644 sql/2024/third-parties/distribution_of_lighthouse_unused_js_by_3p.sql create mode 100644 sql/2024/third-parties/distribution_of_lighthouse_uses_optimized_images_by_3p.sql create mode 100644 sql/2024/third-parties/distribution_of_size_and_time_by_third_parties.sql create mode 100644 sql/2024/third-parties/distribution_of_third_parties_by_number_of_websites.sql create mode 100644 sql/2024/third-parties/distribution_of_websites_by_number_of_third_parties.sql create mode 100644 sql/2024/third-parties/distribution_of_websites_by_number_of_third_party_providers.sql create mode 100644 sql/2024/third-parties/iframe_allow_attribute.sql create mode 100644 sql/2024/third-parties/iframe_attribute_popular_hosts.sql create mode 100644 sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql create mode 100644 sql/2024/third-parties/lighthouse_average_unminified_js_by_3p.sql create mode 100644 sql/2024/third-parties/lighthouse_third_party_facades.sql create mode 100644 sql/2024/third-parties/lighthouse_unminified_css_by_3p.sql create mode 100644 sql/2024/third-parties/lighthouse_unminified_js_by_3p.sql create mode 100644 sql/2024/third-parties/lighthouse_unminified_js_by_3p_by_url.sql create mode 100644 sql/2024/third-parties/lighthouse_unminified_uses_optimized_images_by_3p.sql create mode 100644 sql/2024/third-parties/lighthouse_unused_css_bytes_by_3p.sql create mode 100644 sql/2024/third-parties/lighthouse_unused_js_bytes_by_3p.sql create mode 100644 sql/2024/third-parties/number_of_third_parties_by_rank.sql create mode 100644 sql/2024/third-parties/number_of_third_parties_by_rank_and_category.sql create mode 100644 sql/2024/third-parties/number_of_third_party_providers_by_rank.sql create mode 100644 sql/2024/third-parties/number_of_third_party_providers_by_rank_and_category.sql create mode 100644 sql/2024/third-parties/tao_by_third_party.sql create mode 100644 sql/2024/third-parties/usage_of_lite_youtube_embed.sql create mode 100644 sql/2024/third-parties/usage_of_partytown.sql diff --git a/sql/2024/third-parties/compressed_images_by_3p.sql b/sql/2024/third-parties/compressed_images_by_3p.sql new file mode 100644 index 00000000000..a9b4db22682 --- /dev/null +++ b/sql/2024/third-parties/compressed_images_by_3p.sql @@ -0,0 +1,81 @@ +#standardSQL +# Compressed images (excluding SVG) by third parties + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + resp_content_encoding AS content_encoding, + type, + respBodySize AS size + FROM + `httparchive.summary_requests.2024_06_01_*` + WHERE + type = 'image' AND ( + resp_content_encoding = 'gzip' OR + resp_content_encoding = 'br' + ) AND NOT ( + resp_content_type LIKE 'image/svg%' OR + ENDS_WITH(url, '.svg') + ) +), + +third_party AS ( + SELECT + NET.HOST(domain) AS domain, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain + HAVING + page_usage >= 50 +) + +SELECT + client, + content_encoding, + domain, + size, + SUM(size) OVER (PARTITION BY client) AS total_size, + size / SUM(size) OVER (PARTITION BY client) AS pct_size, + num_requests, + total_requests, + pct_requests +FROM ( + SELECT + client, + content_encoding, + domain, + COUNT(0) AS num_requests, + SUM(size) AS size, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_requests, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_requests, + RANK() OVER (PARTITION BY client, type, content_encoding ORDER BY COUNT(0) DESC) AS domain_rank + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + WHERE + domain IS NOT NULL + GROUP BY + client, + type, + content_encoding, + domain +) +WHERE + domain_rank <= 100 +ORDER BY + client, + content_encoding, + size DESC diff --git a/sql/2024/third-parties/content_encoding.sql b/sql/2024/third-parties/content_encoding.sql new file mode 100644 index 00000000000..f79dad7fdbb --- /dev/null +++ b/sql/2024/third-parties/content_encoding.sql @@ -0,0 +1,51 @@ +#standardSQL +#content-encoding by third parties + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + resp_content_encoding AS content_encoding + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + NET.HOST(domain) AS domain, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain + HAVING + page_usage >= 50 +) + +SELECT + client, + content_encoding, + COUNT(0) AS num_requests, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +WHERE + domain IS NOT NULL +GROUP BY + client, + content_encoding +ORDER BY + client, + num_requests DESC diff --git a/sql/2024/third-parties/content_encoding_by_content_type.sql b/sql/2024/third-parties/content_encoding_by_content_type.sql new file mode 100644 index 00000000000..efd2a7cdae1 --- /dev/null +++ b/sql/2024/third-parties/content_encoding_by_content_type.sql @@ -0,0 +1,55 @@ +#standardSQL +#content-encoding by third parties by content-type + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + resp_content_encoding AS content_encoding, + type + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + NET.HOST(domain) AS domain, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain + HAVING + page_usage >= 50 +) + +SELECT + client, + type, + content_encoding, + COUNT(0) AS num_requests, + SUM(COUNT(0)) OVER (PARTITION BY client, type) AS total, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, type) AS pct +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +WHERE + domain IS NOT NULL +GROUP BY + client, + type, + content_encoding +ORDER BY + client, + type, + num_requests DESC diff --git a/sql/2024/third-parties/distribution_of_3XX_response_body_size.sql b/sql/2024/third-parties/distribution_of_3XX_response_body_size.sql new file mode 100644 index 00000000000..9e4ca7954b3 --- /dev/null +++ b/sql/2024/third-parties/distribution_of_3XX_response_body_size.sql @@ -0,0 +1,64 @@ +#standardSQL +# Distribution of response body size by redirected third parties +# HTTP status codes documentation: https://developer.mozilla.org/docs/Web/HTTP/Status + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + status, + respBodySize AS body_size + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + domain, + IF(status BETWEEN 300 AND 399, 1, 0) AS redirected, + body_size + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +) + +SELECT + client, + percentile, + APPROX_QUANTILES(body_size, 1000)[OFFSET(percentile * 10)] AS approx_redirect_body_size +FROM + base, + UNNEST(GENERATE_ARRAY(1, 100)) AS percentile +WHERE + redirected = 1 +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/third-parties/distribution_of_lighthouse_unminified_css_by_3p.sql b/sql/2024/third-parties/distribution_of_lighthouse_unminified_css_by_3p.sql new file mode 100644 index 00000000000..798d1bd4558 --- /dev/null +++ b/sql/2024/third-parties/distribution_of_lighthouse_unminified_css_by_3p.sql @@ -0,0 +1,61 @@ +#standardSQL +# Pages with unminified third-party CSS + +CREATE TEMPORARY FUNCTION getUnminifiedCssUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnminifiedCssUrls(JSON_EXTRACT(report, "$.audits['unminified-css']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/third-parties/distribution_of_lighthouse_unminified_js_by_3p.sql b/sql/2024/third-parties/distribution_of_lighthouse_unminified_js_by_3p.sql new file mode 100644 index 00000000000..99de8659ec9 --- /dev/null +++ b/sql/2024/third-parties/distribution_of_lighthouse_unminified_js_by_3p.sql @@ -0,0 +1,62 @@ +#standardSQL +# Pages with unminified third-party JavaScript + +CREATE TEMPORARY FUNCTION getUnminifiedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnminifiedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unminified-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/third-parties/distribution_of_lighthouse_unused_css_by_3p.sql b/sql/2024/third-parties/distribution_of_lighthouse_unused_css_by_3p.sql new file mode 100644 index 00000000000..d760a82e9c2 --- /dev/null +++ b/sql/2024/third-parties/distribution_of_lighthouse_unused_css_by_3p.sql @@ -0,0 +1,62 @@ +#standardSQL +# Pages with unused third-party CSS + +CREATE TEMPORARY FUNCTION getUnusedCSSUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnusedCSSUrls(JSON_EXTRACT(report, "$.audits['unused-css-rules']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/third-parties/distribution_of_lighthouse_unused_js_by_3p.sql b/sql/2024/third-parties/distribution_of_lighthouse_unused_js_by_3p.sql new file mode 100644 index 00000000000..60688c36785 --- /dev/null +++ b/sql/2024/third-parties/distribution_of_lighthouse_unused_js_by_3p.sql @@ -0,0 +1,62 @@ +#standardSQL +# Pages with unused third-party JavaScript + +CREATE TEMPORARY FUNCTION getUnusedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnusedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unused-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/third-parties/distribution_of_lighthouse_uses_optimized_images_by_3p.sql b/sql/2024/third-parties/distribution_of_lighthouse_uses_optimized_images_by_3p.sql new file mode 100644 index 00000000000..f4aebe8b831 --- /dev/null +++ b/sql/2024/third-parties/distribution_of_lighthouse_uses_optimized_images_by_3p.sql @@ -0,0 +1,61 @@ +#standardSQL +# Third-party pages with unoptimized images + +CREATE TEMPORARY FUNCTION getUnminifiedImageUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(potential_savings) AS potential_total_savings + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(data.url) AS domain, + data.wastedBytes AS potential_savings + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnminifiedImageUrls(JSON_EXTRACT(report, "$.audits['uses-optimized-images']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(potential_third_party_savings, 1000)[OFFSET(percentile * 10)] AS potential_third_party_savings_bytes, + APPROX_QUANTILES(potential_total_savings, 1000)[OFFSET(percentile * 10)] AS potential_total_savings_bytes +FROM + base, + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/third-parties/distribution_of_size_and_time_by_third_parties.sql b/sql/2024/third-parties/distribution_of_size_and_time_by_third_parties.sql new file mode 100644 index 00000000000..88c7138ecbf --- /dev/null +++ b/sql/2024/third-parties/distribution_of_size_and_time_by_third_parties.sql @@ -0,0 +1,65 @@ +#standardSQL +# Distribution of third party requests size and time by category + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + respBodySize AS body_size, + time + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + body_size, + time + FROM + requests + INNER JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +) + +SELECT + client, + category, + percentile, + APPROX_QUANTILES(body_size, 1000)[OFFSET(percentile * 10)] AS body_size, + APPROX_QUANTILES(time, 1000)[OFFSET(percentile * 10)] AS time -- noqa: L010 +FROM + base, + UNNEST(GENERATE_ARRAY(1, 100)) AS percentile +GROUP BY + client, + category, + percentile +ORDER BY + client, + category, + percentile diff --git a/sql/2024/third-parties/distribution_of_third_parties_by_number_of_websites.sql b/sql/2024/third-parties/distribution_of_third_parties_by_number_of_websites.sql new file mode 100644 index 00000000000..5e648f75ec6 --- /dev/null +++ b/sql/2024/third-parties/distribution_of_third_parties_by_number_of_websites.sql @@ -0,0 +1,65 @@ +#standardSQL +# Distribution of third parties by number of websites + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + canonicalDomain, + COUNT(DISTINCT page) AS pages_per_third_party + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + WHERE + canonicalDomain IS NOT NULL + GROUP BY + client, + canonicalDomain +) + +SELECT + client, + percentile, + APPROX_QUANTILES(pages_per_third_party, 1000)[OFFSET(percentile * 10)] AS approx_pages_per_third_party +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/third-parties/distribution_of_websites_by_number_of_third_parties.sql b/sql/2024/third-parties/distribution_of_websites_by_number_of_third_parties.sql new file mode 100644 index 00000000000..94cfc0d7464 --- /dev/null +++ b/sql/2024/third-parties/distribution_of_websites_by_number_of_third_parties.sql @@ -0,0 +1,63 @@ +#standardSQL +# Distribution of websites by number of third party + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(percentile * 10)] AS approx_third_parties_per_page +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/third-parties/distribution_of_websites_by_number_of_third_party_providers.sql b/sql/2024/third-parties/distribution_of_websites_by_number_of_third_party_providers.sql new file mode 100644 index 00000000000..162b1cc9f71 --- /dev/null +++ b/sql/2024/third-parties/distribution_of_websites_by_number_of_third_party_providers.sql @@ -0,0 +1,63 @@ +#standardSQL +# Distribution of websites by number of third party + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + COUNT(DISTINCT canonicalDomain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(percentile * 10)] AS approx_third_parties_per_page +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2024/third-parties/iframe_allow_attribute.sql b/sql/2024/third-parties/iframe_allow_attribute.sql new file mode 100644 index 00000000000..d887ab955c6 --- /dev/null +++ b/sql/2024/third-parties/iframe_allow_attribute.sql @@ -0,0 +1,44 @@ +#standardSQL +# usage of different directives for allow attribute on iframes + +CREATE TEMP FUNCTION getNumWithAllowAttribute(payload STRING) AS (( + SELECT + COUNT(0) + FROM + UNNEST(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox')) AS iframeAttr + WHERE + JSON_EXTRACT_SCALAR(iframeAttr, '$.allow') IS NOT NULL +)); + +SELECT + client, + SPLIT(TRIM(allow_attr), ' ')[OFFSET(0)] AS directive, + total_iframes_with_allow, + COUNT(0) AS freq, + COUNT(0) / total_iframes_with_allow AS pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox') AS iframeAttrs + FROM + `httparchive.pages.2024_06_01_*`), + UNNEST(iframeAttrs) AS iframeAttr, + UNNEST(REGEXP_EXTRACT_ALL(JSON_EXTRACT_SCALAR(iframeAttr, '$.allow'), r'(?i)([^,;]+)')) AS allow_attr +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + SUM(getNumWithAllowAttribute(payload)) AS total_iframes_with_allow + FROM + `httparchive.pages.2024_06_01_*` + GROUP BY + client +) USING (client) +GROUP BY + client, + directive, + total_iframes_with_allow +HAVING + pct > 0.001 +ORDER BY + client, + pct DESC diff --git a/sql/2024/third-parties/iframe_attribute_popular_hosts.sql b/sql/2024/third-parties/iframe_attribute_popular_hosts.sql new file mode 100644 index 00000000000..7c62c92f3ff --- /dev/null +++ b/sql/2024/third-parties/iframe_attribute_popular_hosts.sql @@ -0,0 +1,53 @@ +#standardSQL +# most common hostnames of iframes that have the allow or sandbox attribute + +CREATE TEMP FUNCTION hasPolicy(attr STRING, policy_type STRING) +RETURNS BOOL DETERMINISTIC +LANGUAGE js AS ''' + const $ = JSON.parse(attr); + return $[policy_type] !== null; +'''; + +SELECT + client, + policy_type, + hostname, + COUNTIF(has_policy) AS freq, + total_iframes, + COUNTIF(has_policy) / total_iframes AS pct +FROM ( + SELECT + client, + policy_type, + JSON_EXTRACT_SCALAR(iframeAttr, '$.hostname') AS hostname, + hasPolicy(iframeAttr, policy_type) AS has_policy + FROM ( + SELECT + _TABLE_SUFFIX AS client, + JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox') AS iframeAttrs + FROM + `httparchive.pages.2024_06_01_*`), + UNNEST(iframeAttrs) AS iframeAttr, + UNNEST(['allow', 'sandbox']) AS policy_type +) +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + SUM(ARRAY_LENGTH(JSON_EXTRACT_ARRAY(JSON_EXTRACT_SCALAR(payload, '$._security'), '$.iframe-allow-sandbox'))) AS total_iframes + FROM + `httparchive.pages.2024_06_01_*` + GROUP BY + client) +USING + (client) +GROUP BY + client, + total_iframes, + policy_type, + hostname +HAVING + pct > 0.001 +ORDER BY + client, + policy_type, + pct DESC diff --git a/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql b/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql new file mode 100644 index 00000000000..aa6c3998b28 --- /dev/null +++ b/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql @@ -0,0 +1,42 @@ +#standardSQL +# Pages with unminified JS by 1P/3P +CREATE TEMPORARY FUNCTION getUnminifiedJsUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + +SELECT + client, + AVG(pct_1p_wasted_bytes) AS avg_pct_1p_wasted_bytes, + AVG(pct_3p_wasted_bytes) AS avg_pct_3p_wasted_bytes +FROM ( + SELECT + client, + page, + SUM(IF(is_3p, 0, wasted_bytes)) / SUM(wasted_bytes) AS pct_1p_wasted_bytes, + SUM(IF(is_3p, wasted_bytes, 0)) / SUM(wasted_bytes) AS pct_3p_wasted_bytes + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(unminified.url) IS NOT NULL AND NET.HOST(unminified.url) IN ( + SELECT domain FROM `httparchive.almanac.third_parties` WHERE date = '2024-06-01' AND category != 'hosting' + ) AS is_3p, + unminified.wastedBytes AS wasted_bytes + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnminifiedJsUrls(JSON_EXTRACT(report, "$.audits['unminified-css']"))) AS unminified + ) + GROUP BY + client, + page + ) +GROUP BY + client diff --git a/sql/2024/third-parties/lighthouse_average_unminified_js_by_3p.sql b/sql/2024/third-parties/lighthouse_average_unminified_js_by_3p.sql new file mode 100644 index 00000000000..0ec88a14d39 --- /dev/null +++ b/sql/2024/third-parties/lighthouse_average_unminified_js_by_3p.sql @@ -0,0 +1,42 @@ +#standardSQL +# Pages with unminified JS by 1P/3P +CREATE TEMPORARY FUNCTION getUnminifiedJsUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes}) => { + return {url, wastedBytes}; + }); +} catch (e) { + return []; +} +'''; + +SELECT + client, + AVG(pct_1p_wasted_bytes) AS avg_pct_1p_wasted_bytes, + AVG(pct_3p_wasted_bytes) AS avg_pct_3p_wasted_bytes +FROM ( + SELECT + client, + page, + SUM(IF(is_3p, 0, wasted_bytes)) / SUM(wasted_bytes) AS pct_1p_wasted_bytes, + SUM(IF(is_3p, wasted_bytes, 0)) / SUM(wasted_bytes) AS pct_3p_wasted_bytes + FROM ( + SELECT + _TABLE_SUFFIX AS client, + lighthouse.url AS page, + NET.HOST(unminified.url) IS NOT NULL AND NET.HOST(unminified.url) IN ( + SELECT domain FROM `httparchive.almanac.third_parties` WHERE date = '2024-06-01' AND category != 'hosting' + ) AS is_3p, + unminified.wastedBytes AS wasted_bytes + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnminifiedJsUrls(JSON_EXTRACT(report, "$.audits['unminified-javascript']"))) AS unminified + ) + GROUP BY + client, + page + ) +GROUP BY + client diff --git a/sql/2024/third-parties/lighthouse_third_party_facades.sql b/sql/2024/third-parties/lighthouse_third_party_facades.sql new file mode 100644 index 00000000000..725460ce1a5 --- /dev/null +++ b/sql/2024/third-parties/lighthouse_third_party_facades.sql @@ -0,0 +1,20 @@ +SELECT + client, + fail, + total, + pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-facades.score') AS FLOAT64) < 0.9) AS fail, + SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS total, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-facades.score') AS FLOAT64) < 0.9) / SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS pct + FROM + `httparchive.lighthouse.2024_06_01_*` + GROUP BY + client +) +WHERE + total > 100 +ORDER BY + client diff --git a/sql/2024/third-parties/lighthouse_unminified_css_by_3p.sql b/sql/2024/third-parties/lighthouse_unminified_css_by_3p.sql new file mode 100644 index 00000000000..a36e300d222 --- /dev/null +++ b/sql/2024/third-parties/lighthouse_unminified_css_by_3p.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-party pages with unminified CSS + +CREATE TEMPORARY FUNCTION getUnminifiedCssUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnminifiedCssUrls(JSON_EXTRACT(report, "$.audits['unminified-css']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2024/third-parties/lighthouse_unminified_js_by_3p.sql b/sql/2024/third-parties/lighthouse_unminified_js_by_3p.sql new file mode 100644 index 00000000000..4cd75c0b3c8 --- /dev/null +++ b/sql/2024/third-parties/lighthouse_unminified_js_by_3p.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-party pages with unminified JavaScript + +CREATE TEMPORARY FUNCTION getUnminifiedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnminifiedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unminified-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2024/third-parties/lighthouse_unminified_js_by_3p_by_url.sql b/sql/2024/third-parties/lighthouse_unminified_js_by_3p_by_url.sql new file mode 100644 index 00000000000..41b0b19d910 --- /dev/null +++ b/sql/2024/third-parties/lighthouse_unminified_js_by_3p_by_url.sql @@ -0,0 +1,75 @@ +#standardSQL +# Third-party pages with unminified JavaScript + +CREATE TEMPORARY FUNCTION getUnminifiedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + potential_third_parties.url AS url, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + data.url AS url, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnminifiedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unminified-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain, + url +) + +SELECT + client, + domain, + url, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain, + url +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2024/third-parties/lighthouse_unminified_uses_optimized_images_by_3p.sql b/sql/2024/third-parties/lighthouse_unminified_uses_optimized_images_by_3p.sql new file mode 100644 index 00000000000..8a4ff88ef38 --- /dev/null +++ b/sql/2024/third-parties/lighthouse_unminified_uses_optimized_images_by_3p.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-party pages with unoptimized images + +CREATE TEMPORARY FUNCTION getUnminifiedImageUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnminifiedImageUrls(JSON_EXTRACT(report, "$.audits['uses-optimized-images']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2024/third-parties/lighthouse_unused_css_bytes_by_3p.sql b/sql/2024/third-parties/lighthouse_unused_css_bytes_by_3p.sql new file mode 100644 index 00000000000..5ff6f31495f --- /dev/null +++ b/sql/2024/third-parties/lighthouse_unused_css_bytes_by_3p.sql @@ -0,0 +1,70 @@ +#bq-tandardSQL +# Third-party pages with unused CSS + +CREATE TEMPORARY FUNCTION getUnusedCssUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnusedCssUrls(JSON_EXTRACT(report, "$.audits['unused-css-rules']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2024/third-parties/lighthouse_unused_js_bytes_by_3p.sql b/sql/2024/third-parties/lighthouse_unused_js_bytes_by_3p.sql new file mode 100644 index 00000000000..cce307298b2 --- /dev/null +++ b/sql/2024/third-parties/lighthouse_unused_js_bytes_by_3p.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-party pages with unused JavaScript + +CREATE TEMPORARY FUNCTION getUnusedJavascriptUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(({url, wastedBytes, totalBytes}) => { + return {url, wastedBytes, totalBytes}; + }); +} catch (e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + potential_third_parties.domain AS domain, + SUM(IF(third_party_domains.domain IS NOT NULL, potential_savings, 0)) AS potential_third_party_savings, + SUM(IF(third_party_domains.domain IS NOT NULL, transfer_size, 0)) AS third_party_transfer_size + FROM ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page, + data.wastedBytes AS potential_savings, + data.totalBytes AS transfer_size + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUnusedJavascriptUrls(JSON_EXTRACT(report, "$.audits['unused-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + client, + domain, + COUNT(DISTINCT page) AS total_pages, + SUM(third_party_transfer_size) AS third_party_transfer_size_bytes, + SUM(potential_third_party_savings) AS potential_third_party_savings_bytes, + SUM(potential_third_party_savings) / SUM(third_party_transfer_size) AS pct_potential_third_party_savings, + SUM(potential_third_party_savings) / COUNT(DISTINCT page) AS potential_third_party_savings_bytes_per_page +FROM + base +WHERE + potential_third_party_savings > 0 +GROUP BY + client, + domain +ORDER BY + client, + total_pages DESC, + potential_third_party_savings_bytes_per_page DESC, + domain diff --git a/sql/2024/third-parties/number_of_third_parties_by_rank.sql b/sql/2024/third-parties/number_of_third_parties_by_rank.sql new file mode 100644 index 00000000000..ce978a3916d --- /dev/null +++ b/sql/2024/third-parties/number_of_third_parties_by_rank.sql @@ -0,0 +1,77 @@ +#standardSQL +# Number of third-parties per websites by rank +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + rank + FROM + `httparchive.summary_pages.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + rank, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING + (client, page) + GROUP BY + client, + page, + rank +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping diff --git a/sql/2024/third-parties/number_of_third_parties_by_rank_and_category.sql b/sql/2024/third-parties/number_of_third_parties_by_rank_and_category.sql new file mode 100644 index 00000000000..811f50a626c --- /dev/null +++ b/sql/2024/third-parties/number_of_third_parties_by_rank_and_category.sql @@ -0,0 +1,85 @@ +#standardSQL +# Number of third-parties per websites by rank and category + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + rank + FROM + `httparchive.summary_pages.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category NOT IN ('hosting') + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + page, + rank, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING + (client, page) + GROUP BY + client, + category, + page, + rank +) + +SELECT + client, + category, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + category, + rank_grouping +ORDER BY + client, + category, + rank_grouping diff --git a/sql/2024/third-parties/number_of_third_party_providers_by_rank.sql b/sql/2024/third-parties/number_of_third_party_providers_by_rank.sql new file mode 100644 index 00000000000..3d9b24838a6 --- /dev/null +++ b/sql/2024/third-parties/number_of_third_party_providers_by_rank.sql @@ -0,0 +1,79 @@ +#standardSQL +# Number of distinct third-party providers per websites by rank +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + rank + FROM + `httparchive.summary_pages.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + rank, + COUNT(DISTINCT canonicalDomain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING + (client, page) + GROUP BY + client, + page, + rank +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping diff --git a/sql/2024/third-parties/number_of_third_party_providers_by_rank_and_category.sql b/sql/2024/third-parties/number_of_third_party_providers_by_rank_and_category.sql new file mode 100644 index 00000000000..4445601a13d --- /dev/null +++ b/sql/2024/third-parties/number_of_third_party_providers_by_rank_and_category.sql @@ -0,0 +1,85 @@ +#standardSQL +# Number of third-parties per websites by rank and category + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + rank + FROM + `httparchive.summary_pages.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category NOT IN ('hosting') + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + page, + rank, + COUNT(DISTINCT canonicalDomain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING + (client, page) + GROUP BY + client, + category, + page, + rank +) + +SELECT + client, + category, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + category, + rank_grouping +ORDER BY + client, + category, + rank_grouping diff --git a/sql/2024/third-parties/tao_by_third_party.sql b/sql/2024/third-parties/tao_by_third_party.sql new file mode 100644 index 00000000000..c4e5aedef51 --- /dev/null +++ b/sql/2024/third-parties/tao_by_third_party.sql @@ -0,0 +1,104 @@ +#standardSQL +# Percent of third-party requests with "Timing-Allow-Origin" headers +# Header reference: https://developer.mozilla.org/docs/Web/HTTP/Headers/Timing-Allow-Origin + +CREATE TEMP FUNCTION get_tao(headers STRING) +RETURNS STRING LANGUAGE js AS ''' + try { + const regex = /timing-allow-origin = (\\*|(http.*?,? )+)/gm; + output = regex.exec(headers)[1]+", "; + output = output.replace(/, , $/, ", "); + return output; + } catch (e) { + return false; + } +'''; + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + RTRIM(urlShort, '/') AS origin, + respOtherHeaders + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + pageid AS page, + RTRIM(urlShort, '/') AS origin + FROM + `httparchive.summary_pages.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +headers AS ( + SELECT + requests.client AS client, + requests.origin AS req_origin, + pages.origin AS page_origin, + get_tao(LOWER(respOtherHeaders)) AS timing_allow_origin, + respOtherHeaders, + third_party.category AS req_category + FROM requests + LEFT JOIN pages + USING (client, page) + INNER JOIN third_party + ON NET.HOST(requests.origin) = NET.HOST(third_party.domain) +), + +base AS ( + SELECT + client, + IF(respOtherHeaders LIKE '%timing-allow-origin = %', 1, 0) AS tao_header_present, + IF( + page_origin = req_origin OR + timing_allow_origin = '*' OR + timing_allow_origin LIKE '*,%' OR + timing_allow_origin LIKE '%,*' OR + timing_allow_origin LIKE '%,*,%' OR + timing_allow_origin LIKE '%, *,%' OR + timing_allow_origin = page_origin OR + timing_allow_origin LIKE page_origin || ',' OR + timing_allow_origin LIKE '%,' || page_origin OR + timing_allow_origin LIKE '%, ' || page_origin OR + timing_allow_origin LIKE '%,' || page_origin || ',%' OR + timing_allow_origin LIKE '%, ' || page_origin || ',%', + 1, 0) AS timing_allowed + FROM headers +) + +SELECT + client, + SUM(tao_header_present) AS tao_requests, + SUM(timing_allowed) AS timing_allowed_requests, + COUNT(0) AS total_requests, + SUM(tao_header_present) / COUNT(0) AS pct_tao_requests, + SUM(timing_allowed) / COUNT(0) AS pct_timing_allowed_requests +FROM + base +GROUP BY + client diff --git a/sql/2024/third-parties/usage_of_lite_youtube_embed.sql b/sql/2024/third-parties/usage_of_lite_youtube_embed.sql new file mode 100644 index 00000000000..39620794722 --- /dev/null +++ b/sql/2024/third-parties/usage_of_lite_youtube_embed.sql @@ -0,0 +1,37 @@ +#standardSQL +# Percent of pages using lite-youtube-embed + +WITH totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total_pages + FROM + `httparchive.summary_pages.2024_06_01_*` + GROUP BY + client +), + +youtube_embed AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS youtube_embed_pages + FROM + `httparchive.technologies.2024_06_01_*` + WHERE + app = 'lite-youtube-embed' + GROUP BY + client +) + +SELECT + client, + youtube_embed_pages, + total_pages, + youtube_embed_pages / total_pages AS pct_youtube_embed_pages +FROM + totals +JOIN + youtube_embed +USING (client) +ORDER BY + client diff --git a/sql/2024/third-parties/usage_of_partytown.sql b/sql/2024/third-parties/usage_of_partytown.sql new file mode 100644 index 00000000000..80a86ecc1aa --- /dev/null +++ b/sql/2024/third-parties/usage_of_partytown.sql @@ -0,0 +1,37 @@ +#standardSQL +# Percent of pages using Partytown + +WITH totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total_pages + FROM + `httparchive.summary_pages.2024_06_01_*` + GROUP BY + client +), + +partytown AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS partytown_pages + FROM + `httparchive.technologies.2024_06_01_*` + WHERE + app = 'Partytown' + GROUP BY + client +) + +SELECT + client, + partytown_pages, + total_pages, + partytown_pages / total_pages AS pct_partytown_pages +FROM + totals +JOIN + partytown +USING (client) +ORDER BY + client From 80ab481b2f78ddd0e971abf58d7e2b8edec6cb49 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Tue, 13 Aug 2024 23:32:33 -0700 Subject: [PATCH 03/23] Percentage-based analysis queries --- ...rcent_of_third_parties_by_content_type.sql | 53 +++++++++++ ..._of_third_parties_using_document_write.sql | 72 +++++++++++++++ ..._third_parties_using_legacy_javascript.sql | 72 +++++++++++++++ ...parties_using_legacy_javascript_by_url.sql | 89 +++++++++++++++++++ .../percent_of_third_party_cache.sql | 74 +++++++++++++++ ...and_bytes_by_category_and_content_type.sql | 86 ++++++++++++++++++ ...t_of_third_party_with_security_headers.sql | 73 +++++++++++++++ .../percent_of_websites_with_third_party.sql | 51 +++++++++++ ...f_websites_with_third_party_by_ranking.sql | 64 +++++++++++++ .../scripts_using_async_defer.sql | 76 ++++++++++++++++ .../scripts_using_async_defer_by_3p.sql | 81 +++++++++++++++++ ...d_parties_by_median_body_size_and_time.sql | 87 ++++++++++++++++++ ...00_third_parties_by_number_of_websites.sql | 76 ++++++++++++++++ 13 files changed, 954 insertions(+) create mode 100644 sql/2024/third-parties/percent_of_third_parties_by_content_type.sql create mode 100644 sql/2024/third-parties/percent_of_third_parties_using_document_write.sql create mode 100644 sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript.sql create mode 100644 sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql create mode 100644 sql/2024/third-parties/percent_of_third_party_cache.sql create mode 100644 sql/2024/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql create mode 100644 sql/2024/third-parties/percent_of_third_party_with_security_headers.sql create mode 100644 sql/2024/third-parties/percent_of_websites_with_third_party.sql create mode 100644 sql/2024/third-parties/percent_of_websites_with_third_party_by_ranking.sql create mode 100644 sql/2024/third-parties/scripts_using_async_defer.sql create mode 100644 sql/2024/third-parties/scripts_using_async_defer_by_3p.sql create mode 100644 sql/2024/third-parties/top100_third_parties_by_median_body_size_and_time.sql create mode 100644 sql/2024/third-parties/top100_third_parties_by_number_of_websites.sql diff --git a/sql/2024/third-parties/percent_of_third_parties_by_content_type.sql b/sql/2024/third-parties/percent_of_third_parties_by_content_type.sql new file mode 100644 index 00000000000..32279f1349f --- /dev/null +++ b/sql/2024/third-parties/percent_of_third_parties_by_content_type.sql @@ -0,0 +1,53 @@ +#standardSQL +# Percent of third party requests by content type. + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + type AS contentType + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +) + +SELECT + client, + contentType, + COUNT(0) AS requests, + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_requests, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_requests +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +WHERE + domain IS NOT NULL +GROUP BY + client, + contentType +ORDER BY + client, + contentType diff --git a/sql/2024/third-parties/percent_of_third_parties_using_document_write.sql b/sql/2024/third-parties/percent_of_third_parties_using_document_write.sql new file mode 100644 index 00000000000..ae1eec654ab --- /dev/null +++ b/sql/2024/third-parties/percent_of_third_parties_using_document_write.sql @@ -0,0 +1,72 @@ +#standardSQL +# Third-parties that use document.write + +CREATE TEMPORARY FUNCTION +getUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(i => ({url: i.source.url})); +} catch(e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUrls(JSON_EXTRACT(report, "$.audits['no-document-write']"))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + base.client AS client, + domain, + COUNT(0) AS freq, + total, + COUNT(0) / total AS pct +FROM + base +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total + FROM + `httparchive.lighthouse.2024_06_01_*` + GROUP BY + _TABLE_SUFFIX +) +USING + (client) +GROUP BY + client, + domain, + total +ORDER BY + client, + freq DESC diff --git a/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript.sql b/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript.sql new file mode 100644 index 00000000000..84cc4d0255c --- /dev/null +++ b/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript.sql @@ -0,0 +1,72 @@ +#standardSQL +# Third-parties that use legacy JavaScript + +CREATE TEMPORARY FUNCTION +getUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(i => ({url: i.url})); +} catch(e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUrls(JSON_EXTRACT(report, "$.audits['legacy-javascript']"))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + base.client AS client, + domain, + COUNT(0) AS freq, + total, + COUNT(0) / total AS pct +FROM + base +JOIN ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total + FROM + `httparchive.lighthouse.2024_06_01_*` + GROUP BY + _TABLE_SUFFIX +) +USING + (client) +GROUP BY + client, + domain, + total +ORDER BY + client, + freq DESC diff --git a/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql b/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql new file mode 100644 index 00000000000..3f78f47c7be --- /dev/null +++ b/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql @@ -0,0 +1,89 @@ +#standardSQL +# Third-party scripts that use legacy JavaScript + +CREATE TEMPORARY FUNCTION +getUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(i => ({url: i.url})); +} catch(e) { + return []; +} +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain, + url + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + data.url AS url, + NET.HOST(data.url) AS domain, + lighthouse.url AS page + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUrls(JSON_EXTRACT(report, "$.audits['legacy-javascript']"))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain, + url +) + +SELECT + client, + domain, + url, + freq, + total, + pct +FROM ( + SELECT + base.client AS client, + domain, + url, + COUNT(0) AS freq, + total, + COUNT(0) / total AS pct, + RANK() OVER (PARTITION BY base.client ORDER BY COUNT(0) DESC) AS url_rank + FROM + base + JOIN ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT url) AS total + FROM + `httparchive.lighthouse.2024_06_01_*` + GROUP BY + _TABLE_SUFFIX + ) + USING + (client) + GROUP BY + client, + domain, + url, + total +) +WHERE + url_rank <= 100 +ORDER BY + client, + freq DESC diff --git a/sql/2024/third-parties/percent_of_third_party_cache.sql b/sql/2024/third-parties/percent_of_third_party_cache.sql new file mode 100644 index 00000000000..a4b4f2c43d2 --- /dev/null +++ b/sql/2024/third-parties/percent_of_third_party_cache.sql @@ -0,0 +1,74 @@ +#standardSQL +# Percent of third party requests cached +# Cache-Control documentation: https://developer.mozilla.org/docs/Web/HTTP/Headers/Cache-Control#Directives + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + resp_cache_control, + status, + respOtherHeaders, + reqOtherHeaders, + type, + url, + pageid AS page + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + type, + IF( + ( + status IN (301, 302, 307, 308, 410) AND + NOT REGEXP_CONTAINS(resp_cache_control, r'(?i)private|no-store') AND + NOT REGEXP_CONTAINS(reqOtherHeaders, r'Authorization') + ) OR + ( + status IN (301, 302, 307, 308, 410) OR + REGEXP_CONTAINS(resp_cache_control, r'public|max-age|s-maxage') OR + REGEXP_CONTAINS(respOtherHeaders, r'Expires') + ), 1, 0) AS cached + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + WHERE + domain IS NOT NULL +) + +SELECT + client, + type, + SUM(cached) AS cached_requests, + COUNT(0) AS total_requests, + SUM(cached) / COUNT(0) AS pct_cached_requests +FROM + base +GROUP BY + client, + type diff --git a/sql/2024/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql b/sql/2024/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql new file mode 100644 index 00000000000..dd45c790156 --- /dev/null +++ b/sql/2024/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql @@ -0,0 +1,86 @@ +#standardSQL +# Percent of third party requests and bytes by category and content type. + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + type AS contentType, + respBodySize AS body_size + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + category, + contentType, + body_size + FROM + requests + INNER JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +), + +requests_per_page_and_category AS ( + SELECT + client, + page, + category, + contentType, + SUM(SUM(body_size)) OVER (PARTITION BY page) AS total_page_size, + SUM(body_size) AS body_size, + SUM(COUNT(0)) OVER (PARTITION BY page) AS total_page_requests, + COUNT(0) AS requests + FROM + base + GROUP BY + client, + page, + category, + contentType +) + +SELECT + client, + category, + contentType, + SUM(requests) AS requests, + SAFE_DIVIDE(SUM(requests), SUM(SUM(requests)) OVER (PARTITION BY client, category)) AS pct_requests, + SUM(body_size) AS body_size, + SAFE_DIVIDE(SUM(body_size), SUM(SUM(body_size)) OVER (PARTITION BY client, category)) AS pct_body_size +FROM + requests_per_page_and_category +GROUP BY + client, + category, + contentType +ORDER BY + client, + category, + contentType diff --git a/sql/2024/third-parties/percent_of_third_party_with_security_headers.sql b/sql/2024/third-parties/percent_of_third_party_with_security_headers.sql new file mode 100644 index 00000000000..7701723753a --- /dev/null +++ b/sql/2024/third-parties/percent_of_third_party_with_security_headers.sql @@ -0,0 +1,73 @@ +#standardSQL +# Percent of third-party requests with security headers + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + RTRIM(urlShort, '/') AS origin, + respOtherHeaders + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +headers AS ( + SELECT + client, + requests.origin AS req_origin, + LOWER(respOtherHeaders) AS respOtherHeaders, + third_party.category AS req_category + FROM requests + INNER JOIN third_party + ON NET.HOST(requests.origin) = NET.HOST(third_party.domain) +), + +base AS ( + SELECT + client, + req_origin, + req_category, + IF(STRPOS(respOtherHeaders, 'strict-transport-security') > 0, 1, 0) AS hsts_header, + IF(STRPOS(respOtherHeaders, 'x-content-type-options') > 0, 1, 0) AS x_content_type_options_header, + IF(STRPOS(respOtherHeaders, 'x-frame-options') > 0, 1, 0) AS x_frame_options_header, + IF(STRPOS(respOtherHeaders, 'x-xss-protection') > 0, 1, 0) AS x_xss_protection_header + FROM headers +) + +SELECT + client, + req_category, + COUNT(0) AS total_requests, + SUM(hsts_header) / COUNT(0) AS pct_hsts_header_requests, + SUM(x_content_type_options_header) / COUNT(0) AS pct_x_content_type_options_header_requests, + SUM(x_frame_options_header) / COUNT(0) AS pct_x_frame_options_header_requests, + SUM(x_xss_protection_header) / COUNT(0) AS pct_x_xss_protection_header_requests +FROM + base +GROUP BY + client, + req_category +ORDER BY + client, + req_category diff --git a/sql/2024/third-parties/percent_of_websites_with_third_party.sql b/sql/2024/third-parties/percent_of_websites_with_third_party.sql new file mode 100644 index 00000000000..732377fff57 --- /dev/null +++ b/sql/2024/third-parties/percent_of_websites_with_third_party.sql @@ -0,0 +1,51 @@ +#standardSQL +# Percent of websites with third parties + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url, + respBodySize + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage, + COUNT(0) AS request_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage > 50 +) + +SELECT + client, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) AS pages_with_third_party, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) / COUNT(DISTINCT page) AS pct_pages_with_third_party, + COUNTIF(domain IS NOT NULL) AS third_party_requests, + COUNT(0) AS total_requests, + COUNTIF(domain IS NOT NULL) / COUNT(0) AS pct_third_party_requests, + SUM(IF(domain IS NOT NULL, respBodySize, 0)) AS third_party_body_size, + SUM(respBodySize) AS total_body_size, + SUM(IF(domain IS NOT NULL, respBodySize, 0)) / SUM(respBodySize) AS pct_body_size +FROM + requests +LEFT JOIN third_party +ON NET.HOST(requests.url) = NET.HOST(third_party.domain) +GROUP BY + client diff --git a/sql/2024/third-parties/percent_of_websites_with_third_party_by_ranking.sql b/sql/2024/third-parties/percent_of_websites_with_third_party_by_ranking.sql new file mode 100644 index 00000000000..b6ceadbf8eb --- /dev/null +++ b/sql/2024/third-parties/percent_of_websites_with_third_party_by_ranking.sql @@ -0,0 +1,64 @@ +#standardSQL +# Percent of websites with third parties by ranking + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + rank + FROM + `httparchive.summary_pages.2024_06_01_*` +) + +SELECT + client, + rank_grouping, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) AS pages_with_third_party, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) / COUNT(DISTINCT page) AS pct_pages_with_third_party +FROM + pages +JOIN + requests +USING (client, page) +LEFT JOIN + third_party +ON NET.HOST(requests.url) = NET.HOST(third_party.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping diff --git a/sql/2024/third-parties/scripts_using_async_defer.sql b/sql/2024/third-parties/scripts_using_async_defer.sql new file mode 100644 index 00000000000..cd324dfd581 --- /dev/null +++ b/sql/2024/third-parties/scripts_using_async_defer.sql @@ -0,0 +1,76 @@ +#standardSQL +# 3P scripts using async or defer +# (capped to 1 hit per domain per page) +CREATE TEMPORARY FUNCTION getScripts(str STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' + try { + var almanac = JSON.parse(str); + + if (Array.isArray(almanac) || typeof almanac != "object") { + return result; + } + + if (almanac.scripts && almanac.scripts.nodes) { + return almanac.scripts.nodes.map((n) => ({ + src: n.src, + isAsync: n.hasOwnProperty("async"), + isDefer: n.hasOwnProperty("defer"), + })); + } + + return []; + } catch (e) { + return []; + } +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain, + COUNTIF(isAsync) AS async_count, + COUNTIF(isDefer) AS defer_count + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.src) AS domain, + data.isAsync AS isAsync, + data.isDefer AS isDefer, + pages.url AS page + FROM + `httparchive.pages.2024_06_01_*` AS pages, + UNNEST(getScripts(JSON_EXTRACT_SCALAR(payload, '$._almanac'))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + base.client AS client, + COUNTIF(async_count > 0) AS freq_async, + COUNTIF(defer_count > 0) AS freq_defer, + COUNT(0) AS total, + COUNTIF(async_count > 0) / COUNT(0) AS pct_async, + COUNTIF(defer_count > 0) / COUNT(0) AS pct_defer +FROM + base +GROUP BY + client +ORDER BY + client diff --git a/sql/2024/third-parties/scripts_using_async_defer_by_3p.sql b/sql/2024/third-parties/scripts_using_async_defer_by_3p.sql new file mode 100644 index 00000000000..0bed778f659 --- /dev/null +++ b/sql/2024/third-parties/scripts_using_async_defer_by_3p.sql @@ -0,0 +1,81 @@ +#standardSQL +# 3P scripts using async or defer +# (capped to 1 hit per domain per page) +CREATE TEMPORARY FUNCTION getScripts(str STRING) +RETURNS ARRAY> +LANGUAGE js AS ''' + try { + var almanac = JSON.parse(str); + + if (Array.isArray(almanac) || typeof almanac != "object") { + return result; + } + + if (almanac.scripts && almanac.scripts.nodes) { + return almanac.scripts.nodes.map((n) => ({ + src: n.src, + isAsync: n.hasOwnProperty("async"), + isDefer: n.hasOwnProperty("defer"), + })); + } + + return []; + } catch (e) { + return []; + } +'''; + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + third_party_domains.domain AS domain, + COUNTIF(isAsync) AS async_count, + COUNTIF(isDefer) AS defer_count + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.src) AS domain, + data.isAsync AS isAsync, + data.isDefer AS isDefer, + pages.url AS page + FROM + `httparchive.pages.2024_06_01_*` AS pages, + UNNEST(getScripts(JSON_EXTRACT_SCALAR(payload, '$._almanac'))) AS data + ) AS potential_third_parties + INNER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page, + domain +) + +SELECT + base.client AS client, + domain, + COUNTIF(async_count > 0) AS freq_async, + COUNTIF(defer_count > 0) AS freq_defer, + COUNT(DISTINCT page) AS page_usage, + COUNTIF(async_count > 0) / COUNT(DISTINCT page) AS pct_async, + COUNTIF(defer_count > 0) / COUNT(DISTINCT page) AS pct_defer +FROM + base +GROUP BY + client, + domain +HAVING + page_usage > 50 +ORDER BY + client, + page_usage DESC diff --git a/sql/2024/third-parties/top100_third_parties_by_median_body_size_and_time.sql b/sql/2024/third-parties/top100_third_parties_by_median_body_size_and_time.sql new file mode 100644 index 00000000000..a8b84e1ecd2 --- /dev/null +++ b/sql/2024/third-parties/top100_third_parties_by_median_body_size_and_time.sql @@ -0,0 +1,87 @@ +#standardSQL +# Top 100 third parties by median response body size, time + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + pageid AS page, + respBodySize AS body_size, + time + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +third_party AS ( + SELECT + domain, + category, + canonicalDomain, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + canonicalDomain, + APPROX_QUANTILES(body_size, 1000)[OFFSET(500)] / 1024 AS median_body_size_kb, + APPROX_QUANTILES(time, 1000)[OFFSET(500)] / 1000 AS median_time_s -- noqa: L010 + FROM + requests + INNER JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + GROUP BY + client, + category, + canonicalDomain +) + +SELECT + ranking, + client, + category, + canonicalDomain, + metric, + sorted_order +FROM ( + SELECT + 'median_body_size_kb' AS ranking, + client, + category, + canonicalDomain, + median_body_size_kb AS metric, + DENSE_RANK() OVER (PARTITION BY client ORDER BY median_body_size_kb DESC) AS sorted_order + FROM base + UNION ALL + SELECT + 'median_time_s' AS ranking, + client, + category, + canonicalDomain, + median_time_s AS metric, + DENSE_RANK() OVER (PARTITION BY client ORDER BY median_time_s DESC) AS sorted_order + FROM base +) +WHERE + sorted_order <= 100 +ORDER BY + ranking, + client, + metric DESC diff --git a/sql/2024/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2024/third-parties/top100_third_parties_by_number_of_websites.sql new file mode 100644 index 00000000000..2f51ed65949 --- /dev/null +++ b/sql/2024/third-parties/top100_third_parties_by_number_of_websites.sql @@ -0,0 +1,76 @@ +#standardSQL +# Top 100 third parties by number of websites + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2024_06_01_*` +), + +totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT pageid) AS total_pages, + COUNT(0) AS total_requests + FROM + `httparchive.summary_requests.2024_06_01_*` + GROUP BY + _TABLE_SUFFIX +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 +) + +SELECT + client, + canonicalDomain, + COUNT(DISTINCT page) AS pages, + total_pages, + COUNT(DISTINCT page) / total_pages AS pct_pages, + COUNT(0) AS requests, + total_requests, + COUNT(0) / total_requests AS pct_requests, + DENSE_RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS sorted_order +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) +JOIN + totals +USING (client) +WHERE + canonicalDomain IS NOT NULL +GROUP BY + client, + total_pages, + total_requests, + canonicalDomain +QUALIFY + sorted_order <= 100 +ORDER BY + pct_pages DESC, + client From 5328a7e59edb5c76ae767dc3ddabb0a8117b6267 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Wed, 14 Aug 2024 00:50:25 -0700 Subject: [PATCH 04/23] third-party blocking queries --- .../third_parties_blocking_main_thread.sql | 71 +++++++++++ ...rties_blocking_main_thread_percentiles.sql | 59 +++++++++ ...ocking_main_thread_percentiles_by_host.sql | 70 +++++++++++ .../third_parties_blocking_rendering.sql | 117 ++++++++++++++++++ ...parties_blocking_rendering_percentiles.sql | 114 +++++++++++++++++ .../third_parties_using_legacy_javascript.sql | 58 +++++++++ 6 files changed, 489 insertions(+) create mode 100644 sql/2024/third-parties/third_parties_blocking_main_thread.sql create mode 100644 sql/2024/third-parties/third_parties_blocking_main_thread_percentiles.sql create mode 100644 sql/2024/third-parties/third_parties_blocking_main_thread_percentiles_by_host.sql create mode 100644 sql/2024/third-parties/third_parties_blocking_rendering.sql create mode 100644 sql/2024/third-parties/third_parties_blocking_rendering_percentiles.sql create mode 100644 sql/2024/third-parties/third_parties_using_legacy_javascript.sql diff --git a/sql/2024/third-parties/third_parties_blocking_main_thread.sql b/sql/2024/third-parties/third_parties_blocking_main_thread.sql new file mode 100644 index 00000000000..77368d280f1 --- /dev/null +++ b/sql/2024/third-parties/third_parties_blocking_main_thread.sql @@ -0,0 +1,71 @@ +#standardSQL +# Third-Party domains which block the main thread +# +# As Lighthouse measures all impact there is no need to do a separate total +# Lighthouse also gives a useable category. So no need to use almanac.third-parties table +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +SELECT + client, + domain, + category, + total_pages, + blocking_pages, + non_blocking_pages, + pct_blocking_pages, + pct_non_blocking_pages, + p50_transfer_size_kib, + p50_blocking_time, + total_pages_rank +FROM ( + SELECT + client, + domain, + category, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(blocking > 0) AS blocking_pages, + COUNT(DISTINCT page) - COUNTIF(blocking > 0) AS non_blocking_pages, + COUNTIF(blocking > 0) / COUNT(0) AS pct_blocking_pages, + (COUNT(DISTINCT page) - COUNTIF(blocking > 0)) / COUNT(0) AS pct_non_blocking_pages, + APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(500)] AS p50_transfer_size_kib, + APPROX_QUANTILES(blocking_time, 1000)[OFFSET(500)] AS p50_blocking_time, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + JSON_VALUE(third_party_items, '$.entity.url') AS domain, + page, + JSON_VALUE(third_party_items, '$.entity.text') AS category, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-summary.details.summary.wastedMs') AS FLOAT64) > 250) AS blocking, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.blockingTime') AS FLOAT64)) AS blocking_time, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.transferSize') AS FLOAT64) / 1024) AS transfer_size_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2024_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.third-party-summary.details.items')) AS third_party_items + GROUP BY + client, + domain, + page, + category + ) + GROUP BY + client, + domain, + category + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC diff --git a/sql/2024/third-parties/third_parties_blocking_main_thread_percentiles.sql b/sql/2024/third-parties/third_parties_blocking_main_thread_percentiles.sql new file mode 100644 index 00000000000..9b88db4b1da --- /dev/null +++ b/sql/2024/third-parties/third_parties_blocking_main_thread_percentiles.sql @@ -0,0 +1,59 @@ +#standardSQL +# Total of Third-Party domains which block the main thread by percentile +# +# As Lighthouse measures all impact there is no need to do a separate total +# Lighthouse also gives a useable category. So no need to use almanac.third-parties table +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +SELECT + client, + total_pages, + blocking_pages, + percentile, + p50_transfer_size_kib, + p50_blocking_time +FROM ( + SELECT + client, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(blocking > 0) AS blocking_pages, + percentile, + APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(percentile * 10)] AS p50_transfer_size_kib, + APPROX_QUANTILES(blocking_time, 1000)[OFFSET(percentile * 10)] AS p50_blocking_time, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + page, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-summary.details.summary.wastedMs') AS FLOAT64) > 250) AS blocking, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.blockingTime') AS FLOAT64)) AS blocking_time, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.transferSize') AS FLOAT64) / 1024) AS transfer_size_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2024_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.third-party-summary.details.items')) AS third_party_items + GROUP BY + client, + page + ), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + GROUP BY + client, + percentile + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC, + percentile diff --git a/sql/2024/third-parties/third_parties_blocking_main_thread_percentiles_by_host.sql b/sql/2024/third-parties/third_parties_blocking_main_thread_percentiles_by_host.sql new file mode 100644 index 00000000000..77bf9c48b3e --- /dev/null +++ b/sql/2024/third-parties/third_parties_blocking_main_thread_percentiles_by_host.sql @@ -0,0 +1,70 @@ +#standardSQL +# Third-Party domains which block the main thread by percentile +# +# As Lighthouse measures all impact there is no need to do a separate total +# Lighthouse also gives a useable category. So no need to use almanac.third-parties table +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +SELECT + client, + domain, + category, + total_pages, + blocking_pages, + percentile, + p50_transfer_size_kib, + p50_blocking_time +FROM ( + SELECT + client, + domain, + category, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(blocking > 0) AS blocking_pages, + percentile, + APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(percentile * 10)] AS p50_transfer_size_kib, + APPROX_QUANTILES(blocking_time, 1000)[OFFSET(percentile * 10)] AS p50_blocking_time, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + JSON_VALUE(third_party_items, '$.entity.url') AS domain, + page, + JSON_VALUE(third_party_items, '$.entity.text') AS category, + COUNTIF(SAFE_CAST(JSON_VALUE(report, '$.audits.third-party-summary.details.summary.wastedMs') AS FLOAT64) > 250) AS blocking, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.blockingTime') AS FLOAT64)) AS blocking_time, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, '$.transferSize') AS FLOAT64) / 1024) AS transfer_size_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2024_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.third-party-summary.details.items')) AS third_party_items + GROUP BY + client, + domain, + page, + category + ), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + GROUP BY + client, + domain, + category, + percentile + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC, + category, + percentile diff --git a/sql/2024/third-parties/third_parties_blocking_rendering.sql b/sql/2024/third-parties/third_parties_blocking_rendering.sql new file mode 100644 index 00000000000..b963f5b9de5 --- /dev/null +++ b/sql/2024/third-parties/third_parties_blocking_rendering.sql @@ -0,0 +1,117 @@ +#standardSQL +# Third-Party domains which render block paint +# +# Unlike the blocking main thread queries, lighthouse only contains details if the +# third-party is render blocking (i.e. wastedMs/total_bytes are never 0) +# And also there are no categories given to each third-party +# So we join to the usual almanac.third_parties table to get those totals and categories +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +WITH total_third_party_usage AS ( + SELECT + _TABLE_SUFFIX AS client, + canonicalDomain, + category, + COUNT(DISTINCT pages.url) AS total_pages + FROM + `httparchive.summary_pages.2024_06_01_*` AS pages + INNER JOIN ( + SELECT + _TABLE_SUFFIX AS client, + pageid, + url + FROM + `httparchive.summary_requests.2024_06_01_*` + ) AS requests + ON ( + pages._TABLE_SUFFIX = requests.client AND + pages.pageid = requests.pageid + ) + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(requests.url) = NET.HOST(domain) AND + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + client, + canonicalDomain, + category + HAVING + total_pages >= 50 +) + +SELECT + client, + canonicalDomain, + category, + total_pages, + blocking_pages, + non_blocking_pages, + blocking_pages_pct, + non_blocking_pages_pct, + p50_wastedMs, + p50_total_bytes_kib +FROM ( + SELECT + client, + canonicalDomain, + category, + total_pages, + COUNT(DISTINCT page) AS blocking_pages, + total_pages - COUNT(DISTINCT page) AS non_blocking_pages, + COUNT(DISTINCT page) / total_pages AS blocking_pages_pct, + (total_pages - COUNT(DISTINCT page)) / total_pages AS non_blocking_pages_pct, + APPROX_QUANTILES(wasted_ms, 1000)[OFFSET(500)] AS p50_wastedMs, + APPROX_QUANTILES(total_bytes_kib, 1000)[OFFSET(500)] AS p50_total_bytes_kib, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + canonicalDomain, + domain, + page, + category, + SUM(SAFE_CAST(JSON_VALUE(renderBlockingItems, '$.wastedMs') AS FLOAT64)) AS wasted_ms, + SUM(SAFE_CAST(JSON_VALUE(renderBlockingItems, '$.totalBytes') AS FLOAT64) / 1024) AS total_bytes_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2024_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.render-blocking-resources.details.items')) AS renderBlockingItems + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(JSON_VALUE(renderBlockingItems, '$.url')) = domain + GROUP BY + client, + canonicalDomain, + domain, + page, + category + ) + INNER JOIN + total_third_party_usage + USING + (client, canonicalDomain, category) + GROUP BY + client, + canonicalDomain, + category, + total_pages + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC, + category diff --git a/sql/2024/third-parties/third_parties_blocking_rendering_percentiles.sql b/sql/2024/third-parties/third_parties_blocking_rendering_percentiles.sql new file mode 100644 index 00000000000..195967c778f --- /dev/null +++ b/sql/2024/third-parties/third_parties_blocking_rendering_percentiles.sql @@ -0,0 +1,114 @@ +#standardSQL +# Third-Party domains which render block paint by percentile +# +# Unlike the blocking main thread queries, lighthouse only contains details if the +# third-party is render blocking (i.e. wastedMs/total_bytes are never 0) +# And also there are no categories given to each third-party +# So we join to the usual almanac.third_parties table to get those totals and categories +# +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +WITH total_third_party_usage AS ( + SELECT + _TABLE_SUFFIX AS client, + canonicalDomain, + category, + COUNT(DISTINCT pages.url) AS total_pages + FROM + `httparchive.summary_pages.2024_06_01_*` AS pages + INNER JOIN ( + SELECT + _TABLE_SUFFIX AS client, + pageid, + url + FROM + `httparchive.summary_requests.2024_06_01_*` + ) AS requests + ON ( + pages._TABLE_SUFFIX = requests.client AND + pages.pageid = requests.pageid + ) + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(requests.url) = NET.HOST(domain) AND + date = '2024-06-01' AND + category != 'hosting' + GROUP BY + client, + canonicalDomain, + category + HAVING + total_pages >= 50 +) + +SELECT + client, + canonicalDomain, + category, + total_pages, + blocking_pages, + percentile, + wasted_ms, + total_bytes_kib +FROM ( + SELECT + client, + canonicalDomain, + category, + total_pages, + COUNT(DISTINCT page) AS blocking_pages, + percentile, + APPROX_QUANTILES(wasted_ms, 1000)[OFFSET(percentile * 10)] AS wasted_ms, + APPROX_QUANTILES(total_bytes_kib, 1000)[OFFSET(percentile * 10)] AS total_bytes_kib, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS total_pages_rank + FROM ( + SELECT + client, + canonicalDomain, + page, + category, + SUM(SAFE_CAST(JSON_VALUE(render_blocking_items, '$.wastedMs') AS FLOAT64)) AS wasted_ms, + SUM(SAFE_CAST(JSON_VALUE(render_blocking_items, '$.totalBytes') AS FLOAT64) / 1024) AS total_bytes_kib + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + url AS page, + report + FROM + `httparchive.lighthouse.2024_06_01_*` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.render-blocking-resources.details.items')) AS render_blocking_items + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(JSON_VALUE(render_blocking_items, '$.url')) = domain AND + date = '2024-06-01' + GROUP BY + client, + canonicalDomain, + page, + category + ) + INNER JOIN + total_third_party_usage + USING (client, canonicalDomain, category), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile + GROUP BY + client, + canonicalDomain, + category, + total_pages, + percentile + HAVING + total_pages >= 50 +) +WHERE + total_pages_rank <= 200 +ORDER BY + client, + total_pages DESC, + category, + percentile diff --git a/sql/2024/third-parties/third_parties_using_legacy_javascript.sql b/sql/2024/third-parties/third_parties_using_legacy_javascript.sql new file mode 100644 index 00000000000..5feda2ce1af --- /dev/null +++ b/sql/2024/third-parties/third_parties_using_legacy_javascript.sql @@ -0,0 +1,58 @@ +#standardSQL +# Third-parties that use legacy JavaScript + +CREATE TEMPORARY FUNCTION +getUrls(audit STRING) +RETURNS ARRAY> LANGUAGE js AS ''' +try { + var $ = JSON.parse(audit); + return $.details.items.map(i => ({url: i.url})); +} catch(e) { + return []; +} +'''; + + +WITH third_party_domains AS ( + SELECT DISTINCT + NET.HOST(domain) AS domain + FROM + `httparchive.almanac.third_parties` +), + +base AS ( + SELECT + client, + page, + COUNTIF(third_party_domains.domain IS NULL) / COUNT(0) AS pct_1p_legacy, + COUNTIF(third_party_domains.domain IS NOT NULL) / COUNT(0) AS pct_3p_legacy + FROM + ( + SELECT + _TABLE_SUFFIX AS client, + NET.HOST(data.url) AS domain, + lighthouse.url AS page + FROM + `httparchive.lighthouse.2024_06_01_*` AS lighthouse, + UNNEST(getUrls(JSON_EXTRACT(report, "$.audits['legacy-javascript']"))) AS data + ) AS potential_third_parties + LEFT OUTER JOIN + third_party_domains + ON + potential_third_parties.domain = third_party_domains.domain + GROUP BY + client, + page +) + + +SELECT + client, + AVG(pct_1p_legacy) AS avg_pct_1p_legacy, + AVG(pct_3p_legacy) AS avg_pct_3p_legacy +FROM + base +GROUP BY + client +ORDER BY + client From a5735e29b6b0f2d7141a58405acacfeae1090684 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Wed, 14 Aug 2024 18:41:58 -0700 Subject: [PATCH 05/23] CSP frequency --- .../csp_allowed_host_frequency.sql | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 sql/2024/third-parties/csp_allowed_host_frequency.sql diff --git a/sql/2024/third-parties/csp_allowed_host_frequency.sql b/sql/2024/third-parties/csp_allowed_host_frequency.sql new file mode 100644 index 00000000000..5917eafbc49 --- /dev/null +++ b/sql/2024/third-parties/csp_allowed_host_frequency.sql @@ -0,0 +1,85 @@ +#standardSQL +# CSP on home pages: most prevalent allowed hosts + +CREATE TEMPORARY FUNCTION getHeader(headers STRING, headername STRING) +RETURNS STRING DETERMINISTIC +LANGUAGE js AS ''' + const parsed_headers = JSON.parse(headers); + const matching_headers = parsed_headers.filter(h => h.name.toLowerCase() == headername.toLowerCase()); + if (matching_headers.length > 0) { + return matching_headers[0].value; + } + return null; +'''; + +WITH totals AS ( + SELECT + client, + COUNT(0) AS total + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_main_document + GROUP BY + client +), + +csp_data AS ( + SELECT + client, + page, + getHeader(TO_JSON_STRING(response_headers), 'Content-Security-Policy') AS csp_header + FROM + `httparchive.all.requests` + WHERE + date = '2024-06-01' AND + is_main_document + AND + response_headers IS NOT NULL +), + +csp_expanded AS ( + SELECT + client, + page, + csp_allowed_host + FROM + csp_data, + UNNEST(REGEXP_EXTRACT_ALL(csp_header, r'(?i)(https*://[^\s;]+)[\s;]')) AS csp_allowed_host + WHERE + csp_header IS NOT NULL +), + +ranked_csp AS ( + SELECT + client, + csp_allowed_host, + COUNT(DISTINCT page) AS freq, + total AS total_pages, + COUNT(DISTINCT page) / total AS pct, + RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS csp_allowed_host_rank + FROM + csp_expanded + JOIN + totals + USING (client) + GROUP BY + client, + total, + csp_allowed_host +) + +SELECT + client, + csp_allowed_host, + freq, + total_pages, + pct +FROM + ranked_csp +WHERE + csp_allowed_host_rank <= 100 +ORDER BY + client, + pct DESC; From 783a67e0d27ce6b7784b1a996fd5aa1e675f4e57 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Fri, 16 Aug 2024 06:33:34 -0700 Subject: [PATCH 06/23] Added mainframe vs iframe analysis --- ...distribution_of_third_parties_by_frame.sql | 51 ++++++++++++ ...d_parties_by_client_and_frame_location.sql | 82 +++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 sql/2024/third-parties/distribution_of_third_parties_by_frame.sql create mode 100644 sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql diff --git a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql new file mode 100644 index 00000000000..2de6ff4ebfa --- /dev/null +++ b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql @@ -0,0 +1,51 @@ +#standardSQL +# Distribution of third-parties embedded in main vs. in iframes + +WITH document_frameid AS ( + SELECT + client, + NET.HOST(page) AS page_host, + CASE + WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) + THEN "mainframe" + ELSE "iframe" + END AS frame_type, + NET.HOST(url) AS frame_host, + JSON_EXTRACT_SCALAR(payload, "$._frame_id") AS frame_id + FROM `httparchive.all.requests` AS requests + WHERE + requests.date = "2024-06-01" + AND requests.is_root_page = true +), +combined_frame_counts AS ( + SELECT client, + page_host, + frame_host, + COUNT(DISTINCT frame_id) AS num_distinct_frameids, + COUNT(frame_id) AS num_total_frameids, + CASE + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 + THEN "mainframe-only" + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 + THEN "iframe-only" + WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 + THEN "both" + END AS frame_presence, + FROM document_frameid + GROUP BY client, page_host, frame_host +) +SELECT + client, + COUNT(DISTINCT frame_host) - 1 AS distinct_third_party_count, + COUNT(frame_host) - 1 AS total_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = "mainframe-only" THEN page_host ELSE NULL END) AS num_publishers_mainframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = "iframe-only" THEN page_host ELSE NULL END) AS num_publishers_iframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = "both" THEN page_host ELSE NULL END) AS num_publishers_both, + COUNT(DISTINCT CASE WHEN frame_presence = "mainframe-only" THEN frame_host ELSE NULL END) AS distinct_mainframe_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = "iframe-only" THEN frame_host ELSE NULL END) AS distinct_iframe_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = "both" THEN frame_host ELSE NULL END) AS distinct_both_third_party_count, + COUNT(CASE WHEN frame_presence = "mainframe-only" THEN frame_host ELSE NULL END) AS distinct_mainframe_third_party_count, + COUNT(CASE WHEN frame_presence = "iframe-only" THEN frame_host ELSE NULL END) AS distinct_iframe_third_party_count, + COUNT(CASE WHEN frame_presence = "both" THEN frame_host ELSE NULL END) AS distinct_both_third_party_count +FROM combined_frame_counts +GROUP BY client; diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql new file mode 100644 index 00000000000..5c7fa4faf06 --- /dev/null +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -0,0 +1,82 @@ +#standardSQL +# Top 20 third-parties embedded in mainframe vs. in iframes + +WITH document_frameid AS ( + SELECT + client, + NET.HOST(page) AS page_host, + CASE + WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) + THEN "mainframe" + ELSE "iframe" + END AS frame_type, + NET.HOST(url) AS frame_host, + JSON_EXTRACT_SCALAR(payload, "$._frame_id") AS frame_id + FROM `httparchive.all.requests` AS requests + WHERE + requests.date = "2024-06-01" + AND requests.is_root_page = true +), +combined_frame_counts AS ( + SELECT client, + page_host, + frame_host, + COUNT(DISTINCT frame_id) AS num_distinct_frameids, + COUNT(frame_id) AS num_total_frameids, + CASE + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 + THEN "mainframe-only" + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 + THEN "iframe-only" + WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 + THEN "both" + END AS frame_presence, + FROM document_frameid + GROUP BY client, page_host, frame_host +), +grouped_data AS ( + SELECT + client, + frame_host, + COUNT(DISTINCT page_host) AS total_distinct_publisher_count, + COUNT(DISTINCT CASE WHEN frame_presence = "mainframe-only" THEN page_host ELSE NULL END) AS num_distinct_publishers_mainframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = "iframe-only" THEN page_host ELSE NULL END) AS num_distinct_publishers_iframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = "both" THEN page_host ELSE NULL END) AS num_distinct_publishers_both + FROM combined_frame_counts + GROUP BY client, frame_host +), +ranked_publishers AS ( + SELECT + client, + frame_host, + num_distinct_publishers_mainframe_only, + num_distinct_publishers_iframe_only, + num_distinct_publishers_both, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY num_distinct_publishers_mainframe_only DESC) AS rank_mainframe, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY num_distinct_publishers_iframe_only DESC) AS rank_iframe, + ROW_NUMBER() OVER (PARTITION BY client ORDER BY num_distinct_publishers_both DESC) AS rank_both + FROM grouped_data +) +SELECT + client, + frame_host, + rank_mainframe, + num_distinct_publishers_mainframe_only, + rank_iframe, + num_distinct_publishers_iframe_only, + rank_both, + num_distinct_publishers_both, + CASE + WHEN rank_mainframe <= 20 THEN 'mainframe' + WHEN rank_iframe <= 20 THEN 'iframe' + WHEN rank_both <= 20 THEN 'both' + END AS category +FROM ranked_publishers +WHERE rank_mainframe <= 20 OR rank_iframe <= 20 OR rank_both <= 20 +ORDER BY client, +category, +CASE category + WHEN 'mainframe' THEN num_distinct_publishers_mainframe_only + WHEN 'iframe' THEN num_distinct_publishers_iframe_only + WHEN 'both' THEN num_distinct_publishers_both +END DESC; From 70c33e552d86799ba3e9bb8b102947f72eb38322 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Sat, 24 Aug 2024 10:04:02 -0700 Subject: [PATCH 07/23] Minor comments --- .../third-parties/lighthouse_average_unminified_css_by_3p.sql | 2 +- .../number_of_third_party_providers_by_rank_and_category.sql | 2 +- .../percent_of_third_parties_using_document_write.sql | 2 +- .../percent_of_third_parties_using_legacy_javascript.sql | 2 +- .../percent_of_third_parties_using_legacy_javascript_by_url.sql | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql b/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql index aa6c3998b28..f888b2646da 100644 --- a/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql +++ b/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql @@ -1,5 +1,5 @@ #standardSQL -# Pages with unminified JS by 1P/3P +# Pages with unminified CSS by 1P/3P CREATE TEMPORARY FUNCTION getUnminifiedJsUrls(audit STRING) RETURNS ARRAY> LANGUAGE js AS ''' try { diff --git a/sql/2024/third-parties/number_of_third_party_providers_by_rank_and_category.sql b/sql/2024/third-parties/number_of_third_party_providers_by_rank_and_category.sql index 4445601a13d..ad8283a67fa 100644 --- a/sql/2024/third-parties/number_of_third_party_providers_by_rank_and_category.sql +++ b/sql/2024/third-parties/number_of_third_party_providers_by_rank_and_category.sql @@ -1,5 +1,5 @@ #standardSQL -# Number of third-parties per websites by rank and category +# Number of third-party providers per websites by rank and category WITH requests AS ( SELECT diff --git a/sql/2024/third-parties/percent_of_third_parties_using_document_write.sql b/sql/2024/third-parties/percent_of_third_parties_using_document_write.sql index ae1eec654ab..ae7926ff7c7 100644 --- a/sql/2024/third-parties/percent_of_third_parties_using_document_write.sql +++ b/sql/2024/third-parties/percent_of_third_parties_using_document_write.sql @@ -1,5 +1,5 @@ #standardSQL -# Third-parties that use document.write +# Percent of third-parties that use document.write CREATE TEMPORARY FUNCTION getUrls(audit STRING) diff --git a/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript.sql b/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript.sql index 84cc4d0255c..03f43060001 100644 --- a/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript.sql +++ b/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript.sql @@ -1,5 +1,5 @@ #standardSQL -# Third-parties that use legacy JavaScript +# Percent third-party scripts that use legacy JavaScript CREATE TEMPORARY FUNCTION getUrls(audit STRING) diff --git a/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql b/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql index 3f78f47c7be..4d8a990dfbb 100644 --- a/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql +++ b/sql/2024/third-parties/percent_of_third_parties_using_legacy_javascript_by_url.sql @@ -1,5 +1,5 @@ #standardSQL -# Third-party scripts that use legacy JavaScript +# Percent third-party scripts that use legacy JavaScript by URLs CREATE TEMPORARY FUNCTION getUrls(audit STRING) From 245e00596bba41deb1bcae01e2110d9b76ca101d Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 26 Aug 2024 16:57:47 +0200 Subject: [PATCH 08/23] Update distribution_of_third_parties_by_frame.sql removing trailing space --- .../third-parties/distribution_of_third_parties_by_frame.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql index 2de6ff4ebfa..63a5df6e473 100644 --- a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql +++ b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql @@ -5,7 +5,7 @@ WITH document_frameid AS ( SELECT client, NET.HOST(page) AS page_host, - CASE + CASE WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) THEN "mainframe" ELSE "iframe" From 34d5c78e9a2cf89a1a3d7d9ef1e5fca9bffacf48 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 26 Aug 2024 16:59:11 +0200 Subject: [PATCH 09/23] Update top20_third_parties_by_client_and_frame_location.sql Trailing slashes --- .../top20_third_parties_by_client_and_frame_location.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql index 5c7fa4faf06..5ef27aad2c4 100644 --- a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -28,7 +28,7 @@ combined_frame_counts AS ( THEN "mainframe-only" WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 THEN "iframe-only" - WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 + WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 THEN "both" END AS frame_presence, FROM document_frameid From 3c19e8fbe51d061c0c729699839271fc2297d5ad Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 26 Aug 2024 17:02:32 +0200 Subject: [PATCH 10/23] Update csp_allowed_host_frequency.sql fixing linting error --- sql/2024/third-parties/csp_allowed_host_frequency.sql | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/2024/third-parties/csp_allowed_host_frequency.sql b/sql/2024/third-parties/csp_allowed_host_frequency.sql index 5917eafbc49..95d169ce290 100644 --- a/sql/2024/third-parties/csp_allowed_host_frequency.sql +++ b/sql/2024/third-parties/csp_allowed_host_frequency.sql @@ -34,8 +34,7 @@ csp_data AS ( `httparchive.all.requests` WHERE date = '2024-06-01' AND - is_main_document - AND + is_main_document AND response_headers IS NOT NULL ), From e0faed50beac479098de3e52354b0d7daeab4034 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 26 Aug 2024 20:09:16 +0200 Subject: [PATCH 11/23] Update top20_third_parties_by_client_and_frame_location.sql Fixing issues with linter. --- ...d_parties_by_client_and_frame_location.sql | 67 ++++++++++++------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql index 5ef27aad2c4..d9f80765471 100644 --- a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -7,15 +7,17 @@ WITH document_frameid AS ( NET.HOST(page) AS page_host, CASE WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) - THEN "mainframe" - ELSE "iframe" + THEN 'mainframe' + ELSE 'iframe' END AS frame_type, NET.HOST(url) AS frame_host, - JSON_EXTRACT_SCALAR(payload, "$._frame_id") AS frame_id - FROM `httparchive.all.requests` AS requests + JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id + FROM + `httparchive.all.requests` AS requests WHERE - requests.date = "2024-06-01" - AND requests.is_root_page = true + requests.date = '2024-06-01' + AND + requests.is_root_page = true ), combined_frame_counts AS ( SELECT client, @@ -25,23 +27,27 @@ combined_frame_counts AS ( COUNT(frame_id) AS num_total_frameids, CASE WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 - THEN "mainframe-only" + THEN 'mainframe-only' WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 - THEN "iframe-only" + THEN 'iframe-only' WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 - THEN "both" + THEN 'both' END AS frame_presence, - FROM document_frameid - GROUP BY client, page_host, frame_host + FROM + document_frameid + GROUP BY + client, + page_host, + frame_host ), grouped_data AS ( SELECT client, frame_host, COUNT(DISTINCT page_host) AS total_distinct_publisher_count, - COUNT(DISTINCT CASE WHEN frame_presence = "mainframe-only" THEN page_host ELSE NULL END) AS num_distinct_publishers_mainframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = "iframe-only" THEN page_host ELSE NULL END) AS num_distinct_publishers_iframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = "both" THEN page_host ELSE NULL END) AS num_distinct_publishers_both + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' THEN page_host ELSE NULL END) AS num_distinct_publishers_mainframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' THEN page_host ELSE NULL END) AS num_distinct_publishers_iframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' THEN page_host ELSE NULL END) AS num_distinct_publishers_both FROM combined_frame_counts GROUP BY client, frame_host ), @@ -67,16 +73,29 @@ SELECT rank_both, num_distinct_publishers_both, CASE - WHEN rank_mainframe <= 20 THEN 'mainframe' - WHEN rank_iframe <= 20 THEN 'iframe' - WHEN rank_both <= 20 THEN 'both' + WHEN rank_mainframe <= 20 + THEN 'mainframe' + WHEN rank_iframe <= 20 + THEN 'iframe' + WHEN rank_both <= 20 + THEN 'both' END AS category -FROM ranked_publishers -WHERE rank_mainframe <= 20 OR rank_iframe <= 20 OR rank_both <= 20 -ORDER BY client, -category, +FROM + ranked_publishers +WHERE + rank_mainframe <= 20 +OR + rank_iframe <= 20 +OR + rank_both <= 20 +ORDER BY + client, + category, CASE category - WHEN 'mainframe' THEN num_distinct_publishers_mainframe_only - WHEN 'iframe' THEN num_distinct_publishers_iframe_only - WHEN 'both' THEN num_distinct_publishers_both + WHEN 'mainframe' + THEN num_distinct_publishers_mainframe_only + WHEN 'iframe' + THEN num_distinct_publishers_iframe_only + WHEN 'both' + THEN num_distinct_publishers_both END DESC; From d2c6e24c7acb46a2f81cbefc06e3b48320fee642 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 26 Aug 2024 20:15:04 +0200 Subject: [PATCH 12/23] Update top20_third_parties_by_client_and_frame_location.sql trailing white spaces. --- ...hird_parties_by_client_and_frame_location.sql | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql index d9f80765471..d7e79704c92 100644 --- a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -12,7 +12,7 @@ WITH document_frameid AS ( END AS frame_type, NET.HOST(url) AS frame_host, JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id - FROM + FROM `httparchive.all.requests` AS requests WHERE requests.date = '2024-06-01' @@ -33,11 +33,11 @@ combined_frame_counts AS ( WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 THEN 'both' END AS frame_presence, - FROM + FROM document_frameid - GROUP BY - client, - page_host, + GROUP BY + client, + page_host, frame_host ), grouped_data AS ( @@ -83,12 +83,12 @@ SELECT FROM ranked_publishers WHERE - rank_mainframe <= 20 + rank_mainframe <= 20 OR - rank_iframe <= 20 + rank_iframe <= 20 OR rank_both <= 20 -ORDER BY +ORDER BY client, category, CASE category From ab156017b719f9fec041086fc2400f9e9a8f71d0 Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 26 Aug 2024 21:49:34 +0200 Subject: [PATCH 13/23] Update distribution_of_third_parties_by_frame.sql Linting --- ...distribution_of_third_parties_by_frame.sql | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql index 63a5df6e473..6e02fc46557 100644 --- a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql +++ b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql @@ -7,14 +7,14 @@ WITH document_frameid AS ( NET.HOST(page) AS page_host, CASE WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) - THEN "mainframe" - ELSE "iframe" + THEN 'mainframe' + ELSE 'iframe' END AS frame_type, NET.HOST(url) AS frame_host, - JSON_EXTRACT_SCALAR(payload, "$._frame_id") AS frame_id + JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id FROM `httparchive.all.requests` AS requests WHERE - requests.date = "2024-06-01" + requests.date = '2024-06-01' AND requests.is_root_page = true ), combined_frame_counts AS ( @@ -25,11 +25,11 @@ combined_frame_counts AS ( COUNT(frame_id) AS num_total_frameids, CASE WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 - THEN "mainframe-only" + THEN 'mainframe-only' WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 - THEN "iframe-only" + THEN 'iframe-only' WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 - THEN "both" + THEN 'both' END AS frame_presence, FROM document_frameid GROUP BY client, page_host, frame_host @@ -38,14 +38,14 @@ SELECT client, COUNT(DISTINCT frame_host) - 1 AS distinct_third_party_count, COUNT(frame_host) - 1 AS total_third_party_count, - COUNT(DISTINCT CASE WHEN frame_presence = "mainframe-only" THEN page_host ELSE NULL END) AS num_publishers_mainframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = "iframe-only" THEN page_host ELSE NULL END) AS num_publishers_iframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = "both" THEN page_host ELSE NULL END) AS num_publishers_both, - COUNT(DISTINCT CASE WHEN frame_presence = "mainframe-only" THEN frame_host ELSE NULL END) AS distinct_mainframe_third_party_count, - COUNT(DISTINCT CASE WHEN frame_presence = "iframe-only" THEN frame_host ELSE NULL END) AS distinct_iframe_third_party_count, - COUNT(DISTINCT CASE WHEN frame_presence = "both" THEN frame_host ELSE NULL END) AS distinct_both_third_party_count, - COUNT(CASE WHEN frame_presence = "mainframe-only" THEN frame_host ELSE NULL END) AS distinct_mainframe_third_party_count, - COUNT(CASE WHEN frame_presence = "iframe-only" THEN frame_host ELSE NULL END) AS distinct_iframe_third_party_count, - COUNT(CASE WHEN frame_presence = "both" THEN frame_host ELSE NULL END) AS distinct_both_third_party_count + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' THEN page_host ELSE null END) AS num_publishers_mainframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' THEN page_host ELSE null END) AS num_publishers_iframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' THEN page_host ELSE null END) AS num_publishers_both, + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' THEN frame_host ELSE null END) AS distinct_mainframe_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' THEN frame_host ELSE null END) AS distinct_iframe_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' THEN frame_host ELSE null END) AS distinct_both_third_party_count, + COUNT(CASE WHEN frame_presence = 'mainframe-only' THEN frame_host ELSE null END) AS distinct_mainframe_third_party_count, + COUNT(CASE WHEN frame_presence = 'iframe-only' THEN frame_host ELSE null END) AS distinct_iframe_third_party_count, + COUNT(CASE WHEN frame_presence = 'both' THEN frame_host ELSE null END) AS distinct_both_third_party_count FROM combined_frame_counts GROUP BY client; From 22a0ea28f0fbd09fd09213eef69398f8351e419e Mon Sep 17 00:00:00 2001 From: Mike Gifford Date: Mon, 26 Aug 2024 21:53:09 +0200 Subject: [PATCH 14/23] Update top20_third_parties_by_client_and_frame_location.sql Linting --- ...d_parties_by_client_and_frame_location.sql | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql index d7e79704c92..7f8a435615d 100644 --- a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -8,15 +8,14 @@ WITH document_frameid AS ( CASE WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) THEN 'mainframe' - ELSE 'iframe' + ELSE 'iframe' END AS frame_type, NET.HOST(url) AS frame_host, JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id FROM `httparchive.all.requests` AS requests WHERE - requests.date = '2024-06-01' - AND + requests.date = '2024-06-01' AND requests.is_root_page = true ), combined_frame_counts AS ( @@ -30,9 +29,10 @@ combined_frame_counts AS ( THEN 'mainframe-only' WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 THEN 'iframe-only' - WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 + WHEN COUNT(DISTINCT frame_id) >= 2 AND COUNT(DISTINCT frame_type) = 2 THEN 'both' - END AS frame_presence, + END AS + frame_presence FROM document_frameid GROUP BY @@ -45,9 +45,9 @@ grouped_data AS ( client, frame_host, COUNT(DISTINCT page_host) AS total_distinct_publisher_count, - COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' THEN page_host ELSE NULL END) AS num_distinct_publishers_mainframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' THEN page_host ELSE NULL END) AS num_distinct_publishers_iframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = 'both' THEN page_host ELSE NULL END) AS num_distinct_publishers_both + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' THEN page_host ELSE null END) AS num_distinct_publishers_mainframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' THEN page_host ELSE null END) AS num_distinct_publishers_iframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' THEN page_host ELSE null END) AS num_distinct_publishers_both FROM combined_frame_counts GROUP BY client, frame_host ), @@ -80,18 +80,18 @@ SELECT WHEN rank_both <= 20 THEN 'both' END AS category -FROM - ranked_publishers -WHERE - rank_mainframe <= 20 -OR - rank_iframe <= 20 -OR - rank_both <= 20 -ORDER BY - client, - category, -CASE category + FROM + ranked_publishers + WHERE + rank_mainframe <= 20 + OR + rank_iframe <= 20 + OR + rank_both <= 20 + ORDER BY + client, + category, + CASE category WHEN 'mainframe' THEN num_distinct_publishers_mainframe_only WHEN 'iframe' From ed117b7b55abb52ec92fc5bfe1859036d8a4d9f0 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Wed, 11 Sep 2024 13:22:26 -0700 Subject: [PATCH 15/23] lint --- ...distribution_of_third_parties_by_frame.sql | 39 +++++++++---------- ...d_parties_by_client_and_frame_location.sql | 23 ++++------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql index 6e02fc46557..082093754fd 100644 --- a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql +++ b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql @@ -7,32 +7,31 @@ WITH document_frameid AS ( NET.HOST(page) AS page_host, CASE WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) - THEN 'mainframe' - ELSE 'iframe' + THEN 'mainframe' + ELSE 'iframe' END AS frame_type, NET.HOST(url) AS frame_host, JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id FROM `httparchive.all.requests` AS requests - WHERE - requests.date = '2024-06-01' - AND requests.is_root_page = true + WHERE requests.date = '2024-06-01' AND requests.is_root_page = true + ), combined_frame_counts AS ( - SELECT client, - page_host, - frame_host, - COUNT(DISTINCT frame_id) AS num_distinct_frameids, - COUNT(frame_id) AS num_total_frameids, - CASE - WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 - THEN 'mainframe-only' - WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 - THEN 'iframe-only' - WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 - THEN 'both' - END AS frame_presence, - FROM document_frameid - GROUP BY client, page_host, frame_host + SELECT client, + page_host, + frame_host, + COUNT(DISTINCT frame_id) AS num_distinct_frameids, + COUNT(frame_id) AS num_total_frameids, + CASE + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 + THEN 'mainframe-only' + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 + THEN 'iframe-only' + WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 + THEN 'both' + END AS frame_presence, + FROM document_frameid + GROUP BY client, page_host, frame_host ) SELECT client, diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql index 7f8a435615d..42b00f49e80 100644 --- a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -31,8 +31,7 @@ combined_frame_counts AS ( THEN 'iframe-only' WHEN COUNT(DISTINCT frame_id) >= 2 AND COUNT(DISTINCT frame_type) = 2 THEN 'both' - END AS - frame_presence + END AS frame_presence FROM document_frameid GROUP BY @@ -80,22 +79,14 @@ SELECT WHEN rank_both <= 20 THEN 'both' END AS category - FROM - ranked_publishers - WHERE - rank_mainframe <= 20 - OR - rank_iframe <= 20 - OR - rank_both <= 20 - ORDER BY - client, - category, + FROM ranked_publishers + WHERE rank_mainframe <= 20 OR rank_iframe <= 20 OR rank_both <= 20 + ORDER BY client, category, CASE category WHEN 'mainframe' - THEN num_distinct_publishers_mainframe_only + THEN num_distinct_publishers_mainframe_only WHEN 'iframe' - THEN num_distinct_publishers_iframe_only + THEN num_distinct_publishers_iframe_only WHEN 'both' - THEN num_distinct_publishers_both + THEN num_distinct_publishers_both END DESC; From a72504ced5288ee9786aa9179590ed9abdefa856 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Wed, 11 Sep 2024 13:32:14 -0700 Subject: [PATCH 16/23] lint --- ...distribution_of_third_parties_by_frame.sql | 22 +++++++++---------- ...d_parties_by_client_and_frame_location.sql | 21 +++++++++--------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql index 082093754fd..845bf34e155 100644 --- a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql +++ b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql @@ -7,7 +7,7 @@ WITH document_frameid AS ( NET.HOST(page) AS page_host, CASE WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) - THEN 'mainframe' + THEN 'mainframe' ELSE 'iframe' END AS frame_type, NET.HOST(url) AS frame_host, @@ -18,18 +18,18 @@ WITH document_frameid AS ( ), combined_frame_counts AS ( SELECT client, - page_host, - frame_host, - COUNT(DISTINCT frame_id) AS num_distinct_frameids, - COUNT(frame_id) AS num_total_frameids, - CASE + page_host, + frame_host, + COUNT(DISTINCT frame_id) AS num_distinct_frameids, + COUNT(frame_id) AS num_total_frameids, + CASE WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 - THEN 'mainframe-only' + THEN 'mainframe-only' WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 - THEN 'iframe-only' - WHEN COUNT(DISTINCT frame_id) >= 2 and COUNT(DISTINCT frame_type) = 2 - THEN 'both' - END AS frame_presence, + THEN 'iframe-only' + WHEN COUNT(DISTINCT frame_id) >= 2 AND COUNT(DISTINCT frame_type) = 2 + THEN 'both' + END AS frame_presence FROM document_frameid GROUP BY client, page_host, frame_host ) diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql index 42b00f49e80..c686369a309 100644 --- a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -79,14 +79,15 @@ SELECT WHEN rank_both <= 20 THEN 'both' END AS category - FROM ranked_publishers - WHERE rank_mainframe <= 20 OR rank_iframe <= 20 OR rank_both <= 20 - ORDER BY client, category, +FROM ranked_publishers +WHERE rank_mainframe <= 20 OR rank_iframe <= 20 OR rank_both <= 20 +ORDER BY client, category, CASE category - WHEN 'mainframe' - THEN num_distinct_publishers_mainframe_only - WHEN 'iframe' - THEN num_distinct_publishers_iframe_only - WHEN 'both' - THEN num_distinct_publishers_both -END DESC; + WHEN 'mainframe' + THEN num_distinct_publishers_mainframe_only + WHEN 'iframe' + THEN num_distinct_publishers_iframe_only + WHEN 'both' + THEN num_distinct_publishers_both + END +DESC; From 549e1ef4fa641d6223a8daac44aa973af6b96c66 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Wed, 11 Sep 2024 13:41:07 -0700 Subject: [PATCH 17/23] lint --- .../distribution_of_third_parties_by_frame.sql | 12 ++++++------ ...20_third_parties_by_client_and_frame_location.sql | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql index 845bf34e155..fd31e5d4388 100644 --- a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql +++ b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql @@ -23,12 +23,12 @@ combined_frame_counts AS ( COUNT(DISTINCT frame_id) AS num_distinct_frameids, COUNT(frame_id) AS num_total_frameids, CASE - WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 - THEN 'mainframe-only' - WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 - THEN 'iframe-only' - WHEN COUNT(DISTINCT frame_id) >= 2 AND COUNT(DISTINCT frame_type) = 2 - THEN 'both' + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 + THEN 'mainframe-only' + WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'iframe' THEN 1 ELSE 0 END) = 1 + THEN 'iframe-only' + WHEN COUNT(DISTINCT frame_id) >= 2 AND COUNT(DISTINCT frame_type) = 2 + THEN 'both' END AS frame_presence FROM document_frameid GROUP BY client, page_host, frame_host diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql index c686369a309..8f3e9d38388 100644 --- a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -88,6 +88,6 @@ ORDER BY client, category, WHEN 'iframe' THEN num_distinct_publishers_iframe_only WHEN 'both' - THEN num_distinct_publishers_both - END -DESC; + THEN num_distinct_publishers_both + END + DESC; From 8ec5b7055bf66c82657c7bfe24f0e18b7044ff15 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Wed, 11 Sep 2024 14:38:00 -0700 Subject: [PATCH 18/23] Ported lighthouse_average_unminified_css_by_3p.sql --- .../lighthouse_average_unminified_css_by_3p.sql | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql b/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql index f888b2646da..325a762a4d8 100644 --- a/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql +++ b/sql/2024/third-parties/lighthouse_average_unminified_css_by_3p.sql @@ -24,16 +24,18 @@ FROM ( SUM(IF(is_3p, wasted_bytes, 0)) / SUM(wasted_bytes) AS pct_3p_wasted_bytes FROM ( SELECT - _TABLE_SUFFIX AS client, - lighthouse.url AS page, + client, + page, NET.HOST(unminified.url) IS NOT NULL AND NET.HOST(unminified.url) IN ( SELECT domain FROM `httparchive.almanac.third_parties` WHERE date = '2024-06-01' AND category != 'hosting' ) AS is_3p, unminified.wastedBytes AS wasted_bytes FROM - `httparchive.lighthouse.2024_06_01_*` AS lighthouse, - UNNEST(getUnminifiedJsUrls(JSON_EXTRACT(report, "$.audits['unminified-css']"))) AS unminified - ) + `httparchive.all.pages` AS allpages + CROSS JOIN + UNNEST(getUnminifiedJsUrls(JSON_EXTRACT(allpages.lighthouse, "$.audits['unminified-css']"))) AS unminified + WHERE allpages.date = '2024-06-01' AND allpages.is_root_page = true + ) GROUP BY client, page From 62ef475cba2506cac1a5707e4bf2594d20316310 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Fri, 13 Sep 2024 16:29:47 -0700 Subject: [PATCH 19/23] Bug fixes --- .../a11y_overall_tech_usage_by_rank.sql | 2 +- .../third-parties/a11y_technology_usage.sql | 2 +- .../a11y_technology_usage_by_rank.sql | 2 +- ...distribution_of_third_parties_by_frame.sql | 79 +++++++++++++------ .../number_of_third_parties_by_rank.sql | 18 +++-- ...d_parties_by_client_and_frame_location.sql | 52 +++++++----- 6 files changed, 102 insertions(+), 53 deletions(-) diff --git a/sql/2024/third-parties/a11y_overall_tech_usage_by_rank.sql b/sql/2024/third-parties/a11y_overall_tech_usage_by_rank.sql index 20ffcd3ff1d..df65fa3359c 100644 --- a/sql/2024/third-parties/a11y_overall_tech_usage_by_rank.sql +++ b/sql/2024/third-parties/a11y_overall_tech_usage_by_rank.sql @@ -43,7 +43,7 @@ SELECT rank_grouping AS rank, COUNT(DISTINCT url) AS freq, total, - COUNT(DISTINCT url) / total AS pct + (COUNT(DISTINCT url) / total) * 100 AS pct FROM a11y_technologies LEFT OUTER JOIN diff --git a/sql/2024/third-parties/a11y_technology_usage.sql b/sql/2024/third-parties/a11y_technology_usage.sql index 926cd566fc7..ca76cdc1053 100644 --- a/sql/2024/third-parties/a11y_technology_usage.sql +++ b/sql/2024/third-parties/a11y_technology_usage.sql @@ -27,7 +27,7 @@ SELECT client, freq, total, - freq / total AS pct + (freq / total) * 100 AS pct FROM a11y_technologies JOIN diff --git a/sql/2024/third-parties/a11y_technology_usage_by_rank.sql b/sql/2024/third-parties/a11y_technology_usage_by_rank.sql index 237b8d167e6..d2cc1da3c0a 100644 --- a/sql/2024/third-parties/a11y_technology_usage_by_rank.sql +++ b/sql/2024/third-parties/a11y_technology_usage_by_rank.sql @@ -45,7 +45,7 @@ SELECT app, COUNT(0) AS freq, total, - COUNT(0) / total AS pct + (COUNT(0) / total) * 100 AS pct FROM a11y_technologies LEFT OUTER JOIN diff --git a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql index fd31e5d4388..8c6b90643dd 100644 --- a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql +++ b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql @@ -5,23 +5,41 @@ WITH document_frameid AS ( SELECT client, NET.HOST(page) AS page_host, - CASE - WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) - THEN 'mainframe' - ELSE 'iframe' - END AS frame_type, NET.HOST(url) AS frame_host, - JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id + CASE + WHEN is_main_document = true + THEN JSON_EXTRACT_SCALAR(payload, '$._frame_id') + END AS mainframe_id, + JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id, + is_main_document FROM `httparchive.all.requests` AS requests WHERE requests.date = '2024-06-01' AND requests.is_root_page = true - +), +page_frames AS ( + SELECT + client, + page_host, + frame_host, + CASE + WHEN frame_host != page_host + THEN true + ELSE false + END AS tp_flag, + is_main_document, + frame_id, + COALESCE(mainframe_id, FIRST_VALUE(mainframe_id) OVER (PARTITION BY page_host ORDER BY is_main_document DESC)) AS mainframe_id, + CASE + WHEN frame_id = COALESCE(mainframe_id, FIRST_VALUE(mainframe_id) OVER (PARTITION BY page_host ORDER BY is_main_document DESC)) + THEN 'mainframe' + ELSE 'iframe' + END AS frame_type + FROM document_frameid ), combined_frame_counts AS ( SELECT client, page_host, frame_host, - COUNT(DISTINCT frame_id) AS num_distinct_frameids, - COUNT(frame_id) AS num_total_frameids, + tp_flag, CASE WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 THEN 'mainframe-only' @@ -30,21 +48,34 @@ combined_frame_counts AS ( WHEN COUNT(DISTINCT frame_id) >= 2 AND COUNT(DISTINCT frame_type) = 2 THEN 'both' END AS frame_presence - FROM document_frameid - GROUP BY client, page_host, frame_host + FROM page_frames + GROUP BY client, page_host, frame_host, tp_flag +), +aggregated_counts AS ( + SELECT + client, + COUNT(DISTINCT page_host) AS distinct_publisher_count, + COUNT(DISTINCT CASE WHEN tp_flag THEN frame_host ELSE null END) AS distinct_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' AND tp_flag THEN page_host ELSE null END) AS distinct_publishers_mainframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' AND tp_flag THEN page_host ELSE null END) AS distinct_publishers_iframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' AND tp_flag THEN page_host ELSE null END) AS distinct_publishers_both, + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' AND tp_flag THEN frame_host ELSE null END) AS distinct_mainframe_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' AND tp_flag THEN frame_host ELSE null END) AS distinct_iframe_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' AND tp_flag THEN frame_host ELSE null END) AS distinct_both_third_party_count, + FROM combined_frame_counts + GROUP BY client ) SELECT client, - COUNT(DISTINCT frame_host) - 1 AS distinct_third_party_count, - COUNT(frame_host) - 1 AS total_third_party_count, - COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' THEN page_host ELSE null END) AS num_publishers_mainframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' THEN page_host ELSE null END) AS num_publishers_iframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = 'both' THEN page_host ELSE null END) AS num_publishers_both, - COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' THEN frame_host ELSE null END) AS distinct_mainframe_third_party_count, - COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' THEN frame_host ELSE null END) AS distinct_iframe_third_party_count, - COUNT(DISTINCT CASE WHEN frame_presence = 'both' THEN frame_host ELSE null END) AS distinct_both_third_party_count, - COUNT(CASE WHEN frame_presence = 'mainframe-only' THEN frame_host ELSE null END) AS distinct_mainframe_third_party_count, - COUNT(CASE WHEN frame_presence = 'iframe-only' THEN frame_host ELSE null END) AS distinct_iframe_third_party_count, - COUNT(CASE WHEN frame_presence = 'both' THEN frame_host ELSE null END) AS distinct_both_third_party_count -FROM combined_frame_counts -GROUP BY client; + distinct_publisher_count, + distinct_third_party_count, + distinct_publishers_mainframe_only, + distinct_publishers_iframe_only, + distinct_publishers_both, + distinct_mainframe_third_party_count, + distinct_mainframe_third_party_count/distinct_third_party_count AS pct_tps_in_mainframe_only, + distinct_iframe_third_party_count, + distinct_iframe_third_party_count/distinct_third_party_count AS pct_tps_in_iframe_only, + distinct_both_third_party_count, + distinct_both_third_party_count/distinct_third_party_count AS pct_tps_in_both +FROM aggregated_counts; diff --git a/sql/2024/third-parties/number_of_third_parties_by_rank.sql b/sql/2024/third-parties/number_of_third_parties_by_rank.sql index ce978a3916d..3d5ccee75e7 100644 --- a/sql/2024/third-parties/number_of_third_parties_by_rank.sql +++ b/sql/2024/third-parties/number_of_third_parties_by_rank.sql @@ -2,20 +2,26 @@ # Number of third-parties per websites by rank WITH requests AS ( SELECT - _TABLE_SUFFIX AS client, - pageid AS page, + client, + page, url FROM - `httparchive.summary_requests.2024_06_01_*` + `httparchive.all.requests` AS req + WHERE + req.date = '2024-06-01' AND + req.is_root_page = true ), pages AS ( SELECT - _TABLE_SUFFIX AS client, - pageid AS page, + client, + page, rank FROM - `httparchive.summary_pages.2024_06_01_*` + `httparchive.all.pages` AS pg + WHERE + pg.date = '2024-06-01' AND + pg.is_root_page = TRUE ), third_party AS ( diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql index 8f3e9d38388..dc116024c19 100644 --- a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -5,25 +5,41 @@ WITH document_frameid AS ( SELECT client, NET.HOST(page) AS page_host, + NET.HOST(url) AS frame_host, CASE - WHEN is_main_document = true AND NET.HOST(page) = NET.HOST(url) + WHEN is_main_document = true + THEN JSON_EXTRACT_SCALAR(payload, '$._frame_id') + END AS mainframe_id, + JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id, + is_main_document + FROM `httparchive.all.requests` AS requests + WHERE requests.date = '2024-06-01' AND requests.is_root_page = true +), +page_frames AS ( + SELECT + client, + page_host, + frame_host, + CASE + WHEN frame_host != page_host + THEN true + ELSE false + END AS tp_flag, + is_main_document, + frame_id, + COALESCE(mainframe_id, FIRST_VALUE(mainframe_id) OVER (PARTITION BY page_host ORDER BY is_main_document DESC)) AS mainframe_id, + CASE + WHEN frame_id = COALESCE(mainframe_id, FIRST_VALUE(mainframe_id) OVER (PARTITION BY page_host ORDER BY is_main_document DESC)) THEN 'mainframe' ELSE 'iframe' - END AS frame_type, - NET.HOST(url) AS frame_host, - JSON_EXTRACT_SCALAR(payload, '$._frame_id') AS frame_id - FROM - `httparchive.all.requests` AS requests - WHERE - requests.date = '2024-06-01' AND - requests.is_root_page = true + END AS frame_type + FROM document_frameid ), combined_frame_counts AS ( SELECT client, page_host, frame_host, - COUNT(DISTINCT frame_id) AS num_distinct_frameids, - COUNT(frame_id) AS num_total_frameids, + tp_flag, CASE WHEN COUNT(DISTINCT frame_type) = 1 AND MAX(CASE WHEN frame_type = 'mainframe' THEN 1 ELSE 0 END) = 1 THEN 'mainframe-only' @@ -32,21 +48,17 @@ combined_frame_counts AS ( WHEN COUNT(DISTINCT frame_id) >= 2 AND COUNT(DISTINCT frame_type) = 2 THEN 'both' END AS frame_presence - FROM - document_frameid - GROUP BY - client, - page_host, - frame_host + FROM page_frames + GROUP BY client, page_host, frame_host, tp_flag ), grouped_data AS ( SELECT client, frame_host, COUNT(DISTINCT page_host) AS total_distinct_publisher_count, - COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' THEN page_host ELSE null END) AS num_distinct_publishers_mainframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' THEN page_host ELSE null END) AS num_distinct_publishers_iframe_only, - COUNT(DISTINCT CASE WHEN frame_presence = 'both' THEN page_host ELSE null END) AS num_distinct_publishers_both + COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' AND tp_flag THEN page_host ELSE null END) AS num_distinct_publishers_mainframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' AND tp_flag THEN page_host ELSE null END) AS num_distinct_publishers_iframe_only, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' AND tp_flag THEN page_host ELSE null END) AS num_distinct_publishers_both FROM combined_frame_counts GROUP BY client, frame_host ), From 280b93efcb03368dc8648f7d203edfae7576e143 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Fri, 13 Sep 2024 22:21:53 -0700 Subject: [PATCH 20/23] lint --- ...distribution_of_third_parties_by_frame.sql | 10 ++-- .../number_of_third_parties_by_rank.sql | 2 +- ...d_parties_by_client_and_frame_location.sql | 47 +++++++++---------- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql index 8c6b90643dd..332f4b297d2 100644 --- a/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql +++ b/sql/2024/third-parties/distribution_of_third_parties_by_frame.sql @@ -21,7 +21,7 @@ page_frames AS ( page_host, frame_host, CASE - WHEN frame_host != page_host + WHEN frame_host != page_host THEN true ELSE false END AS tp_flag, @@ -61,7 +61,7 @@ aggregated_counts AS ( COUNT(DISTINCT CASE WHEN frame_presence = 'both' AND tp_flag THEN page_host ELSE null END) AS distinct_publishers_both, COUNT(DISTINCT CASE WHEN frame_presence = 'mainframe-only' AND tp_flag THEN frame_host ELSE null END) AS distinct_mainframe_third_party_count, COUNT(DISTINCT CASE WHEN frame_presence = 'iframe-only' AND tp_flag THEN frame_host ELSE null END) AS distinct_iframe_third_party_count, - COUNT(DISTINCT CASE WHEN frame_presence = 'both' AND tp_flag THEN frame_host ELSE null END) AS distinct_both_third_party_count, + COUNT(DISTINCT CASE WHEN frame_presence = 'both' AND tp_flag THEN frame_host ELSE null END) AS distinct_both_third_party_count FROM combined_frame_counts GROUP BY client ) @@ -73,9 +73,9 @@ SELECT distinct_publishers_iframe_only, distinct_publishers_both, distinct_mainframe_third_party_count, - distinct_mainframe_third_party_count/distinct_third_party_count AS pct_tps_in_mainframe_only, + distinct_mainframe_third_party_count / distinct_third_party_count AS pct_tps_in_mainframe_only, distinct_iframe_third_party_count, - distinct_iframe_third_party_count/distinct_third_party_count AS pct_tps_in_iframe_only, + distinct_iframe_third_party_count / distinct_third_party_count AS pct_tps_in_iframe_only, distinct_both_third_party_count, - distinct_both_third_party_count/distinct_third_party_count AS pct_tps_in_both + distinct_both_third_party_count / distinct_third_party_count AS pct_tps_in_both FROM aggregated_counts; diff --git a/sql/2024/third-parties/number_of_third_parties_by_rank.sql b/sql/2024/third-parties/number_of_third_parties_by_rank.sql index 3d5ccee75e7..bca7dbf21fe 100644 --- a/sql/2024/third-parties/number_of_third_parties_by_rank.sql +++ b/sql/2024/third-parties/number_of_third_parties_by_rank.sql @@ -21,7 +21,7 @@ pages AS ( `httparchive.all.pages` AS pg WHERE pg.date = '2024-06-01' AND - pg.is_root_page = TRUE + pg.is_root_page = true ), third_party AS ( diff --git a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql index dc116024c19..1597d6250b7 100644 --- a/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql +++ b/sql/2024/third-parties/top20_third_parties_by_client_and_frame_location.sql @@ -21,7 +21,7 @@ page_frames AS ( page_host, frame_host, CASE - WHEN frame_host != page_host + WHEN frame_host != page_host THEN true ELSE false END AS tp_flag, @@ -77,29 +77,24 @@ ranked_publishers AS ( SELECT client, frame_host, - rank_mainframe, - num_distinct_publishers_mainframe_only, - rank_iframe, - num_distinct_publishers_iframe_only, - rank_both, - num_distinct_publishers_both, - CASE - WHEN rank_mainframe <= 20 - THEN 'mainframe' - WHEN rank_iframe <= 20 - THEN 'iframe' - WHEN rank_both <= 20 - THEN 'both' - END AS category + num_distinct_publishers_mainframe_only AS num_distinct_publishers, + 'mainframe' AS category FROM ranked_publishers -WHERE rank_mainframe <= 20 OR rank_iframe <= 20 OR rank_both <= 20 -ORDER BY client, category, - CASE category - WHEN 'mainframe' - THEN num_distinct_publishers_mainframe_only - WHEN 'iframe' - THEN num_distinct_publishers_iframe_only - WHEN 'both' - THEN num_distinct_publishers_both - END - DESC; +WHERE rank_mainframe <= 20 AND num_distinct_publishers_mainframe_only > 0 +UNION ALL +SELECT + client, + frame_host, + num_distinct_publishers_iframe_only AS num_distinct_publishers, + 'iframe' AS category +FROM ranked_publishers +WHERE rank_iframe <= 20 AND num_distinct_publishers_iframe_only > 0 +UNION ALL +SELECT + client, + frame_host, + num_distinct_publishers_both AS num_distinct_publishers, + 'both' AS category +FROM ranked_publishers +WHERE rank_both <= 20 AND num_distinct_publishers_both > 0 +ORDER BY client, category, num_distinct_publishers DESC; From 4f50122f98ee956f9da5e2c1a317edabe575b639 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Tue, 1 Oct 2024 17:48:11 -0700 Subject: [PATCH 21/23] Added third-party requests by rank sql --- ...number_of_third_party_requests_by_rank.sql | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 sql/2024/third-parties/number_of_third_party_requests_by_rank.sql diff --git a/sql/2024/third-parties/number_of_third_party_requests_by_rank.sql b/sql/2024/third-parties/number_of_third_party_requests_by_rank.sql new file mode 100644 index 00000000000..543fe4a2b14 --- /dev/null +++ b/sql/2024/third-parties/number_of_third_party_requests_by_rank.sql @@ -0,0 +1,57 @@ +#standardSQL +# Number of third-party requests by rank +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.all.requests` AS req + WHERE + req.date = '2024-06-01' AND + req.is_root_page = true +), +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.all.pages` AS pg + WHERE + pg.date = '2024-06-01' AND + pg.is_root_page = true +), +third_party AS ( + SELECT + tp.client, + tp.rank, + COUNT(DISTINCT r.url) AS distinct_tp_requests, + COUNT(r.url) AS tp_requests, + rank_grouping + FROM + pages tp + INNER JOIN + requests r + ON NET.HOST(r.page) = NET.HOST(tp.page) AND r.client = tp.client + CROSS JOIN UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping + WHERE + tp.rank <= rank_grouping + GROUP BY + tp.client, + tp.rank, + rank_grouping +) +SELECT + client, + rank_grouping, + APPROX_QUANTILES(distinct_tp_requests, 1000)[OFFSET(500)] AS median_distinct_tp_requests, + APPROX_QUANTILES(tp_requests, 1000)[OFFSET(500)] AS median_tp_requests +FROM + third_party +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping; \ No newline at end of file From 5cf72ec0bcb0ce34fc75f15851dfe4d170f4b560 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Tue, 1 Oct 2024 18:09:00 -0700 Subject: [PATCH 22/23] lint --- .../third-parties/number_of_third_party_requests_by_rank.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2024/third-parties/number_of_third_party_requests_by_rank.sql b/sql/2024/third-parties/number_of_third_party_requests_by_rank.sql index 543fe4a2b14..ed9aa5c4d12 100644 --- a/sql/2024/third-parties/number_of_third_party_requests_by_rank.sql +++ b/sql/2024/third-parties/number_of_third_party_requests_by_rank.sql @@ -54,4 +54,4 @@ GROUP BY rank_grouping ORDER BY client, - rank_grouping; \ No newline at end of file + rank_grouping; From abe326dfb613fe9d7977beed7bda802221a0dab3 Mon Sep 17 00:00:00 2001 From: Yash Vekaria Date: Tue, 22 Oct 2024 00:26:49 -0700 Subject: [PATCH 23/23] Added third-party requests per page by rank for this year's chapter --- ..._third_party_requests_per_page_by_rank.sql | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 sql/2024/third-parties/number_of_third_party_requests_per_page_by_rank.sql diff --git a/sql/2024/third-parties/number_of_third_party_requests_per_page_by_rank.sql b/sql/2024/third-parties/number_of_third_party_requests_per_page_by_rank.sql new file mode 100644 index 00000000000..3d252a27184 --- /dev/null +++ b/sql/2024/third-parties/number_of_third_party_requests_per_page_by_rank.sql @@ -0,0 +1,57 @@ +#standardSQL +# Number of third-party requests per page by rank +WITH requests AS ( + SELECT + client, + page, + url + FROM + `httparchive.all.requests` AS req + WHERE + req.date = '2024-06-01' AND + req.is_root_page = true +), +pages AS ( + SELECT + client, + page, + rank + FROM + `httparchive.all.pages` AS pg + WHERE + pg.date = '2024-06-01' AND + pg.is_root_page = true +), +third_party AS ( + SELECT + tp.client, + tp.page, + tp.rank, + COUNT(DISTINCT r.url) AS distinct_tp_requests, + COUNT(r.url) AS tp_requests + FROM + pages tp + INNER JOIN + requests r + ON NET.HOST(r.page) = NET.HOST(tp.page) AND r.client = tp.client + GROUP BY + tp.client, + tp.page, + tp.rank +) +SELECT + client, + rank_grouping, + APPROX_QUANTILES(distinct_tp_requests, 1000)[OFFSET(500)] AS p50_distinct_tp_requests_per_page, + APPROX_QUANTILES(tp_requests, 1000)[OFFSET(500)] AS p50_tp_requests_per_page +FROM + third_party, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping;