Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Third-party 2024 queries #3722

Merged
merged 23 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6b20201
A11Y technology usage queries
Yash-Vekaria Aug 13, 2024
4184b12
lighthouse and distribution related queries
Yash-Vekaria Aug 14, 2024
80ab481
Percentage-based analysis queries
Yash-Vekaria Aug 14, 2024
5328a7e
third-party blocking queries
Yash-Vekaria Aug 14, 2024
a5735e2
CSP frequency
Yash-Vekaria Aug 15, 2024
783a67e
Added mainframe vs iframe analysis
Yash-Vekaria Aug 16, 2024
70c33e5
Minor comments
Yash-Vekaria Aug 24, 2024
245e005
Update distribution_of_third_parties_by_frame.sql
mgifford Aug 26, 2024
34d5c78
Update top20_third_parties_by_client_and_frame_location.sql
mgifford Aug 26, 2024
3c19e8f
Update csp_allowed_host_frequency.sql
mgifford Aug 26, 2024
e0faed5
Update top20_third_parties_by_client_and_frame_location.sql
mgifford Aug 26, 2024
d2c6e24
Update top20_third_parties_by_client_and_frame_location.sql
mgifford Aug 26, 2024
ab15601
Update distribution_of_third_parties_by_frame.sql
mgifford Aug 26, 2024
22a0ea2
Update top20_third_parties_by_client_and_frame_location.sql
mgifford Aug 26, 2024
ed117b7
lint
Yash-Vekaria Sep 11, 2024
a72504c
lint
Yash-Vekaria Sep 11, 2024
549e1ef
lint
Yash-Vekaria Sep 11, 2024
8ec5b70
Ported lighthouse_average_unminified_css_by_3p.sql
Yash-Vekaria Sep 11, 2024
62ef475
Bug fixes
Yash-Vekaria Sep 13, 2024
280b93e
lint
Yash-Vekaria Sep 14, 2024
4f50122
Added third-party requests by rank sql
Yash-Vekaria Oct 2, 2024
5cf72ec
lint
Yash-Vekaria Oct 2, 2024
abe326d
Added third-party requests per page by rank for this year's chapter
Yash-Vekaria Oct 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions sql/2024/third-parties/a11y_overall_tech_usage_by_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#standardSQL
# Overall A11Y technology usage by domain rank

WITH a11y_technologies AS (
SELECT
_TABLE_SUFFIX AS client,
url
FROM
`httparchive.technologies.2024_06_01_*`
WHERE
category = 'Accessibility'
),

pages AS (
SELECT
_TABLE_SUFFIX AS client,
url,
rank_grouping
FROM
`httparchive.summary_pages.2024_06_01_*`,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
),

rank_totals AS (
SELECT
_TABLE_SUFFIX AS client,
rank_grouping,
COUNT(0) AS total
FROM
`httparchive.summary_pages.2024_06_01_*`,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
GROUP BY
client,
rank_grouping
)

SELECT
client,
rank_grouping AS rank,
COUNT(DISTINCT url) AS freq,
total,
(COUNT(DISTINCT url) / total) * 100 AS pct
FROM
a11y_technologies
LEFT OUTER JOIN
pages
USING (client, url)
JOIN
rank_totals
USING (client, rank_grouping)
GROUP BY
rank_grouping,
total,
client
ORDER BY
client,
rank
35 changes: 35 additions & 0 deletions sql/2024/third-parties/a11y_technology_usage.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#standardSQL
# A11Y technology usage

WITH a11y_technologies AS (
SELECT
_TABLE_SUFFIX AS client,
COUNT(DISTINCT url) AS freq
FROM
`httparchive.technologies.2024_06_01_*`
WHERE
category = 'Accessibility'
GROUP BY
client
),

pages AS (
SELECT
_TABLE_SUFFIX AS client,
COUNT(0) AS total
FROM
`httparchive.summary_pages.2024_06_01_*`
GROUP BY
client
)

SELECT
client,
freq,
total,
(freq / total) * 100 AS pct
FROM
a11y_technologies
JOIN
pages
USING (client)
65 changes: 65 additions & 0 deletions sql/2024/third-parties/a11y_technology_usage_by_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#standardSQL
# A11Y technology usage by domain rank

WITH a11y_technologies AS (
SELECT
_TABLE_SUFFIX AS client,
app,
url
FROM
`httparchive.technologies.2024_06_01_*`
WHERE
category = 'Accessibility'
),

pages AS (
SELECT
_TABLE_SUFFIX AS client,
url,
rank_grouping
FROM
`httparchive.summary_pages.2024_06_01_*`,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
),

rank_totals AS (
SELECT
_TABLE_SUFFIX AS client,
rank_grouping,
COUNT(0) AS total
FROM
`httparchive.summary_pages.2024_06_01_*`,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
GROUP BY
client,
rank_grouping
)

SELECT
client,
rank_grouping AS rank,
app,
COUNT(0) AS freq,
total,
(COUNT(0) / total) * 100 AS pct
FROM
a11y_technologies
LEFT OUTER JOIN
pages
USING (client, url)
JOIN
rank_totals
USING (client, rank_grouping)
GROUP BY
rank_grouping,
total,
client,
app
ORDER BY
client,
rank,
pct DESC
81 changes: 81 additions & 0 deletions sql/2024/third-parties/compressed_images_by_3p.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#standardSQL
# Compressed images (excluding SVG) by third parties

WITH requests AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
url,
resp_content_encoding AS content_encoding,
type,
respBodySize AS size
FROM
`httparchive.summary_requests.2024_06_01_*`
WHERE
type = 'image' AND (
resp_content_encoding = 'gzip' OR
resp_content_encoding = 'br'
) AND NOT (
resp_content_type LIKE 'image/svg%' OR
ENDS_WITH(url, '.svg')
)
),

third_party AS (
SELECT
NET.HOST(domain) AS domain,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2024-06-01' AND
category != 'hosting'
GROUP BY
domain
HAVING
page_usage >= 50
)

SELECT
client,
content_encoding,
domain,
size,
SUM(size) OVER (PARTITION BY client) AS total_size,
size / SUM(size) OVER (PARTITION BY client) AS pct_size,
num_requests,
total_requests,
pct_requests
FROM (
SELECT
client,
content_encoding,
domain,
COUNT(0) AS num_requests,
SUM(size) AS size,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total_requests,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_requests,
RANK() OVER (PARTITION BY client, type, content_encoding ORDER BY COUNT(0) DESC) AS domain_rank
FROM
requests
LEFT JOIN
third_party
ON
NET.HOST(requests.url) = NET.HOST(third_party.domain)
WHERE
domain IS NOT NULL
GROUP BY
client,
type,
content_encoding,
domain
)
WHERE
domain_rank <= 100
ORDER BY
client,
content_encoding,
size DESC
51 changes: 51 additions & 0 deletions sql/2024/third-parties/content_encoding.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#standardSQL
#content-encoding by third parties

WITH requests AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
url,
resp_content_encoding AS content_encoding
FROM
`httparchive.summary_requests.2024_06_01_*`
),

third_party AS (
SELECT
NET.HOST(domain) AS domain,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2024-06-01' AND
category != 'hosting'
GROUP BY
domain
HAVING
page_usage >= 50
)

SELECT
client,
content_encoding,
COUNT(0) AS num_requests,
SUM(COUNT(0)) OVER (PARTITION BY client) AS total,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct
FROM
requests
LEFT JOIN
third_party
ON
NET.HOST(requests.url) = NET.HOST(third_party.domain)
WHERE
domain IS NOT NULL
GROUP BY
client,
content_encoding
ORDER BY
client,
num_requests DESC
55 changes: 55 additions & 0 deletions sql/2024/third-parties/content_encoding_by_content_type.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#standardSQL
#content-encoding by third parties by content-type

WITH requests AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
url,
resp_content_encoding AS content_encoding,
type
FROM
`httparchive.summary_requests.2024_06_01_*`
),

third_party AS (
SELECT
NET.HOST(domain) AS domain,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2024-06-01' AND
category != 'hosting'
GROUP BY
domain
HAVING
page_usage >= 50
)

SELECT
client,
type,
content_encoding,
COUNT(0) AS num_requests,
SUM(COUNT(0)) OVER (PARTITION BY client, type) AS total,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client, type) AS pct
FROM
requests
LEFT JOIN
third_party
ON
NET.HOST(requests.url) = NET.HOST(third_party.domain)
WHERE
domain IS NOT NULL
GROUP BY
client,
type,
content_encoding
ORDER BY
client,
type,
num_requests DESC
Loading