Skip to content

Commit

Permalink
Merge pull request #1324 from DFE-Digital/bigquery-switch-jobs-from-v…
Browse files Browse the repository at this point in the history
…acancies-to-vacancy

Switch BigQuery queries from using old vacancies table to vacancy table
  • Loading branch information
stevenleggdfe authored Feb 17, 2020
2 parents c5cd71c + 8db6d65 commit 52a0076
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 23 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
SELECT
vacancies.id, #unique vacancy ID from the Teaching Vacancies database
vacancies.slug, #human readable dash-separated string that is probably also a unique vacancy ID (but can't trust this)
vacancies.expiry_time,
vacancy.id, #unique vacancy ID from the Teaching Vacancies database
vacancy.slug, #human readable dash-separated string that is probably also a unique vacancy ID (but can't trust this)
PARSE_TIMESTAMP("%e %B %E4Y %R",vacancy.expiry_time) AS expiry_time,
SUM( #series of SUMIF statements that turn various Google Analytics event configurations into counts of the total number of events that occurred on this vacancy's page
IF
(events.event_Action="vacancy_visited",
Expand Down Expand Up @@ -50,7 +50,7 @@ SELECT
events.Unique_Events,
0)) AS twitter_shares,
FROM
`teacher-vacancy-service.production_dataset.vacancies` AS vacancies
`teacher-vacancy-service.production_dataset.vacancy` AS vacancy
LEFT JOIN (
SELECT
SPLIT(SPLIT(Page_path_level_2,"/")[ #Convert the URL part from the Page_path_level_2 which comes in the form /slug into just the slug, which can be joined onto the slug field from the vacancies table in the database
Expand All @@ -64,13 +64,13 @@ LEFT JOIN (
FROM
`teacher-vacancy-service.production_dataset.GA_events_on_vacancies_page`) AS events
ON
vacancies.slug=events.slug #matches the vacancy slug from our database with the vacancy slug from the part of the page URL recorded in Google Analytics - this is the critical part of this query
WHERE vacancies.expiry_time < CURRENT_TIMESTAMP #only obtain vacancies which have expired
AND vacancies.expiry_time > (SELECT MAX(expiry_time) FROM `teacher-vacancy-service.production_dataset.CALCULATED_vacancy_GA_event_counts`) #only select vacancies that expired since we last ran this query
vacancy.slug=events.slug #matches the vacancy slug from our database with the vacancy slug from the part of the page URL recorded in Google Analytics - this is the critical part of this query
WHERE PARSE_TIMESTAMP("%e %B %E4Y %R",vacancy.expiry_time) < CURRENT_TIMESTAMP #only obtain vacancies which have expired
AND PARSE_TIMESTAMP("%e %B %E4Y %R",vacancy.expiry_time) > (SELECT MAX(expiry_time) FROM `teacher-vacancy-service.production_dataset.CALCULATED_vacancy_GA_event_counts`) #only select vacancies that expired since we last ran this query
AND status NOT IN ("trashed","draft")
GROUP BY
vacancies.id,
vacancies.slug,
vacancies.expiry_time
vacancy.id,
vacancy.slug,
vacancy.expiry_time
ORDER BY
vacancies.expiry_time DESC
vacancy.expiry_time DESC
14 changes: 8 additions & 6 deletions bigquery/schools_joined_with_metrics.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,19 @@ WITH
school.urn AS urn,
COUNT(*) AS vacancies_published,
#the total number of vacancies this school published over all time
COUNTIF(CAST(publish_on AS DATE) > DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR)) AS vacancies_published_in_the_last_year,
COUNTIF(CAST(publish_on AS DATE) > DATE_SUB(CURRENT_DATE(), INTERVAL 3 MONTH)) AS vacancies_published_in_the_last_quarter,
COUNTIF(PARSE_DATE("%e %B %E4Y",publish_on) > DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR)) AS vacancies_published_in_the_last_year,
COUNTIF(PARSE_DATE("%e %B %E4Y",publish_on) > DATE_SUB(CURRENT_DATE(), INTERVAL 3 MONTH)) AS vacancies_published_in_the_last_quarter,
COUNTIF(status="published"
AND CAST(expiry_time AS DATE) > CURRENT_DATE()) AS vacancies_currently_live #count this as vacancies which have been published and have not yet expired
AND PARSE_DATE("%e %B %E4Y",expires_on) > CURRENT_DATE()) AS vacancies_currently_live #count this as vacancies which have been published and have not yet expired
FROM
`teacher-vacancy-service.production_dataset.vacancies` AS vacancies
`teacher-vacancy-service.production_dataset.vacancy` AS vacancy
INNER JOIN `teacher-vacancy-service.production_dataset.school` AS school
ON vacancy.school_id=school.id
WHERE
status != "trashed" #exclude deleted vacancies from the counts above
AND status != "draft" #exclude vacancies which have not (yet) been published from the counts above
GROUP BY
vacancies.school.urn ),
school.urn),
mat_metrics AS ( #make a table of academy trusts (MATs and SATs) with current values of trust related metrics for inclusion in main query later
SELECT
GIAS.Trusts__name_ AS trust_name,
Expand Down Expand Up @@ -134,7 +136,7 @@ ON
LEFT JOIN
school_vacancy_metrics
ON
school_vacancy_metrics.urn=CAST(school.urn AS STRING)
school_vacancy_metrics.urn=school.urn
LEFT JOIN
`teacher-vacancy-service.production_dataset.STATIC_GIAS_manual_download` AS GIAS
ON
Expand Down
8 changes: 4 additions & 4 deletions bigquery/vacancies-published.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ WITH
DATE_ADD(year,INTERVAL 8 MONTH)) AS academic_year #converts the month into the corresponding academic year, storing this as the 1st September at the beginning of that academic year (the precise format doesn't matter; we just need a consistent way to represent the academic year so that the PARTITION BY above works)
FROM (
SELECT
CAST(TIMESTAMP_TRUNC(publish_on,MONTH) AS DATE) AS month, #use the first day of the month containing publish_on to represent the month (standard in data studio)
CAST(TIMESTAMP_TRUNC(publish_on,YEAR) AS DATE) AS year #use the first day of the year containing publish_on to represent the year (standard in data studio)
DATE_TRUNC(PARSE_DATE("%e %B %E4Y",publish_on),MONTH) AS month, #use the first day of the month containing publish_on to represent the month (standard in data studio)
DATE_TRUNC(PARSE_DATE("%e %B %E4Y",publish_on),YEAR) AS year #use the first day of the year containing publish_on to represent the year (standard in data studio)
FROM
`teacher-vacancy-service.production_dataset.vacancies`
`teacher-vacancy-service.production_dataset.vacancy`
WHERE
status NOT IN ("trashed",
"deleted",
"draft") #excludes vacancies which were never published, or which were published and then subsequently deleted
AND publish_on IS NOT NULL #also excludes vacancies which were never published (to be safe)
AND publish_on < CURRENT_TIMESTAMP() ) #excludes vacancies which have been published but are not yet visible on the site because their publication date is in the future
AND PARSE_DATE("%e %B %E4Y",publish_on) <= CURRENT_DATE() ) #excludes vacancies which have been published but are not yet visible on the site because their publication date is in the future
GROUP BY
month,
year
Expand Down
7 changes: 5 additions & 2 deletions bigquery/vacancy-feedback-metrics-by-month.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ SELECT
feedback_available) AS exclusive_hires_rate_upperbound
FROM (
SELECT
CAST(TIMESTAMP_TRUNC(publish_on,MONTH) AS DATE) AS month,
DATE_TRUNC(PARSE_DATE("%e %B %E4Y",
publish_on),MONTH) AS month,
COUNT(*) AS vacancies_published,
COUNTIF(hired_status IS NOT NULL
AND listed_elsewhere IS NOT NULL) AS feedback_available,
Expand All @@ -28,12 +29,14 @@ FROM (
"listed_free",
"listed_dont_know")) AS exclusive_hires_upperbound
FROM
`teacher-vacancy-service.production_dataset.vacancies`
`teacher-vacancy-service.production_dataset.vacancy`
WHERE
status NOT IN ("trashed",
"deleted",
"draft")
GROUP BY
month)
WHERE
month IS NOT NULL
ORDER BY
month ASC

0 comments on commit 52a0076

Please sign in to comment.