Merge pull request #1324 from DFE-Digital/bigquery-switch-jobs-from-v…

…acancies-to-vacancy Switch BigQuery queries from using old vacancies table to vacancy table
DFE-Digital · Feb 17, 2020 · 52a0076 · 52a0076
2 parents c5cd71c + 8db6d65
commit 52a0076
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 23 deletions.
diff --git a/bigquery/ga-append-latest-expired-vacancies-to-CALCULATED-vacancy-GA-event-counts.sql b/bigquery/ga-append-latest-expired-vacancies-to-CALCULATED-vacancy-GA-event-counts.sql
@@ -1,7 +1,7 @@
 SELECT
-  vacancies.id, #unique vacancy ID from the Teaching Vacancies database
-  vacancies.slug, #human readable dash-separated string that is probably also a unique vacancy ID (but can't trust this)
-  vacancies.expiry_time,
+  vacancy.id, #unique vacancy ID from the Teaching Vacancies database
+  vacancy.slug, #human readable dash-separated string that is probably also a unique vacancy ID (but can't trust this)
+  PARSE_TIMESTAMP("%e %B %E4Y %R",vacancy.expiry_time) AS expiry_time,
   SUM( #series of SUMIF statements that turn various Google Analytics event configurations into counts of the total number of events that occurred on this vacancy's page
   IF
     (events.event_Action="vacancy_visited",
@@ -50,7 +50,7 @@ SELECT
       events.Unique_Events,
       0)) AS twitter_shares,
 FROM
-  `teacher-vacancy-service.production_dataset.vacancies` AS vacancies
+  `teacher-vacancy-service.production_dataset.vacancy` AS vacancy
 LEFT JOIN (
   SELECT
     SPLIT(SPLIT(Page_path_level_2,"/")[ #Convert the URL part from the Page_path_level_2 which comes in the form /slug into just the slug, which can be joined onto the slug field from the vacancies table in the database
@@ -64,13 +64,13 @@ LEFT JOIN (
   FROM
     `teacher-vacancy-service.production_dataset.GA_events_on_vacancies_page`) AS events
 ON
-  vacancies.slug=events.slug #matches the vacancy slug from our database with the vacancy slug from the part of the page URL recorded in Google Analytics - this is the critical part of this query
-WHERE vacancies.expiry_time < CURRENT_TIMESTAMP #only obtain vacancies which have expired
-AND vacancies.expiry_time > (SELECT MAX(expiry_time) FROM `teacher-vacancy-service.production_dataset.CALCULATED_vacancy_GA_event_counts`) #only select vacancies that expired since we last ran this query
+  vacancy.slug=events.slug #matches the vacancy slug from our database with the vacancy slug from the part of the page URL recorded in Google Analytics - this is the critical part of this query
+WHERE PARSE_TIMESTAMP("%e %B %E4Y %R",vacancy.expiry_time) < CURRENT_TIMESTAMP #only obtain vacancies which have expired
+AND PARSE_TIMESTAMP("%e %B %E4Y %R",vacancy.expiry_time) > (SELECT MAX(expiry_time) FROM `teacher-vacancy-service.production_dataset.CALCULATED_vacancy_GA_event_counts`) #only select vacancies that expired since we last ran this query
 AND status NOT IN ("trashed","draft")
 GROUP BY
-  vacancies.id,
-  vacancies.slug,
-  vacancies.expiry_time
+  vacancy.id,
+  vacancy.slug,
+  vacancy.expiry_time
 ORDER BY
-  vacancies.expiry_time DESC
+  vacancy.expiry_time DESC
diff --git a/bigquery/schools_joined_with_metrics.sql b/bigquery/schools_joined_with_metrics.sql
@@ -14,17 +14,19 @@ WITH
     school.urn AS urn,
     COUNT(*) AS vacancies_published,
     #the total number of vacancies this school published over all time
-    COUNTIF(CAST(publish_on AS DATE) > DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR)) AS vacancies_published_in_the_last_year,
-    COUNTIF(CAST(publish_on AS DATE) > DATE_SUB(CURRENT_DATE(), INTERVAL 3 MONTH)) AS vacancies_published_in_the_last_quarter,
+    COUNTIF(PARSE_DATE("%e %B %E4Y",publish_on) > DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR)) AS vacancies_published_in_the_last_year,
+    COUNTIF(PARSE_DATE("%e %B %E4Y",publish_on) > DATE_SUB(CURRENT_DATE(), INTERVAL 3 MONTH)) AS vacancies_published_in_the_last_quarter,
     COUNTIF(status="published"
-      AND CAST(expiry_time AS DATE) > CURRENT_DATE()) AS vacancies_currently_live #count this as vacancies which have been published and have not yet expired
+      AND PARSE_DATE("%e %B %E4Y",expires_on) > CURRENT_DATE()) AS vacancies_currently_live #count this as vacancies which have been published and have not yet expired
   FROM
-    `teacher-vacancy-service.production_dataset.vacancies` AS vacancies
+    `teacher-vacancy-service.production_dataset.vacancy` AS vacancy
+  INNER JOIN `teacher-vacancy-service.production_dataset.school` AS school
+    ON vacancy.school_id=school.id
   WHERE
     status != "trashed" #exclude deleted vacancies from the counts above
     AND status != "draft" #exclude vacancies which have not (yet) been published from the counts above
   GROUP BY
-    vacancies.school.urn ),
+    school.urn),
   mat_metrics AS ( #make a table of academy trusts (MATs and SATs) with current values of trust related metrics for inclusion in main query later
   SELECT
     GIAS.Trusts__name_ AS trust_name,
@@ -134,7 +136,7 @@ ON
 LEFT JOIN
   school_vacancy_metrics
 ON
-  school_vacancy_metrics.urn=CAST(school.urn AS STRING)
+  school_vacancy_metrics.urn=school.urn
 LEFT JOIN
   `teacher-vacancy-service.production_dataset.STATIC_GIAS_manual_download` AS GIAS
 ON

diff --git a/bigquery/vacancies-published.sql b/bigquery/vacancies-published.sql
@@ -15,16 +15,16 @@ WITH
         DATE_ADD(year,INTERVAL 8 MONTH)) AS academic_year #converts the month into the corresponding academic year, storing this as the 1st September at the beginning of that academic year (the precise format doesn't matter; we just need a consistent way to represent the academic year so that the PARTITION BY above works)
     FROM (
       SELECT
-        CAST(TIMESTAMP_TRUNC(publish_on,MONTH) AS DATE) AS month, #use the first day of the month containing publish_on to represent the month (standard in data studio)
-        CAST(TIMESTAMP_TRUNC(publish_on,YEAR) AS DATE) AS year #use the first day of the year containing publish_on to represent the year (standard in data studio)
+        DATE_TRUNC(PARSE_DATE("%e %B %E4Y",publish_on),MONTH) AS month, #use the first day of the month containing publish_on to represent the month (standard in data studio)
+        DATE_TRUNC(PARSE_DATE("%e %B %E4Y",publish_on),YEAR) AS year #use the first day of the year containing publish_on to represent the year (standard in data studio)
       FROM
-        `teacher-vacancy-service.production_dataset.vacancies`
+        `teacher-vacancy-service.production_dataset.vacancy`
       WHERE
         status NOT IN ("trashed",
           "deleted",
           "draft") #excludes vacancies which were never published, or which were published and then subsequently deleted
         AND publish_on IS NOT NULL #also excludes vacancies which were never published (to be safe)
-        AND publish_on < CURRENT_TIMESTAMP() ) #excludes vacancies which have been published but are not yet visible on the site because their publication date is in the future
+        AND PARSE_DATE("%e %B %E4Y",publish_on) <= CURRENT_DATE() ) #excludes vacancies which have been published but are not yet visible on the site because their publication date is in the future
     GROUP BY
       month,
       year

diff --git a/bigquery/vacancy-feedback-metrics-by-month.sql b/bigquery/vacancy-feedback-metrics-by-month.sql
@@ -12,7 +12,8 @@ SELECT
     feedback_available) AS exclusive_hires_rate_upperbound
 FROM (
   SELECT
-    CAST(TIMESTAMP_TRUNC(publish_on,MONTH) AS DATE) AS month,
+    DATE_TRUNC(PARSE_DATE("%e %B %E4Y",
+        publish_on),MONTH) AS month,
     COUNT(*) AS vacancies_published,
     COUNTIF(hired_status IS NOT NULL
       AND listed_elsewhere IS NOT NULL) AS feedback_available,
@@ -28,12 +29,14 @@ FROM (
         "listed_free",
         "listed_dont_know")) AS exclusive_hires_upperbound
   FROM
-    `teacher-vacancy-service.production_dataset.vacancies`
+    `teacher-vacancy-service.production_dataset.vacancy`
   WHERE
     status NOT IN ("trashed",
       "deleted",
       "draft")
   GROUP BY
     month)
+WHERE
+  month IS NOT NULL
 ORDER BY
   month ASC