Skip to content

Commit

Permalink
Map JSON schema to BigQuery schema
Browse files Browse the repository at this point in the history
This makes some assumptions about the content of repeated fields, but as
we control the event schema we think it is unlikely to break
unexpectedly
  • Loading branch information
duncanjbrown committed Jun 16, 2022
1 parent 45b6fe5 commit 4c7c404
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 4 deletions.
5 changes: 3 additions & 2 deletions .rubocop_todo.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This configuration was generated by
# `rubocop --auto-gen-config`
# on 2022-05-20 13:06:05 UTC using RuboCop version 1.26.1.
# on 2022-06-16 10:08:54 UTC using RuboCop version 1.26.1.
# The point is for the user to remove these configuration records
# one by one as the offenses are removed from the code base.
# Note that changes in the inspected code, or installation of new
Expand All @@ -14,7 +14,7 @@ Naming/FileName:
Exclude:
- 'spec/dummy/config/initializers/dfe-analytics.rb'

# Offense count: 12
# Offense count: 13
# Configuration parameters: AllowedConstants.
Style/Documentation:
Exclude:
Expand All @@ -23,6 +23,7 @@ Style/Documentation:
- 'lib/dfe/analytics.rb'
- 'lib/dfe/analytics/entities.rb'
- 'lib/dfe/analytics/event.rb'
- 'lib/dfe/analytics/event_schema.rb'
- 'lib/dfe/analytics/load_entities.rb'
- 'lib/dfe/analytics/requests.rb'
- 'lib/dfe/analytics/send_events.rb'
Expand Down
63 changes: 63 additions & 0 deletions lib/dfe/analytics/event_schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,69 @@ def self.as_json
path = "#{Gem.loaded_specs['dfe-analytics'].gem_dir}/config/event-schema.json"
File.read(path)
end

def self.as_bigquery_schema
schema = JSON.parse(as_json)
required_fields = schema['required']

properties = schema['properties']

schema = properties.keys.reduce([]) do |bq_schema, json_schema_entry_name|
json_schema_entry = properties[json_schema_entry_name]
bigquery_field_type = resolve_bigquery_type(json_schema_entry)

bigquery_schema_entry = {
'mode' => resolve_bigquery_mode(json_schema_entry_name, json_schema_entry, required_fields),
'name' => json_schema_entry_name,
'type' => bigquery_field_type
}

if bigquery_field_type == 'RECORD'
bigquery_schema_entry['fields'] = [
{
'mode' => 'REQUIRED',
'name' => 'key',
'type' => 'STRING'
},
{
'mode' => 'REPEATED',
'name' => 'value',
'type' => 'STRING'
}
]
end

bq_schema << bigquery_schema_entry
bq_schema
end

schema.to_json
end

def self.resolve_bigquery_mode(json_schema_entry_name, json_schema_entry, required_fields)
if required_fields.include?(json_schema_entry_name)
'REQUIRED'
elsif json_schema_entry['type'] == 'array'
'REPEATED'
else
'NULLABLE'
end
end

def self.resolve_bigquery_type(json_schema_entry)
json_type = json_schema_entry['type']
json_format = json_schema_entry['format']

if json_type == 'array'
'RECORD'
elsif json_type == 'string' && json_format == 'date-time'
'TIMESTAMP'
elsif json_type == 'string'
'STRING'
elsif json_type == 'integer'
'INTEGER'
end
end
end
end
end
8 changes: 6 additions & 2 deletions lib/dfe/analytics/tasks/schema.rake
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@ namespace :dfe do
namespace :analytics do
desc 'Print out the dfe-analytics JSON schema'
task :schema do
path = "#{Gem.loaded_specs['dfe-analytics'].gem_dir}/config/event-schema.json"
puts File.read(path)
puts DfE::Analytics::EventSchema.as_json
end

desc 'Print out the dfe-analytics BigQuery schema'
task :big_query_schema do
puts DfE::Analytics::EventSchema.as_bigquery_schema
end
end
end
11 changes: 11 additions & 0 deletions spec/dfe/analytics/event_schema_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,15 @@
expect(output).to eq schema_on_disk
end
end

describe '.as_bigquery_schema' do
it 'transforms the JSON schema into a BQ schema' do
bq_schema_on_disk = File.read('spec/examples/bigquery_schema.json')

output = JSON.parse(described_class.as_bigquery_schema)

expect(output).to be_present
expect(output).to match_array JSON.parse(bq_schema_on_disk)
end
end
end
106 changes: 106 additions & 0 deletions spec/examples/bigquery_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
[
{
"mode": "REQUIRED",
"name": "occurred_at",
"type": "TIMESTAMP"
},
{
"mode": "REQUIRED",
"name": "event_type",
"type": "STRING"
},
{
"mode": "REQUIRED",
"name": "environment",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "namespace",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "user_id",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "request_uuid",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "request_method",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "request_path",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "request_user_agent",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "request_referer",
"type": "STRING"
},
{
"fields": [
{
"mode": "REQUIRED",
"name": "key",
"type": "STRING"
},
{
"mode": "REPEATED",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "request_query",
"type": "RECORD"
},
{
"mode": "NULLABLE",
"name": "response_content_type",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "response_status",
"type": "STRING"
},
{
"fields": [
{
"mode": "REQUIRED",
"name": "key",
"type": "STRING"
},
{
"mode": "REPEATED",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "data",
"type": "RECORD"
},
{
"mode": "NULLABLE",
"name": "entity_table_name",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "anonymised_user_agent_and_ip",
"type": "STRING"
}
]

0 comments on commit 4c7c404

Please sign in to comment.