From 61e08c55c94fc42e03a22fc37992b5318ee713af Mon Sep 17 00:00:00 2001 From: Erica Porter Date: Thu, 19 Sep 2024 16:27:32 +0100 Subject: [PATCH 1/2] Remove ability to pseudonymise data --- README.md | 43 +------------------ config/locales/en.yml | 4 -- docs/create-events-table.sql | 2 +- lib/dfe/analytics.rb | 27 +++--------- lib/dfe/analytics/event.rb | 5 +-- lib/dfe/analytics/fields.rb | 26 ----------- lib/dfe/analytics/initialisation_events.rb | 1 - spec/dfe/analytics/entities_spec.rb | 26 +---------- spec/dfe/analytics/event_spec.rb | 15 ------- spec/dfe/analytics/fields_spec.rb | 20 --------- .../analytics/initialisation_events_spec.rb | 2 +- spec/dfe/analytics_spec.rb | 9 ---- 12 files changed, 13 insertions(+), 167 deletions(-) diff --git a/README.md b/README.md index ed080053..f328fe6c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **👉 Send every web request and database update to BigQuery** -**✋ Skip or pseudonymise fields containing PII** +**✋ Skip or hide fields containing PII** **✌️ Configure and forget** @@ -341,47 +341,6 @@ Once all the events have been constructed, simply send them to your analytics: DfE::Analytics::SendEvents.do([event, event2, event3]) ``` -## Pseudonymisation - -For an explanation of pseudonymisation, see [ICO Guidance](https://ico.org.uk/media/about-the-ico/consultations/4019579/chapter-3-anonymisation-guidance.pdf). - -### Data Pseudonymisation Algorithm - -Generally all PII data should be pseudonymised, including data that directly or -indirectly references PII, for example database IDs. - -DfE::Analytics also pseudonymises such data, if the relevant fields -appear in `analytics_pii.yml`. If you are pseudonymising -database IDs in your code (in custom events for example), then you should use -the same hashing algorithm for pseudonymisation that the gem uses, in order to -allow joining of pseudonymised data across different database tables. - -To ensure values are consistently pseudonymised, use the following method -whenever manually pseudonoymising: - -```ruby -DfE::Analytics.pseudonymise(value) -``` - -### User ID pseudonymisation - -The `user_id` in the web request event will not be pseudonymised by default. -This can be changed by updating the configuration option in -`config/initializers/dfe_analytics.rb`: - -```ruby -config.pseudonymise_web_request_user_id = true -``` - -**DO** pseudonymise web request `user_id`s if the corresponding field in -the database is in `analytics_pii.yml`: this means the pseudonymised ID will show up -identically for both web requests and database updates, so analysts can join them up. - -**DON'T** pseudonymise web request `user_id`s if the corresponding field in the -database is only in `analytics.yml`. This would result in the web request -version being pseudonymised and the database version not being pseudonymised, -which means they can't be joined up. - ### Entity Table Check Job If you are using a background processing tool or scheduler (such as Sidekiq, Sidekiq-Cron, Resque, Delayed Job or other alternatives), you may want to configure the Entity Table Check Job. This job is designed to ensure the latest version of an entity table in BigQuery is in sync with the database. It is advisable to schedule this job to run on a nightly basis for consistent data verification. diff --git a/config/locales/en.yml b/config/locales/en.yml index a0285ac8..63e5a82a 100644 --- a/config/locales/en.yml +++ b/config/locales/en.yml @@ -53,10 +53,6 @@ en: return the identifier for the user. This is useful for systems with users that don't use the id field. default: proc { |user| user&.id } - pseudonymise_web_request_user_id: - description: | - Whether to pseudonymise the user_id field in the web request event. - default: false entity_table_checks_enabled: description: | Whether to run entity table checksum job. diff --git a/docs/create-events-table.sql b/docs/create-events-table.sql index f129eb1d..f2a24534 100644 --- a/docs/create-events-table.sql +++ b/docs/create-events-table.sql @@ -13,7 +13,7 @@ CREATE TABLE response_content_type STRING OPTIONS(description="Content type of any data that was returned to the browser following this web request, if this event is a web request. For example, 'text/html; charset=utf-8'. Image views, for example, may have a non-text/html content type."), response_status STRING OPTIONS(description="HTTP response code returned by the application in response to this web request, if this event is a web request. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Status."), DATA ARRAY < STRUCT OPTIONS(description="Contents of the field in the database after it was created or updated, or just before it was imported or destroyed.") > > OPTIONS(description="ARRAY of STRUCTs, each with a key and a value. Contains a set of data points appropriate to the event_type of this event. For example, if this event was an entity create, update, delete or import event, data will contain the values of each field in the database after this event took place - according to the settings in the analytics.yml configured for this instance of dfe-analytics. Value be anonymised as a one way hash, depending on configuration settings."), + value ARRAY < STRING > OPTIONS(description="Contents of the field in the database after it was created or updated, or just before it was imported or destroyed.") > > OPTIONS(description="ARRAY of STRUCTs, each with a key and a value. Contains a set of data points appropriate to the event_type of this event. For example, if this event was an entity create, update, delete or import event, data will contain the values of each field in the database after this event took place - according to the settings in the analytics.yml configured for this instance of dfe-analytics."), hidden_DATA ARRAY < STRUCT OPTIONS(description="Contents of the field in the database after it was created or updated, or just before it was imported or destroyed.") > > OPTIONS(description="Defined in the same way as the DATA ARRAY of STRUCTs, except containing fields configured to be hidden in analytics_hidden_pii.yml"), entity_table_name STRING OPTIONS(description="If event_type was an entity create, update, delete or import event, the name of the table in the database that this entity is stored in. NULL otherwise."), diff --git a/lib/dfe/analytics.rb b/lib/dfe/analytics.rb index 95768c50..c086087f 100644 --- a/lib/dfe/analytics.rb +++ b/lib/dfe/analytics.rb @@ -47,7 +47,6 @@ def self.config enable_analytics environment user_identifier - pseudonymise_web_request_user_id entity_table_checks_enabled rack_page_cached bigquery_maintenance_window @@ -78,7 +77,6 @@ def self.configure config.async ||= true config.queue ||= :default config.user_identifier ||= proc { |user| user&.id } - config.pseudonymise_web_request_user_id ||= false config.entity_table_checks_enabled ||= false config.rack_page_cached ||= proc { |_rack_env| false } config.bigquery_maintenance_window ||= ENV.fetch('BIGQUERY_MAINTENANCE_WINDOW', nil) @@ -142,10 +140,6 @@ def self.allowlist Rails.application.config_for(:analytics) end - def self.allowlist_pii - Rails.application.config_for(:analytics_pii) - end - def self.hidden_pii Rails.application.config_for(:analytics_hidden_pii) rescue RuntimeError @@ -203,25 +197,18 @@ def self.extract_model_attributes(model, attributes = nil) exportable_attrs = (allowlist[table_name].presence || []).map(&:to_sym) hidden_pii_attrs = (hidden_pii[table_name].presence || []).map(&:to_sym) - pii_attrs = (allowlist_pii[table_name].presence || []).map(&:to_sym) - - # Validation in fields.rb ensures attributes do not appear on both allowlist_pii and allowlist_hidden_pii - exportable_pii_attrs = exportable_attrs & pii_attrs exportable_hidden_pii_attrs = exportable_attrs & hidden_pii_attrs - # Exclude both pii and hidden attributes from allowed_attributes - allowed_attrs_to_include = exportable_attrs - (exportable_pii_attrs + exportable_hidden_pii_attrs) - + # Exclude hidden pii attributes from allowed_attributes + allowed_attrs_to_include = exportable_attrs - exportable_hidden_pii_attrs allowed_attributes = attributes.slice(*allowed_attrs_to_include&.map(&:to_s)) - obfuscated_attributes = attributes.slice(*exportable_pii_attrs.map(&:to_s)) - .transform_values { |value| pseudonymise(value) } hidden_attributes = attributes.slice(*exportable_hidden_pii_attrs&.map(&:to_s)) - # Allowed attributes (which currently includes the allowlist_pii) must be kept separate from hidden_attributes - model_attributes = {} - model_attributes.merge!(data: allowed_attributes.deep_merge(obfuscated_attributes)) if allowed_attributes.any? || obfuscated_attributes.any? - model_attributes.merge!(hidden_data: hidden_attributes) if hidden_attributes.any? - model_attributes + # Allowed attributes must be kept separate from hidden_attributes + {}.tap do |model_attributes| + model_attributes[:data] = allowed_attributes if allowed_attributes.any? + model_attributes[:hidden_data] = hidden_attributes if hidden_attributes.any? + end end def self.anonymise(value) diff --git a/lib/dfe/analytics/event.rb b/lib/dfe/analytics/event.rb index b460b12e..e11d4bcc 100644 --- a/lib/dfe/analytics/event.rb +++ b/lib/dfe/analytics/event.rb @@ -134,10 +134,7 @@ def ensure_utf8(str) end def user_identifier_for(user) - user_id = DfE::Analytics.user_identifier(user) - user_id = DfE::Analytics.pseudonymise(user_id) if user_id.present? && DfE::Analytics.config.pseudonymise_web_request_user_id - - user_id + DfE::Analytics.user_identifier(user) end end end diff --git a/lib/dfe/analytics/fields.rb b/lib/dfe/analytics/fields.rb index f5eedecb..7886657f 100644 --- a/lib/dfe/analytics/fields.rb +++ b/lib/dfe/analytics/fields.rb @@ -45,15 +45,6 @@ def self.check! HEREDOC end - if overlapping_pii_fields.any? - errors << <<~HEREDOC - PII configuration error detected! The following fields are listed in both hidden_pii and allowlist_pii. - Fields must only be present in one. Please update the configuration to resolve the conflict: - - #{overlapping_pii_fields.to_yaml} - HEREDOC - end - configuration_errors = errors.join("\n\n----------------\n\n") raise(ConfigurationError, configuration_errors) if errors.any? @@ -71,23 +62,6 @@ def self.hidden_pii DfE::Analytics.hidden_pii || {} end - def self.allowlist_pii - DfE::Analytics.allowlist_pii || {} - end - - def self.overlapping_pii_fields - overlapping_fields = [] - hidden_pii.each do |entity, fields| - next if fields.blank? - - if allowlist_pii[entity] - overlapping = fields & allowlist_pii[entity] - overlapping_fields.concat(overlapping) unless overlapping.blank? - end - end - overlapping_fields - end - def self.database DfE::Analytics.all_entities_in_application .reduce({}) do |list, entity| diff --git a/lib/dfe/analytics/initialisation_events.rb b/lib/dfe/analytics/initialisation_events.rb index 86403b60..a8229885 100644 --- a/lib/dfe/analytics/initialisation_events.rb +++ b/lib/dfe/analytics/initialisation_events.rb @@ -41,7 +41,6 @@ def initialise_analytics_data { analytics_version: DfE::Analytics::VERSION, config: { - pseudonymise_web_request_user_id: DfE::Analytics.config.pseudonymise_web_request_user_id, entity_table_checks_enabled: DfE::Analytics.config.entity_table_checks_enabled } } diff --git a/spec/dfe/analytics/entities_spec.rb b/spec/dfe/analytics/entities_spec.rb index 362b0bdd..9dcd8669 100644 --- a/spec/dfe/analytics/entities_spec.rb +++ b/spec/dfe/analytics/entities_spec.rb @@ -2,7 +2,6 @@ RSpec.describe DfE::Analytics::Entities do let(:allowlist_fields) { [] } - let(:pii_fields) { [] } let(:hidden_pii_fields) { [] } with_model :Candidate do @@ -23,10 +22,6 @@ Candidate.table_name.to_sym => allowlist_fields }) - allow(DfE::Analytics).to receive(:allowlist_pii).and_return({ - Candidate.table_name.to_sym => pii_fields - }) - allow(DfE::Analytics).to receive(:hidden_pii).and_return({ Candidate.table_name.to_sym => hidden_pii_fields }) @@ -90,26 +85,9 @@ })]) end - context 'and the specified fields are listed as PII' do - let(:allowlist_fields) { ['email_address'] } - let(:pii_fields) { ['email_address'] } - - it 'hashes those fields' do - Candidate.create(email_address: 'adrienne@example.com') - - expect(DfE::Analytics::SendEvents).to have_received(:perform_later) - .with([a_hash_including({ - 'data' => [ - { 'key' => 'email_address', - 'value' => ['928b126cb77de8a61bf6714b4f6b0147be7f9d5eb60158930c34ef70f4d502d6'] } - ] - })]) - end - end - - context 'and other fields are listed as PII' do + context 'and other fields are listed as hidden PII' do let(:allowlist_fields) { ['id'] } - let(:pii_fields) { ['email_address'] } + let(:hidden_pii) { ['email_address'] } it 'does not include the fields only listed as PII' do Candidate.create(id: 123, email_address: 'adrienne@example.com') diff --git a/spec/dfe/analytics/event_spec.rb b/spec/dfe/analytics/event_spec.rb index f3b87d9e..37d8dd97 100644 --- a/spec/dfe/analytics/event_spec.rb +++ b/spec/dfe/analytics/event_spec.rb @@ -167,21 +167,6 @@ def find_data_pair(output, key) expect(output['user_id']).to eq uuid end end - - context 'pseudonymisation of user_id' do - before do - allow(DfE::Analytics.config).to receive(:pseudonymise_web_request_user_id).and_return(true) - end - - it 'pseudonymises the user id' do - event = described_class.new - uuid = SecureRandom.uuid - pseudonymised_uuid = Digest::SHA2.hexdigest(uuid) - output = event.with_user(regular_user_class.new(uuid)).as_json - - expect(output['user_id']).to eq pseudonymised_uuid - end - end end describe 'with_type' do diff --git a/spec/dfe/analytics/fields_spec.rb b/spec/dfe/analytics/fields_spec.rb index 9fba2cb9..d0d3c5dd 100644 --- a/spec/dfe/analytics/fields_spec.rb +++ b/spec/dfe/analytics/fields_spec.rb @@ -116,12 +116,10 @@ describe 'handling of hidden PII fields' do let(:existing_allowlist) { { Candidate.table_name.to_sym => %w[id dob email_address] } } let(:hidden_pii) { { Candidate.table_name.to_sym => %w[dob] } } - let(:allowlist_pii) { { Candidate.table_name.to_sym => %w[id email_address] } } let(:existing_blocklist) { { Candidate.table_name.to_sym => %w[first_name last_name] } } before do allow(DfE::Analytics).to receive(:hidden_pii).and_return(hidden_pii) - allow(DfE::Analytics).to receive(:allowlist_pii).and_return(allowlist_pii) end describe '.hidden_pii' do @@ -137,24 +135,6 @@ expect { DfE::Analytics::Fields.check! }.not_to raise_error end end - - context 'when there are overlapping fields in hidden_pii and allowlist_pii' do - let(:allowlist_pii) { { Candidate.table_name.to_sym => %w[id dob email_address] } } - - it 'raises an error' do - error_message = /PII configuration error detected! The following fields are listed in both hidden_pii and allowlist_pii/ - expect { DfE::Analytics::Fields.check! }.to raise_error(DfE::Analytics::ConfigurationError, error_message) - end - end - - context 'when hidden PII fields are improperly managed' do - let(:existing_allowlist) { { Candidate.table_name.to_sym => %w[email_address] } } - let(:allowlist_pii) { { Candidate.table_name.to_sym => %w[email_address] } } - - it 'raises an error about hidden PII fields not in allowlist' do - expect { described_class.check! }.to raise_error(DfE::Analytics::ConfigurationError, /New database field detected/) - end - end end end end diff --git a/spec/dfe/analytics/initialisation_events_spec.rb b/spec/dfe/analytics/initialisation_events_spec.rb index 64c07de9..00563598 100644 --- a/spec/dfe/analytics/initialisation_events_spec.rb +++ b/spec/dfe/analytics/initialisation_events_spec.rb @@ -17,7 +17,7 @@ { 'key' => 'analytics_version', 'value' => [DfE::Analytics::VERSION] }, { 'key' => 'config', - 'value' => ['{"pseudonymise_web_request_user_id":false,"entity_table_checks_enabled":true}'] + 'value' => ['{"entity_table_checks_enabled":true}'] } ] } diff --git a/spec/dfe/analytics_spec.rb b/spec/dfe/analytics_spec.rb index 5f5fcd50..96bde30b 100644 --- a/spec/dfe/analytics_spec.rb +++ b/spec/dfe/analytics_spec.rb @@ -5,10 +5,6 @@ expect(DfE::Analytics::VERSION).not_to be nil end - it 'supports the pseudonymise method' do - expect(DfE::Analytics.pseudonymise('foo_bar')).to eq('4928cae8b37b3d1113f5e01e60c967df6c2b9e826dc7d91488d23a62fec715ba') - end - it 'supports the anonymise method for backwards compatibility' do expect(DfE::Analytics.anonymise('foo_bar')).to eq('4928cae8b37b3d1113f5e01e60c967df6c2b9e826dc7d91488d23a62fec715ba') end @@ -206,9 +202,6 @@ allow(DfE::Analytics).to receive(:allowlist).and_return({ Candidate.table_name.to_sym => %w[email_address hidden_data age] }) - allow(DfE::Analytics).to receive(:allowlist_pii).and_return({ - Candidate.table_name.to_sym => %w[email_address] - }) allow(DfE::Analytics).to receive(:hidden_pii).and_return({ Candidate.table_name.to_sym => %w[hidden_data age] }) @@ -220,8 +213,6 @@ result = described_class.extract_model_attributes(candidate) expect(result[:data].keys).to include('email_address') - expect(result[:data]['email_address']).to_not eq(candidate.email_address) - expect(result[:hidden_data]['hidden_data']).to eq('secret') expect(result[:hidden_data]['age']).to eq(50) end From 90cb0b60263370d5ed80be581f06a43b49971d7c Mon Sep 17 00:00:00 2001 From: Erica Porter Date: Mon, 23 Sep 2024 11:51:35 +0100 Subject: [PATCH 2/2] revert to pre-pseudonymisation --- lib/dfe/analytics.rb | 4 ---- lib/dfe/analytics/event.rb | 12 ++++++------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/lib/dfe/analytics.rb b/lib/dfe/analytics.rb index c086087f..5a35a4bb 100644 --- a/lib/dfe/analytics.rb +++ b/lib/dfe/analytics.rb @@ -212,10 +212,6 @@ def self.extract_model_attributes(model, attributes = nil) end def self.anonymise(value) - pseudonymise(value) - end - - def self.pseudonymise(value) # Google SQL equivalent of this is TO_HEX(SHA256(value)) Digest::SHA2.hexdigest(value.to_s) end diff --git a/lib/dfe/analytics/event.rb b/lib/dfe/analytics/event.rb index e11d4bcc..036d0332 100644 --- a/lib/dfe/analytics/event.rb +++ b/lib/dfe/analytics/event.rb @@ -55,7 +55,7 @@ def with_response_details(rack_response) end def with_user(user) - @event_hash.merge!(user_id: user_identifier_for(user)) + @event_hash.merge!(user_id: user_identifier(user)) self end @@ -126,15 +126,15 @@ def hash_to_kv_pairs(hash) end def anonymised_user_agent_and_ip(rack_request) - DfE::Analytics.pseudonymise(rack_request.user_agent.to_s + rack_request.remote_ip.to_s) if rack_request.remote_ip.present? + DfE::Analytics.anonymise(rack_request.user_agent.to_s + rack_request.remote_ip.to_s) if rack_request.remote_ip.present? end - def ensure_utf8(str) - str&.scrub + def user_identifier(user) + DfE::Analytics.user_identifier(user) end - def user_identifier_for(user) - DfE::Analytics.user_identifier(user) + def ensure_utf8(str) + str&.scrub end end end