From 030e0cf432a10ba7706b922d0ba34a6c0ebbafcd Mon Sep 17 00:00:00 2001 From: Ace Nassri Date: Wed, 18 Oct 2017 15:14:51 -0700 Subject: [PATCH] Add DLP samples (BigQuery, DeID, RiskAnalysis) (#474) * Add BigQuery samples + a few minor tweaks * Update comments + fix failing test * Sync w/codegen changes * Add DeID samples * Add DeID tests + remove infoTypes from DeID samples * Remove unused option * Add risk analysis samples * Update README * Add region tags + fix comment --- dlp/deid.js | 163 ++++++++++++++ dlp/inspect.js | 142 +++++++++++-- dlp/metadata.js | 4 +- dlp/package.json | 27 ++- dlp/quickstart.js | 2 +- dlp/redact.js | 4 +- dlp/risk.js | 350 +++++++++++++++++++++++++++++++ dlp/system-test/deid.test.js | 64 ++++++ dlp/system-test/inspect.test.js | 21 +- dlp/system-test/metadata.test.js | 1 + dlp/system-test/risk.test.js | 96 +++++++++ 11 files changed, 842 insertions(+), 32 deletions(-) create mode 100644 dlp/deid.js create mode 100644 dlp/risk.js create mode 100644 dlp/system-test/deid.test.js create mode 100644 dlp/system-test/risk.test.js diff --git a/dlp/deid.js b/dlp/deid.js new file mode 100644 index 0000000000..31e7083664 --- /dev/null +++ b/dlp/deid.js @@ -0,0 +1,163 @@ +/** + * Copyright 2017, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +function deidentifyWithMask (string, maskingCharacter, numberToMask) { + // [START deidentify_masking] + // Imports the Google Cloud Data Loss Prevention library + const DLP = require('@google-cloud/dlp'); + + // Instantiates a client + const dlp = new DLP.DlpServiceClient(); + + // The string to deidentify + // const string = 'My SSN is 372819127'; + + // (Optional) The maximum number of sensitive characters to mask in a match + // If omitted from the request or set to 0, the API will mask any matching characters + // const numberToMask = 5; + + // (Optional) The character to mask matching sensitive data with + // const maskingCharacter = 'x'; + + // Construct deidentification request + const items = [{ type: 'text/plain', value: string }]; + const request = { + deidentifyConfig: { + infoTypeTransformations: { + transformations: [{ + primitiveTransformation: { + characterMaskConfig: { + maskingCharacter: maskingCharacter, + numberToMask: numberToMask + } + } + }] + } + }, + items: items + }; + + // Run deidentification request + dlp.deidentifyContent(request) + .then((response) => { + const deidentifiedItems = response[0].items; + console.log(deidentifiedItems[0].value); + }) + .catch((err) => { + console.log(`Error in deidentifyWithMask: ${err.message || err}`); + }); + // [END deidentify_masking] +} + +function deidentifyWithFpe (string, alphabet, keyName, wrappedKey) { + // [START deidentify_fpe] + // Imports the Google Cloud Data Loss Prevention library + const DLP = require('@google-cloud/dlp'); + + // Instantiates a client + const dlp = new DLP.DlpServiceClient(); + + // The string to deidentify + // const string = 'My SSN is 372819127'; + + // The set of characters to replace sensitive ones with + // For more information, see https://cloud.google.com/dlp/docs/reference/rest/v2beta1/content/deidentify#FfxCommonNativeAlphabet + // const alphabet = 'ALPHA_NUMERIC'; + + // The name of the Cloud KMS key used to encrypt ('wrap') the AES-256 key + // const keyName = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'; + + // The encrypted ('wrapped') AES-256 key to use + // This key should be encrypted using the Cloud KMS key specified above + // const wrappedKey = 'YOUR_ENCRYPTED_AES_256_KEY' + + // Construct deidentification request + const items = [{ type: 'text/plain', value: string }]; + const request = { + deidentifyConfig: { + infoTypeTransformations: { + transformations: [{ + primitiveTransformation: { + cryptoReplaceFfxFpeConfig: { + cryptoKey: { + kmsWrapped: { + wrappedKey: wrappedKey, + cryptoKeyName: keyName + } + }, + commonAlphabet: alphabet + } + } + }] + } + }, + items: items + }; + + // Run deidentification request + dlp.deidentifyContent(request) + .then((response) => { + const deidentifiedItems = response[0].items; + console.log(deidentifiedItems[0].value); + }) + .catch((err) => { + console.log(`Error in deidentifyWithFpe: ${err.message || err}`); + }); + // [END deidentify_fpe] +} + +const cli = require(`yargs`) + .demand(1) + .command( + `mask `, + `Deidentify sensitive data by masking it with a character.`, + { + maskingCharacter: { + type: 'string', + alias: 'c', + default: '' + }, + numberToMask: { + type: 'number', + alias: 'n', + default: 0 + } + }, + (opts) => deidentifyWithMask(opts.string, opts.maskingCharacter, opts.numberToMask) + ) + .command( + `fpe `, + `Deidentify sensitive data using Format Preserving Encryption (FPE).`, + { + alphabet: { + type: 'string', + alias: 'a', + default: 'ALPHA_NUMERIC', + choices: ['NUMERIC', 'HEXADECIMAL', 'UPPER_CASE_ALPHA_NUMERIC', 'ALPHA_NUMERIC'] + } + }, + (opts) => deidentifyWithFpe(opts.string, opts.alphabet, opts.keyName, opts.wrappedKey) + ) + .example(`node $0 mask "My SSN is 372819127"`) + .example(`node $0 fpe "My SSN is 372819127" `) + .wrap(120) + .recommendCommands() + .epilogue(`For more information, see https://cloud.google.com/dlp/docs.`); + +if (module === require.main) { + cli.help().strict().argv; // eslint-disable-line +} diff --git a/dlp/inspect.js b/dlp/inspect.js index 2e4cc073fc..b01032e467 100644 --- a/dlp/inspect.js +++ b/dlp/inspect.js @@ -25,7 +25,7 @@ function inspectString (string, minLikelihood, maxFindings, infoTypes, includeQu const DLP = require('@google-cloud/dlp'); // Instantiates a client - const dlp = DLP(); + const dlp = new DLP.DlpServiceClient(); // The string to inspect // const string = 'My name is Gary and my email is gary@example.com'; @@ -37,7 +37,7 @@ function inspectString (string, minLikelihood, maxFindings, infoTypes, includeQu // const maxFindings = 0; // The infoTypes of information to match - // const infoTypes = [{ name: 'US_MALE_NAME', name: 'US_FEMALE_NAME' }]; + // const infoTypes = [{ name: 'PHONE_NUMBER' }, { name: 'EMAIL_ADDRESS' }, { name: 'CREDIT_CARD_NUMBER' }]; // Whether to include the matching string // const includeQuote = true; @@ -85,7 +85,7 @@ function inspectFile (filepath, minLikelihood, maxFindings, infoTypes, includeQu const DLP = require('@google-cloud/dlp'); // Instantiates a client - const dlp = DLP(); + const dlp = new DLP.DlpServiceClient(); // The path to a local file to inspect. Can be a text, JPG, or PNG file. // const fileName = 'path/to/image.png'; @@ -97,7 +97,7 @@ function inspectFile (filepath, minLikelihood, maxFindings, infoTypes, includeQu // const maxFindings = 0; // The infoTypes of information to match - // const infoTypes = [{ name: 'US_MALE_NAME' }, { name: 'US_FEMALE_NAME' }]; + // const infoTypes = [{ name: 'PHONE_NUMBER' }, { name: 'EMAIL_ADDRESS' }, { name: 'CREDIT_CARD_NUMBER' }]; // Whether to include the matching string // const includeQuote = true; @@ -148,7 +148,7 @@ function promiseInspectGCSFile (bucketName, fileName, minLikelihood, maxFindings const DLP = require('@google-cloud/dlp'); // Instantiates a client - const dlp = DLP(); + const dlp = new DLP.DlpServiceClient(); // The name of the bucket where the file resides. // const bucketName = 'YOUR-BUCKET'; @@ -164,7 +164,7 @@ function promiseInspectGCSFile (bucketName, fileName, minLikelihood, maxFindings // const maxFindings = 0; // The infoTypes of information to match - // const infoTypes = [{ name: 'US_MALE_NAME' }, { name: 'US_FEMALE_NAME' }]; + // const infoTypes = [{ name: 'PHONE_NUMBER' }, { name: 'EMAIL_ADDRESS' }, { name: 'CREDIT_CARD_NUMBER' }]; // Get reference to the file to be inspected const storageItems = { @@ -222,7 +222,7 @@ function eventInspectGCSFile (bucketName, fileName, minLikelihood, maxFindings, const DLP = require('@google-cloud/dlp'); // Instantiates a client - const dlp = DLP(); + const dlp = new DLP.DlpServiceClient(); // The name of the bucket where the file resides. // const bucketName = 'YOUR-BUCKET'; @@ -238,7 +238,7 @@ function eventInspectGCSFile (bucketName, fileName, minLikelihood, maxFindings, // const maxFindings = 0; // The infoTypes of information to match - // const infoTypes = [{ name: 'US_MALE_NAME' }, { name: 'US_FEMALE_NAME' }]; + // const infoTypes = [{ name: 'PHONE_NUMBER' }, { name: 'EMAIL_ADDRESS' }, { name: 'CREDIT_CARD_NUMBER' }]; // Get reference to the file to be inspected const storageItems = { @@ -307,7 +307,7 @@ function inspectDatastore (projectId, namespaceId, kind, minLikelihood, maxFindi const DLP = require('@google-cloud/dlp'); // Instantiates a client - const dlp = DLP(); + const dlp = new DLP.DlpServiceClient(); // (Optional) The project ID containing the target Datastore // const projectId = process.env.GCLOUD_PROJECT; @@ -326,9 +326,9 @@ function inspectDatastore (projectId, namespaceId, kind, minLikelihood, maxFindi // const maxFindings = 0; // The infoTypes of information to match - // const infoTypes = [{ name: 'US_MALE_NAME' }, { name: 'US_FEMALE_NAME' }]; + // const infoTypes = [{ name: 'PHONE_NUMBER' }, { name: 'EMAIL_ADDRESS' }, { name: 'CREDIT_CARD_NUMBER' }]; - // Get reference to the file to be inspected + // Construct items to be inspected const storageItems = { datastoreOptions: { partitionId: { @@ -384,6 +384,86 @@ function inspectDatastore (projectId, namespaceId, kind, minLikelihood, maxFindi // [END inspect_datastore] } +function inspectBigquery (projectId, datasetId, tableId, minLikelihood, maxFindings, infoTypes, includeQuote) { + // [START inspect_bigquery] + // Imports the Google Cloud Data Loss Prevention library + const DLP = require('@google-cloud/dlp'); + + // Instantiates a client + const dlp = new DLP.DlpServiceClient(); + + // (Optional) The project ID to run the API call under + // const projectId = process.env.GCLOUD_PROJECT; + + // The ID of the dataset to inspect, e.g. 'my_dataset' + // const datasetId = 'my_dataset'; + + // The ID of the table to inspect, e.g. 'my_table' + // const tableId = 'my_table'; + + // The minimum likelihood required before returning a match + // const minLikelihood = 'LIKELIHOOD_UNSPECIFIED'; + + // The maximum number of findings to report (0 = server maximum) + // const maxFindings = 0; + + // The infoTypes of information to match + // const infoTypes = [{ name: 'PHONE_NUMBER' }, { name: 'EMAIL_ADDRESS' }, { name: 'CREDIT_CARD_NUMBER' }]; + + // Construct items to be inspected + const storageItems = { + bigQueryOptions: { + tableReference: { + projectId: projectId, + datasetId: datasetId, + tableId: tableId + } + } + }; + + // Construct request for creating an inspect job + const request = { + inspectConfig: { + infoTypes: infoTypes, + minLikelihood: minLikelihood, + maxFindings: maxFindings + }, + storageConfig: storageItems + }; + + // Run inspect-job creation request + dlp.createInspectOperation(request) + .then((createJobResponse) => { + const operation = createJobResponse[0]; + + // Start polling for job completion + return operation.promise(); + }) + .then((completeJobResponse) => { + // When job is complete, get its results + const jobName = completeJobResponse[0].name; + return dlp.listInspectFindings({ + name: jobName + }); + }) + .then((results) => { + const findings = results[0].result.findings; + if (findings.length > 0) { + console.log(`Findings:`); + findings.forEach((finding) => { + console.log(`\tInfo type: ${finding.infoType.name}`); + console.log(`\tLikelihood: ${finding.likelihood}`); + }); + } else { + console.log(`No findings.`); + } + }) + .catch((err) => { + console.log(`Error in inspectBigquery: ${err.message || err}`); + }); + // [END inspect_bigquery] +} + const cli = require(`yargs`) // eslint-disable-line .demand(1) .command( @@ -434,6 +514,26 @@ const cli = require(`yargs`) // eslint-disable-line opts.infoTypes ) ) + .command( + `bigquery `, + `Inspects a BigQuery table using the Data Loss Prevention API.`, + { + projectId: { + type: 'string', + alias: 'p', + default: process.env.GCLOUD_PROJECT + } + }, + (opts) => inspectBigquery( + opts.projectId, + opts.datasetName, + opts.tableName, + opts.minLikelihood, + opts.maxFindings, + opts.infoTypes, + opts.includeQuote + ) + ) .command( `datastore `, `Inspect a Datastore instance using the Data Loss Prevention API.`, @@ -449,7 +549,15 @@ const cli = require(`yargs`) // eslint-disable-line default: '' } }, - (opts) => inspectDatastore(opts.projectId, opts.namespaceId, opts.kind, opts.minLikelihood, opts.maxFindings, opts.infoTypes, opts.includeQuote) + (opts) => inspectDatastore( + opts.projectId, + opts.namespaceId, + opts.kind, + opts.minLikelihood, + opts.maxFindings, + opts.infoTypes, + opts.includeQuote + ) ) .option('m', { alias: 'minLikelihood', @@ -477,15 +585,9 @@ const cli = require(`yargs`) // eslint-disable-line type: 'boolean', global: true }) - .option('l', { - alias: 'languageCode', - default: 'en-US', - type: 'string', - global: true - }) .option('t', { alias: 'infoTypes', - default: [], + default: ['PHONE_NUMBER', 'EMAIL_ADDRESS', 'CREDIT_CARD_NUMBER'], type: 'array', global: true, coerce: (infoTypes) => infoTypes.map((type) => { @@ -496,6 +598,8 @@ const cli = require(`yargs`) // eslint-disable-line .example(`node $0 file resources/test.txt`) .example(`node $0 gcsFilePromise my-bucket my-file.txt`) .example(`node $0 gcsFileEvent my-bucket my-file.txt`) + .example(`node $0 bigquery my-dataset my-table`) + .example(`node $0 datastore my-datastore-kind`) .wrap(120) .recommendCommands() .epilogue(`For more information, see https://cloud.google.com/dlp/docs. Optional flags are explained at https://cloud.google.com/dlp/docs/reference/rest/v2beta1/content/inspect#InspectConfig`); diff --git a/dlp/metadata.js b/dlp/metadata.js index be492f5c03..4725d0794d 100644 --- a/dlp/metadata.js +++ b/dlp/metadata.js @@ -21,7 +21,7 @@ function listInfoTypes (category, languageCode) { const DLP = require('@google-cloud/dlp'); // Instantiates a client - const dlp = DLP(); + const dlp = new DLP.DlpServiceClient(); // The category of info types to list. // const category = 'CATEGORY_TO_LIST'; @@ -52,7 +52,7 @@ function listRootCategories (languageCode) { const DLP = require('@google-cloud/dlp'); // Instantiates a client - const dlp = DLP(); + const dlp = new DLP.DlpServiceClient(); // The BCP-47 language code to use, e.g. 'en-US' // const languageCode = 'en-US'; diff --git a/dlp/package.json b/dlp/package.json index 05a02a77f8..24ac62fa84 100644 --- a/dlp/package.json +++ b/dlp/package.json @@ -20,6 +20,10 @@ "cloud-repo-tools": { "requiresKeyFile": true, "requiresProjectId": true, + "requiredEnvVars": [ + "DLP_DEID_WRAPPED_KEY", + "DLP_DEID_KEY_NAME" + ], "product": "dlp", "samples": [ { @@ -42,15 +46,30 @@ "file": "metadata.js", "docs_link": "https://cloud.google.com/dlp/docs", "usage": "node metadata.js --help" + }, + { + "id": "deid", + "name": "DeID", + "file": "deid.js", + "docs_link": "https://cloud.google.com/dlp/docs", + "usage": "node deid.js --help" + }, + { + "id": "risk", + "name": "Risk Analysis", + "file": "risk.js", + "docs_link": "https://cloud.google.com/dlp/docs", + "usage": "node risk.js --help" } ] }, "dependencies": { + "@google-cloud/bigquery": "^0.9.6", "@google-cloud/dlp": "^0.1.0", "google-auth-library": "0.10.0", - "google-auto-auth": "0.7.1", - "google-proto-files": "0.12.1", - "mime": "1.3.6", + "google-auto-auth": "0.7.2", + "google-proto-files": "0.13.0", + "mime": "1.4.0", "request": "2.81.0", "request-promise": "4.2.1", "safe-buffer": "5.1.1", @@ -58,6 +77,6 @@ }, "devDependencies": { "@google-cloud/nodejs-repo-tools": "1.4.17", - "ava": "0.21.0" + "ava": "0.22.0" } } diff --git a/dlp/quickstart.js b/dlp/quickstart.js index 370348330c..392e8008fb 100644 --- a/dlp/quickstart.js +++ b/dlp/quickstart.js @@ -20,7 +20,7 @@ const DLP = require('@google-cloud/dlp'); // Instantiates a client -const dlp = DLP(); +const dlp = new DLP.DlpServiceClient(); // The string to inspect const string = 'Robert Frost'; diff --git a/dlp/redact.js b/dlp/redact.js index e299592df5..2bc1c23902 100644 --- a/dlp/redact.js +++ b/dlp/redact.js @@ -21,7 +21,7 @@ function redactString (string, replaceString, minLikelihood, infoTypes) { const DLP = require('@google-cloud/dlp'); // Instantiates a client - const dlp = DLP(); + const dlp = new DLP.DlpServiceClient(); // The string to inspect // const string = 'My name is Gary and my email is gary@example.com'; @@ -74,7 +74,7 @@ function redactImage (filepath, minLikelihood, infoTypes, outputPath) { const DLP = require('@google-cloud/dlp'); // Instantiates a client - const dlp = DLP(); + const dlp = new DLP.DlpServiceClient(); // The path to a local file to inspect. Can be a JPG or PNG image file. // const fileName = 'path/to/image.png'; diff --git a/dlp/risk.js b/dlp/risk.js new file mode 100644 index 0000000000..6faa1c4a7e --- /dev/null +++ b/dlp/risk.js @@ -0,0 +1,350 @@ +/** + * Copyright 2017, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +function numericalRiskAnalysis (projectId, datasetId, tableId, columnName) { + // [START numerical_risk] + // Imports the Google Cloud Data Loss Prevention library + const DLP = require('@google-cloud/dlp'); + + // Instantiates a client + const dlp = new DLP.DlpServiceClient(); + + // (Optional) The project ID to run the API call under + // const projectId = process.env.GCLOUD_PROJECT; + + // The ID of the dataset to inspect, e.g. 'my_dataset' + // const datasetId = 'my_dataset'; + + // The ID of the table to inspect, e.g. 'my_table' + // const tableId = 'my_table'; + + // The name of the column to compute risk metrics for, e.g. 'age' + // Note that this column must be a numeric data type + // const columnName = 'firstName'; + + const sourceTable = { + projectId: projectId, + datasetId: datasetId, + tableId: tableId + }; + + // Construct request for creating a risk analysis job + const request = { + privacyMetric: { + numericalStatsConfig: { + field: { + columnName: columnName + } + } + }, + sourceTable: sourceTable + }; + + // Create helper function for unpacking values + const getValue = (obj) => obj[Object.keys(obj)[0]]; + + // Run risk analysis job + dlp.analyzeDataSourceRisk(request) + .then((response) => { + const operation = response[0]; + return operation.promise(); + }) + .then((completedJobResponse) => { + const results = completedJobResponse[0].numericalStatsResult; + + console.log(`Value Range: [${getValue(results.minValue)}, ${getValue(results.maxValue)}]`); + + // Print unique quantile values + let tempValue = null; + results.quantileValues.forEach((result, percent) => { + const value = getValue(result); + + // Only print new values + if ((tempValue !== value) && + !(tempValue && tempValue.equals && tempValue.equals(value))) { + console.log(`Value at ${percent}% quantile: ${value}`); + tempValue = value; + } + }); + }) + .catch((err) => { + console.log(`Error in numericalRiskAnalysis: ${err.message || err}`); + }); + // [END numerical_risk] +} + +function categoricalRiskAnalysis (projectId, datasetId, tableId, columnName) { + // [START categorical_risk] + // Imports the Google Cloud Data Loss Prevention library + const DLP = require('@google-cloud/dlp'); + + // Instantiates a client + const dlp = new DLP.DlpServiceClient(); + + // (Optional) The project ID to run the API call under + // const projectId = process.env.GCLOUD_PROJECT; + + // The ID of the dataset to inspect, e.g. 'my_dataset' + // const datasetId = 'my_dataset'; + + // The ID of the table to inspect, e.g. 'my_table' + // const tableId = 'my_table'; + + // The name of the column to compute risk metrics for, e.g. 'firstName' + // const columnName = 'firstName'; + + const sourceTable = { + projectId: projectId, + datasetId: datasetId, + tableId: tableId + }; + + // Construct request for creating a risk analysis job + const request = { + privacyMetric: { + categoricalStatsConfig: { + field: { + columnName: columnName + } + } + }, + sourceTable: sourceTable + }; + + // Create helper function for unpacking values + const getValue = (obj) => obj[Object.keys(obj)[0]]; + + // Run risk analysis job + dlp.analyzeDataSourceRisk(request) + .then((response) => { + const operation = response[0]; + return operation.promise(); + }) + .then((completedJobResponse) => { + const results = completedJobResponse[0].categoricalStatsResult.valueFrequencyHistogramBuckets[0]; + console.log(`Most common value occurs ${results.valueFrequencyUpperBound} time(s)`); + console.log(`Least common value occurs ${results.valueFrequencyLowerBound} time(s)`); + console.log(`${results.bucketSize} unique values total.`); + results.bucketValues.forEach((bucket) => { + console.log(`Value ${getValue(bucket.value)} occurs ${bucket.count} time(s).`); + }); + }) + .catch((err) => { + console.log(`Error in categoricalRiskAnalysis: ${err.message || err}`); + }); + // [END categorical_risk] +} + +function kAnonymityAnalysis (projectId, datasetId, tableId, quasiIds) { + // [START k_anonymity] + // Imports the Google Cloud Data Loss Prevention library + const DLP = require('@google-cloud/dlp'); + + // Instantiates a client + const dlp = new DLP.DlpServiceClient(); + + // (Optional) The project ID to run the API call under + // const projectId = process.env.GCLOUD_PROJECT; + + // The ID of the dataset to inspect, e.g. 'my_dataset' + // const datasetId = 'my_dataset'; + + // The ID of the table to inspect, e.g. 'my_table' + // const tableId = 'my_table'; + + // A set of columns that form a composite key ('quasi-identifiers') + // const quasiIds = [{ columnName: 'age' }, { columnName: 'city' }]; + + const sourceTable = { + projectId: projectId, + datasetId: datasetId, + tableId: tableId + }; + + // Construct request for creating a risk analysis job + const request = { + privacyMetric: { + kAnonymityConfig: { + quasiIds: quasiIds + } + }, + sourceTable: sourceTable + }; + + // Create helper function for unpacking values + const getValue = (obj) => obj[Object.keys(obj)[0]]; + + // Run risk analysis job + dlp.analyzeDataSourceRisk(request) + .then((response) => { + const operation = response[0]; + return operation.promise(); + }) + .then((completedJobResponse) => { + const results = completedJobResponse[0].kAnonymityResult.equivalenceClassHistogramBuckets[0]; + console.log(`Bucket size range: [${results.equivalenceClassSizeLowerBound}, ${results.equivalenceClassSizeUpperBound}]`); + + results.bucketValues.forEach((bucket) => { + const quasiIdValues = bucket.quasiIdsValues.map(getValue).join(', '); + console.log(` Quasi-ID values: {${quasiIdValues}}`); + console.log(` Class size: ${bucket.equivalenceClassSize}`); + }); + }) + .catch((err) => { + console.log(`Error in kAnonymityAnalysis: ${err.message || err}`); + }); + // [END k_anonymity] +} + +function lDiversityAnalysis (projectId, datasetId, tableId, sensitiveAttribute, quasiIds) { + // [START l_diversity] + // Imports the Google Cloud Data Loss Prevention library + const DLP = require('@google-cloud/dlp'); + + // Instantiates a client + const dlp = new DLP.DlpServiceClient(); + + // (Optional) The project ID to run the API call under + // const projectId = process.env.GCLOUD_PROJECT; + + // The ID of the dataset to inspect, e.g. 'my_dataset' + // const datasetId = 'my_dataset'; + + // The ID of the table to inspect, e.g. 'my_table' + // const tableId = 'my_table'; + + // The column to measure l-diversity relative to, e.g. 'firstName' + // const sensitiveAttribute = 'name'; + + // A set of columns that form a composite key ('quasi-identifiers') + // const quasiIds = [{ columnName: 'age' }, { columnName: 'city' }]; + + const sourceTable = { + projectId: projectId, + datasetId: datasetId, + tableId: tableId + }; + + // Construct request for creating a risk analysis job + const request = { + privacyMetric: { + lDiversityConfig: { + quasiIds: quasiIds, + sensitiveAttribute: { + columnName: sensitiveAttribute + } + } + }, + sourceTable: sourceTable + }; + + // Create helper function for unpacking values + const getValue = (obj) => obj[Object.keys(obj)[0]]; + + // Run risk analysis job + dlp.analyzeDataSourceRisk(request) + .then((response) => { + const operation = response[0]; + return operation.promise(); + }) + .then((completedJobResponse) => { + const results = completedJobResponse[0].lDiversityResult.sensitiveValueFrequencyHistogramBuckets[0]; + + console.log(`Bucket size range: [${results.sensitiveValueFrequencyLowerBound}, ${results.sensitiveValueFrequencyUpperBound}]`); + results.bucketValues.forEach((bucket) => { + const quasiIdValues = bucket.quasiIdsValues.map(getValue).join(', '); + console.log(` Quasi-ID values: {${quasiIdValues}}`); + console.log(` Class size: ${bucket.equivalenceClassSize}`); + bucket.topSensitiveValues.forEach((valueObj) => { + console.log(` Sensitive value ${getValue(valueObj.value)} occurs ${valueObj.count} time(s).`); + }); + }); + }) + .catch((err) => { + console.log(`Error in lDiversityAnalysis: ${err.message || err}`); + }); + // [END l_diversity] +} + +const cli = require(`yargs`) // eslint-disable-line + .demand(1) + .command( + `numerical `, + `Computes risk metrics of a column of numbers in a Google BigQuery table.`, + {}, + (opts) => numericalRiskAnalysis( + opts.projectId, + opts.datasetId, + opts.tableId, + opts.columnName + ) + ) + .command( + `categorical `, + `Computes risk metrics of a column of data in a Google BigQuery table.`, + {}, + (opts) => categoricalRiskAnalysis( + opts.projectId, + opts.datasetId, + opts.tableId, + opts.columnName + ) + ) + .command( + `kAnonymity [quasiIdColumnNames..]`, + `Computes the k-anonymity of a column set in a Google BigQuery table.`, + {}, + (opts) => kAnonymityAnalysis( + opts.projectId, + opts.datasetId, + opts.tableId, + opts.quasiIdColumnNames.map((f) => { + return { columnName: f }; + }) + ) + ) + .command( + `lDiversity [quasiIdColumnNames..]`, + `Computes the l-diversity of a column set in a Google BigQuery table.`, + {}, + (opts) => lDiversityAnalysis( + opts.projectId, + opts.datasetId, + opts.tableId, + opts.sensitiveAttribute, + opts.quasiIdColumnNames.map((f) => { + return { columnName: f }; + }) + ) + ) + .option('p', { + type: 'string', + alias: 'projectId', + default: process.env.GCLOUD_PROJECT, + global: true + }) + .example(`node $0 numerical nhtsa_traffic_fatalities accident_2015 state_number -p bigquery-public-data`) + .example(`node $0 categorical nhtsa_traffic_fatalities accident_2015 state_name -p bigquery-public-data`) + .example(`node $0 kAnonymity nhtsa_traffic_fatalities accident_2015 state_number county -p bigquery-public-data`) + .example(`node $0 lDiversity nhtsa_traffic_fatalities accident_2015 city state_number county -p bigquery-public-data`) + .wrap(120) + .recommendCommands() + .epilogue(`For more information, see https://cloud.google.com/dlp/docs.`); + +if (module === require.main) { + cli.help().strict().argv; // eslint-disable-line +} diff --git a/dlp/system-test/deid.test.js b/dlp/system-test/deid.test.js new file mode 100644 index 0000000000..b7348a9314 --- /dev/null +++ b/dlp/system-test/deid.test.js @@ -0,0 +1,64 @@ +/** + * Copyright 2017, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const test = require('ava'); +const tools = require('@google-cloud/nodejs-repo-tools'); + +const cmd = 'node deid'; +const cwd = path.join(__dirname, `..`); + +const harmfulString = 'My SSN is 372819127'; +const harmlessString = 'My favorite color is blue'; + +const wrappedKey = process.env.DLP_DEID_WRAPPED_KEY; +const keyName = process.env.DLP_DEID_KEY_NAME; + +test.before(tools.checkCredentials); + +// deidentify_masking +test(`should mask sensitive data in a string`, async (t) => { + const output = await tools.runAsync(`${cmd} mask "${harmfulString}" -c x -n 5`, cwd); + t.is(output, 'My SSN is xxxxx9127'); +}); + +test(`should ignore insensitive data when masking a string`, async (t) => { + const output = await tools.runAsync(`${cmd} mask "${harmlessString}"`, cwd); + t.is(output, harmlessString); +}); + +test(`should handle masking errors`, async (t) => { + const output = await tools.runAsync(`${cmd} mask "${harmfulString}" -n -1`, cwd); + t.regex(output, /Error in deidentifyWithMask/); +}); + +// deidentify_fpe +test(`should FPE encrypt sensitive data in a string`, async (t) => { + const output = await tools.runAsync(`${cmd} fpe "${harmfulString}" ${wrappedKey} ${keyName} -a NUMERIC`, cwd); + t.regex(output, /My SSN is \d{9}/); + t.not(output, harmfulString); +}); + +test(`should ignore insensitive data when FPE encrypting a string`, async (t) => { + const output = await tools.runAsync(`${cmd} fpe "${harmlessString}" ${wrappedKey} ${keyName}`, cwd); + t.is(output, harmlessString); +}); + +test(`should handle FPE encryption errors`, async (t) => { + const output = await tools.runAsync(`${cmd} fpe "${harmfulString}" ${wrappedKey} BAD_KEY_NAME`, cwd); + t.regex(output, /Error in deidentifyWithFpe/); +}); diff --git a/dlp/system-test/inspect.test.js b/dlp/system-test/inspect.test.js index 246f52fd40..922f83e96e 100644 --- a/dlp/system-test/inspect.test.js +++ b/dlp/system-test/inspect.test.js @@ -75,7 +75,6 @@ test.serial(`should inspect multiple GCS text files with event handlers`, async t.regex(output, /Processed \d+ of approximately \d+ bytes./); t.regex(output, /Info type: PHONE_NUMBER/); t.regex(output, /Info type: EMAIL_ADDRESS/); - t.regex(output, /Info type: CREDIT_CARD_NUMBER/); }); test.serial(`should handle a GCS file with no sensitive data with event handlers`, async (t) => { @@ -100,7 +99,6 @@ test.serial(`should inspect multiple GCS text files with promises`, async (t) => const output = await tools.runAsync(`${cmd} gcsFilePromise nodejs-docs-samples-dlp *.txt`, cwd); t.regex(output, /Info type: PHONE_NUMBER/); t.regex(output, /Info type: EMAIL_ADDRESS/); - t.regex(output, /Info type: CREDIT_CARD_NUMBER/); }); test.serial(`should handle a GCS file with no sensitive data with promises`, async (t) => { @@ -116,7 +114,6 @@ test.serial(`should report GCS file handling errors with promises`, async (t) => // inspect_datastore test.serial(`should inspect Datastore`, async (t) => { const output = await tools.runAsync(`${cmd} datastore Person --namespaceId DLP`, cwd); - t.regex(output, /Info type: PHONE_NUMBER/); t.regex(output, /Info type: EMAIL_ADDRESS/); }); @@ -125,11 +122,27 @@ test.serial(`should handle Datastore with no sensitive data`, async (t) => { t.is(output, 'No findings.'); }); -test.serial(`should report Datastore file handling errors`, async (t) => { +test.serial(`should report Datastore errors`, async (t) => { const output = await tools.runAsync(`${cmd} datastore Harmless --namespaceId DLP -t BAD_TYPE`, cwd); t.regex(output, /Error in inspectDatastore/); }); +// inspect_bigquery +test.serial(`should inspect a Bigquery table`, async (t) => { + const output = await tools.runAsync(`${cmd} bigquery integration_tests_dlp harmful`, cwd); + t.regex(output, /Info type: CREDIT_CARD_NUMBER/); +}); + +test.serial(`should handle a Bigquery table with no sensitive data`, async (t) => { + const output = await tools.runAsync(`${cmd} bigquery integration_tests_dlp harmless `, cwd); + t.is(output, 'No findings.'); +}); + +test.serial(`should report Bigquery table handling errors`, async (t) => { + const output = await tools.runAsync(`${cmd} bigquery integration_tests_dlp harmless -t BAD_TYPE`, cwd); + t.regex(output, /Error in inspectBigquery/); +}); + // CLI options test(`should have a minLikelihood option`, async (t) => { const promiseA = tools.runAsync(`${cmd} string "My phone number is (123) 456-7890." -m POSSIBLE`, cwd); diff --git a/dlp/system-test/metadata.test.js b/dlp/system-test/metadata.test.js index 086ab9cf22..5f088a4620 100644 --- a/dlp/system-test/metadata.test.js +++ b/dlp/system-test/metadata.test.js @@ -27,6 +27,7 @@ test.before(tools.checkCredentials); test(`should list info types for a given category`, async (t) => { const output = await tools.runAsync(`${cmd} infoTypes GOVERNMENT`, cwd); t.regex(output, /US_DRIVERS_LICENSE_NUMBER/); + t.false(output.includes('AMERICAN_BANKERS_CUSIP_ID')); }); test(`should inspect categories`, async (t) => { diff --git a/dlp/system-test/risk.test.js b/dlp/system-test/risk.test.js new file mode 100644 index 0000000000..8481ad911e --- /dev/null +++ b/dlp/system-test/risk.test.js @@ -0,0 +1,96 @@ +/** + * Copyright 2017, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const test = require('ava'); +const tools = require('@google-cloud/nodejs-repo-tools'); + +const cmd = 'node risk'; +const cwd = path.join(__dirname, `..`); + +const dataset = 'integration_tests_dlp'; +const uniqueField = 'Name'; +const repeatedField = 'Mystery'; +const numericField = 'Age'; + +test.before(tools.checkCredentials); + +// numericalRiskAnalysis +test(`should perform numerical risk analysis`, async (t) => { + const output = await tools.runAsync(`${cmd} numerical ${dataset} harmful ${numericField}`, cwd); + t.regex(output, /Value at 0% quantile: \d{2}/); + t.regex(output, /Value at \d{2}% quantile: \d{2}/); +}); + +test(`should handle numerical risk analysis errors`, async (t) => { + const output = await tools.runAsync(`${cmd} numerical ${dataset} nonexistent ${numericField}`, cwd); + t.regex(output, /Error in numericalRiskAnalysis/); +}); + +// categoricalRiskAnalysis +test(`should perform categorical risk analysis on a string field`, async (t) => { + const output = await tools.runAsync(`${cmd} categorical ${dataset} harmful ${uniqueField}`, cwd); + t.regex(output, /Most common value occurs \d time\(s\)/); +}); + +test(`should perform categorical risk analysis on a number field`, async (t) => { + const output = await tools.runAsync(`${cmd} categorical ${dataset} harmful ${numericField}`, cwd); + t.regex(output, /Most common value occurs \d time\(s\)/); +}); + +test(`should handle categorical risk analysis errors`, async (t) => { + const output = await tools.runAsync(`${cmd} categorical ${dataset} nonexistent ${uniqueField}`, cwd); + t.regex(output, /Error in categoricalRiskAnalysis/); +}); + +// kAnonymityAnalysis +test(`should perform k-anonymity analysis on a single field`, async (t) => { + const output = await tools.runAsync(`${cmd} kAnonymity ${dataset} harmful ${numericField}`, cwd); + t.regex(output, /Quasi-ID values: \{\d{2}\}/); + t.regex(output, /Class size: \d/); +}); + +test(`should perform k-anonymity analysis on multiple fields`, async (t) => { + const output = await tools.runAsync(`${cmd} kAnonymity ${dataset} harmful ${numericField} ${repeatedField}`, cwd); + t.regex(output, /Quasi-ID values: \{\d{2}, \d{4} \d{4} \d{4} \d{4}\}/); + t.regex(output, /Class size: \d/); +}); + +test(`should handle k-anonymity analysis errors`, async (t) => { + const output = await tools.runAsync(`${cmd} kAnonymity ${dataset} nonexistent ${numericField}`, cwd); + t.regex(output, /Error in kAnonymityAnalysis/); +}); + +// lDiversityAnalysis +test(`should perform l-diversity analysis on a single field`, async (t) => { + const output = await tools.runAsync(`${cmd} lDiversity ${dataset} harmful ${uniqueField} ${numericField}`, cwd); + t.regex(output, /Quasi-ID values: \{\d{2}\}/); + t.regex(output, /Class size: \d/); + t.regex(output, /Sensitive value James occurs \d time\(s\)/); +}); + +test(`should perform l-diversity analysis on multiple fields`, async (t) => { + const output = await tools.runAsync(`${cmd} lDiversity ${dataset} harmful ${uniqueField} ${numericField} ${repeatedField}`, cwd); + t.regex(output, /Quasi-ID values: \{\d{2}, \d{4} \d{4} \d{4} \d{4}\}/); + t.regex(output, /Class size: \d/); + t.regex(output, /Sensitive value James occurs \d time\(s\)/); +}); + +test(`should handle l-diversity analysis errors`, async (t) => { + const output = await tools.runAsync(`${cmd} lDiversity ${dataset} nonexistent ${uniqueField} ${numericField}`, cwd); + t.regex(output, /Error in lDiversityAnalysis/); +});