From 172d411ac6235ed286618941cf54cb92be5287f4 Mon Sep 17 00:00:00 2001 From: imreACTmd Date: Sun, 17 Oct 2021 18:13:43 +0000 Subject: [PATCH 1/6] move pg_dump and output to streams --- src/index.ts | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/src/index.ts b/src/index.ts index 6d26c35..48582ab 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,9 +1,9 @@ import { Command, flags } from "@oclif/command"; -import { promisify } from "util"; -import { exec } from "child_process"; +import { spawn } from "child_process"; const faker = require("faker"); const fs = require("fs"); const path = require("path"); +const readline = require('readline'); function dieAndLog(message: string, error: any) { console.error(message); @@ -56,23 +56,6 @@ class PgAnonymizer extends Command { }), }; - async originalDump(db: string, memory: number): Promise { - const execPromisified = promisify(exec); - try { - console.log("Launching pg_dump"); - const { stdout, stderr } = await execPromisified(`pg_dump ${db}`, { - maxBuffer: memory * 1024 * 1024, - }); - if (stderr.trim()) { - dieAndLog("pg_dump command failed.", stderr); - } - return stdout; - } catch (e) { - dieAndLog("pg_dump command failed. Are you sure it is installed?", e); - } - return ""; - } - async run() { const { args, flags } = this.parse(PgAnonymizer); @@ -84,10 +67,17 @@ class PgAnonymizer extends Command { ? require(path.join(process.cwd(), flags.extension)) : null; - const result = await this.originalDump( - args.database, - Number(flags.pgDumpOutputMemory) - ); + console.log("Launching pg_dump"); + const pg = spawn('pg_dump', [args.database]); + pg.on('exit', function(code) { + if (code != 0) { + dieAndLog("pg_dump command failed with exit code", code); + } + }); + pg.stderr.on('data', function(data) { + dieAndLog("pg_dump command error:", data); + }); + pg.stdout.setEncoding('utf8'); const list = flags.list.split(",").map((l) => { return { @@ -100,11 +90,16 @@ class PgAnonymizer extends Command { let indices: Number[] = []; let cols: string[] = []; - console.log("Command pg_dump done, starting anonymization."); + console.log("Command pg_dump started, running anonymization."); console.log("Output file: " + flags.output); - fs.writeFileSync(flags.output, ""); + let out = fs.createWriteStream(flags.output); + + const inputLineResults = readline.createInterface({ + input: pg.stdout, + crlfDelay: Infinity + }) as any as Iterable; - for (let line of result.split("\n")) { + for await (let line of inputLineResults) { if (line.match(/^COPY .* FROM stdin;$/)) { table = line.replace(/^COPY (.*?) .*$/, "$1"); console.log("Anonymizing table " + table); @@ -175,7 +170,7 @@ class PgAnonymizer extends Command { cols = []; } try { - fs.appendFileSync(flags.output, line + "\n"); + out.write(line + "\n"); } catch (e) { dieAndLog("Failed to write file", e); } From bdaac90eb9107ab8c8eb5324ab2657e1674010a6 Mon Sep 17 00:00:00 2001 From: imreACTmd Date: Tue, 28 Dec 2021 15:48:04 +0100 Subject: [PATCH 2/6] now you can specify which table you wnt to replace a column in also fix bug where fixed word gets overridden with random faker word --- README.md | 10 ++++++---- src/index.ts | 33 ++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 748c37d..db3fa10 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,12 @@ Specifying another list via `--list` replace the default automatically anonymize email,name,description,address,city,country,phone,comment,birthdate ``` +You can also specify the table for a column using the slash notation: + +```csv +public.user/email,public.product/description +``` + #### Customize replacements You can also choose which faker function you want to use to replace data (default is `faker.random.word`): @@ -70,10 +76,6 @@ module.exports = { }; ``` -### Memory limit - -Use `-m` to change `pg_dump` output memory limit (e.g: `512`) - ### Locale (i18n) Use `-l` to change the locale used by faker (default: `en`) diff --git a/src/index.ts b/src/index.ts index 48582ab..40ff1f3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -112,7 +112,8 @@ class PgAnonymizer extends Command { .map((e) => e.toLowerCase()); indices = cols.reduce((acc: Number[], value, key) => { - if (list.find((l) => l.col === value)) acc.push(key); + if (list.find((l) => l.col === value)) acc.push(key) + else if (list.find((l) => l.col === table + '/' + value)) acc.push(key); return acc; }, []); @@ -127,9 +128,14 @@ class PgAnonymizer extends Command { .split("\t") .map((v, k) => { if (indices.includes(k)) { - const replacement = list.find( + let replacement = list.find( (l) => l.col === cols[k] )?.replacement; + if (!replacement) { + replacement = list.find( + (l) => l.col === table + '/' + cols[k] + )?.replacement; + } if (replacement) { if (replacement.startsWith("faker.")) { const [_one, two, three] = replacement.split("."); @@ -148,18 +154,19 @@ class PgAnonymizer extends Command { }, extension)(v, table); } return replacement; + } else { + if (cols[k] === "email") return faker.internet.email(); + if (cols[k] === "name") return faker.name.findName(); + if (cols[k] === "description") return faker.random.words(3); + if (cols[k] === "address") return faker.address.streetAddress(); + if (cols[k] === "city") return faker.address.city(); + if (cols[k] === "country") return faker.address.country(); + if (cols[k] === "phone") return faker.phone.phoneNumber(); + if (cols[k] === "comment") return faker.random.words(3); + if (cols[k] === "birthdate") + return postgreSQLDate(faker.date.past()); + return faker.random.word(); } - if (cols[k] === "email") return faker.internet.email(); - if (cols[k] === "name") return faker.name.findName(); - if (cols[k] === "description") return faker.random.words(3); - if (cols[k] === "address") return faker.address.streetAddress(); - if (cols[k] === "city") return faker.address.city(); - if (cols[k] === "country") return faker.address.country(); - if (cols[k] === "phone") return faker.phone.phoneNumber(); - if (cols[k] === "comment") return faker.random.words(3); - if (cols[k] === "birthdate") - return postgreSQLDate(faker.date.past()); - return faker.random.word(); } return v; }) From f53959d0824d53bd60e3b0bdde72fc506e2da423 Mon Sep 17 00:00:00 2001 From: imreACTmd Date: Thu, 30 Dec 2021 16:41:45 +0100 Subject: [PATCH 3/6] updates based on comments --- README.md | 4 ++-- src/index.ts | 31 +++++++++++++++---------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index db3fa10..e2350a8 100644 --- a/README.md +++ b/README.md @@ -32,10 +32,10 @@ Specifying another list via `--list` replace the default automatically anonymize email,name,description,address,city,country,phone,comment,birthdate ``` -You can also specify the table for a column using the slash notation: +You can also specify the table for a column using the dot notation: ```csv -public.user/email,public.product/description +public.user.email,public.product.description,email,name ``` #### Customize replacements diff --git a/src/index.ts b/src/index.ts index 40ff1f3..5a41c10 100644 --- a/src/index.ts +++ b/src/index.ts @@ -51,8 +51,8 @@ class PgAnonymizer extends Command { }), pgDumpOutputMemory: flags.string({ char: "m", - description: "max memory used to get output from pg_dump in MB", - default: "256", + description: "Obsolete, not needed any more: max memory used to get output from pg_dump in MB", + default: "0", }), }; @@ -113,7 +113,7 @@ class PgAnonymizer extends Command { indices = cols.reduce((acc: Number[], value, key) => { if (list.find((l) => l.col === value)) acc.push(key) - else if (list.find((l) => l.col === table + '/' + value)) acc.push(key); + else if (list.find((l) => l.col === table + '.' + value)) acc.push(key); return acc; }, []); @@ -133,7 +133,7 @@ class PgAnonymizer extends Command { )?.replacement; if (!replacement) { replacement = list.find( - (l) => l.col === table + '/' + cols[k] + (l) => l.col === table + '.' + cols[k] )?.replacement; } if (replacement) { @@ -154,19 +154,18 @@ class PgAnonymizer extends Command { }, extension)(v, table); } return replacement; - } else { - if (cols[k] === "email") return faker.internet.email(); - if (cols[k] === "name") return faker.name.findName(); - if (cols[k] === "description") return faker.random.words(3); - if (cols[k] === "address") return faker.address.streetAddress(); - if (cols[k] === "city") return faker.address.city(); - if (cols[k] === "country") return faker.address.country(); - if (cols[k] === "phone") return faker.phone.phoneNumber(); - if (cols[k] === "comment") return faker.random.words(3); - if (cols[k] === "birthdate") - return postgreSQLDate(faker.date.past()); - return faker.random.word(); } + if (cols[k] === "email") return faker.internet.email(); + if (cols[k] === "name") return faker.name.findName(); + if (cols[k] === "description") return faker.random.words(3); + if (cols[k] === "address") return faker.address.streetAddress(); + if (cols[k] === "city") return faker.address.city(); + if (cols[k] === "country") return faker.address.country(); + if (cols[k] === "phone") return faker.phone.phoneNumber(); + if (cols[k] === "comment") return faker.random.words(3); + if (cols[k] === "birthdate") + return postgreSQLDate(faker.date.past()); + return faker.random.word(); } return v; }) From c534a37f714082a5ec76f753fab6c0b2f93245b6 Mon Sep 17 00:00:00 2001 From: imreACTmd Date: Mon, 3 Jan 2022 20:55:02 +0100 Subject: [PATCH 4/6] new option to read replacements from a file --- src/index.ts | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/src/index.ts b/src/index.ts index 5a41c10..9d9609d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -32,9 +32,11 @@ class PgAnonymizer extends Command { help: flags.help({ char: "h" }), list: flags.string({ char: "l", - description: "list of columns to anonymize", - default: - "email,name,description,address,city,country,phone,comment,birthdate", + description: "[default: email,name,description,address,city,country,phone,comment,birthdate] list of columns to anonymize", + }), + configFile: flags.string({ + char: "c", + description: "config file with list of columns to anonymize", }), extension: flags.string({ char: "e", @@ -79,12 +81,32 @@ class PgAnonymizer extends Command { }); pg.stdout.setEncoding('utf8'); - const list = flags.list.split(",").map((l) => { - return { - col: l.replace(/:(?:.*)$/, "").toLowerCase(), - replacement: l.includes(":") ? l.replace(/^(?:.*):/, "") : null, - }; - }); + if (!(flags.list || flags.configFile)) { + flags.list = "email,name,description,address,city,country,phone,comment,birthdate"; + } + + let list: { col: string; replacement: string | null; }[]; + if (flags.configFile) { + list = fs.readFileSync(flags.configFile, "utf8") + .split(/\r?\n/) + .map((l: string) => l.trim()) + .map((l: string) => { + if (l === "") return null; + if (l.startsWith("#")) return null; + return { + col: l.replace(/:(?:.*)$/, "").toLowerCase(), + replacement: l.includes(":") ? l.replace(/^(?:.*):/, "") : null + }; + }) + .filter(Boolean); + } else if (flags.list) { + list = flags.list.split(",").map((l) => { + return { + col: l.replace(/:(?:.*)$/, "").toLowerCase(), + replacement: l.includes(":") ? l.replace(/^(?:.*):/, "") : null, + }; + }); + } let table: string | null = null; let indices: Number[] = []; @@ -123,7 +145,7 @@ class PgAnonymizer extends Command { cols.filter((v, k) => indices.includes(k)).join(", ") ); else console.log("No columns to anonymize"); - } else if (table && line.trim()) { + } else if (table && line.trim() && (line !== "\\.")) { line = line .split("\t") .map((v, k) => { From f5029413252339ca5934815318d0de0408a506d4 Mon Sep 17 00:00:00 2001 From: imreACTmd Date: Mon, 3 Jan 2022 21:02:47 +0100 Subject: [PATCH 5/6] document configFile option --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index e2350a8..be02f81 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,13 @@ You can also specify the table for a column using the dot notation: public.user.email,public.product.description,email,name ``` +Alternatively use `--configFile` option to specify a file with a list of column names and optional replacements, one per line: + +```bash +npx pg-anonymizer postgres://localhost/mydb \ + --configFile /path/to/file +``` + #### Customize replacements You can also choose which faker function you want to use to replace data (default is `faker.random.word`): From 589ce373dc7e5257e64479dac4c4a5faf6cb1568 Mon Sep 17 00:00:00 2001 From: imreACTmd Date: Mon, 3 Jan 2022 21:20:10 +0100 Subject: [PATCH 6/6] merge some of upstream --- src/index.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/index.ts b/src/index.ts index 9d9609d..c298b7d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,7 +3,7 @@ import { spawn } from "child_process"; const faker = require("faker"); const fs = require("fs"); const path = require("path"); -const readline = require('readline'); +const readline = require("readline"); function dieAndLog(message: string, error: any) { console.error(message); @@ -70,16 +70,16 @@ class PgAnonymizer extends Command { : null; console.log("Launching pg_dump"); - const pg = spawn('pg_dump', [args.database]); - pg.on('exit', function(code) { + const pg = spawn("pg_dump", [args.database]); + pg.on("exit", function(code) { if (code != 0) { dieAndLog("pg_dump command failed with exit code", code); } }); - pg.stderr.on('data', function(data) { + pg.stderr.on("data", function(data) { dieAndLog("pg_dump command error:", data); }); - pg.stdout.setEncoding('utf8'); + pg.stdout.setEncoding("utf8"); if (!(flags.list || flags.configFile)) { flags.list = "email,name,description,address,city,country,phone,comment,birthdate"; @@ -100,7 +100,7 @@ class PgAnonymizer extends Command { }) .filter(Boolean); } else if (flags.list) { - list = flags.list.split(",").map((l) => { + list = flags.list.split(",").map((l: string) => { return { col: l.replace(/:(?:.*)$/, "").toLowerCase(), replacement: l.includes(":") ? l.replace(/^(?:.*):/, "") : null, @@ -135,7 +135,7 @@ class PgAnonymizer extends Command { indices = cols.reduce((acc: Number[], value, key) => { if (list.find((l) => l.col === value)) acc.push(key) - else if (list.find((l) => l.col === table + '.' + value)) acc.push(key); + else if (list.find((l) => l.col === table + "." + value)) acc.push(key); return acc; }, []); @@ -155,7 +155,7 @@ class PgAnonymizer extends Command { )?.replacement; if (!replacement) { replacement = list.find( - (l) => l.col === table + '.' + cols[k] + (l) => l.col === table + "." + cols[k] )?.replacement; } if (replacement) {