forked from humanmade/go-anonymize-mysqldump
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanonymize-mysqldump.go
523 lines (440 loc) · 13.7 KB
/
anonymize-mysqldump.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
package main
import (
"bufio"
"encoding/json"
"fmt"
"github.com/akamensky/argparse"
"github.com/sirupsen/logrus"
"github.com/xwb1989/sqlparser"
"io"
"io/ioutil"
"os"
"strings"
"sync"
)
const VERSION = "202212071734"
type Config struct {
Patterns []ConfigPattern `json:"patterns"`
}
type ConfigPattern struct {
TableName string `json:"tableName"`
Fields []PatternField `json:"fields"`
}
type PatternField struct {
Field string `json:"field"`
Position int `json:"position"`
Type string `json:"type"`
Constraints []PatternFieldConstraint `json:"constraints"`
Unique bool `json:"unique"`
Value string `json:"value"`
}
type PatternFieldConstraint struct {
Field string `json:"field"`
Position int `json:"position"`
Value string `json:"value"`
Match string `json:"match"`
}
type SafeMap struct {
v map[string]map[string]bool
m sync.RWMutex
}
var (
transformationFunctionMap = map[string]func(*sqlparser.SQLVal) *sqlparser.SQLVal {
"id": generateId,
"username": generateUsername,
"password": generatePassword,
"email": generateEmail,
"freeEmail": generateFreeEmail,
"randomEmail": generateRandomEmail,
"safeEmail": generateSafeEmail,
"url": generateURL,
"name": generateName,
"firstName": generateFirstName,
"lastName": generateLastName,
"paragraph": generateParagraph,
"ipv4": generateIPv4,
"customPhoneNumber": generateCustomPhoneNumber,
"customUserAgent": generateCustomUserAgent,
"libPrefix": generateLibPrefix,
"libCompanyName": generateLibCompanyName,
"libStreet": generateLibStreet,
"customStreet": generateCustomStreet,
"libBuildingNumber": generateLibBuildingNumber,
"libCity": generateLibCity,
"libZip": generateLibZip,
"libState": generateLibState,
"libCountry": generateLibCountry,
"customCountry": generateCustomCountry,
"libParagraph": generateLibParagraph,
"customSocialNetwork": generateCustomSocialNetwork,
"customSocialId": generateCustomSocialId,
"customSocialToken": generateCustomSocialToken,
"libInternetUser": generateLibInternetUser,
"customUniqueUser": generateCustomUniqueUser,
"customPassword": generateCustomPassword,
"customRecoverToken": generateCustomRecoverToken,
"customUserToken": generateCustomUserToken,
"libAdditionalAddress": generateLibAdditionalAddress,
"customTitle": generateCustomTitle,
"static": generateStaticValue,
}
// I expect dump being in order
uniqueMap SafeMap
currentTable string
)
// Many thanks to https://stackoverflow.com/a/47515580/1454045
func init() {
lvl, ok := os.LookupEnv("LOG_LEVEL")
// LOG_LEVEL not set, let's default to info
if !ok {
lvl = "info"
}
// parse string, this is built-in feature of logrus
ll, err := logrus.ParseLevel(lvl)
if err != nil {
ll = logrus.InfoLevel
}
// set global log level
logrus.SetLevel(ll)
setLocale("de")
}
func main() {
config := parseArgs()
lines := setupAndProcessInput(config, os.Stdin)
for line := range lines {
fmt.Print(<-line)
}
}
func setupAndProcessInput(config Config, input io.Reader) chan chan string {
var wg sync.WaitGroup
lines := make(chan chan string, 10)
wg.Add(1)
go processInput(&wg, input, lines, config)
go func() {
wg.Wait()
close(lines)
}()
return lines
}
func parseArgs() Config {
parser := argparse.NewParser("anonymize-mysqldump", "Reads SQL from STDIN and replaces content for anonymity based on the provided config.")
configFilePath := parser.String("c", "config", &argparse.Options{Required: false, Help: "Path to config.json"})
var version *bool = parser.Flag("V", "version", &argparse.Options{Required: false, Help: "print the version number and exit"})
err := parser.Parse(os.Args)
if err != nil || (!*version && *configFilePath == "") {
// In case of error print error and print usage
// This can also be done by passing -h or --help flags
fmt.Print(parser.Usage(err), "\n")
os.Exit(1)
}
if *version {
fmt.Print(VERSION, "\n")
os.Exit(0)
}
return readConfigFile(*configFilePath)
}
func readConfigFile(filepath string) Config {
jsonConfig, err := ioutil.ReadFile(filepath)
if err != nil {
logrus.Fatal(err)
}
var decoded Config
jsonReader := strings.NewReader(string(jsonConfig))
jsonParser := json.NewDecoder(jsonReader)
jsonParser.Decode(&decoded)
return decoded
}
func processInput(wg *sync.WaitGroup, input io.Reader, lines chan chan string, config Config) {
defer wg.Done()
r := bufio.NewReaderSize(input, 2*1024*1024)
var nextLine string
insertStarted := false
continueLooping := true
for continueLooping {
line, err := r.ReadString('\n')
if err == io.EOF {
// continueLooping is used because line might be populated even when we've
// reached the end of the file, so we set a boolean once the last line is
// being processed to end the loop.
continueLooping = false
} else if err != nil {
// log any other errors and break
logrus.Error(err.Error())
break
}
// If the line is shorter than 6 characters, which is the shortest line for
// an insert query, let's skip processing it
if len(line) < 6 {
// TODO I'd love to clean this up so we don't make ch in three different
// places, but that's a task for another day
ch := make(chan string)
lines <- ch
ch <- line
//ch <- line + "\n"
continue
}
// Test if this is an INSERT query. We'll use this to determine if we need
// to concatenate lines together if they're spread apart multiple lines
// instead of on a single line
maybeInsert := strings.ToUpper(line[:6]) == "INSERT"
if maybeInsert {
insertStarted = true
}
line = strings.TrimSpace(line)
// Now that we've detected this is an INSERT query, let's append the lines
// together to form a single line in the event this spans multiple lines
if insertStarted {
nextLine += line
} else {
// When it's not an insert query, let's add this line and move on without
// processing it
// TODO clean this up too
ch := make(chan string)
lines <- ch
ch <- line + "\n"
continue
}
lastCharacter := line[len(line)-1:]
if lastCharacter == ";" {
insertStarted = false
} else {
// If we haven't reached a query terminator and and insert query has
// begun, let's move on to the next line
continue
}
// Now let's actually process the line!
wg.Add(1)
ch := make(chan string)
lines <- ch
go func(line string) {
defer wg.Done()
line = processLine(line, config)
ch <- line
}(nextLine)
// Now let's reset nextLine to empty so that it doesn't continue
// appending lines forever
nextLine = ""
}
}
func processLine(line string, config Config) string {
parsed, err := parseLine(line)
if err != nil {
// TODO Add line number to log
logrus.WithFields(logrus.Fields{
"error": err,
"line": line,
}).Error("Failed parsing line with error: ")
return line
}
// TODO Detect if line matches pattern
processed, err := applyConfigToParsedLine(parsed, config)
// TODO make modifications
// TODO Return changes
recompiled, err := recompileStatementToSQL(processed)
if err != nil {
// TODO Add line number to log
logrus.WithFields(logrus.Fields{
"error": err,
}).Error("Failed recompiling line with error: ")
return line
}
return recompiled
}
func parseLine(line string) (sqlparser.Statement, error) {
stmt, err := sqlparser.Parse(line)
if err != nil {
return nil, err
}
return stmt, nil
}
func applyConfigToParsedLine(stmt sqlparser.Statement, config Config) (sqlparser.Statement, error) {
insert, isInsertStatement := stmt.(*sqlparser.Insert)
if !isInsertStatement {
// Let's skip other statements as we only want to process inserts.
return stmt, nil
}
modified, err := applyConfigToInserts(insert, config)
if err != nil {
// TODO Log error and move on
return stmt, nil
}
return modified, nil
}
func applyConfigToInserts(stmt *sqlparser.Insert, config Config) (*sqlparser.Insert, error) {
values, isValuesSlice := stmt.Rows.(sqlparser.Values)
if !isValuesSlice {
// This _should_ have type Values, but if it doesn't, let's skip it
// TODO Perhaps worth logging when this happens?
return stmt, nil
}
// Clean unique map to save resources
setCurrentTable(stmt.Table.Name.String())
// Iterate over the specified configs and see if this statement matches any
// of the desired changes
// TODO make this use goroutines
for _, pattern := range config.Patterns {
if stmt.Table.Name.String() != pattern.TableName {
// Config is not for this table, move onto next available config
continue
}
// Ok, now it's time to make some modifications
newValues, err := modifyValues(values, pattern)
if err != nil {
// TODO Perhaps worth logging when this happens?
return stmt, nil
}
stmt.Rows = newValues
}
return stmt, nil
}
// TODO we're gonna have to figure out how to retain types if we ever want to
// mask number-based fields
func modifyValues(values sqlparser.Values, pattern ConfigPattern) (sqlparser.Values, error) {
// TODO make this use goroutines
for row := range values {
// TODO make this use goroutines
for _, fieldPattern := range pattern.Fields {
// Position is 1 indexed instead of 0, so let's subtract 1 in order to get
// it to line up with the value inside the ValTuple inside of values.Values
valTupleIndex := fieldPattern.Position - 1
// Ignore NULLs to avoid interface conversion panic
var (
i int = 0
proposedValue, value *sqlparser.SQLVal
uniqueLimit int = 100
valueString string
)
switch values[row][valTupleIndex].(type) {
case *sqlparser.NullVal:
continue
case *sqlparser.SQLVal:
value = values[row][valTupleIndex].(*sqlparser.SQLVal)
}
// Skip transformation if transforming function doesn't exist
if transformationFunctionMap[fieldPattern.Type] == nil {
// TODO in the event a transformation function isn't correctly defined,
// should we actually exit? Should we exit or fail softly whenever
// something goes wrong in general?
logrus.WithFields(logrus.Fields{
"type": fieldPattern.Type,
"field": fieldPattern.Field,
}).Error("Failed applying transformation type for field")
continue
}
// Skipping applying a transformation because field is empty
if len(value.Val) == 0 {
continue
}
// Skip this PatternField if none of its constraints match
if fieldPattern.Constraints != nil && !rowObeysConstraints(fieldPattern.Constraints, values[row]) {
continue
}
// Set unique map current field
setCurrentField(fieldPattern.Field)
for {
// Static value special case
if fieldPattern.Type == "static" {
value = sqlparser.NewStrVal([]byte(fieldPattern.Value))
}
proposedValue = transformationFunctionMap[fieldPattern.Type](value)
valueString = convertSQLValToString(proposedValue)
exists := checkMapExists(fieldPattern.Field, valueString)
if !fieldPattern.Unique || !exists {
values[row][valTupleIndex] = proposedValue
setMapValue(fieldPattern.Field, valueString)
break
} else if i >= uniqueLimit {
logrus.WithFields(logrus.Fields{
"type": fieldPattern.Type,
"field": fieldPattern.Field,
"value": valueString,
}).Error("Failed applying unique transformation for field")
break
}
i++
}
}
}
// values[0][0] = sqlparser.NewStrVal([]byte("Foobar"))
return values, nil
}
func rowObeysConstraints(constraints []PatternFieldConstraint, row sqlparser.ValTuple) bool {
for _, constraint := range constraints {
valTupleIndex := constraint.Position - 1
value := row[valTupleIndex].(*sqlparser.SQLVal)
parsedValue := convertSQLValToString(value)
logrus.WithFields(logrus.Fields{
"parsedValue": parsedValue,
"constraint.value": constraint.Value,
"constraint.match": constraint.Match,
}).Trace("Debuging constraint obediance: ")
switch constraint.Match {
default:
case "equal":
if parsedValue != constraint.Value {
return false
}
case "not_equal":
if parsedValue == constraint.Value {
return false
}
case "contains":
if !strings.Contains(parsedValue, constraint.Value) {
return false
}
case "not_contains":
if strings.Contains(parsedValue, constraint.Value) {
return false
}
}
}
return true
}
func convertSQLValToString(value *sqlparser.SQLVal) string {
buf := sqlparser.NewTrackedBuffer(nil)
buf.Myprintf("%s", []byte(value.Val))
pq := buf.ParsedQuery()
bytes, err := pq.GenerateQuery(nil, nil)
if err != nil {
return ""
}
return string(bytes)
}
func recompileStatementToSQL(stmt sqlparser.Statement) (string, error) {
// TODO Potentially replace with BuildParsedQuery
buf := sqlparser.NewTrackedBuffer(nil)
buf.Myprintf("%v", stmt)
pq := buf.ParsedQuery()
bytes, err := pq.GenerateQuery(nil, nil)
if err != nil {
return "", err
}
return string(bytes) + ";\n", nil
}
func setCurrentTable(tableName string) {
if tableName != currentTable {
currentTable = tableName
uniqueMap.m.Lock()
uniqueMap.v = map[string]map[string]bool{}
uniqueMap.m.Unlock()
}
}
func setCurrentField(fieldName string) {
uniqueMap.m.Lock()
_, exists := uniqueMap.v[fieldName]
if !exists {
uniqueMap.v[fieldName] = map[string]bool{}
}
uniqueMap.m.Unlock()
}
func checkMapExists(field string, value string) bool {
uniqueMap.m.Lock()
_, exists := uniqueMap.v[field][value]
uniqueMap.m.Unlock()
return exists
}
func setMapValue(field string, value string) {
uniqueMap.m.Lock()
uniqueMap.v[field][value] = true
uniqueMap.m.Unlock()
}