-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathverifyAddress.py
5609 lines (5284 loc) · 315 KB
/
verifyAddress.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# pylint: disable=line-too-long, invalid-name, pointless-string-statement, broad-exception-caught, attribute-defined-outside-init
# pylint: disable=unused-argument, arguments-differ, unused-variable, unnecessary-pass, global-statement, missing-function-docstring
# pylint: disable=missing-class-docstring, too-many-lines
'''
A script to test if an address is plausible, or questionable.
The address can be free text, so we need to look for the normalized parts.
[The address is a dictionary, which can have 'state', 'postcode' and 'suburb', but must have one or more 'addressLines' of data]
This script uses the G-NAF (Geocoded National Address Files) which can be loaded into a MySQL database or read as CSV files.
This script also uses a file of Australia Post suburb, postcode, state data. That data has to be merged with the Australian
Bureau of Statistics SA1 and LGA data; namely the boundaries for SA1 and LGA areas. A pre-processing script assigns SA1 and LGA
codes to each suburb, postcode, state combination.
The core concept here is a locality (a.k.a suburb). States and postcode have one or more localities, although a locality can cross
state and/or postcode boundaries. Localities contain streets which can contain addresses.
A street can cross locality borders, but in G-NAF an address must have a house number, a street and locality.
SYNOPSIS
$ python verifyAddress.py [-I inputDir|--inputDir=inputDir] [-O outputDir|--outputDir=outputDir]
[-C configDir|--configDir=configDir] [-c configFile|--configFile=configFile]
[-H|--hasHeading] [-m headingsMappingFile|--headingsMappingFile=headingsMappingFile]
[-S|--verifyAddressService] [-P verifyAddressPort|--verifyAddressPort=verifyAddressPort]
[-G GNAFdir|--GNAFdir=GNAFdir] [-A ABSdir|--ABSdir=ABSdir]
[-F dataFilesDirectory|--DataFilesDirectory=dataFilesDirectory]
[-N|--NTpostcodes] [-R|--region]
[-D DatabaseType|--DatabaseType=DatabaseType]
[-s server|--server=server]
[-u username|--username=username] [-p password|--password=password]
[-d databaseName|--databaseName=databaseName]
[-x|--addExtras] [-W|configWeights]
[-a|--abbreviate] [-b|--returnBoth] [-i|--indigenious] [-|filename]...
[-v loggingLevel|--verbose=logingLevel] [-L logDir|--logDir=logDir] [-l logfile|--logfile=logfile]
REQUIRED
OPTIONS
-I inputDir|--inputDir=inputDir
The directory where the input file will be found (default='.')
-O outputDir|--outputDir=outputDir
The directory where the output file will be written (default='.')
-C configDir|--configDir=configDir
The directory where the configuration files can be found (default='.')
-c configFile|--configFile=configFile
The configuration file (default=verifyAddress.json)
-H|--hasHeading
Files of addresses to be verified are CSV files and have a heading line
(mapping of heading to data items is defined in the headingsMappingFile)
Output file will also have headings.
-m headingsMappingFile|--headingsMappingFile=headingsMappingFile
The name of headings mapping file (must be in the configuration directory(default='headings.json')
-S|--verifyAddressService
Run verifyAddress as a service (default=False)
-P verifyAddressPort|--verifyAddressPort=verifyAddressPort
The port for the verifyAddress service (default=8088)
-G GNAFdir|--GNAFdir=GNAFdir
Use the standard G-NAF psv files from this folder
-A ABSdir|--GNAFdir=GNAFdir
The directory where the standard ABS csv files will be found (default='./G-NAF')
-F dataFilesDirectory|--DataFilesDirectory=dataFilesDirectory
The directory containing the compact data files (default='./data')
-N|--NTpostcodes
Assume that 8dd is an NT postcode of 08dd
-R|--region
Assume Australian region (State/Territory) if no state/territory supplied, but one or more suburbs found
-D DatabaseType|--DatabaseType=DatabaseType
The type of database [choice:MSSQL/MySQL]
-s server|--server=server]
The address of the database server
-u userName|--userName=userName]
The user name require to access the database
-p password|--userName=userName]
The user password require to access the database
-d databaseName|--databaseName=databaseName]
The name of the database
-x|-addExtras
Use additional flat text, level text, trims
-W|configWeights
Use suburb/state weights and fuzz levels from the config file
-a|--abbreviate
Return abbreviated street types
-b|--returnBoth
Return both full and abbreviated street types
-i|--indigenious
Search for Indigenious community addresses
-v loggingLevel|--verbose=loggingLevel
Set the level of logging that you want (defaut INFO).
-L logDir
The directory where the log file will be written (default='.')
-l logfile|--logfile=logfile
The name of a logging file where you want all messages captured (default=None)
This script receives and processes a string of text (called an 'Address').
'''
# Import all the modules that make life easy
import sys
import os
import argparse
import logging
import csv
import json
import collections
import re
import copy
import threading
import socketserver
from urllib.parse import parse_qs
from http.server import BaseHTTPRequestHandler, HTTPServer
import jellyfish
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import OperationalError
from sqlalchemy_utils import database_exists
class VerifyData:
'''
The Verify Address Data - required for threading
'''
def __init__(self, thisProgName):
self.logfmt = thisProgName + ' [%(asctime)s]: %(message)s'
self.formatter = logging.Formatter(fmt=self.logfmt, datefmt='%d/%m/%y %H:%M:%S %p')
self.result = {} # The structured result
# This next section is plagurised from /usr/include/sysexits.h
EX_OK = 0 # successful termination
EX_WARN = 1 # non-fatal termination with warnings
EX_USAGE = 64 # command line usage error
EX_DATAERR = 65 # data format error
EX_NOINPUT = 66 # cannot open input
EX_NOUSER = 67 # addressee unknown
EX_NOHOST = 68 # host name unknown
EX_UNAVAILABLE = 69 # service unavailable
EX_SOFTWARE = 70 # internal software error
EX_OSERR = 71 # system error (e.g., can't fork)
EX_OSFILE = 72 # critical OS file missing
EX_CANTCREAT = 73 # can't create (user) output file
EX_IOERR = 74 # input/output error
EX_TEMPFAIL = 75 # temp failure; user is invited to retry
EX_PROTOCOL = 76 # remote error in protocol
EX_NOPERM = 77 # permission denied
EX_CONFIG = 78 # configuration error
# The command line arguments and their related globals
inputDir = '.' # The directory where the input files will be found
outputDir = '.' # The directory where the output files will be written
configDir = '.' # The directory where the config files will be found
configFile = 'verifyAddress.json' # The default configuration file
verifyAddressService = None # Run as a service
verifyAddressPort = None # The service port
GNAFdir = None # Use the standard G-NAF psv files from this folder
ABDdir = None # The directory where the standard ABS csv files will be found (default='./G-NAF')
DataDir = '.' # The directory where the data files will be found
NTpostcodes = False # Assume 8xx is NT postcode 08xx
region = False # Assume Australian region (State/Territory) if no state/territory supplied, but suburb(s) found
DatabaseType = None # The database type
engine = None # The database engine
conn = None # The database connection
Session = None # The database session maker
databaseName = None # The database name
addExtras = None # Strip of extra trims
indigenious = None # Look for indigenious communities
communityCodes = [] # Codes representing the word 'COMMUNITY'
logDir = '.' # The directory where the log files will be written
logging_levels = {0:logging.CRITICAL, 1:logging.ERROR, 2:logging.WARNING, 3:logging.INFO, 4:logging.DEBUG}
loggingLevel = logging.NOTSET # The default logging level
logFile = None # The name of the logfile (output to stderr if None)
fh = None # The logging handler for file things
sh = None # The logging handler for stdin things
abbreviate = False # Output abbreviated street types
returnBoth = False # Output returnBothd street types
# The global data
mydb = None # The database connector for tables
cursor = None # The database cursor for tables
states = {} # The stateAbbrev, regex(stateName), regex(stateAbbrev) for each statePid
postcodes = {} # Postcodes and their states and suburbs
suburbs = {} # Locality and Suburb data
suburbLen = {} # Length of each suburb name, soundex code and list of suburbs
suburbCount = {} # Count of properties within each suburb/state combination
maxSuburbLen = None # Length of the longest suburb
localities = {} # List of tuples of (statePid, localityName, alias) for each localityPid
localityNames = set() # Set of all locality names
localityGeodata = {} # Geolocation data for each locality pid
stateLocalities = {} # Sets of localityPids for each statePid
postcodeLocalities = {} # Postcodes and their set of localityPids
localityPostcodes = {} # Localities and their set of postcodes
neighbours = {} # LocalityPids with their set of neighbouring locality pids
streetNames = {} # Street Name/Type/Suffix, localityPid and alias for each streetPid
streets = {} # Streets by soundCode, streetKey, source and streetPid
streetLen = {} # Length of street name with all the matching streets
shortStreets = {} # Street with no street type and their geocode data
shortTypes = set() # Street types that exist in short streets
shortTypeKeys = {} # Street types with short keys for short streets with that street type in the street name
streetTypes = {} # Street type and list of streetTypeAbbrev, regex(streetType), regex(streetTypeAbbrev)
streetTypeCount = {} # Street type and count of properties with this street type
streetTypeSuburbs = {} # Suburbs containing this street type as part of their name (regex of preceeding word and street type)
streetTypeSound = {} # Unique soundex for street types
streetSuffixes = {} # Street suffix and list of regex(streetSuffix), streetSuffixAbbrev)
streetNos = {} # Streets with their houses and geocode data
stateStreets = {} # Sets of streetPids for each statePid
streetLocalities = {} # Sets of localityPid for each streetPid
localityStreets = {} # Sets of streetPids for each localityPid
buildings = {} # Building name, streetPid, regex and details
buildingPatterns = {} # Building name, regular expresson for finding building name
flats = [] # List of regular expressions for finding flat types
levels = [] # List of regular expressions for finding unit types
extraTrims = [] # Any extra trims to be removed
services = [] # Postal Delivery Services
SA1map = {} # key=Mesh Block 2016 code, value=SA1 code
LGAmap = {} # key=Mesh Block 2016 code, value=LGA code
SandTs = ['ACT', 'NSW', 'NT', 'OT', 'QLD', 'SA', 'TAS', 'VIC', 'WA']
# Set up the default configuration for suburb/street weights and fuzz levels
# These can be overridden from the configuration file
suburbSourceWeight = {'G':10, 'GA':9, 'C':8, 'GN':7, 'GS':6, 'GL':5, 'GAS':4, 'GAL':2, 'CL':1, '':0}
streetSourceWeight = {'G':10, 'GA':9, 'C':8, 'GS':6, 'GL':5, 'GAS':4, 'GAL':2, '':0}
# fuzzLevels
fuzzLevels = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ]
slash = re.compile(r'\\')
oneSpace = re.compile(r'\s\s+')
dashSpace = re.compile(r'\s*-\s*')
endHyphen = re.compile(r'-$')
deliveryNumber = r'\b([A-Z]{1,2})?(\d{1,6})([A-Z]{1,2})?\b(?<!([ 2-9]1ST|[ 2-9]2ND|[ 2-9]3RD|[ 0-9][4-9]TH|1[1-3]TH))'
deliveryRange = deliveryNumber + r'(( *- *)' + deliveryNumber + r')?'
LOTpattern = re.compile(r'(LOT *)' + deliveryRange)
lastDigit = re.compile(deliveryRange)
period = re.compile(r'\.')
# Create the class for handline http request
class verifyAddressHandler(BaseHTTPRequestHandler):
def log_message(self, fmt, *logArgs):
return
def do_GET(self):
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
self.message = '<html><head><title>Geocode an Australian Address</title><link rel="icon" href="data:,"></head><body>'
self.message += '<h1>Geocode and Normalize/Standardize an Australian Address</h1>'
self.message += '<form method="post" action ="' + self.path + '">'
self.message += '<h2>Paste your Australian Address as a single line below</h2>'
self.message += '<input type="text" name="line" style="width:70%"></input>'
self.message += '<h1>OR</h1>'
self.message += '<h2>Paste your semi-structured Australian Address below - then click the Geocode button</h2>'
self.message += '<table style="width:70%"><tr>'
self.message += '<td style="width:20%;text-align=right">Line1</td>'
self.message += '<td><input type="text" name="line1" style="width:80%;text-align=left"></input></td>'
self.message += '</tr><tr>'
self.message += '<td style="width:20%;text-align=right">Line2</td>'
self.message += '<td><input type="text" name="line2" style="width:80%;text-align=left"></input></td>'
self.message += '</tr><tr>'
self.message += '<td style="width:20%;text-align=right">Suburb</td>'
self.message += '<td><input type="text" name="suburb" style="width:80%;text-align=left"></input></td>'
self.message += '</tr><tr>'
self.message += '<td style="width:20%;text-align=right">State</td>'
self.message += '<td><input type="text" name="state" style="width:80%;text-align=left"></input></td>'
self.message += '</tr><tr>'
self.message += '<td style="width:20%;text-align=right">Postcode</td>'
self.message += '<td><input type="text" name="postcode" style="width:80%;text-align=left"></input></td>'
self.message += '</tr></table>'
self.message += '<h2>then click the Geocode button</h2>'
self.message += '<p><input type="submit" value="Geocode this please"/></p>'
self.message += '</form></body></html>'
self.wfile.write(self.message.encode('utf-8'))
return
def do_POST(self): # We only handle POST requests
# Reset all the globals
self.data = VerifyData('[verifyAddressService-' + threading.currentThread().getName() + ']')
# Set up logging for this new thread
self.data.logger = logging.getLogger()
# Get the address data
content_len = int(self.headers['Content-Length'])
content_type = self.headers['Content-Type'].casefold()
try:
accept_type = self.headers['Accept'].casefold()
except Exception as expt:
accept_type = 'text/html'
body = self.rfile.read(content_len) # Get the URL encoded body
if content_type == 'application/x-www-form-urlencoded':
try:
# Create self.data.params to mirror the JSON payload
params = parse_qs(body)
self.data.params = {}
line0 = ''
line1 = ''
line2 = ''
suburb = ''
state = ''
postcode = ''
if b'line' in params:
line0 = params[b'line'][0].decode('ASCII').strip()
if b'line1' in params:
line1 = params[b'line1'][0].decode('ASCII').strip()
if b'line2' in params:
line2 = params[b'line2'][0].decode('ASCII').strip()
if b'suburb' in params:
suburb = params[b'suburb'][0].decode('ASCII').strip()
if b'state' in params:
state = params[b'state'][0].decode('ASCII').strip()
if b'postcode' in params:
postcode = params[b'postcode'][0].decode('ASCII').strip()
if line0 == '':
# Looks like structured data
if line1 == '':
if line2 == '': # No address lines of any sort
self.data.logger.critical('no lines')
# Return Bad Request
# Now output the web page
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
# Assembling the HTML content
self.message = '<html><head><title>Geocode an Australian Address</title><link rel="icon" href="data:,"></head><body>'
self.message += '<h1>Geocoded and Normalized/Standardized Address</h1>'
self.message += '<h2>Error - no address lines entered</h2>'
self.message += '<h3>Please enter a single line address or a semi-structured address</h3>'
self.message += '<br><a href="' + self.path + '">Click here to Geocode and Normalize/Standardize another Australian Address</a><br>'
self.message += '</body></html>'
self.wfile.write(self.message.encode('utf-8'))
# Shutdown logging
for this_hdlr in self.data.logger.handlers:
this_hdlr.flush()
del self.data
return
self.data.params['addressLines'] = [line2]
elif line2 == '':
self.data.params['addressLines'] = [line1]
else:
self.data.params['addressLines'] = []
self.data.params['addressLines'].append(line1)
self.data.params['addressLines'].append(line2)
if suburb != '':
self.data.params['suburb'] = suburb
if state != '':
self.data.params['state'] = state
if postcode != '':
self.data.params['postcode'] = postcode
elif (line1 == '') and (line2 == '') and (suburb == '') and (state == '') and (postcode == ''):
# Looks like unstructured data
self.data.params['addressLines'] = [line0]
else:
self.data.logger.critical('both single line and semi-structured addresses)')
# It's a mess - both structured and unstructured
# Now output the web page
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
# Assembling the HTML content
self.message = '<html><head><title>Geocode an Australian Address</title><link rel="icon" href="data:,"></head><body>'
self.message += '<h1>Geocoded and Normalized/Standardized Address</h1>'
self.message += '<h2>Error - both single line and semi-structured address entered</h2>'
self.message += '<h3>Please enter a single line address or a semi-structured address</h3>'
self.message += '<br><a href="' + self.path + '">Click here to Geocode and Normalize/Standardize another Australian Address</a><br>'
self.message += '</body></html>'
self.wfile.write(self.message.encode('utf-8'))
# Shutdown logging
for this_hdlr in self.data.logger.handlers:
this_hdlr.flush()
del self.data
return
except Exception as ee:
# Return Bad Request
# Shutdown logging
for this_hdlr in self.data.logger.handlers:
this_hdlr.flush()
del self.data
self.send_error(400)
return
else:
# Read in the JSON payload
try:
self.data.params = json.loads(body) # JSON payload
except Exception as expt:
self.data.logger.critical('Bad JSON')
# Return Bad Request
# Shutdown logging
for this_hdlr in self.data.logger.handlers:
this_hdlr.flush()
del self.data
self.send_error(400)
return
# Process the request - get the Address to verify
self.data.Address = {}
for eachAddressPart in self.data.params:
self.data.Address[eachAddressPart] = self.data.params[eachAddressPart]
self.data.logger.info('verifyAddress address(%s)', self.data.Address)
# verify the address
verifyAddress(self.data)
# Check if JSON or HTML response required
if accept_type == 'application/json':
# Return the results
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
# Return the results dictionary
self.data.response = json.dumps(self.data.result)
self.data.response = self.data.response.encode('utf-8')
self.wfile.write(self.data.response)
else:
# Now output the web page
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
# Assembling the HTML content
self.data.message = '<html><head><title>Geocode an Australian Address</title><link rel="icon" href="data:,"></head><body>'
self.data.message += '<h1>Geocoded and Normalized/Standardized Address</h1>'
self.data.message += '<h2>Geocoded Meta Data</h2>'
self.data.message += '<table style="width:70%"><tr>'
self.data.message += '<td style="width:20%;text-align=right">Latitude</td>'
self.data.message += '<td style="width:80%;text-align=left">' + self.data.result['latitude'] + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:20%;text-align=right">Longitude</td>'
self.data.message += '<td style="width:80%;text-align=left">' + self.data.result['longitude'] + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:20%;text-align=right">Mesh Block</td>'
self.data.message += '<td style="width:80%;text-align=left">' + self.data.result['Mesh Block'] + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:20%;text-align=right">SA1</td>'
self.data.message += '<td style="width:80%;text-align=left">' + self.data.result['SA1'] + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:20%;text-align=right">LGA</td>'
self.data.message += '<td style="width:80%;text-align=left">' + self.data.result['LGA'] + '</td>'
self.data.message += '</tr></table>'
self.data.message += '<h2>Normalized/Standardized Address</h2>'
self.data.message += '<table style="width:70%"><tr>'
if self.data.result['isPostalService'] and (self.data.result['buildingName'] != ''):
self.data.message += '<td style="width:30%;text-align=right">Building Name</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['buildingName'] + '</td>'
self.data.message += '</tr><tr>'
if (self.data.result['addressLine1'] != '') and (self.data.result['addressLine1'][-1] == ','):
self.data.result['addressLine1'] = self.data.result['addressLine1'][:-1]
self.data.message += '<td style="width:30%;text-align=right">Address Line 1</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['addressLine1'] + '</td>'
self.data.message += '</tr><tr>'
if (self.data.result['addressLine2'] != '') and (self.data.result['addressLine2'][-1] == ','):
self.data.result['addressLine2'] = self.data.result['addressLine2'][:-1]
self.data.message += '<td style="width:30%;text-align=right">Address Line 2</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['addressLine2'] + '</td>'
self.data.message += '</tr><tr>'
if not self.data.result['isPostalService'] and (self.data.result['buildingName'] != ''):
self.data.message += '<td style="width:30%;text-align=right">Building Name</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['buildingName'] + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:30%;text-align=right">House Number</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['houseNo'] + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:30%;text-align=right">Street</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['street'] + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:30%;text-align=right">Suburb</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['suburb'] + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:30%;text-align=right">Postcode</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['postcode'] + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:30%;text-align=right">State</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['state'] + '</td>'
self.data.message += '</tr></table>'
if returnBoth:
self.data.message += '<h2>Abbreviated Normalized/Standardized Address</h2>'
self.data.message += '<table style="width:70%"><tr>'
if (self.data.result['addressLine1Abbrev'] != '') and (self.data.result['addressLine1Abbrev'][-1] == ','):
self.data.result['addressLine1Abbrev'] = self.data.result['addressLine1Abbrev'][:-1]
self.data.message += '<td style="width:30%;text-align=right">Abbreviated Address Line 1</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['addressLine1Abbrev'] + '</td>'
self.data.message += '</tr><tr>'
if (self.data.result['addressLine2Abbrev'] != '') and (self.data.result['addressLine2Abbrev'][-1] == ','):
self.data.result['addressLine2Abbrev'] = self.data.result['addressLine2Abbrev'][:-1]
self.data.message += '<td style="width:30%;text-align=right">Abbreviated Address Line 2</td>'
self.data.message += '<td style="width:60%;text-align=left">' + self.data.result['addressLine2Abbrev'] + '</td>'
self.data.message += '</tr></table>'
self.data.message += '<h2>G-NAF ID, Accuracy, Score and Messages</h2>'
self.data.message += '<table style="width:70%"><tr>'
self.data.message += '<td style="width:20%;text-align=right">G-NAF ID</td>'
self.data.message += '<td style="width:80%;text-align=left">' + str(self.data.result['G-NAF ID']) + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:20%;text-align=right">Accuracy</td>'
self.data.message += '<td style="width:80%;text-align=left">' + str(self.data.result['accuracy']) + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:20%;text-align=right">Fuzz Level</td>'
self.data.message += '<td style="width:80%;text-align=left">' + str(self.data.result['fuzzLevel']) + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:20%;text-align=right">Score</td>'
self.data.message += '<td style="width:80%;text-align=left">' + str(self.data.result['score']) + '</td>'
self.data.message += '</tr><tr>'
self.data.message += '<td style="width:20%;text-align=right">Status</td>'
self.data.message += '<td style="width:80%;text-align=left">' + self.data.result['status'] + '</td>'
if len(self.data.result['messages']) > 0:
self.data.message += '</tr><tr>'
firstMessage = True
for mess in range(len(self.data.result['messages'])):
if firstMessage:
self.data.message += '<td style="width:20%;text-align=right">Messages</td>'
firstMessage = False
else:
self.data.message += '<td style="width:20%;text-align=right"></td>'
self.data.message += '<td style="width:80%;text-align=left">' + self.data.result['messages'][mess] + '</td>'
self.data.message += '</tr></table>'
self.data.message += '<p><b><a href="' + self.path + '">Click here to Geocode and Normalize/Standardize another Australian Address</a></b><br>'
self.data.message += '</body></html>'
self.data.response = self.data.message.encode('utf-8')
self.wfile.write(self.data.response)
# Shutdown logging
for this_hdlr in self.data.logger.handlers:
this_hdlr.flush()
del self.data
return
class ThreadedHTTPServer(socketserver.ThreadingMixIn, HTTPServer) :
'''
Handle requests in a separate thread.
'''
pass
# fork() NOT AVAILABLE ON WINDOWS
# class ForkingHTTPServer(socketserver.ForkingMixIn, HTTPServer) :
# '''
# Handle requests in a separate thread.
# '''
# pass
def cleanText(thisText, removeCommas):
if thisText is not None:
thisText = str(thisText).upper() # Convert to upper case
thisText = thisText.replace(':', '') # Remove colons
if removeCommas:
thisText = thisText.replace(',', '') # Remove commas
thisText = slash.sub('/', thisText) # Change backslash to slash so we don't acccidentally crash regular expressions
thisText = oneSpace.sub(' ', thisText) # Collapse mutiple white space to a single space
thisText = dashSpace.sub('-', thisText) # Remove white space around the hyphen in hyphenated streets, suburbs
thisText = endHyphen.sub('', thisText) # Remove hyphens at the end of streets, suburbs
thisText = thisText.strip() # Remove white space from start and end of text
return thisText
else:
return ''
def addPostcode(this, postcode, suburb, statePid, sa1, lga, latitude, longitude):
'''
Add postcode data from postcodeSA1LGA, postcode_SA1LGA.csv
'''
# this.logger.debug('Adding postcode (%s), suburb (%s)', postcode, suburb)
global maxSuburbLen
if postcode not in postcodes:
postcodes[postcode] = {}
postcodes[postcode]['states'] = set()
postcodes[postcode]['states'].add(statePid)
if suburb == '':
postcodes[postcode][suburb] = [sa1, lga, latitude, longitude]
else:
if suburb not in postcodes[postcode]:
postcodes[postcode][suburb] = {}
postcodes[postcode][suburb][statePid] = [sa1, lga, latitude, longitude]
if statePid not in postcodes[postcode]:
postcodes[postcode][statePid] = set()
postcodes[postcode][statePid].add(suburb)
soundCode = jellyfish.soundex(suburb)
if soundCode not in suburbs:
suburbs[soundCode] = {}
if suburb not in suburbs[soundCode]:
suburbs[soundCode][suburb] = {}
if statePid not in suburbs[soundCode][suburb]:
suburbs[soundCode][suburb][statePid] = {}
if 'A' not in suburbs[soundCode][suburb][statePid]:
suburbs[soundCode][suburb][statePid]['A'] = {}
suburbs[soundCode][suburb][statePid]['A'][postcode] = [sa1, lga, latitude, longitude]
suburbLength = len(suburb)
if (maxSuburbLen is None) or (suburbLength > maxSuburbLen):
maxSuburbLen = suburbLength
if suburbLength not in suburbLen:
suburbLen[suburbLength] = {}
if soundCode not in suburbLen[suburbLength]:
suburbLen[suburbLength][soundCode] = []
if suburb not in suburbLen[suburbLength][soundCode]:
suburbLen[suburbLength][soundCode].append(suburb)
return
def addSuburb(this, localityPid, statePid, suburb, alias, sa1, lga, latitude, longitude):
'''
Add suburb data from localitySA1LGA, locality_SA1LGA.psv
'''
# this.logger.debug('Adding suburb %s', suburb)
global maxSuburbLen
# Add to streetTypeSuburb if any word in suburb is streetType
theseWords = suburb.split(' ')
if len(theseWords) > 1:
for ii, word in enumerate(theseWords[1:]):
if word in streetTypes:
if word not in streetTypeSuburbs:
streetTypeSuburbs[word] = set()
streetTypeSuburbs[word].add(re.compile(theseWords[ii] + r'\s+' + word))
localities[localityPid].add((statePid, suburb, alias)) # Add suburb name to localities (if not already there)
localityNames.add(suburb)
soundCode = jellyfish.soundex(suburb)
if soundCode not in suburbs:
suburbs[soundCode] = {}
if suburb not in suburbs[soundCode]:
suburbs[soundCode][suburb] = {}
if statePid not in suburbs[soundCode][suburb]:
suburbs[soundCode][suburb][statePid] = {}
if alias == 'P':
if 'G' not in suburbs[soundCode][suburb][statePid]:
suburbs[soundCode][suburb][statePid]['G'] = {}
suburbs[soundCode][suburb][statePid]['G'][localityPid] = [sa1, lga, latitude, longitude]
elif alias == 'C':
if 'C' not in suburbs[soundCode][suburb][statePid]:
suburbs[soundCode][suburb][statePid]['C'] = {}
suburbs[soundCode][suburb][statePid]['C'][localityPid] = [sa1, lga, latitude, longitude]
else:
if 'GA' not in suburbs[soundCode][suburb][statePid]:
suburbs[soundCode][suburb][statePid]['GA'] = {}
suburbs[soundCode][suburb][statePid]['GA'][localityPid] = [sa1, lga, latitude, longitude]
localityGeodata[localityPid] = (sa1, lga, latitude, longitude)
suburbLength = len(suburb)
if (maxSuburbLen is None) or (suburbLength > maxSuburbLen):
maxSuburbLen = suburbLength
if suburbLength not in suburbLen:
suburbLen[suburbLength] = {}
if soundCode not in suburbLen[suburbLength]:
suburbLen[suburbLength][soundCode] = []
if suburb not in suburbLen[suburbLength][soundCode]:
suburbLen[suburbLength][soundCode].append(suburb)
return
def addLocality(this, localityPid, suburb, postcode, statePid, alias):
'''
Add locality data from LOCALITY, LOCALITY_ALIAS, locality.psv
'''
# this.logger.debug('Adding locality %s with postcode (%s) and statePid (%s)', suburb, postcode, statePid)
if localityPid not in localities:
localities[localityPid] = set()
localities[localityPid].add((statePid, suburb, alias))
localityNames.add(suburb)
if statePid not in stateLocalities:
stateLocalities[statePid] = set()
stateLocalities[statePid].add(localityPid)
if (postcode is not None) and (postcode != ''):
if postcode not in postcodeLocalities:
postcodeLocalities[postcode] = set()
postcodeLocalities[postcode].add(localityPid)
if localityPid not in localityPostcodes:
localityPostcodes[localityPid] = set()
localityPostcodes[localityPid].add(postcode)
if postcode not in postcodes:
postcodes[postcode] = {}
postcodes[postcode]['states'] = set()
postcodes[postcode]['states'].add(statePid)
if alias not in ['P', 'C']: # Don't clone postcodes for locality aliases
return
soundCode = jellyfish.soundex(suburb)
if (soundCode in suburbs) and (suburb in suburbs[soundCode]):
if (statePid in suburbs[soundCode][suburb]) and ('A' in suburbs[soundCode][suburb][statePid]):
for postcode in suburbs[soundCode][suburb][statePid]['A']:
if postcode not in postcodeLocalities:
postcodeLocalities[postcode] = set()
postcodeLocalities[postcode].add(localityPid)
if localityPid not in localityPostcodes:
localityPostcodes[localityPid] = set()
localityPostcodes[localityPid].add(postcode)
return
def addStreetName(this, streetPid, streetName, streetType, streetSuffix, localityPid, alias):
'''
Add street names from STREET_LOCALITY, STREET_LOCALITY_ALIAS, xxx_STREET_LOCALITY_psv.psv, xxx_STREET_LOCALITY_ALIAS_psv.psv, street_details.psv
'''
# this.logger.debug('Adding street name %s %s %s', streetName, streetType, streetSuffix)
# Deal with street names that contain abbreviations
# Build up a list of acceptable equivalent street names
names = [(streetName, alias)]
if streetName[:3] == 'MT ':
names.append(('MOUNT ' + streetName[3:], 'A'))
# For hyphenated street names we allow both halves as street names aliases
# Plus the names in the reverse order, plus the parts in either order separated by a space instead of a hyphen
hyphenParts = streetName.split('-')
if len(hyphenParts) == 2:
names.append((hyphenParts[0], 'A'))
names.append((hyphenParts[1], 'A'))
names.append((hyphenParts[1] + '-' + hyphenParts[0], 'A'))
names.append((hyphenParts[0] + ' ' + hyphenParts[1], 'A'))
names.append((hyphenParts[1] + ' ' + hyphenParts[0], 'A'))
if streetPid not in streetNames:
streetNames[streetPid] = []
for name, thisAlias in names:
if streetType is None:
if streetSuffix is None:
streetNames[streetPid].append([name, '', '', localityPid, thisAlias])
else:
streetNames[streetPid].append([name, '', streetSuffix, localityPid, thisAlias])
elif streetSuffix is None:
streetNames[streetPid].append([name, streetType, '', localityPid, thisAlias])
else:
streetNames[streetPid].append([name, streetType, streetSuffix, localityPid, thisAlias])
if localityPid not in localityStreets:
localityStreets[localityPid] = set()
localityStreets[localityPid].add(streetPid)
if streetPid not in streetLocalities:
streetLocalities[streetPid] = localityPid
if localityPid not in localities:
return
done = set()
for thisStatePid, thisSuburb, thisAlias in localities[localityPid]:
statePid = thisStatePid
if statePid in done:
continue
done.add(statePid)
if statePid not in stateStreets:
stateStreets[statePid] = set()
stateStreets[statePid].add(streetPid)
if streetType not in streetTypeCount:
streetTypeCount[streetType] = 1
else:
streetTypeCount[streetType] += 1
return
def addStreet(this, streetPid, sa1, lga, latitude, longitude):
'''
Add street geocode data from streetSA1LGA, street_SA1LGA.psv
'''
# this.logger.debug('Adding street sa1 %s', sa1)
for name in range(len(streetNames[streetPid])):
streetName = streetNames[streetPid][name][0]
streetType = streetNames[streetPid][name][1]
streetSuffix = streetNames[streetPid][name][2]
alias = streetNames[streetPid][name][4]
soundCode = jellyfish.soundex(streetName)
if streetType == '':
if streetSuffix == '':
streetKey = streetName + '~~'
else:
streetKey = '~'.join([streetName, '', streetSuffix])
elif streetSuffix == '':
streetKey = '~'.join([streetName, streetType, ''])
else:
streetKey = '~'.join([streetName, streetType, streetSuffix])
if soundCode not in streets:
streets[soundCode] = {}
if streetKey not in streets[soundCode]:
streets[soundCode][streetKey] = {}
if alias == 'P':
if 'G' not in streets[soundCode][streetKey]:
streets[soundCode][streetKey]['G'] = {}
streets[soundCode][streetKey]['G'][streetPid] = [sa1, lga, latitude, longitude]
else:
if 'GA' not in streets[soundCode][streetKey]:
streets[soundCode][streetKey]['GA'] = {}
streets[soundCode][streetKey]['GA'][streetPid] = [sa1, lga, latitude, longitude]
if streetType == '':
if streetSuffix == '':
shortKey = streetName
shortRegex = streetName
else:
shortKey = ' '.join([streetName, streetSuffix]).strip()
shortRegex = streetName + r'\s+' + streetSuffix
if shortKey not in shortStreets:
shortStreets[shortKey] = {}
shortStreets[shortKey]['regex'] = re.compile(r'\b' + shortRegex + r'\b')
shortStreets[shortKey]['SK'] = streetKey
if alias == 'P':
if 'G' not in shortStreets[shortKey]:
shortStreets[shortKey]['G'] = {}
shortStreets[shortKey]['G'][streetPid] = [sa1, lga, latitude, longitude]
else:
if 'GA' not in shortStreets[shortKey]:
shortStreets[shortKey]['GA'] = {}
shortStreets[shortKey]['GA'][streetPid] = [sa1, lga, latitude, longitude]
words = streetName.split(' ')
for word in words:
if word in streetTypes:
shortTypes.add(word)
if word not in shortTypeKeys:
shortTypeKeys[word] = set()
shortTypeKeys[word].add(shortKey)
streetLength = len(streetName)
if streetLength not in streetLen:
streetLen[streetLength] = []
streetLen[streetLength].append([soundCode, streetName, streetKey])
return
def addStreetNumber(this, buildingName, streetPid, localityPid, postcode, lotNumber, numberFirst, numberLast, mbCode, latitude, longitude, addressPid):
'''
Add street number from ADDRESS_DETAIL table, xxx_ADDRESS_DETAIL_psv.psv or address_detail.psv
'''
# this.logger.debug('Adding street number %s', str(numberFirst))
if localityPid in localities: # Count properties in this suburb
done = set()
for thisStatePid, thisSuburb, thisAlias in localities[localityPid]:
thisThing = thisStatePid + '~' + thisSuburb
if thisThing in done:
continue
done.add(thisThing)
if thisSuburb not in suburbCount:
suburbCount[thisSuburb] = {}
if thisStatePid not in suburbCount[thisSuburb]:
suburbCount[thisSuburb][thisStatePid] = 0
suburbCount[thisSuburb][thisStatePid] += 1
if (postcode is not None) and (postcode != ''):
if postcode not in postcodeLocalities:
postcodeLocalities[postcode] = set()
postcodeLocalities[postcode].add(localityPid)
if localityPid not in localityPostcodes:
localityPostcodes[localityPid] = set()
localityPostcodes[localityPid].add(postcode)
if lotNumber is not None:
if (buildingName is not None) and (buildingName != ''):
if buildingName not in buildings:
buildings[buildingName] = []
buildings[buildingName].append([lotNumber, streetPid, localityPid])
if buildingName not in buildingPatterns:
buildingPatterns[buildingName] = re.compile(r'\b' + buildingName.replace(' ', r'\s+') + r'\b')
if streetPid not in streetNos:
streetNos[streetPid] = {}
streetNos[streetPid][lotNumber] = [mbCode, latitude, longitude, True, addressPid]
if numberFirst is not None:
if streetPid not in streetNos:
streetNos[streetPid] = {}
if numberLast is None:
if (buildingName is not None) and (buildingName != ''):
if buildingName not in buildings:
buildings[buildingName] = []
buildings[buildingName].append([numberFirst, streetPid, localityPid])
if buildingName not in buildingPatterns:
buildingPatterns[buildingName] = re.compile(r'\b' + buildingName.replace(' ', r'\s+') + r'\b')
streetNos[streetPid][numberFirst] = [mbCode, latitude, longitude, False, addressPid]
else:
step = 2
if streetPid in streetNames:
streetType = streetNames[streetPid][0][1]
if streetType in ['CLOSE', 'COURT', 'PLACE', 'CUL-DE-SAC']:
step = 1
for houseNo in range(int(numberFirst), int(numberLast) + 1, step):
if (buildingName is not None) and (buildingName != ''):
if buildingName not in buildings:
buildings[buildingName] = []
buildings[buildingName].append([houseNo, streetPid, localityPid])
if buildingName not in buildingPatterns:
buildingPatterns[buildingName] = re.compile(r'\b' + buildingName.replace(' ', r'\s+') + r'\b')
streetNos[streetPid][houseNo] = [mbCode, latitude, longitude, False, addressPid]
return
def addNeighbours(this, localityPid, soundCode, suburb, statePid, done, depth):
'''
Add neigbouring locality_pids, for this locality, to this.neiboursSet
'''
# this.logger.debug('Adding neighbour for suburb(%s), soundCode(%s), locality(%s) in state(%s), depth(%d)', suburb, soundCode, localityPid, states[statePid][0], depth)
# Assemble the neighbouring localities
if localityPid in neighbours:
done.add(localityPid)
for neighbour in sorted(neighbours[localityPid]):
if 'GN' not in suburbs[soundCode][suburb][statePid]:
suburbs[soundCode][suburb][statePid]['GN'] = {}
if (neighbour not in suburbs[soundCode][suburb][statePid]['GN']) and (neighbour in localityGeodata):
# this.logger.debug('addNeighbour - adding %s', neighbour)
suburbs[soundCode][suburb][statePid]['GN'][neighbour] = list(localityGeodata[neighbour])
# Do neighbours of this neighbour if required
if (depth > 0) and (neighbour in neighbours) and (neighbour not in done):
addNeighbours(this, neighbour, soundCode, suburb, statePid, done, depth - 1)
def initData(this):
'''
Read in the G-NAF tables, Australia Post data and any Other data
from the specified database (if any) and build up the data structures used to verify addresses.
'''
this.logger.info('Starting to initialize data')
# Read in the States and compile regular expressions for both the full and abbreviated name
# We use the state_pid as the key so we can use it to join to other tables
this.logger.info('Fetching states')
sts = []
if DatabaseType is not None: # Use the database tables
dfStates = pd.read_sql_query(text('SELECT state_pid, date_retired, state_name, state_abbreviation FROM STATE WHERE date_retired IS NULL'), engine.connect())
results = dfStates.values.tolist()
for (statePid, date_retired, name, state) in results:
if date_retired is not None:
continue
sts.append([statePid, cleanText(name, True), state])
elif GNAFdir is not None: # Use the standard G-NAF PSV files
# STATE_PID|DATE_CREATED|DATE_RETIRED|STATE_NAME|STATE_ABBREVIATION
for SandT in SandTs:
with open(os.path.join(GNAFdir, 'Standard', SandT + '_STATE_psv.psv'), 'rt', newline='', encoding='utf-8') as stateFile:
stateReader = csv.DictReader(stateFile, dialect=csv.excel, delimiter='|')
for rrow in stateReader:
if rrow['DATE_RETIRED'] != '': # Skip if retired
continue
sts.append([rrow['STATE_PID'], cleanText(rrow['STATE_NAME'], True), rrow['STATE_ABBREVIATION']])
else: # Use the optimised PSV files
# STATE_PID|STATE_NAME|STATE_ABBREVIATION
with open(os.path.join(DataDir, 'state.psv'), 'rt', newline='', encoding='utf-8') as stateFile:
stateReader = csv.DictReader(stateFile, dialect=csv.excel, delimiter='|')
for rrow in stateReader:
sts.append([rrow['STATE_PID'], cleanText(rrow['STATE_NAME'], True), rrow['STATE_ABBREVIATION']])
# Now build up states
this.logger.info('Building states')
for state in sts:
statePid = state[0]
stateName = state[1]
stateAbbrev = state[2]
if statePid not in states:
states[statePid] = [] # The stateAbbrev, regex(stateName), regex(stateAbbrev) for each statePid
states[statePid] = [stateAbbrev,
re.compile(r'\b' + stateName.replace(' ', r'\s+') + r'\b'),
re.compile(r'\b' + stateAbbrev.replace(' ', r'\s+') + r'\b')]
# Read in any extra state abbreviation
if addExtras:
if os.path.isfile(os.path.join(DataDir, 'extraStates.psv')):
# stateAbbrev|abbrev
with open(os.path.join(DataDir, 'extraStates.psv'), 'rt', newline='', encoding='utf-8') as stateFile:
stateReader = csv.DictReader(stateFile, dialect=csv.excel, delimiter='|')
for rrow in stateReader:
for statePid, statesInfo in states.items():
if statesInfo[0] == rrow['stateAbbrev']:
abbrev = rrow['abbrev']