-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathutil.py
executable file
·1562 lines (1306 loc) · 71.5 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
'''
Collection of short but frequently used routines and classes - shorthands for different daily tasks.
---
This file is part of Nifty python package. Copyright (c) by Marcin Wojnarski.
Nifty is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License
as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
Nifty is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with Nifty. If not, see <http://www.gnu.org/licenses/>.
'''
from __future__ import absolute_import
from __future__ import print_function
import os, sys, glob, types as _types, copy, re, numbers, json, time, datetime, calendar, itertools
import logging, random, math, collections, unicodedata, heapq, threading, inspect, hashlib, binascii
from six import PY2, PY3, class_types, iterkeys, iteritems
from six.moves import builtins, StringIO, map, range, zip
import six
if PY3:
import io
from functools import reduce
basestring = str
unicode = str
file = io.IOBase
#####################################################################################################################################################
###
### TYPE CHECKING
###
def isint(x): return isinstance(x, numbers.Integral)
isintegral = isinteger = isint
def isnumber(x): return isinstance(x, numbers.Number)
def isstring(s): return isinstance(s, basestring)
def isdict(x): return isinstance(x, dict)
def istype(x): return isinstance(x, class_types) # recognizes Python2 old-style classes, too
def islist(x, orTuple = True):
if orTuple: return isinstance(x, (list,tuple))
return isinstance(x, list)
def istuple(x):
return isinstance(x, tuple)
def iscontainer(x):
"True if x is a container object (list,tuple,dict,set), but NOT a string or custom iterable."
return isinstance(x, collections.Container) and not isinstance(x, basestring)
def isiterable(x):
"True if x is *any* iterable: list, tuple, dict, set, string (!), any object with __iter__ or __getitem__ method."
return isinstance(x, collections.Iterable) #and not isinstance(x, basestring)
def isregex(x):
return x is not None and (isinstance(x, getattr(re, '_pattern_type', type(None))) or isinstance(x, getattr(re, 'Pattern', type(None))))
# def isarray(x) - defined in 'math' module
def isfunction(x, funtypes = (_types.FunctionType, _types.BuiltinFunctionType, _types.MethodType, _types.BuiltinMethodType, getattr(_types, 'UnboundMethodType',_types.MethodType))):
"True if x is any kind of a 'syntactic' function: function, method, built-in; but NOT any other callable (object with __call__ method is not a function)."
return isinstance(x, funtypes)
def isgenerator(x):
return isinstance(x, _types.GeneratorType)
def ismethod(x):
"True if x is a method, bound or unbound."
return isinstance(x, (_types.MethodType, _types.BuiltinMethodType, _types.UnboundMethodType))
def isbound(method):
"True if a given method is bound, i.e., assigned to an instance (with 'self'), not a class method."
return getattr(method, 'im_self', None) is not None
# Environment checks:
def islinux():
"Is the operating system posix-type: linux, unix, Mac OS"
return os.name == "posix"
########################################################################################################################################################
###
### CONVERSIONS & COMMAND-LINE
###
# Conversions of un-structured values, typically strings received from the console (sys.argv), to different types of structured objects.
# If an input value is already a structured one (None value in particular), it is returned unchanged.
# If 'default' is given, it is returned in case of an exception, otherwise exceptions are passed to the caller.
RAISE = object() # a token used in as*() functions to indicate that exceptions should be re-raised
def asbool(s, default = RAISE):
if s in (None, "None"): return None
if isstring(s): s = s.lower()
if s in [False, 0, 0.0, "0", "", "false", "no", "n"]: return False
if s in [True, 1, 1.0, "1", "true", "yes", "y"]: return True
if default is RAISE: raise Exception("Unrecognized value passed to asbool(): %s" % s)
return default
def asint(s, default = RAISE):
if not isstring(s): return s
try: return int(s)
except:
if default is RAISE: raise
return default
def asnumber(s, default = RAISE):
if not isstring(s): return s
try: return int(s)
except: pass
try: return float(s)
except:
if default is RAISE: raise
return default
def asdatetime(d, fmt = '%Y-%m-%d %H:%M:%S', default = RAISE):
"String parsed as '%Y-%m-%d %H:%M:%S' by default. <date> converted to a datetime with hour=second=0. If a datetime or None, returned unchanged"
if isinstance(d, datetime.date): return datetime.datetime(d.year, d.month, d.day)
if not isstring(d): return d
try: return datetime.datetime.strptime(d, fmt)
except:
if default is RAISE: raise
return default
def asobject(name, context = {}, default = RAISE):
"Find an object defined inside 'context' (dict, object, module) by its name."
if not isstring(name): return name
if not isdict(context): context = context.__dict__
if name in context: return context[name]
if default is RAISE: raise Exception("Object can't be found: '%s'" % name)
return default
def asdict(**items):
"""Shorthand to create a dictionary using a `key=value` syntax instead of the standard `'key':value`."""
return items
def runCommand(context = {}, params = None, fun = None, offset = 1):
"""
Take from 'sys' all command-line arguments (starting at #offset) passed to the script and interpret them as a name
of a callable (function) from 'context' (module or dict, typically globals() of the caller),
and possibly its parameters; find the function, execute with given parameters (passed as unnamed strings)
and return its result. If the command is not present in 'context' and there are no parameters,
pass it to eval(), which is more general and can execute an arbitrary expression, not only a global-scope function.
If 'params' list is present, use it as arguments instead of sys.argv[offset:]; strings with '=' sign treated as keyword args.
Note: the called function should convert internally the parameters from a string to a proper type and
this conversion is done in a local context of the function, so it may be hard to pass variables as parameters.
"""
if not isdict(context): context = {atr: getattr(context, atr) for atr in dir(context)}
if params is None: params = sys.argv[offset:] # argv[0] is the script name, omit
# set function 'fun' if possible; retrieve command string 'cmd' from 'params' if needed
cmd = None
if fun is None:
if not params: raise Exception("Please give a command to execute.")
cmd, params = params[0], params[1:]
if cmd in context:
fun = context[cmd]
# convert textual 'params' to a list/dict of 'args'/'kwargs'
args = []; kwargs = {}
for arg in params:
if '=' in arg:
k, v = arg.split('=', 1)
if not k: raise Exception("There can be no spaces around '=' on the argument list: %s" % arg)
kwargs[k] = v
else:
if kwargs: raise Exception("Unnamed argument cannot follow a keyword argument: %s" % arg)
args.append(arg)
if fun:
return fun(*tuple(args), **kwargs)
if params:
raise Exception("Object can't be found: '%s'" % cmd) # when parameters present, we can't call eval() - don't know what to do with params?
return eval(cmd, context)
#####################################################################################################################################################
###
### CLASSES
###
def issubclass(x, cls): #@ReservedAssignment
"True if x is a class and subclass of cls, False otherwise. Overrides built-in issubclass() which raised exception if 'x' was not a class (inconvenient in many cases); this function accepts non-classes too."
return isinstance(x, type) and builtins.issubclass(x, cls)
def classname(obj = None, full = False, cls = None):
"Return (fully qualified) class name of the object 'obj' or class 'cls'."
if cls is None: cls = obj.__class__
if full: return cls.__module__ + "." + cls.__name__
return cls.__name__
def types(obj):
"Finds the type and all base types of a given object. Like baseclasses(), but includes also own type()."
t = type(obj)
return [t] + baseclasses(t)
def baseclasses(cls, include_self=False):
"Finds all base classes of a given class, also indirect ones, by recursively looking up __bases__. 'object' base is excluded."
if cls is object: return []
l = []
for base in cls.__bases__:
l.extend(baseclasses(base, True))
if include_self:
l.append(cls)
return l
bases = baseclasses # alias
def subclasses(cls, include_self=False):
"Finds all subclasses of a given class, also indirect ones, by recursively calling __subclasses__()"
l = []
for child in cls.__subclasses__():
l.extend(subclasses(child, True))
if include_self:
l.append(cls)
return l
#####################################################################################################################################################
###
### SEQUENCES & STREAMS
###
def unique(seq):
"""List of elements of a sequence 'seq' with duplicates removed, order preserved. (from: http://stackoverflow.com/a/480227/1202674)"""
if not seq: return []
seen = set()
seen_add = seen.add
return [x for x in seq if x not in seen and not seen_add(x)]
def duplicates(seq):
"""Set of duplicates found in a sequence `seq`."""
seen = set()
seen_add = seen.add
return set(x for x in seq if x in seen or seen_add(x))
def duplicate(seq):
"""Any duplicate in a sequence `seq`; or None if no duplicates are present."""
dups = duplicates(seq)
return dups.pop() if dups else None
def flatten(*seq):
"""List of all atomic elements of 'seq' (strings treated as atomic)
together with all elements of sub-iterables of 'seq', recursively.
>>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, ('a','string')], (8, 9))
[1, 2, 3, 42, None, 4, 5, 6, 7, 'a', 'string', 8, 9]
"""
try:
l = len(seq)
if l == 0: return []
if l == 1: seq = seq[0]
except: pass
result = []
for x in seq:
if hasattr(x, "__iter__") and not isinstance(x, basestring): result += flatten(x)
else: result.append(x)
return result
def split_where(seq, cut_test, key_func = None):
"""Split a sequence `seq` on every position `pos` between characters where cut_test(pos,a,b) is True,
given a = seq[pos-1], b = seq[pos], 1 <= pos < len(seq).
Return a list of subsequences, empty if `seq` is empty. `seq` must support iteration and slicing.
If optional `key_func` is given, a list of (subseq, key_func(start,stop)) pairs is returned,
where (start,stop) are start-stop indices of `subseq` in `seq`:
start = index of subseq[0] in `seq`, stop = 1 + index of subseq[-1] in `seq`.
Example of splitting a numeric sequence into monotonic (locally non-decreasing) subsequences:
>>> split_where([1, 5, 8, 3, 5, 8, 2, 0, 4], lambda i, x, y: x > y)
[[1, 5, 8], [3, 5, 8], [2], [0, 4]]
>>> split_where([], None)
[]
"""
result = []
prev = None
start = 0
stop = -1
# test every pair (a,b) of neighboring elements in `seq`, make a split wherever cut_test(a,b) is True
for stop, curr in enumerate(seq):
if stop >= 1 and cut_test(stop, prev, curr):
subseq = seq[start:stop]
item = subseq if key_func is None else (subseq, key_func(start, stop))
result.append(item)
start = stop
prev = curr
if stop == -1: return [] # `seq` was empty?
assert start <= stop
# assert len(seq) == stop + 1
subseq = seq[start:]
item = subseq if key_func is None else (subseq, key_func(start, stop))
result.append(item)
return result
def batch(sequence, size):
"Split a sequence into batches of max. `size` items each and yield as a stream of batches, every batch as a list."
assert size >= 1
batch = []
for item in sequence:
if len(batch) >= size:
yield batch
batch = []
batch.append(item)
if batch: yield batch
def chain(iterables):
"Like itertools.chain(), but accepts a sequence of sequences as a single argument, rather than each sequence as a separate argument."
for seq in iterables:
for item in seq:
yield item
def partition(test, sequence):
"""Split `sequence` into 2 lists: (positive,negative), according to the bool value of test(x), order preserved.
`test` is either a 1-arg function; or None, which is equivalent to test=bool.
`test` is called only ONCE for each element of `sequence`.
"""
pos = []
neg = []
test = test if test is not None else bool
for x in sequence:
if test(x): pos.append(x)
else: neg.append(x)
return pos, neg
#####################################################################################################################################################
###
### DICTIONARIES & LISTS
###
def printdict(d, sep = ' = ', indent = ' ', end = '\n'):
"Human-readable multi-line printout of dictionary key->value items."
line = indent + '%s' + sep + '%s' + end
text = ''.join(line % item for item in iteritems(d))
print(text)
def reversedict(d):
"""Given a dict of {key: value} pairs, create a reverse mapping {value: key}. Uniqueness of values is NOT checked!"""
return {value: key for key, value in d.items()}
def list2str(l, sep = " ", f = str):
"Convert all items of list 'l' into strings by mapping them through function 'f' and joining by separator 'sep'. 'f' can also be a format string."
if isstring(f): f = lambda x: f % x
return sep.join(map(f, l))
def str2list(s, sep = None):
"""Return s.split(sep), but first check if 's' is not already a list or None (return unchanged in such case).
For convenient definition of string lists: either as lists or as sep-separated strings of words."""
if s is None or islist(s): return s
return s.split(sep)
def list2dict(l, dict_type = dict, invert = False):
"Convert a list to {index: value} mapping of a given dict_type. If invert=True, the resulting mapping is inverted: {value: index}."
if invert: return dict_type((v,i) for i, v in enumerate(l))
else: return dict_type((i,v) for i, v in enumerate(l))
def obj2dict(obj):
'Recursively convert a tree of nested objects into nested dictionaries. Iterables converted to lists.'
if hasattr(obj, "__iter__"):
return [obj2dict(v) for v in obj]
elif hasattr(obj, "__dict__"):
return dict([(k, obj2dict(v)) for k,v in iteritems(obj.__dict__) if not callable(v) and not k.startswith('_')])
else:
return obj
def dict2obj(d, cls, obj = None):
'''Converts dictionary 'd' to an object of class 'cls' (instantiated via cls());
or if 'obj' is given then sets attributes of this existing object (like setattrs()). Returns the object'''
if not obj:
obj = cls()
setattrs(obj, d)
return obj
def class2dict(cls, exclude = "__", methods = False):
"""Retrieves all attributes of a class, possibly except methods if methods=False (default), and returns as a dict.
Similar to getattrs() called with names=None, but detects inherited class attributes, too."""
names = [n for n in dir(cls) if not n.startswith(exclude)]
if methods: return {n: getattr(cls, n) for n in names}
d = {}
for n in names:
v = getattr(cls, n)
if not isfunction(v): d[n] = v
return d
def subdict(d, keys, strict = False, default = False):
"Creates a sub-dictionary from dict 'd', by selecting only the given 'keys' (list). If strict=True, all 'keys' must be present in 'd'."
if isstring(keys): keys = keys.split()
if strict: return dict((k,d[k]) for k in keys)
if default: return dict((k,d.get(k)) for k in keys)
if len(keys) <= len(d): return dict((k,d[k]) for k in keys if k in d)
return dict(item for item in iteritems(d) if item[0] in keys)
def splitkeys(d):
"""Split multi-name keys of dictionary 'd' and return as a new dictionary. If 'd' contains string keys of the form 'key1 key2 key3 ...'
(several keys merged into one string, sharing the same value), they will be split on whitespaces, creating separate keys with the same value assigned.
All keys in 'd' must be strings, or exception is raised."""
d2 = {}
for key, val in iteritems(d):
for k in key.split():
d2[k] = val
return d2
def lowerkeys(d):
"Copy dictionary 'd' with all keys changed to lowercase. Class of 'd' is preserved (can be other than dict)."
return d.__class__((k.lower(), v) for k,v in iteritems(d))
def getattrs(obj, names = None, exclude = "__", default = None, missing = True, usedict = False):
"""
Like the built-in getattr(), but returns many attributes at once, as a dict of {name: value} pairs.
Attribute names are given in 'names' as a list of strings, or a string with 1+ space-separated names.
By default, attributes are retrieved using getattr(), which detects class attributes,
fires up descriptors (if any) and returns methods as <unbound method> not <function>.
Only if names=None and usedict=True, attributes are taken directly from __dict__,
which can be faster, but in some cases behaves differently than getattr().
For missing attributes: returns None if missing=True (default), skips if missing=None, raises an exception if missing=False.
"""
if names is None: # retrieving all attributes?
if usedict: # use faster but less correct approach: __dict__
d = obj.__dict__.copy()
if exclude is None: return d
for k in d.keys():
if k.startswith(exclude): del d[k]
return d
# proceed to a slower but fully correct approach: getattr() ...
if exclude is None: names = list(obj.__dict__.keys())
else: names = [n for n in list(obj.__dict__.keys()) if not n.startswith(exclude)]
if isstring(names): # retrieving an explicit list of attributes?
if ' ' not in names: return {names: getattr(obj, names)} # a single name given
names = names.split() # multiple names
d = {}
if missing:
for k in names: d[k] = getattr(obj, k, default)
elif missing == False:
for k in names: d[k] = getattr(obj, k)
else:
for k in names:
if hasattr(obj, k): d[k] = getattr(obj, k)
return d
def setattrs(obj, d, values = None):
"""Similar to built-in setattr(), but takes entire dictionary 'd'; or a list of names 'd' and list of values 'values' - and sets many attributes at once.
'values' can also be a single non-list value, in which case it will be assigned to all attributes."""
#obj.__dict__.update(d)
if values:
if not islist(values): values = [values] * len(d)
pairs = list(zip(d,values))
else:
pairs = iteritems(d)
for k,v in pairs:
setattr(obj, k, v)
return obj
def copyattrs(dest, src, names = None, missing = False):
"Like setattrs() above, but sets attributes by copying (shallow copy) all attributes from another object or dict 'src' (see getattrs())."
if isdict(src): attrs = subdict(src, names) if names is not None else src
else: attrs = getattrs(src, names, missing = missing)
setattrs(dest, attrs)
return dest
#def retype(obj, newtype):
# ""
# obj2 = newtype()
def setdefaults(d, keys = '', default = ''):
'''
Checks keys in dictionary 'd' and inserts default values
if a given key is missing or None.
'''
for k in keys if islist(keys) else keys.split():
d.setdefault(k, default)
if d[k] is None: d[k] = default
return d
def get(d, key, default = ''):
'''
Similar to dict.get(), but returns 'default' also when the key is defined,
but value is empty, e.g. None.
'''
v = d.get(key)
if not v: return default
return v
class ObjDict(dict):
"""A dictionary whose items can be accessed like object properties (d.key), in addition to standard access (d['key']). Be careful with keys named like standard dict properties.
Keys starting with '__' can't be accessed in this way."""
def __getattr__(self, name):
if name.startswith('__'): raise AttributeError(name)
return self[name]
def __setattr__(self, name, value):
if name.startswith('__'): raise AttributeError(name)
self[name] = value
def __delattr__(self, name):
if name.startswith('__'): raise AttributeError(name)
del self[name]
def __getstate__(self):
return dict(self)
class ComparableMixin:
"Base class (mixin) that implements all comparison operators in terms of __lt__()."
def __eq__(self, other):
return not self < other and not other < self
def __ne__(self, other):
return self < other or other < self
def __gt__(self, other):
return other < self
def __ge__(self, other):
return not self < other
def __le__(self, other):
return not other < self
from heapq import heappush, heappop
class Heap(object):
"An object-oriented wrapper for standard heapq module. Additionally allows custom comparison key to be provided."
def __init__(self, items = None, key = None):
self.key = key # function key(item) that generates key value
self.items = []
if items:
if key: self.items = [(key(item), item) for item in items] # insert pairs to the heap, to enable custom comparison
else: self.items = list(items) # copy the original list
heapq.heapify(self.items)
def push(self, item):
if self.key: heappush(self.items, (self.key(item), item))
else: heappush(self.items, item)
def pop(self):
if self.key: return heappop(self.items)[1]
else: return heappop(self.items)
def __len__(self):
return len(self.items)
def heapmerge(*inputs):
"""Like heapq.merge(), merges multiple sorted inputs (any iterables) into a single sorted output, but provides more convenient API:
each input is a pair of (iterable, label) and each yielded result is a pair of (item, label of the input) - so that it's known what input a given item originates from.
Labels can be any objects (e.g., object that produced the input stream)."""
def entries(iterable, label):
for obj in iterable: yield (obj, label)
iterables = [entries(*inp) for inp in inputs]
return heapq.merge(*iterables)
def sizeof(obj):
"""Estimates total memory usage of (possibly nested) `obj` by recursively calling sys.getsizeof() for list/tuple/dict/set containers
and adding up the results. Does NOT handle circular object references!
"""
size = sys.getsizeof(obj)
if isinstance(obj, dict): return size + sum(map(sizeof, list(obj.keys()))) + sum(map(sizeof, list(obj.values())))
if isinstance(obj, (list, tuple, set, frozenset)): return size + sum(map(sizeof, obj))
return size
#####################################################################################################################################################
###
### OBJECTS
###
class __Labelled__(type):
"Metaclass that implements labels for the actual class: inheritable lists of attributes that exhibit a special behavior."
def __init__(cls, *args):
cls.__labels__ = [] # names of attributes that represent labels in this class
def labels(cls, names): #@NoSelf
for name in str2list(names): cls.label(name)
def label(cls, name): #@NoSelf
"Declare 'name' as a label and set up the list of labelled attributes, the list to be stored under 'name'."
cls.normLabel(name) # convert cls's own labelling to canonical representation
cls.inheritList(name) # inherit labellings from superclasses
cls.__labels__.append(name) # mark 'name' as a label
def normLabel(cls, label): #@NoSelf
"Normalize a list of labelled attributes declared in this class, by converting it from a string or inner class if necessary."
attrs = getattr(cls, label, []) # list of names of attributes labelled by 'label'
if istype(attrs): # inner class?
inner = attrs
vals = getattrs(inner)
for name in iterkeys(vals): # check that all attrs can be safely copied to top class, without overwriting regular attr
if not name in cls.__dict__: continue
raise Exception("Attribute %s appears twice in %s: as a regular attribute and inside label class %s" %
(name, cls, label))
setattrs(cls, vals) # copy all attrs from the inner class to top class level
attrs = cls._getattrs(inner)
#attrs = vals.keys() # collect attr names
elif isstring(attrs): # space-separated list of attribute names?
attrs = attrs.split()
setattr(cls, label, attrs)
def inheritList(cls, label): #@NoSelf
"If 'label' is the name of a special attribute containing a list of items, append lists from base classes to cls's list."
#"""Find out what attributes are labelled by 'label' in superclasses and label them in this class, too.
#'label' is the name of attribute that keeps a list of labelled attrs of a given class."""
baseitems = [getattr(base, label, []) for base in cls.__bases__] # get lists defined in base classes
baseitems = reduce(lambda x,y:x+y, baseitems) # combine into one list
items = getattr(cls, label)
combined = unique(items + baseitems) # add cls's items at the BEGINNING and uniqify
setattr(cls, label, combined)
def _getattrs(outer, cls): #@NoSelf
"""Get names of all attributes of a given class, arrange them in the same ORDER as in the source code,
and return together with their values as an Ord.
Warning: only the attributes that appear at the beginning of their line are detected.
For example, if attribubes are defined like this:
x = y = 0
only 'x' will be detected, 'y' will be missed.
"""
from tokenize import generate_tokens
import token
src = outer._getsource(cls.__name__)
tokens = generate_tokens(StringIO(src).readline)
tokens = [(t[1], t[4]) for t in tokens if t[0] == token.NAME] # pairs (name, line) for all NAME tokens
attrs = [name for (name,line) in tokens if line.strip().startswith(name)] # only take names that start the line
attrs = unique(attrs) # remove duplicates
attrdict = getattrs(cls)
attrs = [name for name in attrs if name in attrdict] # remove names that don't appear in 'attrdict'
for name in attrdict: # append names that don't appear in 'attrs'
if name not in attrs: attrs.append(name)
return attrs
def _getsource(outer, name): #@NoSelf
"""Improved variant of inspect.getsource(), corrected for inner classes.
Standard getsource() works incorrectly when two outer classes in the same file have inner classes with the same name.
"""
outsrc = inspect.getsource(outer) # all source of the outer class, contains somewhere the inner class 'name'
pat = re.compile(r'^(\s*)class\s*' + name + r'\b')
lines = outsrc.splitlines()
for i in range(len(lines)): # find the 1st line with "class 'name'"
match = pat.match(lines[i])
if not match: continue
indent = match.group(1)
indent1 = indent + ' '
indent2 = indent + '\t'
j = i + 1
while j < len(lines): # extract all lines of the block following "class 'name'"
line = lines[j]
sline = line.strip()
if line.startswith(indent1) or line.startswith(indent2) or sline == '' or sline.startswith('#'): j += 1
else: break
return '\n'.join(lines[i+1:j])
return outsrc # as a fallback, return all 'outsrc'
class __Object__(__Labelled__):
"Metaclass for Object. Implements __transient__ label."
def __init__(cls, *args): #@NoSelf
super(__Object__, cls).__init__(cls, *args)
cls.label('__transient__') # declare '__transient__' as a label and set up the list of labelled attributes, cls.__transient__
cls.label('__shared__') # declare '__shared__' as a label and set up the list of labelled attributes, cls.__shared__
class Object(six.with_metaclass(__Object__, object)):
"""For easy creation of objects that can have assigned any attributes, unlike <object> instances. For example:
obj = Object(); obj.x = 21
obj = Object(x = 21, y = 'ala')
obj = Object({'x':21, 'y':'ala'})
With base <object> this is impossible - a subclass, even if with empty implementation, is required to assign to attributes.
Additionally, Object implements:
- equality '==' operator __eq__ that performs deep comparison by comparing __dict__ dictionaries, not only object IDs.
- __str__ that prints the class name and its __dict__, with full recursion like for nested dict's (__repr__ == __str__) - if __verbose__=True.
- __getstate__ that understands the __transient__ list of attributes and excludes them from serialization.
- copy() and deepcopy() also honor __transient__, since they utilize __getstate__ unless custom __copy__/__deepcopy__
is implemented in a subclass.
When subclassing Object:
- Some attributes can be labelled as "transient", by adding their names to subclass'es __transient__ list.
__transient__ can also be given as a space-separated string "name1 name2 ...", which will be converted automatically
into a list by the metaclass, after subclass definition. Additionally, the metaclass automatically extends the list
with names declared as transient in superclasses.
__transient__ is typically a class-level attribute, but can be overriden in instances to modify
serialization behavior on per-instance basis.
- If you provide custom metaclass for your Object subclass, remember to inherit that metaclass from __Object__ and call
super(X, cls).__init__ in your __init__(cls).
- Subclasses can easily add their own labels, by implementing a metaclass that subclasses __Object__
and invokes cls.label('labelName') in __init__. New labels will be automatically provided with
conversions and inheritance, like __transient__ is.
"""
__transient__ = [] # list of names of attributes to be excluded from serialization and (deep-)copying
__shared__ = [] # list of names of attributes that should be shallow-copied (shared between copies) in deepcopy
# __verbose__ = True # in __str__, shall we print recursively all properties of the object? if False, standard __str__ is used
def __init__(self, __dict__ = {}, **kwargs): #@ReservedAssignment
self.__dict__.update(__dict__)
self.__dict__.update(kwargs)
def __eq__(self, other):
return self.__dict__ == getattr(other, '__dict__', None)
# def __str__(self):
# if not self.__verbose__: return object.__str__(self)
# items = ["%s = %s" % (k,repr(v)) for k,v in iteritems(self.__dict__)]
# return "%s(%s)" % (self.__class__.__name__, ', '.join(items)) #str(self.__dict__)
# __repr__ = __str__
def __getstate__(self):
"""Representation of this object for serialization. Returns a copy of __dict__ with transient attributes removed,
or just original __dict__ if no transient attrs defined."""
if self.__transient__:
state = self.__dict__.copy()
for attr in self.__transient__: state.pop(attr, None)
return state
return self.__dict__
def __setstate__(self, state):
self.__dict__ = state
def __deepcopy__(self, memo):
"Custom implementation of deepcopy() that honours the __shared__ specifier."
cls = self.__class__
result = cls.__new__(cls)
memo[id(self)] = result # to avoid excess copying in case the object itself is referenced from its member
deepcopy = copy.deepcopy
# nocopy(): avoid copying of shared variables and generator objects (frequent case in data pipelines)
if self.__shared__:
def nocopy(k, v): return k in self.__shared__ or isgenerator(v)
else:
def nocopy(k, v): return isgenerator(v)
for k, v in iteritems(self.__getstate__()):
setattr(result, k, v if nocopy(k, v) else deepcopy(v, memo))
return result
class NoneObject(Object):
"Class for mock-up objects that - like None - evaluate to False in bool(), but additionally can hold any data inside or provide other custom behavior."
def __bool__(self): return False
__nonzero__ = __bool__
#####################################################################################################################################################
###
### STRINGS & TEXT
###
def merge_spaces(s, pat = re.compile(r'\s+')):
"Merge multiple spaces, replace newlines and tabs with spaces, strip leading/trailing space. Similar to normalize-space() in XPath."
return pat.sub(' ', s).strip()
def flat_spaces(s, pat = re.compile(r'[\n\r\t]')):
"Replace \n, \r, \t special characters with regular spaces."
return pat.sub(' ', s)
def escape(s):
"Slash-escape (or encode) non-printable characters, including \n and \t."
return s.encode('unicode_escape')
def ascii(text):
"""ASCII-fication of a given unicode 'text': national characters replaced with their non-accented ASCII analogs.
See http://stackoverflow.com/a/1383721/1202674, function bu(), for possible improvements."""
if isinstance(text, six.binary_type): # checks for byte string in both Py2 and Py3
text = text.decode("UTF-8")
result = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore')
return result.decode('ASCII') if six.PY3 else result
def prefix(sep, string):
"Adds a prefix 'sep' to 'string', but only if 'string' is non-empty and not None. Otherwise empty string."
if string: return sep + str(string)
return ''
def indent(text, spaces=8, fill=None, strip=False):
"Inserts 'fill' or a given no. of spaces at the beginning of each line in 'text'. Can strip the text beforehand"
if fill is None: fill = ' ' * spaces
if strip: text = text.strip()
return fill + text.replace('\n', '\n' + fill)
### See also jsonpickle (http://jsonpickle.github.com)
class JsonObjEncoder(json.JSONEncoder):
"""Extends JSON serialization to custom classes. Serializes any non-json-serializable object by outputing its __json__() or __getstate__() or __dict__.
Sets converted to lists. Good for printing, but not reversible: info about class of the object gets lost."""
def default(self, obj):
if isinstance(obj, set): return list(obj)
if hasattr(obj, '__json__'): return obj.__json__()
if hasattr(obj, '__getstate__'): return obj.__getstate__()
try:
return obj.__dict__
except:
return str(obj)
def dumpJson(obj):
return json.dumps(obj, cls = JsonObjEncoder)
def printJson(*objs):
for obj in objs: print(json.dumps(obj, indent = 4, cls = JsonObjEncoder))
jsondump = dumpjson = jsonDump = dumpJson
jsonprint = printjson = jsonPrint = printJson
class JsonReversibleEncoder(json.JSONEncoder): ### DRAFT
def default(self, obj):
self.classesHandled = {}
if isinstance(obj, list(self.classesHandled.values())):
key = '__%s__' % obj.__class__.__name__
d = {'/cls': classname(obj, full=True)}
d.update(obj.__dict__)
return d
return json.JSONEncoder.default(self, obj)
class JsonReversibleDecoder(json.JSONDecoder): ### DRAFT
pass
class JsonDict(dict):
"""A dictionary that's linked to a JSON file on disk: initial data is loaded from file upon __init__;
sync() and close() save dict contents back to the file, by re-opening and rewriting all file contents.
The file is closed between syncs."""
def __init__(self, filename, load = True, indent = 2, **json_kwargs):
super(JsonDict, self).__init__()
self.filename = filename
self.json_kwargs = json_kwargs
self.json_kwargs['indent'] = indent
if load and os.path.exists(self.filename): self.load()
def load(self):
with open(self.filename, 'rt') as f:
state = json.load(f)
self.update(state)
def save(self):
with open(self.filename, 'wt') as f:
json.dump(self, f, **self.json_kwargs)
def sync(self): self.save()
def close(self): self.save()
class JSON(object): ### DRAFT
"JSON printer & parser. Customizable."
metadata = False # if True, printing includes additional info in JSON output, to enable proper restoring of all classes and objects from primitives
handlers = {} # custom handlers for specific types, implemented as external functions rather than __getstate__/__setstate__
indent = None
separators = (', ', ': ')
sort_keys = False
def dumps(self, obj):
return json.dumps(obj, cls = JsonObjEncoder, indent = self.indent, separators = self.separators)
def loads(self, s):
primitive = json.loads(s, object_pairs_hook = collections.OrderedDict)
return primitive
def encode(self, obj):
"Encode 'obj' into a primitive (but possibly complex) value."
def decode(self, primitive):
"Decode primitive value with metadata into a complex object."
# obj = cls.__initstate__(state)
### DAST printing ###
def dumpdast(obj, **kwargs):
from .data import dast
return dast.encode(obj, **kwargs)
def printdast(obj, **kwargs):
from .data import dast
print(dast.encode(obj, **kwargs))
### Hashes & checksums
def hashmd5(s, n = 4):
"""Stable cross-platform hash function for strings. Will always return the same output for a given input (suitable for DB storage),
in contrast to the standard hash() function whose implementation varies between platforms and can change in the future.
Calculates MD5 digest and returns the first 'n' bytes (2*n hex digits) converted to an unsigned n-byte long integer, 1 <= n <= 16.
>>> hashmd5("Ala ma kota", 3), hashmd5("Ala ma kota", 7)
(9508390, 40838224789264552)
This function works also for Unicode strings and should return the same value on Python 2 & 3,
however on Python 2, doctests cannot handle unicode properly and return a different value during test:
> > > hashmd5(u"ąśężźćółńĄŚĘŻŹĆÓŁŃ") == 886848614
True
"""
s = s.encode('utf-8')
return int(hashlib.md5(s).hexdigest()[:2*n], 16)
def crc32(data):
"""
Calculates CRC-32 checksum of the input string, in a way compatible across all Python versions and hardware platforms.
The returned value is an unsigned 4-byte integer in the range: [0, 2**32-1].
"""
if isinstance(data, six.text_type):
data = data.encode('utf-8')
return binascii.crc32(data) & 0xffffffff
#####################################################################################################################################################
###
### NUMBERS & COUNTING
###
class Counter(object):
"""Accumulator that adds up a weighted stream of numbers or numpy arrays and returns their mean
at the end (or at any point during accumulation)."""
def __init__(self):
self.total = 0 # sum total of input values; will be changed to float/numpy during accumulation if necessary
self.count = 0 # sum total of weights; will be changed to float during accumulation if necessary
def add(self, x, weight = 1):
self.total += x * weight
self.count += weight
def mean(self):
return self.total / float(self.count)
def minmax(*args):
if len(args) == 1: args = args[0] # you can pass a single argument containing a sequence, or each value separately as multiple arguments
if len(args) == 2: return args if args[0] <= args[1] else (args[1],args[0])
return (min(args), max(args))
def percent(x, y, ifzero = 0):
'''
Returns percentage value of x in y, or 'ifzero' when y==0.
Return type (int/float) is the same as the type of arguments.
'''
return (100 * x / y) if y != 0 else ifzero
def bound(val, minn = 0.0, maxx = 1.0):
"Restricts 'val' to the range of [minn,maxx]"
return max(min(val, maxx), minn)
def divup(x, y):
div = x / y
if div * y == x: return div
return div + 1
def noise(scale = 0.1):
"Symmetric uniform random noise in the range: [-scale, +scale)"
return (random.random()-0.5)*2 * scale
def mnoise(scale = 1.1):
"Multiplicative random noise in the range: [exp(-ln(scale)), +scale); symmetric in log-scale, uniform, scale should be >1.0. For example, mnoise(2) is in the range [0.5,2.0)"
return math.exp(noise(math.log(scale)))
def parseint(s):
"Flexible parsing of integers from real-world strings. String may contain thousand separators (spaces, commas, dots) or parentheses."
if not s: return None
s = s.translate(None, ',. \n()')
return int(s)
def enumerate_limit(sequence, limit, start = 0):
"Like enumerate(), but reads at most `limit` number of items from `sequence`."
return enumerate(itertools.islice(sequence, 0, limit), start = start)
#####################################################################################################################################################
###
### DATE & TIME
###
class Timer(object):
"Create a Timer object once, than read many times the amount of time that elapsed so far."
def __init__(self): self.start = time.time()
def reset(self): self.start = time.time()
def elapsed(self): return time.time() - self.start # time elapsed until now, in seconds, floating-point
# floating-point results
def seconds(self): return self.elapsed()
def minutes(self): return self.elapsed() / 60
def hours(self): return self.elapsed() / (60 * 60)
# integer results (rounded down)
def iseconds(self): return int(self.seconds())
def iminutes(self): return int(self.minutes())
def ihours(self): return int(self.hours())
def __str__(self):
d = self.elapsed()
if d < 60*10: return "%.1f s" % d
if d < 60*60*10: return "%.1f min" % (d / 60)
return "%.1f hours" % (d / (60*60))
"""
Different ways to represent date and time in python:
- timestamp (1360847012.038727): no. of seconds since the Epoch
- time.time() returns timestamp in local timezone
- time.struct_time - equivalent of C struct tm. No timezone info, but keeps day of week, day of year, daylight saving.
- datetime.datetime
- datetime.timedelta: difference between two 'datetime' objects
"""
# different time periods in seconds; for use with functions that operate on seconds, like time.time() or time.sleep()
MINUTE = 60
HOUR = 60*60
DAY = 60*60*24
WEEK = 60*60*24*7
YEAR = 60*60*24*365.2425
# current date in structural form, as datetime.date
def today(): return datetime.date.today()
def todayString(fmt = '%Y-%m-%d'): return datetime.date.today().strftime(fmt)
# current date+time in structural form, as datetime.datetime; use time.time() for flat form of #seconds from Epoch
def now(): return datetime.datetime.now()
def nowString(fmt = '%Y-%m-%d %H:%M:%S'): return datetime.datetime.now().strftime(fmt)
def utcnow(): return datetime.datetime.utcnow()