-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfile-readin
executable file
·357 lines (305 loc) · 12.1 KB
/
file-readin
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
#!/usr/bin/python3
''' Meant as a one-shot OS disk cache warmer, such as search indexes, data read in unusual seeky ways, and whatnot.
Note that when these files are larger than the cache can take,
this is sort of pointless (particularly to do more than once).
Shows speed (unless file was small, since those speeds are likely to be misleading).
'''
import sys,os,stat,time,getopt,re
# The amount to read at once. 8MB seems fast and not very harsh on temporary memory use
#chunksize = 8*1024*1024
chunksize = 64*1024
#We don't show read speed on small files
# should probably be at least the disk's read size to be anywhere near accurate
# a lowish multiple of 64k is probably okay
min_report = 512*1024
global_total_bytes = 0
def kmg(amount,kilo=1000, append='',thresh=15, nextup=0.9, rstrip0=True, extradigits=0, i_for_1024=True):
""" For more easily skimmable sizes
e.g.
kmg(3429873278462) == '3.4T'
kmg(342987327) == '343M'
kmg(34298) == '34K'
'%sB'%kmg(2342342324) == '2.3GB'
'%sB'%kmg(2342342324, kilo=1024) == '2.2GiB'
'%sB'%kmg(2342342324, kilo=1024, extradigits=1) == '2.18GiB'
'%sB'%kmg(19342342324, kilo=1024) == '18GiB'
'%sB'%kmg(19342342324, kilo=1024, extradigits=1) == '18GiB' (because of rstrip0)
Decimal/SI kilos by default, so useful beyond bytes.
Specify kilo=1024 if you want binary kilos. By default this also adds the i.
thresh is the controls where we take one digit away, e.g. for 1.3GB but 16GB.
Default is at 15 which is entirely arbitrary.
Disable using None.
nextup makes us switch to the next higher up earlier, e.g. 700GB but 0.96TB
Disable using None.
extradigits=1 (or maybe more) to unconditionally see a less-rounded number
(though note rstrip can still apply)
rstrip0 whether to take off '.0' if present (defaults to true)
append is mostly meant for optional space between number and unit.
"""
mega = kilo*kilo
giga = mega*kilo
tera = giga*kilo
peta = tera*kilo
exa = peta*kilo
zetta = exa*kilo
yotta = zetta*kilo
if nextup is None:
nextup = 1.0
if thresh is None:
thresh = 1000
nextup = float(nextup)
# Yes, could be handled a bunch more more compactly (and used to be)
showdigits=0
if abs(amount) < nextup*kilo: # less than a kilo; omits multiplier and i
showval = amount
else:
for csize, mchar in ( (peta, 'P'),
(tera, 'T'),
(giga, 'G'),
(mega, 'M'),
(kilo, 'K'),
#(exa, 'E'),# exa, zetta, yotta is shown as peta amounts. Too large to comprehend anyway.
#(zeta, 'Z'),
#(yotta,'Y'),
):
if abs(amount) > nextup*csize:
showval = amount/float(csize)
if showval<thresh:
showdigits = 1 + extradigits
else:
showdigits = 0 + extradigits
append += mchar
if i_for_1024 and kilo==1024:
append += 'i'
break
ret = ("%%.%df"%(showdigits))%showval
if rstrip0:
if '.' in ret:
ret=ret.rstrip('0').rstrip('.')
ret += append
return ret
def parse_kmg(s, kilo=1000, listen_to_i=False):
""" e.g. parse_kmg('1k') == 1000
parse_kmg('2 MiB', kilo=1024) == 2097152
Quick and dirty implementation, may need work:
looks for kmgtp, Ignores anything not [0-9kmgtpi.]
Kilo defailts to decimal kilos.
If you want binary kilos, specify kilos=1024
...OR set listen_to_i=True for things like 4.5KiB.
This is false by default because you ought to
know the amount of preformatting you need to do
"""
if listen_to_i and 'i' in s:
kilo=1024
mega = kilo*kilo
giga = mega*kilo
tera = giga*kilo
peta = tera*kilo
ns = re.sub(r'[A-Za-z]','',s) #s.rstrip('kmgtpKMGTPiIbB') # or just everything?
if ns.count(',') == 1: # pseudo-relocalization.
ns = ns.replace(',','.') # e.g. for dutch people.
try:
ret = float(ns)
sl = s.lower()
# TODO: test whether it's right after the number, to avoid words with these letters messing things up.
if 'k' in sl:
ret *= kilo
elif 'm' in sl:
ret *= mega
elif 'g' in sl:
ret *= giga
elif 't' in sl:
ret *= tera
elif 'p' in sl:
ret *= peta
ret = int(ret)
return ret
except Exception as e:
print( "Didn't understand value %r"%ns )
print( e )
raise
def min_sec(sec, second_digits=1, left_pad=2):
""" takes float value, represents as minutes, seconds, e.g.
min_sec(62.33242) == '1m02.3s'
min_sec(1.3) == '0m01.3s'
min_sec(13) == '0m13.0s'
second_digits refers to the digits after the decimal point to print,
min_sec(5.3,0) == '0m05s'
left_pad to the left padding on the seconds (there by default for things to line up).
min_sec(5.3,0,0) == '0m5s'
"""
ret = []
mins = int( sec/60. )
secs = sec%60.
ret.append('%dm'%mins)
ret.append(('%%0%dd'%left_pad)%int(secs)) # whole seconds, doing separately makes the zero padding slightly less messy
ff = ('%%.%dfs'%second_digits)
ret.append( (ff % ( secs%1 ))[1:] )
return ''.join(ret)
def readin(fn, first_bytes=None, last_bytes=None, printstuff=1):
global global_total_bytes
try:
stob = os.stat(fn)
size = stob.st_size
mode = stob.st_mode
if not stat.S_ISREG(mode):
if printstuff:
sys.stdout.write("Not regular file: %s\n"%fn)
sys.stdout.flush()
return
if printstuff:
sys.stdout.write("Reading: %-50s"%fn)
sys.stdout.flush()
t = time.time()
f = open(fn, 'rb')
readtotal = 0
while True:
if first_bytes!=None and first_bytes>0:
readnow=len( f.read(first_bytes) )
else:
readnow=len( f.read(chunksize) )
if readnow==0: #EOF
break
readtotal+=readnow
if first_bytes>0 and readtotal >= first_bytes:
#if printstuff:
# print( " - stopped after %d bytes"%readtotal)
break
if first_bytes>0 and last_bytes > 0: # we didn't just read all, and want to read from the end
f.seek( max(0,size - last_bytes) )
while True:
readnow=len(f.read(chunksize))
if readnow==0: #EOF
break
global_total_bytes += readtotal
f.close()
dtime=time.time()-t
if readtotal>min_report:
extra=''
mbpersec = (readtotal/dtime)/(1024*1024)
if mbpersec>600:
extra=' (probably mostly cached in memory)'
elif mbpersec>150:
extra=' (probably partly cached in memory)'
elif mbpersec>80:
extra=' (probably came from disk, sole reader and not fragmented)'
extra=''
else:
extra=' (probably came from disk, not sole reader, or fragmented)'
extra=''
if printstuff:
print( " -- %d MB/s%s"%( mbpersec , extra ))
sys.stdout.flush()
else:
if printstuff:
print( '' )# for its newline
#print ' -- (small)'
#sys.stdout.flush()
except (OSError,):
pass
def usage():
print( "Reads data from files (and discards)")
print( "Written as a user-controlled informed readahead thing ahead of real use, to exploit the OS's page cache")
print( "")
print( " usage: readin [options] paths")
print( "")
print( " If you do not use -r, you must specify filenames")
print( "")
print( " -r recursive. If you dont use this, specify all filenames you want read")
print( " -w use filename whitelist (instead of reading everything), and add to that whitelist")
print( " (substring; will add globs later)")
print( " -o only do stat(), don't read")
print( " -s <bytes> how many bytes to read at the start of the file.")
print( " -e <bytes> how many bytes to read at the end of the file.")
print( " If neither is specified, reads the whole file.")
print( " -h this help")
print( "")
print( " Example: readin -r -w .jpg -w .png .")
print( " Example: readin -r -w .mp3 -s 2k -e 128 /data/MusicBulk")
print( "")
def main():
recursive = False
only_first = None
stat_only = False
first_bytes = 0
last_bytes = 0
whitelist = []
count_files = 0
def whitelist_match(s):
if len(whitelist)>0:
for ss in whitelist:
if ss in s:
print( "file %s matches %s"%(s,ss))
return True
return False
else: # no whitelist
return True
try:
optlist, args = getopt.getopt(sys.argv[1:], 'rs:he:w:o')
for o,v in optlist:
if o=='-r':
recursive = True
if o=='-w':
whitelist.append(v)
if o=='-s':
first_bytes = parse_kmg(v, kilo=1024)
if o=='-o':
stat_only = True
if o=='-e':
last_bytes = parse_kmg(v, kilo=1024)
if o.startswith('-h'):
usage()
sys.exit(0)
except Exception as e:
print( e)
print()
usage()
sys.exit(-1)
try:
start_time = time.time()
for fn in args:
fn = os.path.realpath(fn)
if not os.path.exists(fn):
print( "Does not exist: %r"%fn)
else:
if os.path.isfile(fn): # directly specified file?
print( fn)
if whitelist_match(fn):
if stat_only:
#print( "Statting: %r"%fn)
os.stat(fn)
else:
readin(fn, first_bytes, last_bytes)
count_files += 1
if os.path.isdir(fn): # directory
if recursive: # if -r, then walk it:
for curdir,dirs,files in os.walk(fn, followlinks=True):
for filename in files:
fullpath = os.path.join(curdir,filename)
if whitelist_match(filename):
count_files += 1
if stat_only:
#print "Statting: %r"%filename
os.stat(fn)
else:
readin(fullpath, first_bytes, last_bytes)
except KeyboardInterrupt:
print( "\n\n Interrupted")
took_time = time.time() - start_time
sec_digits = 1
if took_time<0.5: # fast
sec_digits = 3
if took_time<0.1: # near-instant, probably all in RAM
sec_digits = 4
if took_time>0:
if not stat_only:
print( "\nOverall speed:\n %sB in %s (~%sB/sec)"%(
kmg(global_total_bytes),
min_sec(took_time),
kmg(global_total_bytes/took_time),
))
print( " %sfiles / sec"%(
kmg(count_files / took_time, append=' '),
))
if __name__=='__main__':
main()