-
Notifications
You must be signed in to change notification settings - Fork 54
/
Copy pathchunk.py
44 lines (32 loc) · 1.16 KB
/
chunk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
'''
split a file into a given number of chunks randomly, line by line.
Usage: chunk.py <input file> <number of chunks> [-s <random seed>] [--verbose]'
'''
import sys
import random
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument( "input_file", help = "path to the input file")
parser.add_argument( "num_chunks", help = "number of chunks to split the input file into", type = int )
parser.add_argument( "-s", "--seed", help = "sets a seed for the random number generator", default = None )
parser.add_argument("-v", "--verbose", help = "will write counts during process to standard out",
action = "store_true", default = False)
args = parser.parse_args()
if args.seed:
print "seeding: %s" % ( args.seed )
random.seed( args.seed )
basename = os.path.basename( args.input_file )
basename, ext = os.path.splitext( basename )
i = open( args.input_file )
os = {}
for n in range( args.num_chunks ):
output_file = "%s_%s%s" % ( basename, n, ext )
os[n] = open( output_file, 'wb' )
counter = 0
for line in i:
n = random.randint( 0, args.num_chunks - 1 )
os[n].write( line )
counter += 1
if args.verbose and counter % 100000 == 0:
print counter