-
Notifications
You must be signed in to change notification settings - Fork 1
/
Kmer.snakefile
124 lines (78 loc) · 2.96 KB
/
Kmer.snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#################################################################################
# FUNCTIONS #
#################################################################################
#################################################################################
# GLOBALS #
#################################################################################
#################################################################################
# RULES #
#################################################################################
rule all:
input:
"data/interim/kmers/kmer_matrix.npz"
rule kmers:
input:
"data/interim/fasta_files_list_no_ecoli.txt"
output:
"data/interim/kmers/kmer_ohe_zarr",
"data/interim/kmers/kmer_index_leveldb"
params:
k=11,
mins=1,
v=0,
cores=12,
d="data/interim/dsk"
script:
"src/data/make_kmer_table.py"
rule filter:
input:
"data/interim/kmers/kmer_ohe_zarr",
"data/interim/fasta_files_list_no_ecoli.txt",
"data/interim/kmers/kmer_index_leveldb"
params:
maj=2260,
min=0,
k=11
output:
"data/interim/kmers/kmer_matrix.npz"
run:
import zarr
import pandas as pd
import dask
import dask.array as da
import dask.dataframe as dd
import numpy as np
import plyvel
from os.path import splitext, basename
# Output genome order
with open(input[1], 'r') as infh:
fasta_files = infh.read().splitlines()
genomes = [ splitext(basename(f))[0] for f in fasta_files]
garray = np.array(genomes)
print("Genome array built")
store = zarr.DirectoryStore(input[0])
group = zarr.hierarchy.group(store=store,overwrite=False)
za = group['ohe']
a = da.from_array(za, chunks=za.chunks)
nkmers = a.shape[1]
# Record kmer order
db = plyvel.DB(input[2])
dt = 'U'+str(params['k'])
ko = np.empty(nkmers, dtype=dt)
for k, v in db:
# print(k)
# print(v)
i = int(v.decode('utf-8'))
ko[i] = k.decode('utf-8')
print("Kmer order array built")
print("Start of filtering")
with dask.set_options(scheduler='threads'):
# Filter by counts
frequencies = (a != 0).sum(0)
mask = (frequencies > params['min']) & (frequencies < params['maj'])
a = a[:,mask]
karray = a.compute()
print(karray.shape)
koarray = ko[mask]
print("End of filtering")
np.savez(output[0], kmers=karray, kmer_order=koarray, genome_order=garray)