-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathAGPcorrect.py
105 lines (81 loc) · 3.97 KB
/
AGPcorrect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python3
import sys
import gzip
from Bio import SeqIO
from binascii import hexlify
if len(sys.argv) == 1 or (sys.argv[1] in ("-h", "--help")):
print(
"Usage: AGPCorrect ref.fa(.gz) scaffs.agp >corrected_scaffs.agp",
file=sys.stderr,
)
sys.exit(0)
if len(sys.argv) != 3:
sys.exit("Usage: AGPCorrect ref.fa(.gz) scaffs.agp >corrected_scaffs.agp")
def Open(file_name):
with open(file_name, "rb") as f:
isgzip = hexlify(f.read(2)) == b"1f8b"
return gzip.open(file_name, "rt") if isgzip else open(file_name, "r")
print("Reading fasta...", file=sys.stderr)
with Open(sys.argv[1]) as f:
seqs = {seq.id: len(seq) for seq in SeqIO.parse(f, "fasta")} ## Generating a list of all sequences and their lengths from the fasta
## All expected sequences are present.
# print(
# f"Read fasta, {len(seqs)} sequences",
# *(f"{s}: {n} bp" for s, n in seqs.items()),
# "\n",
# file=sys.stderr,
# sep="\n",
# )
seen = {}
with open(sys.argv[2], "r") as f: ## Building a dictionary for scaffolds and their lengths based on AGP output
for line in f:
if not line.startswith("#"):
line = line.split("\t")
if line[4] == "W":
seen[line[5]] = max(seen.setdefault(line[5],0), int(line[7]))
## everything printed here is expected
stdout_file=sys.stdout
sys.stdout = open('corrected.agp', 'w')
with open(sys.argv[2], "r") as f: ## Opening the AGP again
curr_scaff = None
maxn = 1
for line in f:
line = line[:-1]
if not line.startswith("#"):
line = line.split("\t")
if curr_scaff != line[0]: ## checking whether the current scaffold name matches the scaffold name in the previous line
## If the current scaffold name does not match the previous, then it print ths scaffold name and the "correction" (which I assume is the BP difference between the )
# if curr_scaff:
# print(f"{curr_scaff}: {correct} bp correction", file=sys.stderr)
curr_scaff = line[0]
# print (line, curr_scaff, "<- AGP")
correct = 0
line[1] = str(int(line[1]) + correct) ## Printing corrected start position I believe
if line[4] == "W" and ((this_l := int(line[7])) == seen[line[5]]):
correct += (acc_l := seqs[line[5]]) - this_l ## everything seems to be fine here, the sizes haven't been corrected
##seems like this is where we are getting negative values from - but it could be that the right "correction isn't matched with the right scaffold "
# print (seqs[line[5]], line[5])
# print (this_l, correct, '\n')
## Seqs is a dictionary of scaffolds and their lengths generated from the original fasta file
## So line[5] is the name of the current scaffold and seqs[line[5]] goes to the dict and produces the true length of the scaffold
if int(line[6]) >= acc_l: ##line[6] is the start position of each segment - seems to be checking whether the start position is greater than the actual length of the scaffold?
sys.exit(
"Error with line: {}\n{} > {}".format(
"\t".join(line), line[6], acc_l
)
)
line[7] = str(acc_l)
line[2] = str(int(line[2]) + correct)
print("\t".join(line))
maxn = max(maxn, int(line[0].split("_")[-1]))
else:
if line.startswith("# DESCRIPTION"):
line += "\tModified by PretextView_AGPCorrect"
# if curr_scaff:
# print(f"{curr_scaff}: {correct} bp correction", file=sys.stderr)
maxn += 1
for k, (s, n) in enumerate((s, n) for s, n in seqs.items()):
if s not in set(seen.keys()):
print(f"Scaffold_{maxn+k}\t1\t{n}\t1\tW\t{s}\t1\t{n}\t+")
sys.stdout.close()
sys.stdout = stdout_file