Skip to content

Commit

Permalink
Python 3 fixes (#26)
Browse files Browse the repository at this point in the history
* Read gzipped or uncompressed files with try/except.
* Use explicit int() for divisions that used to produce integers in python2 -- e.g. in binary search algorithms where the result is used for indexing.
* Exit the top-level script with an error code if any component script fails.
  • Loading branch information
MariaNattestad authored Jan 20, 2020
1 parent 594bf1c commit 58fb525
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 44 deletions.
10 changes: 6 additions & 4 deletions scripts/Assemblytics
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ then
then
echo "BETWEEN,DONE,Step 2: Assemblytics_between_alignments.pl completed successfully. Now finding variants within alignments." >> $LOG_FILE
>&2 echo "3. Finding variants within alignments"
$SCRIPT_PATH/Assemblytics_within_alignment.py --delta $OUTPUT_PREFIX.Assemblytics.unique_length_filtered_l$UNIQUE_LENGTH.delta.gz --min $MINIMUM_SIZE > $OUTPUT_PREFIX.variants_within_alignments.bed
$SCRIPT_PATH/Assemblytics_within_alignment.py --delta $OUTPUT_PREFIX.Assemblytics.unique_length_filtered_l$UNIQUE_LENGTH.delta.gz --min $MINIMUM_SIZE --output $OUTPUT_PREFIX.variants_within_alignments.bed

if [ -e $OUTPUT_PREFIX.variants_within_alignments.bed ];
then
Expand Down Expand Up @@ -123,19 +123,21 @@ then
echo "SUMMARY,DONE,Step 5: Assemblytics_summary.py completed successfully" >> $LOG_FILE
else
echo "SUMMARY,FAIL,Step 5: Assemblytics_summary.py failed" >> $LOG_FILE
exit 1
fi

else
echo "COMBINE,FAIL,Step 4: combining variants failed" >> $LOG_FILE
exit 1
fi
else

echo "WITHIN,FAIL,Step 3: Assemblytics_within_alignment.py failed: Possible problem before this step or with Python on server." >> $LOG_FILE
exit 1
fi
else
echo "BETWEEN,FAIL,Step 2: Assemblytics_between_alignments.pl failed: Possible problem with Perl or show-coords on server." >> $LOG_FILE
exit 1
fi
else
echo "UNIQFILTER,FAIL,Step 1: Assemblytics_uniq_anchor.py failed: Possible problem with Python or Python packages on server." >> $LOG_FILE
exit 1
fi

55 changes: 28 additions & 27 deletions scripts/Assemblytics_uniq_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,21 @@ def run(args):
if unique_length == 10000:
print("Use --unique-length X to set the unique anchor length requirement. Default is 10000, such that each alignment must have at least 10000 bp from the query that are not included in any other alignments.")


print("header:")

f = open(filename)
header1 = f.readline()
# the first two bytes show whether a file is gzipped
if header1[0:2]=="\x1f\x8b":
f.close()
f = gzip.open(filename)
print(f.readline().strip())
else:
print(header1.strip())

try:
f = gzip.open(filename, 'rt')
header1 = f.readline().strip()
print("Detected gzipped delta file. Reading...")
except:
f = open(filename, 'r')
header1 = f.readline().strip()
print("Detected uncompressed delta file. Reading...")

# Ignore the first two lines for now
print("\n")
print("Header (2 lines):")
print(header1)
print(f.readline().strip())
print("\n")

linecounter = 0

Expand All @@ -56,7 +56,6 @@ def run(args):

for line in f:
if line[0]==">":

fields = line.strip().split()
current_query_name = fields[1]
current_header = line.strip()
Expand Down Expand Up @@ -84,9 +83,9 @@ def run(args):
num_queries = len(lines_by_query)
print("Filtering alignments of %d queries" % (num_queries))

num_query_step_to_report = num_queries/100
num_query_step_to_report = int(num_queries/100)
if num_queries < 100:
num_query_step_to_report = num_queries/10
num_query_step_to_report = int(num_queries/10)
if num_queries < 10:
num_query_step_to_report = 1

Expand All @@ -97,22 +96,24 @@ def run(args):

query_counter += 1
if (query_counter % num_query_step_to_report) == 0:
print("Progress: %d%%" % (query_counter*100/num_queries))
print("Progress: %d%%" % (int(query_counter*100/num_queries)))
print("Progress: 100%")

print("Deciding which alignments to keep: %d seconds for %d queries" % (time.time()-before,num_queries))
before = time.time()

fout = gzip.open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta.gz" % (unique_length),'w')
fout = gzip.open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta.gz" % (unique_length),'wt')

f = open(filename)
header1 = f.readline()
if header1[0:2]=="\x1f\x8b":
f.close()
f = gzip.open(filename)
header1 = f.readline()

fout.write(header1) # write the first line that we read already
try:
f = gzip.open(filename, 'rt')
header1 = f.readline().strip()
print("Detected gzipped delta file. Reading...")
except:
f = open(filename, 'r')
header1 = f.readline().strip()
print("Detected uncompressed delta file. Reading...")

fout.write(header1)
fout.write(f.readline())

linecounter = 0
Expand Down Expand Up @@ -329,7 +330,7 @@ def binary_search(query, numbers, left, right):

if left >= right:
return right
mid = (right+left)/2
mid = int((right+left)/2)


if query == numbers[mid]:
Expand Down
2 changes: 0 additions & 2 deletions scripts/Assemblytics_variant_charts.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ comma_format<-function(num) {
formatC(abs(num),format="f",big.mark=",",drop0trailing = TRUE)
}



# Prep data for log-scaled plot
alt <- bed

Expand Down
29 changes: 18 additions & 11 deletions scripts/Assemblytics_within_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,21 @@ def run(args):
filename = args.delta
minimum_variant_size = args.minimum_variant_size

f = open(filename)
header1 = f.readline()
# check first 2 bytes to see if the file is gzipped
if header1[0:2]=="\x1f\x8b":
f.close()
f = gzip.open(filename)
header1 = f.readline()

try:
f = gzip.open(filename, 'rt')
header1 = f.readline().strip()
print("Detected gzipped delta file. Reading...")
except:
f = open(filename, 'r')
header1 = f.readline().strip()
print("Detected uncompressed delta file. Reading...")

# Ignore the first two lines for now
f.readline()
print("\n")
print("Header (2 lines):")
print(header1)
print(f.readline().strip())
print("\n")

linecounter = 0

Expand Down Expand Up @@ -74,18 +79,20 @@ def run(args):

f.close()

fout = open(args.output_path, 'w')
newcounter = 1
for line in variants:
if line[4] >= minimum_variant_size:
line[3] = "Assemblytics_w_%d" % (newcounter)
print("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10])
fout.write("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10] + "\n")
newcounter += 1

fout.close()

def main():
parser=argparse.ArgumentParser(description="Outputs MUMmer coordinates annotated with length of unique sequence for each alignment")
parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
parser.add_argument("--min",help="Minimum size (bp) of variant to include, default = 50" ,dest="minimum_variant_size",type=int, default=50)
parser.add_argument("--output", help="Output file with variants in bed format.", dest="output_path", type=str, required=True)
parser.set_defaults(func=run)
args=parser.parse_args()
args.func(args)
Expand Down

0 comments on commit 58fb525

Please sign in to comment.