From 58fb52567b402ab54bc8c4f1126ac1565d46f61f Mon Sep 17 00:00:00 2001 From: Maria Nattestad Date: Mon, 20 Jan 2020 15:42:29 -0800 Subject: [PATCH] Python 3 fixes (#26) * Read gzipped or uncompressed files with try/except. * Use explicit int() for divisions that used to produce integers in python2 -- e.g. in binary search algorithms where the result is used for indexing. * Exit the top-level script with an error code if any component script fails. --- scripts/Assemblytics | 10 +++-- scripts/Assemblytics_uniq_anchor.py | 55 ++++++++++++------------ scripts/Assemblytics_variant_charts.R | 2 - scripts/Assemblytics_within_alignment.py | 29 ++++++++----- 4 files changed, 52 insertions(+), 44 deletions(-) diff --git a/scripts/Assemblytics b/scripts/Assemblytics index 635f0de..5a458ee 100755 --- a/scripts/Assemblytics +++ b/scripts/Assemblytics @@ -86,7 +86,7 @@ then then echo "BETWEEN,DONE,Step 2: Assemblytics_between_alignments.pl completed successfully. Now finding variants within alignments." >> $LOG_FILE >&2 echo "3. Finding variants within alignments" - $SCRIPT_PATH/Assemblytics_within_alignment.py --delta $OUTPUT_PREFIX.Assemblytics.unique_length_filtered_l$UNIQUE_LENGTH.delta.gz --min $MINIMUM_SIZE > $OUTPUT_PREFIX.variants_within_alignments.bed + $SCRIPT_PATH/Assemblytics_within_alignment.py --delta $OUTPUT_PREFIX.Assemblytics.unique_length_filtered_l$UNIQUE_LENGTH.delta.gz --min $MINIMUM_SIZE --output $OUTPUT_PREFIX.variants_within_alignments.bed if [ -e $OUTPUT_PREFIX.variants_within_alignments.bed ]; then @@ -123,19 +123,21 @@ then echo "SUMMARY,DONE,Step 5: Assemblytics_summary.py completed successfully" >> $LOG_FILE else echo "SUMMARY,FAIL,Step 5: Assemblytics_summary.py failed" >> $LOG_FILE + exit 1 fi - else echo "COMBINE,FAIL,Step 4: combining variants failed" >> $LOG_FILE + exit 1 fi else - echo "WITHIN,FAIL,Step 3: Assemblytics_within_alignment.py failed: Possible problem before this step or with Python on server." >> $LOG_FILE + exit 1 fi else echo "BETWEEN,FAIL,Step 2: Assemblytics_between_alignments.pl failed: Possible problem with Perl or show-coords on server." >> $LOG_FILE + exit 1 fi else echo "UNIQFILTER,FAIL,Step 1: Assemblytics_uniq_anchor.py failed: Possible problem with Python or Python packages on server." >> $LOG_FILE + exit 1 fi - diff --git a/scripts/Assemblytics_uniq_anchor.py b/scripts/Assemblytics_uniq_anchor.py index ff4bebe..9ef2088 100755 --- a/scripts/Assemblytics_uniq_anchor.py +++ b/scripts/Assemblytics_uniq_anchor.py @@ -25,21 +25,21 @@ def run(args): if unique_length == 10000: print("Use --unique-length X to set the unique anchor length requirement. Default is 10000, such that each alignment must have at least 10000 bp from the query that are not included in any other alignments.") - - print("header:") - - f = open(filename) - header1 = f.readline() - # the first two bytes show whether a file is gzipped - if header1[0:2]=="\x1f\x8b": - f.close() - f = gzip.open(filename) - print(f.readline().strip()) - else: - print(header1.strip()) - + try: + f = gzip.open(filename, 'rt') + header1 = f.readline().strip() + print("Detected gzipped delta file. Reading...") + except: + f = open(filename, 'r') + header1 = f.readline().strip() + print("Detected uncompressed delta file. Reading...") + # Ignore the first two lines for now + print("\n") + print("Header (2 lines):") + print(header1) print(f.readline().strip()) + print("\n") linecounter = 0 @@ -56,7 +56,6 @@ def run(args): for line in f: if line[0]==">": - fields = line.strip().split() current_query_name = fields[1] current_header = line.strip() @@ -84,9 +83,9 @@ def run(args): num_queries = len(lines_by_query) print("Filtering alignments of %d queries" % (num_queries)) - num_query_step_to_report = num_queries/100 + num_query_step_to_report = int(num_queries/100) if num_queries < 100: - num_query_step_to_report = num_queries/10 + num_query_step_to_report = int(num_queries/10) if num_queries < 10: num_query_step_to_report = 1 @@ -97,22 +96,24 @@ def run(args): query_counter += 1 if (query_counter % num_query_step_to_report) == 0: - print("Progress: %d%%" % (query_counter*100/num_queries)) + print("Progress: %d%%" % (int(query_counter*100/num_queries))) print("Progress: 100%") print("Deciding which alignments to keep: %d seconds for %d queries" % (time.time()-before,num_queries)) before = time.time() - fout = gzip.open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta.gz" % (unique_length),'w') + fout = gzip.open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta.gz" % (unique_length),'wt') - f = open(filename) - header1 = f.readline() - if header1[0:2]=="\x1f\x8b": - f.close() - f = gzip.open(filename) - header1 = f.readline() - - fout.write(header1) # write the first line that we read already + try: + f = gzip.open(filename, 'rt') + header1 = f.readline().strip() + print("Detected gzipped delta file. Reading...") + except: + f = open(filename, 'r') + header1 = f.readline().strip() + print("Detected uncompressed delta file. Reading...") + + fout.write(header1) fout.write(f.readline()) linecounter = 0 @@ -329,7 +330,7 @@ def binary_search(query, numbers, left, right): if left >= right: return right - mid = (right+left)/2 + mid = int((right+left)/2) if query == numbers[mid]: diff --git a/scripts/Assemblytics_variant_charts.R b/scripts/Assemblytics_variant_charts.R index 396b93c..9532318 100755 --- a/scripts/Assemblytics_variant_charts.R +++ b/scripts/Assemblytics_variant_charts.R @@ -34,8 +34,6 @@ comma_format<-function(num) { formatC(abs(num),format="f",big.mark=",",drop0trailing = TRUE) } - - # Prep data for log-scaled plot alt <- bed diff --git a/scripts/Assemblytics_within_alignment.py b/scripts/Assemblytics_within_alignment.py index bfc50eb..3e07f51 100755 --- a/scripts/Assemblytics_within_alignment.py +++ b/scripts/Assemblytics_within_alignment.py @@ -12,16 +12,21 @@ def run(args): filename = args.delta minimum_variant_size = args.minimum_variant_size - f = open(filename) - header1 = f.readline() - # check first 2 bytes to see if the file is gzipped - if header1[0:2]=="\x1f\x8b": - f.close() - f = gzip.open(filename) - header1 = f.readline() - + try: + f = gzip.open(filename, 'rt') + header1 = f.readline().strip() + print("Detected gzipped delta file. Reading...") + except: + f = open(filename, 'r') + header1 = f.readline().strip() + print("Detected uncompressed delta file. Reading...") + # Ignore the first two lines for now - f.readline() + print("\n") + print("Header (2 lines):") + print(header1) + print(f.readline().strip()) + print("\n") linecounter = 0 @@ -74,18 +79,20 @@ def run(args): f.close() + fout = open(args.output_path, 'w') newcounter = 1 for line in variants: if line[4] >= minimum_variant_size: line[3] = "Assemblytics_w_%d" % (newcounter) - print("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10]) + fout.write("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10] + "\n") newcounter += 1 - + fout.close() def main(): parser=argparse.ArgumentParser(description="Outputs MUMmer coordinates annotated with length of unique sequence for each alignment") parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True) parser.add_argument("--min",help="Minimum size (bp) of variant to include, default = 50" ,dest="minimum_variant_size",type=int, default=50) + parser.add_argument("--output", help="Output file with variants in bed format.", dest="output_path", type=str, required=True) parser.set_defaults(func=run) args=parser.parse_args() args.func(args)