# -*- coding: utf-8 -*- """ Created on Wed May 11 17:58:38 2016 @author: lpsmith """ # This file reads in SNP array BAF data, corrects for dye bias, and writes # it out again in EXPANDS format. There are basic filters that can be # used, and it also can be filtered against the exome data. Files with # no filter, the basic filters, and basic+exome are all written out. dye_median = 0.5237 #from running median_calc.py; June 8th, 2016 blood_be_diff_threshold = 0.03 #from running FilterFinder.py; June 17; 2016 het_threshold = 0.18 #from running FilterFinder_blood_call.py; June 17, 2016 ex_snp_diff_threshold = 0.4 #from running FilterFinder_exome_diff.py; June 17, 2016 dbcorr = True # read the probeset file, which correlates name to position. infilename = "probe_set_build37_forpartek.txt" infile = open(infilename,"r") indata = infile.readlines() infile.close() samples = ["1047", "163", "222", "230", "312", "391", "611", "660", "729", "732", "824", "930"] #samples = ["1047"] labels = {} rev_labels = {} for line in indata: line = line.rstrip().split() if (len(line) == 3): (id, chr, pos) = line[0:3] if chr=="X": chr = "23" if chr=="Y": chr = "24" labels[id] = (chr, pos) rev_labels[(chr, pos)] = id for sample in samples: #Read the SNP data file SNPfilename = "SNP_data_orig/" + sample + "_BAFs" infile = open(SNPfilename,"r") indata = infile.readlines() infile.close() #Process the SNP data SNPs = {} labelrow = indata[0].rstrip().split("\t") BErow = indata[1].rstrip().split("\t") bloodrow = indata[2].rstrip().split("\t") for col in range(len(labelrow)): if labelrow[col] in labels: SNPs[labels[labelrow[col]]] = (BErow[col], bloodrow[col]) #print labels[labelrow[col]] #Process the exome data. exomes = {} exomefilename = "exome_data_reformatted/" + sample + "_full.baf" infile = open(exomefilename, "r") indata = infile.readlines() infile.close() for exline in indata: exline = exline.rstrip().split("\t") if exline[1] == "startpos": continue labelpair = (exline[0], exline[1]) exomes[labelpair] = (exline[3], exline[5]) #Now go through the SNP data and export different files for different filters dbcorrstr = "" if (dbcorr): dbcorrstr = "_dbcorr" outfn_0f = "SNP_data_filtered/" + sample + dbcorrstr + "_diff_filter.txt" outfn_1f = "SNP_data_filtered/" + sample + dbcorrstr + "_basic_filters.txt" outfn_2f = "SNP_data_filtered/" + sample + dbcorrstr + "_basic_and_exome_filters.txt" outf_0f = open(outfn_0f, "w") outf_1f = open(outfn_1f, "w") outf_2f = open(outfn_2f, "w") outline = "chr\tstartpos\tendpos\tAF_Tumor\tPN_B\tAF_Blood\n" outf_0f.write(outline) outf_1f.write(outline) outf_2f.write(outline) for label in SNPs.items(): (chr, pos) = label[0] (SNP_BE, SNP_blood) = label[1] if (SNP_BE == "?" or SNP_blood == "?"): continue SNP_BE = float(SNP_BE) SNP_blood = float(SNP_blood) snpid = rev_labels[label[0]] if (snpid[0:3] != "cnv" and dbcorr): #dye bias correction, BE: if (SNP_BE < dye_median): SNP_BE = SNP_BE * 0.5/dye_median else: SNP_BE = 0.5 + 0.5*(SNP_BE-dye_median)/(1-dye_median) #dye bias correction, blood: if (SNP_blood < dye_median): SNP_blood = SNP_blood * 0.5/dye_median else: SNP_blood = 0.5 + 0.5*(SNP_blood-dye_median)/(1-dye_median) #Call for whether blood is heterozygous or not: ishet = "0" if (SNP_blood > 0.25 and SNP_blood < 0.75): ishet = "1" #If the two are not significantly different, don't write it out anywhere. if (abs(SNP_blood - SNP_BE) < blood_be_diff_threshold): continue #if the *called* blood frequency isn't significantly different from BE, don't write that out either. if (ishet == "0"): if (SNP_BE > 1-blood_be_diff_threshold or SNP_BE < blood_be_diff_threshold): continue else: if (abs(SNP_BE-0.5) < blood_be_diff_threshold): continue #If need be, flip which allele is 'B' so that the blood is the lower frequency if (SNP_blood > SNP_BE): SNP_BE = 1-SNP_BE SNP_blood = 1-SNP_blood #Create the output line that we may write out: outline = chr + "\t" + pos + "\t" + pos + "\t" + str(SNP_BE) + "\t" + ishet + "\t" + str(SNP_blood) + "\n" #Write this out to the no-filter file outf_0f.write(outline) #if the blood cannot be called as heterozygous or homozygous, don't write that out to a filtered file: if (SNP_blood > 0.0+het_threshold and SNP_blood < 0.5-het_threshold): continue if (SNP_blood > 0.5+het_threshold and SNP_blood < 1.0-het_threshold): continue #We're done with our first filter; write this out. outf_1f.write(outline) #now try to find the same data in the exome: exome = exomes.get(label[0]) if (exome == None): continue (exomeBE, exomeBlood) = exome exomeBE = float(exomeBE) exomeBlood = float(exomeBlood) #if the two measures signifiantly differ, don't write them out: if (abs(SNP_BE - exomeBE) + abs(SNP_blood - exomeBlood) > ex_snp_diff_threshold): continue #This was our last filter; write it out. outf_2f.write(outline)