-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbamsubset.h
150 lines (123 loc) · 4.21 KB
/
bamsubset.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#ifndef BAMSUBSET_H_
#define BAMSUBSET_H_
#include <seqan/basic.h>
#include <seqan/sequence.h>
#include <seqan/bam_io.h>
#include <iostream>
#include <unordered_set>
using namespace seqan;
struct Stats
{
unsigned filteredReads;
unsigned passedReads;
Stats(): filteredReads(0), passedReads(0){}
inline void report()
{
std::cout << "\nSUMMARY" << std::endl;
std::cout << "Total records:\t\t" << (filteredReads + passedReads) << std::endl;
std::cout << "Filtered records:\t" << filteredReads << "\t(" << static_cast<double>(filteredReads)/(filteredReads + passedReads)*100 << "%)"
<< "\nPassed records:\t\t" << passedReads << "\t(" << static_cast<double>(passedReads)/(filteredReads + passedReads)*100 << "%)" << std::endl;
}
};
//Read a text file containing whitelisted barcodes, put into set of strings
//Return false if text file can not be opened
//Return true if success
bool readWhitelist(std::unordered_set<std::string> & wlBarcodes, const CharString & bcWlFileName)
{
// Read whitelisted barcodes file
std::ifstream wlIn(toCString(bcWlFileName));
if (!wlIn.is_open())
{
std::cerr << "ERROR: Could not open " << bcWlFileName << " for reading.\n";
return false;
}
std::string barcode;
while (wlIn >> barcode)
{
// Read the file line by line, save each barcode
wlBarcodes.emplace(barcode);
}
std::cout << "\n[bcsubset] Loaded " << wlBarcodes.size() << " barcodes from \'" << bcWlFileName << "\'." << std::endl;
return !empty(wlBarcodes);
}
// Process BAM header, add @PG line
inline void processHeader(BamHeader & header, BamFileOut & bamFileOut, char const ** argv)
{
// Modify @PG
BamHeaderRecord hrecord;
hrecord.type = BamHeaderRecordType::BAM_HEADER_PROGRAM;
// Set tags
appendValue(hrecord.tags, Pair<CharString>("ID", "bcsubset"));
appendValue(hrecord.tags, Pair<CharString>("VN", VERSION));
CharString clstring;
append(clstring, "bcsubset");
for (unsigned j=1; j<length(argv);++j)
{
appendValue(clstring, ' ');
append(clstring, argv[j]);
}
appendValue(hrecord.tags, Pair<CharString>("CL", clstring));
appendValue(header, hrecord);
writeHeader(bamFileOut, header);
}
//Trim the last n characters of barcode
inline void trimBarcode(CharString & barcode, const unsigned toTrim)
{
resize(barcode, length(barcode) - toTrim);
}
// Get barcode from tags in BAM records
inline bool getBarcodeFromTags(std::string & barcode, const BamAlignmentRecord & record, const CharString & bctag, const unsigned toTrim)
{
unsigned idx = 0;
BamTagsDict tagsDict(record.tags);
bool keyFound = findTagKey(idx, tagsDict, bctag);
if (keyFound)
{
CharString tag;
if (!extractTagValue(tag, tagsDict, idx))
{
std::cerr << "WARNING: There was an error extracting barcode from tag " << bctag << " of record: " << record.qName << "\n";
return false;
}
else
{
trimBarcode(tag, toTrim);
move(barcode, tag);
return true;
}
}
else
{
return false;
}
}
// Check if a BAM record contains a whitelisted barcode
inline bool isGoodRecord(const BamAlignmentRecord & record, const std::unordered_set<std::string> & wlBarcodes, const CharString & bctag, const unsigned toTrim)
{
std::string readBC;
if(!getBarcodeFromTags(readBC, record, bctag, toTrim))
return false;
if (wlBarcodes.find(readBC) == wlBarcodes.end())
return false;
else
return true;
}
// Process input BAM file to find records matching the whitelisted barcodes and write them to output BAM file
inline void processBam(BamFileIn & inFile, BamFileOut & bamFileOut, const std::unordered_set<std::string> & wlBarcodes, const CharString & bctag, const unsigned toTrim, Stats & stats)
{
while (!atEnd(inFile))
{
BamAlignmentRecord record;
readRecord(record, inFile);
if(isGoodRecord(record, wlBarcodes, bctag, toTrim))
{
writeRecord(bamFileOut, record);
++stats.passedReads;
}
else
{
++stats.filteredReads;
}
}
}
#endif /* BAMSUBSET_H_ */