-
Notifications
You must be signed in to change notification settings - Fork 2
/
sgclient.h
234 lines (194 loc) · 8.56 KB
/
sgclient.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/*
* Copyright (C) 2015 by Glenn Hickey (hickey@soe.ucsc.edu)
*
* Released under the MIT license, see LICENSE.cactus
*/
#ifndef _SGCLIENT_H
#define _SGCLIENT_H
#include <string>
#include <vector>
#include <limits>
#include <map>
#include <stdexcept>
#include <sstream>
#include "sidegraph.h"
#include "sgsegment.h"
#include "download.h"
/**
All logic for reading side graph data from the GA4GH client,
nameley Sequences, Joins and AllelePaths. Uses libcurl
which is left as dependency (maybe move to submodule?) for now
design is that stuff gets downloaded into a single SideGraph object
(_sg) for which SGClient is responsible for deleting.
*/
class SGClient
{
public:
static const int DefaultPageSize;
SGClient();
~SGClient();
/** free up all memory stored in side graph */
void erase();
/** set URL to be used by all the other methods */
void setURL(const std::string& baseURL);
/** print a few messages here if specified */
void setOS(std::ostream* os);
/** set the Page Size for POST requests */
void setPageSize(int pageSize);
/** toggle whether paths are downloaded */
void setSkipPaths(bool skipPaths);
/** Download a whole Side Graph into memory. Topolgy gets stored
* internally in (returned) SideGraph, path and bases get stored in
* the given vectors */
const SideGraph* downloadGraph(std::vector<std::string>& outBases,
std::vector<SGNamedPath>& outPaths);
/** Download sequences into the Side Graph. returns Next Page Token.
* call after downloadReferences. In order to get sequence names,
* pass nameIdMap as downloaded by downloadReferences(). outBases
* is to support new interface to download bases with sequences. it
* is optional. */
int downloadSequences(std::vector<const SGSequence*>& outSequences,
std::vector<std::string>* outBases = NULL,
const std::map<int, std::string>* nameIdMap = NULL,
int pageToken = 0,
int pageSize = DefaultPageSize,
int referenceSetID = -1,
int variantSetID = -1);
/** Download reference ids and build sequence id -> reference name map
* ignoring everything else. returns Next Page Token. */
int downloadReferences(std::map<int, std::string>& outIdMap,
int pageToken = 0,
int pageSize = DefaultPageSize,
int referenceSetID = -1);
/** Download the DNA bases for a given sequence. Note the ID here is
* the mapped ID (ie used by SideGraph class) */
int downloadBases(sg_int_t sgSeqID, std::string& outBases, int start = 0,
int end = -1);
/** Download joins into the Side Graph. Returns Next Page Token. Note
* must download Sequences first!! Sequence ID's in joins are
* automatically mapped to in-memory Side Graph ids. */
int downloadJoins(std::vector<const SGJoin*>& outJoins,
int pageToken = 0,
int pageSize = DefaultPageSize,
int referenceSetID = -1,
int variantSetID = -1);
/** Download alleles (only saving named paths for now) */
int downloadAllelePaths(std::vector<SGNamedPath>& outPaths,
int pageToken = 0,
int pageSize = DefaultPageSize,
int sequenceID = -1,
const std::vector<int>* variantSetIDs = NULL,
int start = 0,
int end = std::numeric_limits<int>::max());
/** Download allele path. returns -1 if path not found */
int downloadAllele(int alleleID, std::vector<SGSegment>& outPath,
int& outVariantSetID, std::string& outName);
/** SideGraph class, as currently implemented, only works with
* sequences with ids in [0, n), and it happily changes input id's
* to enforce this. We therefore keep a little map to get back the
* ID from the graph server sequence, given a side graph sequence */
sg_int_t getOriginalSeqID(sg_int_t sgID) const;
/** Other direction (returns -1 if not found) */
sg_int_t getSGSeqID(sg_int_t sgID) const;
/** Apply mapping (original->sg) to join */
void mapSeqIDsInJoin(SGJoin& join) const;
/** Apply mapping (original->sg) to every segment in path */
void mapSeqIDsInPath(std::vector<SGSegment>& path) const;
/** Add a mapping */
void addSeqIDMapping(sg_int_t originalID, sg_int_t sgID);
/** Get access to Side Graph that's been downloaded so far */
const SideGraph* getSideGraph() const;
protected:
/** Make sure input join connects to positions that exist */
void verifyInJoin(const SGJoin& joine) const;
/** Make sure input segment spans range that exists */
void verifyInPath(int alleleID, const std::vector<SGSegment>& path) const;
/** Build the JSON string for sequence download options */
std::string getSequencePostOptions(int pageToken,
int pageSize,
int referenceSetID,
int variantSetID,
bool getBases) const;
/** Build the JSON string for reference download options */
std::string getReferencePostOptions(int pageToken,
int pageSize,
int referenceSetID,
const std::vector<int>& seqIDs,
const std::vector<std::string>& md5s,
const std::vector<std::string>& accs,
const std::vector<std::string>& rnames)
const;
/** Build the JSON string for join download options */
std::string getJoinPostOptions(int pageToken,
int pageSize,
int referenceSetID,
int variantSetID) const;
/** Build the JSON string for allele download options */
std::string getAllelePostOptions(int pageToken,
int pageSize,
int sequenceID,
const std::vector<int>* variantSetIDs,
int start,
int end) const;
/** Print logging messages here */
std::ostream& os();
static const std::string CTHeader;
SideGraph* _sg;
std::string _url;
Download _download;
// sucky hack: to do: fix sidegraph and lookup to let sequences
// have arbitrary ids.
std::map<sg_int_t, sg_int_t> _toOrigSeqId;
std::map<sg_int_t, sg_int_t> _fromOrigSeqId;
std::ostream* _os;
std::stringstream _ignore;
int _pageSize;
bool _skipPaths;
};
inline sg_int_t SGClient::getOriginalSeqID(sg_int_t sgID) const
{
assert(_toOrigSeqId.find(sgID) != _toOrigSeqId.end());
return _toOrigSeqId.find(sgID)->second;
}
inline sg_int_t SGClient::getSGSeqID(sg_int_t origID) const
{
std::map<sg_int_t, sg_int_t>::const_iterator i =_fromOrigSeqId.find(origID);
if (i == _fromOrigSeqId.end())
{
return -1;
}
return i->second;
}
inline void SGClient::mapSeqIDsInJoin(SGJoin& join) const
{
// man, that crappy SideGraph write interface is coming to bite me.
join.setSide1(SGSide(SGPosition(
getSGSeqID(join.getSide1().getBase().getSeqID()),
join.getSide1().getBase().getPos()),
join.getSide1().getForward()));
join.setSide2(SGSide(SGPosition(
getSGSeqID(join.getSide2().getBase().getSeqID()),
join.getSide2().getBase().getPos()),
join.getSide2().getForward()));
}
inline void SGClient::mapSeqIDsInPath(std::vector<SGSegment>& path) const
{
for (int i = 0; i < path.size(); ++i)
{
// man, that crappy SideGraph write interface is coming to bite me.
path[i].setSide(SGSide(SGPosition(
getSGSeqID(path[i].getSide().getBase().getSeqID()),
path[i].getSide().getBase().getPos()),
path[i].getSide().getForward()));
}
}
inline void SGClient::addSeqIDMapping(sg_int_t originalID, sg_int_t sgID)
{
_toOrigSeqId.insert(std::pair<sg_int_t, sg_int_t>(sgID, originalID));
_fromOrigSeqId.insert(std::pair<sg_int_t, sg_int_t>(originalID, sgID));
}
inline const SideGraph* SGClient::getSideGraph() const
{
return _sg;
}
#endif