-
Notifications
You must be signed in to change notification settings - Fork 3
/
readsparse.hpp
375 lines (352 loc) · 15.1 KB
/
readsparse.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/* Read and write sparse matrices in text format:
* <labels(s)> <column>:<value> <column>:<value> ...
*
* BSD 2-Clause License
* Copyright (c) 2021, David Cortes
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <iostream>
#include <vector>
#include <stdio.h>
#include <cstdint>
#include <cinttypes>
#ifndef size_t
# include <stddef.h>
#endif
#if defined(_FOR_PYTHON) || defined(_FOR_R) || !defined(_WIN32)
#define EXPORTABLE
#else
#ifdef READSPARSE_COMPILE
#define EXPORTABLE __declspec(dllexport)
#else
#define EXPORTABLE __declspec(dllimport)
#endif
#endif
/* Functions for reading and writing sparse CSR matrices in text format.
See SVMLight's webpage for some details about the format:
http://svmlight.joachims.org
Datasets in this format can be downloaded from:
- LibSVM datasets:
https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets
- Extreme Classification Repository:
http://manikvarma.org/downloads/XC/XMLRepository.html
The function templates below will be available for the following types:
- int_t : [int, int64_t, size_t]
- real_t : [float, double]
- label_t : [int, int64_t, size_t, float, double]
Each function has a variant that uses C++ streams and another that uses C file pointers.
The ones with C file pointers are faster, but might be less robust to errors.
Note that the code paths are different and might produce slightly different
results for the same file or same input.
The functions assume the inputs and outputs are a sparse matrix 'X' in CSR format,
and either a vector of labels or a binary CSR matrix of labels 'y'. In the case
of CSR labels, only column indices are used, not values.
In general, a CSR matrix consists of the following structure:
- indptr is an array of size=nrows+1, with each entry denoting
at which position [row] where does each row start in the
indices and data arrays, and at position [row+1] where does
the row end.
- indices is an array of size=nnz containing the non-missing
column indices, with rows delimited by indptr.
- values is an array of size=nnz which has the associated values
for each entry in indices.
Parameters
==========
- input_file
Input stream, either as an input stream, or as a file pointer in non-binary read mode.
- output_file
Output stream, either as an output stream, or as a file pointer in non-binary write mode.
- indptr
Array/vector of size nrows+1, containing the index pointer for the 'X' matrix.
For the reading functions, this is an output variable and the data that it
contains will be overwritten.
- indices
Array/vector of size nnz, containing the non-missing column indices for the
'X' matrix. For the reading functions, this is an output variable and the
data that it contains will be overwritten.
- values
Array/vector of size nnz, containing the associated values to each entry
in 'indices'. For the reading functions, this is an output variable and the
data that it contains will be overwritten.
- labels
Array/vector of size nrows, containing the 'y' values (labels for each row
in "X'"). For the reading functions, this is an output variable and the
data that it contains will be overwritten.
- indptr_lab
Array/vector of size nrows+1, containing the indptr of the 'y' values.
For the reading functions, this is an output variable and the
data that it contains will be overwritten.
- indices_lab
Array/vector of size nnz_y, containing the indices of the 'y' values.
For the reading functions, this is an output variable and the
data that it contains will be overwritten.
- qid
Optional rray/vector of size nrows. This will contain secondary label
information used for ranking under 'qid' tag. Will not be read unless passing
'assume_no_qid=false', and will not be written unless passing 'has_qid=true'.
For the reading functions, this is an output variable and the
data that it contains will be overwritten.
- missing_qid
Value that denotes that a given qid is missing. Missing qids will not
be written into the data (such row will now have qid:<value>). When
reading the data, missing qids will have value SIZE_MAX for size_t
types and -INT_MAX for other types. Note that in general, other software
does not work with a mixture of missing and non-missing qid.
- missing_label
Value that denotes that a given label is missing, when the labels are
integer types (for numeric types, will be assumed that NAN means missing).
Missing labels will not be written into the data, so those rows will start
with a space. When reading the data, missing labels will have a value of
NAN for numeric types, SIZE_MAX for size_t, and -INT_MAX for other types.
Note that, in general, other software does not work with missing labels.
- has_qid
Whether the input data has qid field. If passing 'has_qid=false', will
not write the qid. See 'assume_no_qid' for the reading functions.
- nrows
Number of rows in the data. When reading the data, this is an output variable.
When writing the data, it is a non-optional input variable.
Note that, on 32-bit systems and lower, this variable will be an unsigned 64-bit
integer, but the number or rows cannot exceed the maximum value for size_t
(SIZE_MAX, which is 2^32-1 on 32-bit systems). The number of columns and classes
can however still be larger.
- ncols
Number of columns in the data. When reading the data, this is an output variable,
and if the data has a header row that suggests a larger number of columns,
this larger number is what will be output. When writing the data, this is
optional and only used when passing 'add_header=true'.
- nclasses
Number of classes in the 'y' data. When reading the data, this is an output variable,
and if the data has a header row that suggests a larger number of classes,
this larger number is what will be output. When writing the data, this is
optional and only used when passing 'add_header=true'.
- ignore_zero_valued
Whether to ignore features that have a value of zero. When writing the data,
this will be determined heuristically by std::abs(x) >= pow(10., -decimal_places)/2.,
which might not match with how the libc/libc++ functions round them.
- sort_indices
Whether to sort the column indices in the CSR matrix after reading or before writing.
When writing, the input data will be modified in-place. It is recommended
to pass true as other software might assume that they are sorted.
- text_is_base1
Whether the indices in the text have or should have numeration starting at 1.
The library uses 0-based numeration, so if passing 'text_is_base1=true',
it will subtract minus 1 from the indices after reading the data
(unless it finds an index of zero or less), and will add +1 to the indices
when writing. Most software assumes the text is base-1.
- assume_no_qid
Whether to assume that the data has a qid field. If passing false and the
data does turn out to have qid, the features will not be read. See
'has_qid' for the writing functions.
- assume_trailing_ws
Whether to assume that lines in the input can have extra whitespaces at the
end before a newline. For large files which do not have any extra whitespace
at the end, parsing with this set to 'false' is typically 1.5x faster,
but if the file does turn out to have e.g. spaces at the end, the result
will be incorrect.
- add_header
Whether to add a header with metadata (number of rows, columns, and classes)
as the first row. The reading functions will automatically detect if the
data has a header as first row.
- decimal_places
Number of decimal places after the point with which to write the numeric
values. This applies also to labels when they are numeric types.
Note that the values will be rounded.
Returns
=======
success
Will return 'true' if the operation completes successfully, 'false' otherwise.
When it fails, it might throw an error message to stderr, but will not
be able to tell what failed exactly.
*/
/* Note: the functions here are not really templated, but are rather pre-compiled
for all possible combinations of valid types (see documentation above for valid
types for each template).
The header includes at the end a longer header with the de-templated prototypes,
which is automatically generated.
Thus, do not rely on the default arguments when using this header. These are
only for documentation purposes as suggested values. */
#if SIZE_MAX >= UINT64_MAX
# define size_large size_t
#else
# define size_large uint64_t
#endif
template <class int_t=int64_t, class real_t=double, class label_t=double>
EXPORTABLE bool read_single_label
(
std::istream &input_file,
std::vector<int_t> &indptr,
std::vector<int_t> &indices,
std::vector<real_t> &values,
std::vector<label_t> &labels,
std::vector<int_t> &qid,
size_large &nrows,
size_large &ncols,
size_large &nclasses,
const size_t limit_nrows = 0,
const bool ignore_zero_valued = true,
const bool sort_indices = true,
const bool text_is_base1 = true,
const bool assume_no_qid = true,
const bool assume_trailing_ws = true
);
template <class int_t=int64_t, class real_t=double, class label_t=double>
EXPORTABLE bool read_single_label
(
FILE *input_file,
std::vector<int_t> &indptr,
std::vector<int_t> &indices,
std::vector<real_t> &values,
std::vector<label_t> &labels,
std::vector<int_t> &qid,
size_large &nrows,
size_large &ncols,
size_large &nclasses,
const size_t limit_nrows = 0,
const bool ignore_zero_valued = true,
const bool sort_indices = true,
const bool text_is_base1 = true,
const bool assume_no_qid = true,
const bool assume_trailing_ws = true
);
template <class int_t=int64_t, class real_t=double>
EXPORTABLE bool read_multi_label
(
std::istream &input_file,
std::vector<int_t> &indptr,
std::vector<int_t> &indices,
std::vector<real_t> &values,
std::vector<int_t> &indptr_lab,
std::vector<int_t> &indices_lab,
std::vector<int_t> &qid,
size_large &nrows,
size_large &ncols,
size_large &nclasses,
const size_t limit_nrows = 0,
const bool ignore_zero_valued = true,
const bool sort_indices = true,
const bool text_is_base1 = true,
const bool assume_no_qid = true,
const bool assume_trailing_ws = true
);
template <class int_t=int64_t, class real_t=double>
EXPORTABLE bool read_multi_label
(
FILE *input_file,
std::vector<int_t> &indptr,
std::vector<int_t> &indices,
std::vector<real_t> &values,
std::vector<int_t> &indptr_lab,
std::vector<int_t> &indices_lab,
std::vector<int_t> &qid,
size_large &nrows,
size_large &ncols,
size_large &nclasses,
const size_t limit_nrows = 0,
const bool ignore_zero_valued = true,
const bool sort_indices = true,
const bool text_is_base1 = true,
const bool assume_no_qid = true,
const bool assume_trailing_ws = true
);
template <class int_t=int64_t, class real_t=double, class label_t=double>
EXPORTABLE bool write_single_label
(
std::ostream &output_file,
int_t *indptr,
int_t *indices,
real_t *values,
label_t *labels,
int_t *qid,
const int_t missing_qid,
const label_t missing_label,
const bool has_qid,
const size_large nrows,
const size_large ncols,
const size_large nclasses,
const bool ignore_zero_valued = true,
const bool sort_indices = true,
const bool text_is_base1 = true,
const bool add_header = false,
const int decimal_places = 8
);
template <class int_t=int64_t, class real_t=double, class label_t=double>
EXPORTABLE bool write_single_label
(
FILE *output_file,
int_t *indptr,
int_t *indices,
real_t *values,
label_t *labels,
int_t *qid,
const int_t missing_qid,
const label_t missing_label,
const bool has_qid,
const size_large nrows,
const size_large ncols,
const size_large nclasses,
const bool ignore_zero_valued = true,
const bool sort_indices = true,
const bool text_is_base1 = true,
const bool add_header = false,
const int decimal_places = 8
);
template <class int_t=int64_t, class real_t=double>
EXPORTABLE bool write_multi_label
(
std::ostream &output_file,
int_t *indptr,
int_t *indices,
real_t *values,
int_t *indptr_lab,
int_t *indices_lab,
int_t *qid,
const int_t missing_qid,
const bool has_qid,
const size_large nrows,
const size_large ncols,
const size_large nclasses,
const bool ignore_zero_valued = true,
const bool sort_indices = true,
const bool text_is_base1 = true,
const bool add_header = false,
const int decimal_places = 8
);
template <class int_t=int64_t, class real_t=double>
EXPORTABLE bool write_multi_label
(
FILE *output_file,
int_t *indptr,
int_t *indices,
real_t *values,
int_t *indptr_lab,
int_t *indices_lab,
int_t *qid,
const int_t missing_qid,
const bool has_qid,
const size_large nrows,
const size_large ncols,
const size_large nclasses,
const bool ignore_zero_valued = true,
const bool sort_indices = true,
const bool text_is_base1 = true,
const bool add_header = false,
const int decimal_places = 8
);
#include "readsparse_detemplated.hpp"