-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHIGGS_dataset_to_binary.c
127 lines (96 loc) · 2.92 KB
/
HIGGS_dataset_to_binary.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TRAIN_SIZE 10500000
#define TEST_SIZE 500000
#define FEATURE_DIM 28
/**
*
* Data is defined as: Class Label (1 for signal or 0 for background) | FEATURES
*
*/
double* read_dataset(char *file_path, int size)
{
int i, j, k = 0;
FILE *f = fopen(file_path, "r");
if(f == NULL) {
perror("Error");
exit(1);
}
double *data = malloc(size * (FEATURE_DIM + 1) * sizeof(double));
char *value = malloc(256 * sizeof(char));
for(i = 0; i < size; i++) {
for(j = 0; j < FEATURE_DIM + 1; j++) {
if(j == FEATURE_DIM) {
// Last data value has no comma
fscanf(f, "%s", value);
data[k] = strtod(value, NULL);
}
else {
// Read in data to value; data separated by a comma
fscanf(f, "%256[^,],", value);
data[k] = strtod(value, NULL);
}
k++;
}
}
free(value);
fclose(f);
return data;
}
void write_binary_dataset(char *file_path, double *data, size_t size)
{
FILE *bf = fopen(file_path, "wb");
if(bf == NULL) {
perror("Error");
exit(1);
}
size_t elements_written = fwrite(data, sizeof(double), size * (FEATURE_DIM + 1), bf);
printf("\n%d elements written to %s\n", (int) elements_written, file_path);
fclose(bf);
}
void read_binary_dataset(char *file_path, int size, double *data)
{
int i;
FILE *bf = fopen(file_path, "rb");
if(bf == NULL) {
perror("Error");
exit(1);
}
for(i = 0; i < size; i++) {
fread(&data[i * (FEATURE_DIM + 1)], sizeof(double), (FEATURE_DIM + 1), bf);
}
fclose(bf);
}
void generate_HIGGS_binary_datasets()
{
printf("\nReading in training and testing datasets...\n");
double *HIGGS_train_data = read_dataset("HIGGS.csv", TRAIN_SIZE + TEST_SIZE);
int i, test_size_chunk = TEST_SIZE * (FEATURE_DIM + 1), k = test_size_chunk;
double *HIGGS_test_data = malloc(test_size_chunk * sizeof(double));
for(i = 0; i < test_size_chunk; i++) {
HIGGS_test_data[i] = HIGGS_train_data[k];
k++;
}
printf("Converting to binary...\n");
write_binary_dataset("HIGGS_train.bin", HIGGS_train_data, TRAIN_SIZE);
write_binary_dataset("HIGGS_test.bin", HIGGS_test_data, TEST_SIZE);
free(HIGGS_train_data);
free(HIGGS_test_data);
}
int main()
{
clock_t begin = clock();
generate_HIGGS_binary_datasets();
double *HIGGS_train_data = malloc(TRAIN_SIZE * (FEATURE_DIM + 1) * sizeof(double));
double *HIGGS_test_data = malloc(TEST_SIZE * (FEATURE_DIM + 1) * sizeof(double));
printf("\nTest reading in data from binary files...\n");
read_binary_dataset("HIGGS_train.bin", TRAIN_SIZE, HIGGS_train_data);
read_binary_dataset("HIGGS_test.bin", TEST_SIZE, HIGGS_test_data);
clock_t end = clock();
free(HIGGS_train_data);
free(HIGGS_test_data);
printf("\nDone\n\n");
printf("Total time to read in data, convert to binary and read in data from binary: %.2f secs\n",
(double) (end - begin) / CLOCKS_PER_SEC);
}