-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMakefile
129 lines (106 loc) · 4.74 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# TODO: ADD CI OF SOME MINIMAL KIND:
# - Spellcheck
# - markdown lint
# TODO: new make target for smart stats
# For each serial, lookup failure time or max time.
# Find the data file the day before failure/the day of max time
# Pull smart stats from that file
# write out to a csv file per serial
# combine together
# consider as covariates
# TODO: new make target to grab model prices on ebay to compute cost effectiveness of each drive
# Default target
.PHONY: help
help:
@echo "Usage: make [target]"
@echo
@echo "Available targets:"
@echo " help Display this help message."
@echo " all Download all data, process it, run the analysis, and knit the report."
@echo " clean Remove processed files, but keep downloaded zip files."
@echo " install Install needed R and python packages"
@echo " print_files Print a list of the files that will be processed, for debugging."
@echo " download_data Download the zip files from Backblaze."
@echo " unzip_data Unzip each downloaded zip file and combine it into a single csv file."
@echo " combine_data Combine all csv files into a single csv file."
@echo " analyze_data Run the survival analysis and knit the Readme.md file"
@echo
@echo "Run make clean && make -j16 all to download/process the data in parallel."
# Configuration
BASE_URL := https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/
END_YEAR := 2024
END_QUARTER := Q2
# Directories
DOWNLOAD_DIR := zip_data
DATA_DIR := data
RESULTS_DIR := results
# Generate list of yearly data files (2013-2015)
YEARLY_FILES := $(addprefix $(DOWNLOAD_DIR)/data_,$(addsuffix .zip,$(shell seq 2013 2015)))
# Generate list of quarterly data files
define QUARTER_FILES
$(if $(filter $(1),$(END_YEAR)),\
$(addprefix $(DOWNLOAD_DIR)/data_,$(addsuffix _$(1).zip,$(shell seq -f "Q%g" 1 $(subst Q,,$(END_QUARTER))))),\
$(addprefix $(DOWNLOAD_DIR)/data_,$(addsuffix _$(1).zip,Q1 Q2 Q3 Q4)))
endef
QUARTERLY_FILES := $(foreach year,$(shell seq 2016 $(END_YEAR)),$(call QUARTER_FILES,$(year)))
# Static pattern rule for downloading files
ALL_FILES := $(YEARLY_FILES) $(QUARTERLY_FILES)
$(ALL_FILES): $(DOWNLOAD_DIR)/%.zip:
@mkdir -p $(dir $@)
curl -o $@ $(BASE_URL)$(@F)
# Pattern rule for processing zip files to CSV
# Only keeps the first 5 columns: date,serial_number,model,capacity_bytes,failure
# Note that for the smart stats stats, different files have different columns
# So if we want to process smart stats, we need much more complicated logic.
# For this script we just want to analyze failure rates, so we drop the smart stats
# Add trap 'rm -rf "$$TEMP_DIR"' EXIT; at the start to delete tempfiles
# Takes about 4 minutes with 8 cores
# ...Bad input files: data/Q1_2017/2017-01-30.csv — ???
# ...Bad input files: data/2014/2014-11-02.csv — DST ends lol
# ...Bad input files: data/2015/2015-11-01.csv — DST ends lol
CSV_FILES := $(patsubst $(DOWNLOAD_DIR)/data_%.zip,$(DATA_DIR)/%.csv,$(ALL_FILES))
$(DATA_DIR)/%.csv: $(DOWNLOAD_DIR)/data_%.zip code/unzip_data.R
@echo "Processing $< ... $(shell date '+%Y-%m-%d %H:%M:%S')"
@TEMP_DIR="$(DATA_DIR)/$*"; \
mkdir -p "$$TEMP_DIR"; \
unzip -n -qq -j $< -d "$$TEMP_DIR" -x "__MACOSX/*" "*.DS_Store" 2>/dev/null && \
Rscript code/unzip_data.R \
--input "$$TEMP_DIR" \
--output $@
results/drive_dates.csv: $(CSV_FILES) code/combine_data.R
Rscript code/combine_data.R --input $(CSV_FILES)
results/survival.csv: results/drive_dates.csv code/survival.R
Rscript code/survival.R
README.md: README.Rmd results/survival.csv prices.csv
Rscript -e "rmarkdown::render('README.Rmd', 'github_document', clean=TRUE)"
# Only need to do this once to get the models to price check
prices.csv:
Rscript -e "data.table::fwrite(data.table::fread('results/survival.csv')[surv_5yr_lower > 0.70,][order(-surv_5yr_lower), list(model, price=NA)], 'prices.csv')"
# Define make targets
.PHONY: all
all: download_data unzip_data combine_data analyze_data
.PHONY: clean
clean:
find $(DATA_DIR) -mindepth 1 ! -name '.gitignore' -delete
find $(RESULTS_DIR) -mindepth 1 ! -name '.gitignore' -delete
.PHONY: install_r
install_r:
Rscript -e "install.packages(c('data.table', 'survival', 'survminer', 'collapse', 'stringi', 'argparser', 'pbapply', 'fastmatch', 'knitr'), repos='https://cran.rstudio.com/')"
.PHONY: install_py
install_py:
pip3 install -r requirements.txt
.PHONY: install
install: install_r install_py
.PHONY: print_files
print_files:
@echo "Yearly files:" $(YEARLY_FILES) "\n"
@echo "Quarterly files:" $(QUARTERLY_FILES) "\n"
@echo "CSV files:" $(CSV_FILES) "\n"
.PHONY: download_data
download_data: $(ALL_FILES)
.PHONY: unzip_data
unzip_data: $(CSV_FILES)
.PHONY: combine_data
combine_data: results/drive_dates.csv
.PHONY: analyze_data
analyze_data: results/survival.csv README.md