-
Notifications
You must be signed in to change notification settings - Fork 1
/
dplyr.r
39 lines (34 loc) · 1.04 KB
/
dplyr.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
library(arrow)
library(dplyr)
main <- function () {
start_time <- Sys.time()
files <- Sys.glob('/data/performance-benchmark-data/*.parquet')
dataframes <- lapply(files[1:1], function(x) { read_parquet(x) })
dataframe <- bind_rows(dataframes)
dataframe %>%
mutate(sales = price * quantity) %>%
group_by(`member-id`) %>%
summarise(total_spend = sum(sales),
avg_basket_size = mean(sales),
avg_price = mean(price),
n_transactions = n(),
n_visits = n_distinct(date),
n_brands = n_distinct(`brand-id`),
n_styles = n_distinct(`style-id`)) %>%
write_parquet("dplyr.parquet")
end_time <- Sys.time()
print(end_time - start_time)
}
main()
df <- read_parquet("dplyr.parquet")
print(df)
head(df)
colnames(df)
# 1 Part
# Time difference of 7.683979 mins
# Without Docker
# Time difference of 7.343145 mins
# 12 Parts
# Time difference of 16.54721 mins
# Without Docker
# Time difference of 15.86956 mins