-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjson2csv.R
84 lines (72 loc) · 2.23 KB
/
json2csv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
library(tidyverse)
files <- list.files("data", pattern = "json")
map_dfr(files, function(json){
js <- jsonlite::read_json(str_c("data/", json))
map_dfr(seq_along(js$sentences), function(i){
js$sentences[[i]]$words %>%
map("wf") %>%
modify_if(is.null, ~ NA) %>%
unlist() ->
word_forms
js$sentences[[i]]$words %>%
map("ana") %>%
map(1) %>%
map("gloss") %>%
modify_if(is.null, ~ NA) %>%
unlist() ->
gloss
js$sentences[[i]]$words %>%
map("ana") %>%
map(1) %>%
map("parts") %>%
modify_if(is.null, ~ NA) %>%
unlist() ->
morphonology
js$sentences[[i]]$words %>%
map("ana") %>%
map(1) %>%
map("parts") %>%
modify_if(is.null, ~ NA) %>%
unlist() ->
morphonology
js$sentences[[i]]$src_alignment %>%
map_chr("src") ->
source_file
js$sentences[[i]]$src_alignment %>%
map_chr("off_start_src") ->
time_start
js$sentences[[i]]$src_alignment %>%
map_chr("off_end_src") ->
time_end
tibble(filename = source_file,
time_start = time_start,
time_end = time_end,
speaker = js$sentences[[i]]$meta$speaker,
recorded = js$meta$year,
lang = js$sentences[[i]]$lang,
text = js$sentences[[i]]$text,
word_forms,
morphonology,
gloss,
language = "abaz1241",
dataset_creator = "Anastasia Panova, Anna Sorokina, Peter Arkadiev, Elena Sokur",
dataset_provider = "George Moroz")
})
}) ->
result
result %>%
distinct(filename, lang, text) %>%
group_by(filename, lang) %>%
mutate(sentence_id = 1:n()) %>%
pivot_wider(names_from = lang, values_from = text) %>%
rename(text = `0`,
translation = `1`) ->
translation_pairs
result %>%
filter(lang == 0,
word_forms != "\n",
word_forms != "") %>%
left_join(translation_pairs) %>%
select(filename, time_start, time_end, speaker, sentence_id, text, translation, word_forms, morphonology, gloss, language, dataset_creator, dataset_provider, sentence_id, translation) %>%
write_csv("data_oral_abaza_corpus.csv")
## RUN encrypt.R