-
Notifications
You must be signed in to change notification settings - Fork 2
/
1_auto_split.R
111 lines (103 loc) · 3.63 KB
/
1_auto_split.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Only change line 5 and line something at the end
library(stringr)
library(tidyverse)
file_name <- "NCCU-TM039-CN-FM.csv" # Only change the file path to the input file in <0_raw
TM <- read_csv(paste0("0_raw/", file_name))
# ... separation
dot3_sep<-function(d){
u <- data.frame()
s <- data.frame()
t <- data.frame()
d <- mutate(d, Utterance = (str_replace_all(d$Utterance, "…", '...')))
for (i in 1:length(d$Utterance)) {
y<-str_locate_all(d$Utterance[i], "\\[{0,1}\\.{2,3}\\]{0,1}")[[1]]
# if(!(is.na(y))){
z<-c(1)
for(j in y[,1] ){
z<-c(z,j)
}
z<-c(z,nchar(d$Utterance[i])+1)
if(length(z)==1){
u=rbind(u,d$Utterance[i])
s=rbind(s,d$Speaker[i])
t=rbind(t,d$Turn[i])
}else{
for(k in 1:(length(z)-1)){
if (!((substring(d$Utterance[i],z[k],z[k+1]-1))=='')){
u=rbind(u,substring(d$Utterance[i],z[k],z[k+1]-1))
s=rbind(s,d$Speaker[i])
t=rbind(t,d$Turn[i])
}
}
}
}
a=data.frame()
a=data.frame(Turn = t, Speaker = s, Utterance = u)
colnames(a)= c("Turn","Speaker","Utterance")
return(a)
}
x=dot3_sep(TM)
# # change "[[[">"[3"
# a= x %>%
# mutate(Utterance=ifelse(str_detect(Utterance, "\\[{3}"),
# gsub('\\[{3}', '[3', Utterance),Utterance))%>%
# mutate(Utterance=ifelse(str_detect(Utterance, "\\[{2}"),
# gsub('\\[{2}', '[2', Utterance),Utterance))%>%
# mutate(Utterance=ifelse(str_detect(Utterance, "\\]{3}"),
# gsub('\\]{3}', '3]', Utterance),Utterance))%>%
# mutate(Utterance=ifelse(str_detect(Utterance, "\\]{2}"),
# gsub('\\]{2}', '2]', Utterance),Utterance))
# bracket overlap
# brkt_overlap<-function(d,n){ # (data, max number of bracket)
# re_list1=c("\\[\\D","\\[2","\\[3","\\[4","\\[5","\\[6","\\[7","\\[8","\\[9")
# re_list2=c("\\D\\]","2\\]","3\\]","4\\]","5\\]","6\\]","7\\]","8\\]","9\\]")
#
# for (m in (1:n)){
# i=0
# while(i<length((d[,3]))){
# i=i+1
# if(str_detect(d[i,3],re_list1[m])){
# for(j in i:length((d[,3]))){
# if(str_detect(d[j,3],re_list2[m])){
# for(k in (j+1):length((d[,3]))){
# if(str_detect(d[k,3],re_list1[m])){
# for(l in (k):length((d[,3]))){
# if(str_detect(d[l,3],re_list2[m])){
# if ((j+1)<=(k-1)){
# d=rbind(d[1:j, ],
# d[k:l, ],
# d[(j+1):(k-1), ],
# d[(l+1):nrow(d), ])
# i=i+(l-k)+1
# }else{
# d=rbind(d[1:j, ],d[k:l, ],
# d[(l+1):nrow(d), ])
# i=i+1+j-i
# }
# # 1:j + k:l + (j+1:k) + (l+1:n)
#
# break
# }
# }
# break
# }
# }
# break
# }
# }
# }
# }
# }
# return(d)
# }
# o=brkt_overlap(a,3)
# result= a %>%
# mutate(Utterance=ifelse(str_detect(Utterance, "\\[3"),
# gsub('\\[3', '[[[', Utterance),Utterance))%>%
# mutate(Utterance=ifelse(str_detect(Utterance, "\\[2"),
# gsub('\\[2', '[[', Utterance),Utterance))%>%
# mutate(Utterance=ifelse(str_detect(Utterance, "3\\]"),
# gsub('3\\]', ']]]', Utterance),Utterance))%>%
# mutate(Utterance=ifelse(str_detect(Utterance, "2\\]"),
# gsub('2\\]', ']]', Utterance),Utterance))
write_csv(x,paste0("1_auto_split/", file_name))