forked from Tikquuss/meta_XLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bible.sh
136 lines (115 loc) · 4.38 KB
/
bible.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/bin/bash
# usage : bible.sh $languages $data_type
# data_type : mono or para or mono,para
# download and processed bible data
# Transform (tokenize, lower and remove accent, loard code and vocab, learn and apply BPE tokenization,
# binarize...) our data contained in the text files into a pth file understandable by the framework :
# takes a lot of time with dataset size, nCodes and shuf_n_samples
set -e
# languages
lgs=$1
# path containing the csvs folder
# zip_file_link (csvs) = https://drive.google.com/file/d/1NuSJ-NT_BsU1qopLu6avq6SzUEf6nVkk/view?usp=sharing
# download and unzip in $csv_path
csv_path=/content
# where to store the txt files
output_dir=/content/data
# path where processed files will be stored
OUTPATH=/content/processed
# If parallel data is available and you need to preprocess it
PARA=True
# If you want to process monolingual data (if the monolingual data is unavailable and you
# leave this parameter set to True, the parallel data will be used to build the monolingual data)
MONO=True
# folder containing the parallel data
PARA_PATH=$output_dir
# folder containing the monolingual data
MONO_PATH=$output_dir
# whether all languages should share the same vocabulary (leave to True)
SAME_VOCAB=True
# The following parameter allows, when having independent monolingual and parallel data, to add the parallel data to the monolingual data. It is left by default to True.
add_para_data_to_mono_data=False
# Learn nCodes BPE code on the training data
nCodes=20000
# Generating shuf_n_samples random permutations of training data to learn bpe
shuf_n_samples=10000
# It is preferable and advisable that it be the powers of two...
threads_for_tokenizer=16
# Percentage of data to use as test data (%)
test_size=10
# Percentage of data to use as validation data (%)
val_size=10
# tools paths
TOOLS_PATH=tools
TOKENIZE=$TOOLS_PATH/tokenizer_our.sh
LOWER_REMOVE_ACCENT=$TOOLS_PATH/lowercase_and_remove_accent.py
FASTBPE=$TOOLS_PATH/fastBPE/fast
#PROCESSED_FILE=../scripts/build_meta_data_multixlm.sh
# The n_sample parameter is optional, and when it is not passed or when it exceeds the dataset size, the whole dataset is considered
n_samples=-1
# If you don't have any other data to fine-tune your model on a specific sub-task, specify the percentage of the sub-task metadata to consider or -1 to ignore it.
#sub_tasks=en-fr:10,de-en:-1,de-fr:-1
#If you want the subtasks to be constructed from the pair combinations of your languages, put the three dots
sub_tasks=...
tasks_n_samples=-1
##############################################
function abrev() {
if [[ $1 = "Francais" ]]; then
result="fr"
elif [[ $1 = "Anglais" ]]; then
result="en"
elif [[ $1 = "KALATA_KO_SC_Gbaya" ]] || [[ $1 = "KALATA_KO_DC_Gbaya" ]]; then
result="Gbay"
elif [[ $1 = "BIBALDA_TA_PELDETTA" ]]; then
result="MASS"
elif [[ $1 = "MKPAMAN_AMVOE_Ewondo" ]]; then
result="Ewon"
else
length=${#1}
if [[ $length -le 4 ]]; then
result=$1
else
result=$(echo $1 | cut -c1-4)
fi
fi
}
if [ $sub_tasks="..." ]; then
sub_tasks=""
IFS=', ' read -r -a langs_array <<< "$lgs"
# todo : sort the array in alphebical oder
array_length=${#langs_array[*]}
for (( i=0; i<$array_length; ++i)); do
for (( j=$(($i+1)); j<$array_length; ++j)); do
abrev ${langs_array[$i]}
a=$result
abrev ${langs_array[$j]}
b=$result
sub_tasks=$sub_tasks,$a-$b:$tasks_n_samples
done
done
# Remove the comma in front
sub_tasks=$(echo $sub_tasks | cut -c2-)
fi
echo $sub_tasks
# create output path
mkdir -p $OUTPATH
# avoid permission error
chmod +x $FASTBPE
chmod +x $TOOLS_PATH/mosesdecoder/scripts/tokenizer/*.perl
echo "======================="
echo "Extract texts files"
echo "======================="
for data_type in $(echo ${2-'mono,para'} | sed -e 's/\,/ /g'); do
python ../scripts/bible.py --csv_path $csv_path --output_dir $output_dir --data_type $data_type --languages $lgs
done
echo "======================="
echo "Processed"
echo "======================="
chmod +x ../scripts/build_meta_data_multixlm.sh
. ../scripts/build_meta_data_multixlm.sh $sub_tasks $n_samples $add_para_data_to_mono_data
# todo : rendre les choses dynamiques comme ceci
#chmod +x $PROCESSED_FILE
#$PROCESSED_FILE
echo "======================="
echo "End"
echo "======================="