-
Notifications
You must be signed in to change notification settings - Fork 7
/
encipher.sh
120 lines (96 loc) · 3.58 KB
/
encipher.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
LOC='/cs/lab-folder/' # set your root project location
# LOC='/local-scratch'
ROOT="${LOC}/username/cipherdaug-nmt"
DATAROOT="${ROOT}/data/iwslt14" # set your data root
# specify SRC and TGT data to locate data folder
# specify side
DEF_SIDE="src"
while getopts "s:t:x:" FLAG; do
case "${FLAG}" in
s) SRC=${OPTARG};;
t) TGT=${OPTARG};;
x) SIDE=${OPTARG};;
# f) SPLITT=${OPTARG};;
esac
done
if [ ! ${#SRC} -gt 0 ]; then
echo "-s arg must be provided"
echo "usage: bash encipher.sh -s de -t en -x src" # -f train"
exit 1
fi
if [ ! ${#TGT} -gt 0 ]; then
echo "-t arg must be provided"
echo "usage: bash encipher.sh -s de -t en -x src" # -f train"
exit 1
fi
# if [ ! ${#SPLITT} -gt 0 ]; then
# echo "-f (split) arg must be provided"
# echo "usage: bash encipher.sh -s de -t en -x src -f train"
# exit 1
# fi
if [ ! ${#SIDE} -gt 0 ]; then
echo "warning: -x (side) arg not provided: should be either 'src' or 'tgt'"
echo "usage: bash encipher.sh -src de -tgt en -side src" # -f train"
echo "default fallback to 'src'"
SIDE=$DEF_SIDE
fi
#####################################
###### cipher naming convention #####
#####################################
# for each src lang, create a
# corresponding cipher dir "srcx"
# src = de; -> cipher = dex
# replace x with the key
# e.g; de -- keys [2,3]
# output --> de2, de3
####### don't change this ###########
## multiling train depends on this ##
#####################################
KEYS=(1 2 3 4 5)
SPLITS=("train" "valid" "test")
ENCIPHER="${ROOT}/cipher/encipher.py"
for KEY in "${KEYS[@]}"; do
for SPLIT in "${SPLITS[@]}"; do
# infer input and output filenames
if [ ${SIDE} = "src" ]; then
echo "-x (side) : 'src'"
SELF_OUT="${DATAROOT}/${SRC}x-${TGT}"
OUT_DIR="${DATAROOT}/${SRC}x-${TGT}"
mkdir -p ${SELF_OUT} ${OUT_DIR}
FILE="${SPLIT}.${SRC}-${TGT}.${SRC}"
CIPHER="${SPLIT}.${SRC}${KEY}-${TGT}.${SRC}${KEY}"
# the parallel side of input file
PARL="${SPLIT}.${SRC}-${TGT}.${TGT}"
COPY_PARL="${SPLIT}.${SRC}${KEY}-${TGT}.${TGT}"
# self copy [dex - de automatically]
SELF_SRC="${OUT_DIR}/${CIPHER}"
SELF_TGT="${DATAROOT}/${SRC}-${TGT}/${FILE}"
COPY_SELF_SRC="${SPLIT}.${SRC}${KEY}-${SRC}.${SRC}${KEY}"
COPY_SELF_TGT="${SPLIT}.${SRC}${KEY}-${SRC}.${SRC}"
elif [ ${SIDE} = "tgt" ]; then
echo "-x (side) : 'tgt' not supported yet. Exiting now.."
exit 1
fi
if [ ! -f "${OUT_DIR}/${CIPHER}" ]; then
echo ""
echo "Generating ** ${SPLIT} ** ${SRC}-${TGT} ${SIDE} cipher .."
# generate cipher data for specified input
python ${ENCIPHER} -i "${DATAROOT}/${SRC}-${TGT}/${FILE}" --keys $KEY \
--char-dict-path "${DATAROOT}/${SRC}-${TGT}/chardict.train.${SRC}" > "${OUT_DIR}/${CIPHER}"
echo "Generating real parallel data for cipher .."
# generate parallel data by copying
cat "${DATAROOT}/${SRC}-${TGT}/${PARL}" > "${OUT_DIR}/${COPY_PARL}"
echo "Generating self parallel data for cipher .."
# generate self parallel data by copying
cat "${SELF_SRC}" > "${SELF_OUT}/${COPY_SELF_SRC}"
cat "${SELF_TGT}" > "${SELF_OUT}/${COPY_SELF_TGT}"
echo "Done!"
else
echo "Found ${SRC}-${TGT} ${SIDE} - ${KEY} cipher. Not generating!"
fi
done
done
echo
echo "Check dirs:"
echo "${OUT_DIR}"
echo "${SELF_OUT}"