-
Notifications
You must be signed in to change notification settings - Fork 0
/
tagger.sh
89 lines (80 loc) · 3.1 KB
/
tagger.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/bash
METHOD=$1
LANG=$2
USER=$3
PASS=$4
# count of character that are processed in one request
CHUNK_SIZE=5000
SERVICE_URI=https://www.tilde.com/tagger/Service.asmx
# Create temporay POST data file
POST_FILE=$(mktemp) || exit 1
WGET_PARAM="-O - --http-user=$USER --http-passwd=$PASS --post-file=$POST_FILE -nv"
# Process text function
# Param/-s - text to process
processText()
{
TEXT="$*"
# Substitute UNIX type line-ends with Windows (The web service processes Windows type line-ends)
# remove CR characters (in case there are some) and convert to DOS line-ends
#TEXT=$(echo "$TEXT" | tr -d '\r' | sed 's/$'"/`echo \\\r`/")
# Ignore empty lines
if [[ -z $TEXT ]]
then
echo "$TEXT"
return
fi
# URLEscape input data & write to POST file
TEXT=`echo "$TEXT" | perl -wple 's/([^\w])/sprintf("%%%02X", ord($1))/eg'`
echo "lang=$LANG&text=$TEXT">$POST_FILE
# Execute Web Service method
case $METHOD in
tokenize)
RESULT=`wget $WGET_PARAM "$SERVICE_URI/Tokenize"`
;;
break)
RESULT=`wget $WGET_PARAM "$SERVICE_URI/BreakSentences"`
;;
moses)
echo "lang=$LANG&outputFormat=moses&text=$TEXT">$POST_FILE
RESULT=`wget $WGET_PARAM "$SERVICE_URI/PosTagger"`
;;
treetagger)
echo "lang=$LANG&outputFormat=treetagger&text=$TEXT">$POST_FILE
RESULT=`wget $WGET_PARAM "$SERVICE_URI/PosTagger"`
;;
xces)
echo "lang=$LANG&outputFormat=xces&text=$TEXT">$POST_FILE
RESULT=`wget $WGET_PARAM "$SERVICE_URI/PosTagger"`
;;
*)
echo "ERROR: Unknown method '$METHOD'!" >&2
;;
esac
# Remove XML markup
RESULT=`echo "$RESULT" | tail --lines=+2 | sed -r 's/<string [^>]*>//; s/<\/string>//' | sed -e 's/\</</g; s/\>/>/g; s/\ / /g; s/\¢/¢/g; s/\£/£/g; s/\¥/?/g; s/\€/<80>/g; s/\§/§/g; s/\©/©/g; s/\®/®/g; s/\™/<99>/g; s/\&/\&/g;'`
# Output result
echo "$RESULT" | tr -d '\r'
}
# Main script
# Process all text at once when requesting XCES format
if [ "$METHOD" == "xces" ]; then
TEXT=`cat /dev/stdin`
processText $TEXT
else
# For as many lines as the STDIN has...
while read LINE; do
TEXT="$LINE"
while [ ${#TEXT} -lt $CHUNK_SIZE ] && read LINE ; do
TEXT="$TEXT"$'\n'"$LINE"
done
# check if the last line without new-line character has been read (such a line will not be added to the TEXT)
if [ ${#TEXT} -lt $CHUNK_SIZE ]; then
TEXT="$TEXT"$'\n'"$LINE"
fi
processText "$TEXT"
done
# check if the last line without new-line character has been read (such line might not be processed)
if [ ${#LINE} -gt 0 ]; then
processText "$LINE"
fi
fi