-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollocs.jsp
230 lines (218 loc) · 8.19 KB
/
collocs.jsp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
<%@ page language="java" contentType="text/html; charset=UTF-8"
pageEncoding="UTF-8"%>
<%@include file="common.jsp"%>
<%
String bibcode = request.getParameter( "bibcode" );
String text = request.getParameter( "text" );
if (text != null && !text.trim().isEmpty())
bibcode = null;
int gramwidth = 2;
try {
gramwidth = Integer.parseInt( request.getParameter( "gramwidth" ) );
} catch (Exception e) {
}
if (gramwidth < 2 || gramwidth > 5)
gramwidth = 3;
// default values
boolean stoplist = true;
boolean np = true;
boolean locs = true;
boolean lem = false;
boolean sent = false;
boolean reps = false;
boolean bag = false;
//
if (request.getParameter( "go" ) != null) {
stoplist = bool( pageContext, "stoplist" );
np = bool( pageContext, "np" );
locs = bool( pageContext, "locs" );
lem = bool( pageContext, "lem" );
sent = bool( pageContext, "sent" );
reps = bool( pageContext, "reps" );
bag = bool( pageContext, "bag" );
}
%>
<!DOCTYPE html>
<html>
<head>
<title>Collocations</title>
<link rel="stylesheet" type="text/css" href="alix.css" />
</head>
<body>
<%@include file="menu.jsp"%>
<article id="article">
<h1>
<a href=".">Alix</a> : <a href="?">collocations</a> (locutions et
cooccurrences fréquentes)
</h1>
<form
onsubmit="if (!this.text.value) return true; this.method = 'post'; this.action='?'; "
method="get" action="">
<select name="bibcode"
onchange="this.form.text.value = ''; this.method = 'GET'; this.form.submit()">
<%
seltext( pageContext, bibcode );
%>
</select> <label> <select name="gramwidth"
onchange="this.form.submit()">
<%
int[] values = { 2, 3, 4, 5 };
int lim = values.length;
String selected = "";
boolean seldone = false;
for (int i = 0; i < lim; i++) {
if (!seldone && values[i] == gramwidth) {
selected = " selected=\"selected\"";
seldone = true;
}
out.println( "<option" + selected + " value=\"" + values[i] + "\">" + values[i] + "</option>" );
selected = "";
}
String checked = " checked=\"checked\"";
%>
</select> mots
</label> <br /> <label> <input name="stoplist" type="checkbox"
value="1" <%if (stoplist)
out.print( checked );%> /> mots vides
</label> <label> <input name="np" type="checkbox"
<%if (np)
out.print( checked );%> /> noms propres
</label> <label> <input name="lem" type="checkbox"
<%if (lem)
out.print( checked );%> /> lemmes
</label> <label> <input name="locs" type="checkbox"
<%if (locs)
out.print( checked );%> /> locutions
</label> <label> <input name="sent" type="checkbox"
<%if (sent)
out.print( checked );%> /> couper aux phrases
</label> <label> <input name="reps" type="checkbox"
<%if (reps)
out.print( checked );%> /> répétitions
</label> <label> <input name="bag" type="checkbox"
<%if (bag)
out.print( checked );%> /> sac de mots
</label> <input type="hidden" name="go" value="go" />
<button type="submit">Envoyer</button>
<br />
<textarea name="text" style="width: 100%; height: 10em;" cols=""
rows="">
<%
if (text != null)
out.print( text );
%>
</textarea>
</form>
<%
if (bibcode != null)
text = text( pageContext, bibcode );
if (text != null && !text.isEmpty()) {
long time = System.nanoTime();
DicFreq words = new DicFreq();
DicPhrase phrases = new DicPhrase();
final int NAME = words.add( "NOM" );
final int NUM = words.add( "NUM" );
int senselevel = -1;
if (stoplist) {
BufferedReader buf = new BufferedReader(
new InputStreamReader( Lexik.class.getResourceAsStream( "dic/stop.csv" ), StandardCharsets.UTF_8 ) );
String l;
// define a "sense level" in the dictionary, by inserting a stoplist at first
while ((l = buf.readLine()) != null) {
int code = words.add( l.trim() );
if ( code > senselevel )
senselevel = code;
}
buf.close();
// add some more words to the stoplits
for (String w : new String[] { "chère", "dire", "dis", "dit", "jeune", "jeunes", "yeux" }) {
int code = words.add( w );
if (code > senselevel)
senselevel = code;
}
}
// out.print("<p>Initialisation : "+((System.nanoTime() - time) / 1000000) + " ms. ");
time = System.nanoTime();
// pattern to select
IntRoller gram = new IntRoller( 0, gramwidth - 1 );
int code;
int exit = 1000;
StringBuffer label = new StringBuffer();
Occ occ = new Occ(); // pointer on current occurrence in the tokenizer flow
Tokenizer toks = new Tokenizer( text );
int occs = 0;
while (true) {
if (locs) {
occ = toks.word();
if (occ == null)
break;
}
else {
if (!toks.token( occ ))
break;
}
// clear after sentences ?
if (sent && occ.tag().equals( Tag.PUNsent )) {
wordflow.clear();
gram.clear();
wordmarks.clear();
continue;
}
if (occ.tag().isPun())
continue; // do not record punctuation
occs++; // do not count punctuation
if (occ.tag().isNum())
code = NUM; // simplify numbers
else if (np && occ.tag().isName())
code = NAME; // simplify names
else if (!lem)
code = words.add( occ.orth() ); // no lem
else if (occ.tag().isVerb() || occ.tag().isAdj() || occ.tag().isSub())
code = words.add( occ.lem() );
else
code = words.add( occ.orth() );
// clear to avoid repetitions ?
// « Voulez vous sortir, grand pied de grue, grand pied de grue, grand pied de grue »
if (reps && code == wordflow.first()) {
wordflow.clear();
gram.clear();
wordmarks.clear();
continue;
}
wordflow.push( code ); // add this token to the word flow
wordmarks.dec(); // decrement positions of the recorded plain words
if (wordflow.get( 0 ) <= senselevel)
continue; // do not record empty words
wordmarks.push( 0 ); // record a new position of full word
gram.push( wordflow.get( 0 ) ); // store a signficant word as a collocation key
if (gram.get( 0 ) == 0)
continue; // the collocation key is not complete
// check dictionary if it contains the collocation
int count = phrases.inc( gram );
// new value, add a label to the collocation
if (count == 1) {
label.setLength( 0 );
for (int i = wordmarks.get( 0 ); i <= 0; i++) {
String w = words.label( wordflow.get( i ) );
label.append( w );
if (i == 0)
; // do not append space to end
else if (label.length() > 1 && label.charAt( label.length() - 1 ) == '\'')
; // do not append space after apos
else
label.append( ' ' );
}
// System.out.println( label );
phrases.label( gram, label.toString() );
}
// if ( --exit < 0 ) System.exit( 1 );
}
out.print( "<p>" + dfppm.format( occs ) + " occurrences, " + dfppm.format( phrases.occs() ) + " collocations, "
+ dfppm.format( phrases.size() ) + " différentes, en " + ((System.nanoTime() - time) / 1000000)
+ " ms.</p>\n" );
phrases.html( out, 200, words );
}
%>
</article>
</body>
</html>