-
Notifications
You must be signed in to change notification settings - Fork 20
/
pom.xml
281 lines (274 loc) · 12.7 KB
/
pom.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cc.mallet</groupId>
<artifactId>PCPLDA</artifactId>
<!-- v1.1.9: First public release -->
<!-- v2.0.0: Major API re-design for better subclassing -->
<!-- v2.1.1: Fixed small bug when trying to print top words when have none. Added simple support for gracefully aborting the sampler -->
<!-- v2.1.2: Target class is now a proper MALLET label in a "traget Alphabet". Phi is settable. -->
<!-- v2.2.2: Added API method for setting Phi "safely", which ensures that Alphabets match -->
<!-- v2.3.2: Added API method sampling Z with Phi fixed -->
<!-- v3.0.0: Added API and implementation for setting priors on topics for specific words -->
<!-- v3.1.0: Added API method for extracting the document topic means (z_bar) -->
<!-- v3.1.1: More robust implementation of setPhi and getZbar -->
<!-- v3.1.2: Get stop list and keep_numbers from config file -->
<!-- v3.1.3: Support for saving document topic means -->
<!-- v3.1.4: Cleaner implementation of getBooleanProperty -->
<!-- v3.1.5: Added SimpleTokenizerLarge with a (settable) larger buffer -->
<!-- v3.1.6: Fixed bug in config. Doing trim on null property -->
<!-- v3.1.7: Added ability to save and export Phi means from model -->
<!-- v3.1.8: Added XValidationCreator -->
<!-- v3.1.9: Fixed bug in XValidationCreator which used an all zero phi to sample testset -->
<!-- v3.2.0: Saving Phi means is now default true, memory requirements went up with one extra Phi -->
<!-- v3.2.1: Fork Join Pool should not be static -->
<!-- v3.2.2: XValidationCreator fix -->
<!-- v3.2.3: SimpleTokenizerLarge must override deepClone. -->
<!-- v3.3.3: Major rework of distributed prototype -->
<!-- v3.4.3: Added doInitialLogging method so we can do some logging BEFORE the run starts (facilitates tracing if crash) -->
<!-- v3.4.4: Changed from non-daemon threads to daemon threads -->
<!-- v3.4.5: Changed BACK from daemon threads to non-daemon threads. This should be looked at later -->
<!-- v3.5.0: Added classifier based on Symmetric Kullback Leibler divergence. -->
<!-- v3.5.1: Improved Symmetric Kullback Leibler classifier better defaults, save fold data. -->
<!-- v3.5.2: In KL-Classifier, use theta estimate instead of doc-topic means that can contain 0's. -->
<!-- v3.5.3: In KL-Classifier select on commandline to run multi- or single corpus version. Ensure consistent priors. -->
<!-- v3.5.4: Added ParallelLDATrainTest -->
<!-- v3.5.5: Fixed bug in theta estimate cacl. -->
<!-- v3.5.6: Back to using zBar in KLClassifier, since this gives the best prediction accuracy. Don't know why yet. -->
<!-- v4.0.0: Added implementation of LightPCLDA! A Metropolis Hastings version of LDA which easily handles MANY topics -->
<!-- v4.0.1: Updated implementation after derivation update -->
<!-- v4.1.0: Extracted and pushed down implementation of type priors to reduce mem and improve cache in std versions -->
<!-- v4.1.1: Updated variable selection -->
<!-- v4.1.2: Small corrections in types/VSDirichlet.java -->
<!-- v4.1.3: Fixed OBOE introduced in v2.0.0 -->
<!-- v4.2.0: Added (collapsed) Light AD-LDA -->
<!-- v4.2.1: Fixed OBOE introduced in v2.0.0 in LightCollapsed. Improved test suite -->
<!-- v4.2.2: Fixed common seed -->
<!-- v4.3.0: First Collapsed Light version with thread local (AD-LDA) behaviour -->
<!-- v4.4.0: Word, instead of Phi, proposal version of Light PCLDA -->
<!-- v4.4.1: Improved config by moving SubConfig to PCLDA from DOLDA -->
<!-- v4.4.2: Fixed bugs in Light PC-LDA and Light LDA -->
<!-- v4.4.3: Added implementation if getCorpusSize to ADLDA -->
<!-- v4.5.0: Added support for TF-IDF pruning of vocabulary -->
<!-- v4.5.1: Some cleanups and more unit tests on Utils. Added smoke test for LightPCLDAtypeTopicProposal -->
<!-- v4.6.1: Added possibility to configure the "nr_top_words" (default=20) to print and "max_doc_buf_size" (default 10000) to adjust the size of the max document size. -->
<!-- v4.6.2: Fix max_doc_size bug, used wrong default value. -->
<!-- v4.6.3: Fixed bug in light collapsed acceptance calc -->
<!-- v4.6.4: Fixed potential bug parallel updates of batchLocalTopicTypeUpdates -->
<!-- v4.6.5: Possible fix for tokenizer bug -->
<!-- v4.6.6: Added config options to produce more stats, doc lengths and term frequencies -->
<!-- v4.7.0: Bumped POM with new Version. This version contains API Changes. Removed getTopWords from LDASampler API. These are now residing in LDAUtils. -->
<!-- v4.7.1: Corrected probability calculations when displaying relevance words -->
<!-- v4.7.2: Added option to output theta estimate -->
<!-- v4.7.3: Word Salience and KR1 word re-weighting working. (it is printed but no no option to save it yet)-->
<!-- v4.7.4: Minor fixes, saves TopWords and RelevanceWords as CSV rather than text file-->
<!-- v5.0.0: Added Polya Urn based Dirichlet sampling-->
<!-- v5.0.1: Use normal approximation when not drawing using alias table -->
<!-- v5.1.0: Added functionality for loading instances directly from a directory rather than from a file -->
<!-- v5.1.1: Bugfix for crash on getTypeTopicCounts from ADLDA model -->
<!-- v5.2.1: Test set + held out LL calc + Topic Model Diagnostics -->
<!-- v5.2.2: Small refactoring -->
<!-- v6.0.0: Hyper parameter optimization support added -->
<!-- v6.0.1: Fixed bug which caused PolyaUrn not to set the correct Phi when called from sampleZGivenPhi. -->
<!-- v6.0.2: Added some convenience facilities for loading instances from string data -->
<!-- v7.0.0: Support for parallel HDP -->
<!-- v7.1.0: Pretty convinced we have a working implementation of the HDP now, and it is efficient -->
<!-- v7.2.0: Some interface cleanups -->
<!-- v7.2.1: Bugfix in Dirichlet normalization -->
<!-- v7.2.2: Reduce sensitivity in Binomial sampler test -->
<!-- v7.2.3: Config updates + disable HDP LL -->
<!-- v7.2.4: Sample Psi from prior -->
<!-- v7.2.5: Small cleanup of sample l. Start loop at nrTopicIndicators == 2 -->
<!-- v7.2.6: Calculate nominator in sample "l" on log scale to minimize risk of underflow -->
<!-- v8.0.0: Separated distributed version into separate repo -->
<!-- v8.0.1: Also calculate p in sample "l" on log scale to minimize risk of underflow -->
<!-- v8.0.2: Bugfix that ensures that PolyaUrn handles zero length documents properly -->
<!-- v8.0.3: Replaced broken retrospective sampler -->
<!-- v8.0.4: Finalized sampling from PPU prior -->
<!-- v8.0.5: Fix phi sampling for Polya Urn -->
<!-- v8.0.6: Minor bug fix in model log likelihood calculation, logging of topic indicators -->
<!-- v8.0.7: Minor bug fix sparse samplers when document is one word long + various cleanup-->
<!-- v8.0.8: Similarity updates-->
<!-- v8.0.9: Likelihood distances working -->
<!-- v8.0.10: Made time critical toString methods use StringBuilder -->
<!-- v8.0.11: Fixed setting subconfig when reading serialized sampler + removed set no topics in parsed config -->
<!-- v8.0.12: Fixed "zero length document" bug in Spalias with priors -->
<!-- v8.1.0: Added Polya Urn with seeding (fixed small bug in Spalias with prior)-->
<!-- v8.1.1: Added 'raw' mode, i.e no pre-processing of input data-->
<!-- v8.2.1: Added 'continue sampling' and 'init from' functionality -->
<!-- v8.2.2: Support 'raw' mode when reading corpus from string -->
<!-- v8.3.0: Support saving sampler from main Java application -->
<!-- v8.4.0: Added support for continued sampling in main, controlled by 'continue' CMD line option -->
<!-- v8.5.0: Support for compressed input files (zip, gz). Improved iteration callback support -->
<!-- v8.5.1: Fixed resource leaks in file loading and 'continued' sampling -->
<!-- v9.0.0: Major cleanups -->
<!-- v9.1.0: Reverted back to default loading using file (instead of streams), fixed various small issues -->
<!-- v9.2.0: Added support for MALLET format in topic indicator logging -->
<!-- v9.2.1: Added support source for MALLET format in topic indicator logging, fixed base_output_dir bug -->
<!-- v9.2.2: Progress bar -->
<version>9.2.2</version>
<name>Partially Collapsed Parallel LDA</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<scm>
<connection>scm:git:file:///home/eralljn/workspace/PCLDA</connection>
</scm>
<dependencies>
<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil</artifactId>
<version>7.2.1</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>[24.1.1,)</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.3</version>
</dependency>
<dependency>
<groupId>commons-configuration</groupId>
<artifactId>commons-configuration</artifactId>
<version>1.9</version>
</dependency>
<dependency>
<groupId>cc.mallet</groupId>
<artifactId>mallet</artifactId>
<version>2.0.7</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
<version>2.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.12.6.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.ejml/ejml-all -->
<dependency>
<groupId>org.ejml</groupId>
<artifactId>ejml-all</artifactId>
<version>0.41</version>
</dependency>
<dependency>
<groupId>me.tongfei</groupId>
<artifactId>progressbar</artifactId>
<version>0.9.4</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/java</sourceDirectory>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>buildnumber-maven-plugin</artifactId>
<version>1.2</version>
<executions>
<execution>
<phase>validate</phase>
<goals>
<goal>create</goal>
</goals>
</execution>
</executions>
<configuration>
<doCheck>false</doCheck>
<doUpdate>false</doUpdate>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<archive>
<manifest>
<addDefaultImplementationEntries>true</addDefaultImplementationEntries>
</manifest>
<manifestEntries>
<Implementation-Build>${buildNumber}</Implementation-Build>
</manifestEntries>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>1.6</version>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>cc.mallet.topics.tui.ParallelLDA</mainClass>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>