diff --git a/README.md b/README.md new file mode 100644 index 0000000..4ba2d9d --- /dev/null +++ b/README.md @@ -0,0 +1,94 @@ +## Persianp Processing Toolbox + +Persianp is a text processing tool developed in Java to accomplish preprocessing tasks in Persian texts. The toolbox accomplishes following task: +* Character-level normalization +* Tokenization +* Lemmatization +* POS tagging +* Stopword detection +* Noun phrase chunking + +### Using Persianp from the command line +Be sure folder 'res' is next to the 'jar' file. + +'''bash +$ java -cp persianp-toolbox-1.0.jar com.persianp.nlp.process.Process -input inputfile.txt -output outputfile.txt -task (tokenize|tag|lemmatize|taglemmatize) [-nostopword] [-prop propertyFile.properties] +''' + +At the moment NP chunking is not supported from the comand line. + +### Using the Persianp API +Add the API to libraries of your program. The following example shows how to use the toolbox. + +''' +public class TestPersianp { + + public static void main(String[] args) { + TestPersianp testPersianp = new TestPersianp(); + testPersianp.process(); + } + + private void process() { + try { + Properties properties = new Properties(); + properties.load(this.getClass().getClassLoader().getResourceAsStream("persianp.properties")); + Process process = new Process(properties); + InputStream in = this.getClass().getClassLoader().getResourceAsStream("testText.txt"); + BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String line; + while ((line = br.readLine()) != null) { + process.process(line); + + System.out.println(process.getText()); +// process.getTokens(); +// process.getTokensText(); +// process.getTags(); +// process.getChunkTag(); +// process.getLemmas(); +// process.getNonStopwordTokens(); + + int sentenceSize = process.getSentencesSize(); + for (int j = 0; j < sentenceSize; ++j) { +// List tokensText = process.getTokensTextInSentence(j); +// List tags = process.getTagsInSentence(j); +// List lemmas = process.getLemmasInSentence(j); + List tokens = process.getTokensInSentence(j); + for (int k = 0; k < tokens.size(); ++k) { + System.out.println(tokens.get(k).getText() + "\t\t\t" + tokens.get(k).getLemma() + "\t\t\t" + tokens.get(k).getTag()); + } + } + } + in.close(); + br.close(); + } catch (Exception e){ + e.printStackTrace(); + } + } +} + +''' + +### More Information / Citing This Toolbox +Please cite the paper below if you use the Persianp toolbox in your research. It also provides more information about the toolbox. + +> Mahdi Mohseni, Javad Ghofrani, Heshaam Faili +> Persianp: A Persian Text Processing Toolbox +> International Conference on Intelligent Text Processing and Computational Linguistics +CICLing 2016: Computational Linguistics and Intelligent Text Processing pp 75-87 + +Bibtex citation: + +''' +@InProceedings{Persianp2016, +author="Mohseni, Mahdi +and Ghofrani, Javad +and Faili, Heshaam", +title="Persianp: A Persian Text Processing Toolbox", +booktitle="Computational Linguistics and Intelligent Text Processing", +year="2018", +publisher="Springer International Publishing", +pages="75--87", +isbn="978-3-319-75477-2" +} +''' + diff --git a/persianp-public-1.0-SNAPSHOT.jar b/persianp-public-1.0-SNAPSHOT.jar new file mode 100755 index 0000000..0c9dc47 Binary files /dev/null and b/persianp-public-1.0-SNAPSHOT.jar differ diff --git a/res/c.m b/res/c.m new file mode 100755 index 0000000..6705dbb Binary files /dev/null and b/res/c.m differ diff --git a/res/fi.t b/res/fi.t new file mode 100755 index 0000000..865689b Binary files /dev/null and b/res/fi.t differ diff --git a/res/fr.t b/res/fr.t new file mode 100755 index 0000000..7a16bba --- /dev/null +++ b/res/fr.t @@ -0,0 +1,2 @@ + +>Ix!L_V} >YBi.c*O@pX \ No newline at end of file diff --git a/res/l.m b/res/l.m new file mode 100755 index 0000000..54cfaf5 Binary files /dev/null and b/res/l.m differ diff --git a/res/lr.t b/res/lr.t new file mode 100755 index 0000000..648f978 Binary files /dev/null and b/res/lr.t differ diff --git a/res/m.t b/res/m.t new file mode 100755 index 0000000..026940a Binary files /dev/null and b/res/m.t differ diff --git a/res/p.m b/res/p.m new file mode 100755 index 0000000..1364859 Binary files /dev/null and b/res/p.m differ diff --git a/res/pn.t b/res/pn.t new file mode 100755 index 0000000..c2bdc76 Binary files /dev/null and b/res/pn.t differ diff --git a/res/rr.t b/res/rr.t new file mode 100755 index 0000000..3fab3f6 --- /dev/null +++ b/res/rr.t @@ -0,0 +1,2 @@ +Xlɛ cs D3xc +I]nvK݃ޤ@$ޣR">kj'<loPa$d.fwZ@5,L D8-bG%8c22^ \ No newline at end of file diff --git a/res/s.t b/res/s.t new file mode 100755 index 0000000..cf13845 Binary files /dev/null and b/res/s.t differ diff --git a/res/v.t b/res/v.t new file mode 100755 index 0000000..af705da Binary files /dev/null and b/res/v.t differ diff --git a/res/vr.t b/res/vr.t new file mode 100755 index 0000000..fc2480f Binary files /dev/null and b/res/vr.t differ diff --git a/res/wl.t b/res/wl.t new file mode 100755 index 0000000..d3a4df4 --- /dev/null +++ b/res/wl.t @@ -0,0 +1 @@ +S5SCB!g,0 '݄Z݆-w \ No newline at end of file diff --git a/res/wt.t b/res/wt.t new file mode 100755 index 0000000..59f0771 Binary files /dev/null and b/res/wt.t differ diff --git a/res/wtl.t b/res/wtl.t new file mode 100755 index 0000000..5f0efea Binary files /dev/null and b/res/wtl.t differ