SastrawiJs is a javascript package for doing stemming in Indonesian language. It is based from Sastrawi for PHP by Andy Librian.
From Wikipedia, stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form—generally a written word form. Example :
- menahan => tahan
- pewarna => warna
For browser/client javascript
<script src="stemmer.js"></script>
<script src="tokenizer.js"></script>
For node.js
npm install sastrawi
Then on the file
var sastrawi = require("sastrawijs");
Web/client
var sentence =
"Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan";
var stemmed = [];
var stemmer = new Stemmer();
var tokenizer = new Tokenizer();
words = tokenizer.tokenize(sentence);
for (word of words) {
stemmed.push(stemmer.stem(word));
}
console.log(stemmed);
Node
var sentence =
"Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan";
var stemmed = [];
var stemmer = new sastrawi.Stemmer();
var tokenizer = new sastrawi.Tokenizer();
words = tokenizer.tokenize(sentence);
for (word of words) {
stemmed.push(stemmer.stem(word));
}
console.log(stemmed);
Aside from using the default dictionary, user can make their own dictionary.
var custom = ["hancur", "benar", "apa", "siapa", "jubah",
"baju", "beli", "celana", "hantu", "jual", "buku", "milik", "kulit",
"sakit", "kasih", "buang", "suap", "nilai", "beri", "rambut", "adu",
"suara", "daerah", "ajar", "kerja", "ternak", "asing", "raup", "gerak",
"puruk", "terbang", "lipat", "ringkas", "warna", "yakin", "bangun",
"fitnah", "vonis", "baru", "ajar", "tangkap", "kupas", "minum", "pukul",
"cinta", "dua", "jauh", "ziarah", "nuklir", "gila", "hajar", "qasar",
"udara", "populer", "warna", "yoga", "adil", "rumah", "muka", "labuh",
"tarung", "tebar", "indah", "daya", "untung", "sepuluh", "ekonomi",
"makmur", "telah", "serta", "percaya", "pengaruh", "kritik", "seko",
"sekolah", "tahan", "capa", "capai", "mula", "mulai", "petan", "tani",
"aba", "abai", "balas", "balik", "peran", "medan", "syukur", "syarat",
"bom", "promosi", "proteksi", "prediksi", "kaji", "sembunyi", "langgan",
"laku", "baik", "terang", "iman", "bisik", "taat", "puas", "makan",
"nyala", "nyanyi", "nyata", "nyawa", "rata", "lembut", "ligas",
"budaya", "karya", "ideal", "final", "taat", "tiru", "sepak", "kuasa",
"malaikat", "nikmat", "lewat", "nganga", "allah"];
var stemmer = new Stemmer(custom);
- Algoritma Nazief dan Adriani
- Asian J. 2007. Effective Techniques for Indonesian Text Retrieval. PhD thesis School of Computer Science and Information Technology RMIT University Australia. (PDF dan Amazon)
- Arifin, A.Z., I.P.A.K. Mahendra dan H.T. Ciptaningtyas. 2009. Enhanced Confix Stripping Stemmer and Ants Algorithm for Classifying News Document in Indonesian Language, Proceeding of International Conference on Information & Communication Technology and Systems (ICTS). (PDF)
- A. D. Tahitoe, D. Purwitasari. 2010. Implementasi Modifikasi Enhanced Confix Stripping Stemmer Untuk Bahasa Indonesia dengan Metode Corpus Based Stemming, Institut Teknologi Sepuluh Nopember (ITS) – Surabaya, 60111, Indonesia. (PDF)
- Tambahan aturan stemming dari kontributor Sastrawi.
Sastrawi rely heavily on a root word dictionary. It is based on kateglo.com with some modifications.
As Sastrawi for PHP, SastrawiJs is also shared with MIT license. As for the license of kateglo: CC-BY-NC-SA 3.0.
- Sastrawi - PHP
- JSastrawi - Java
- cSastrawi - C
- PySastrawi - Python
- Go-Sastrawi - Go
- Sastrawi-Ruby - Ruby