-
Notifications
You must be signed in to change notification settings - Fork 0
/
Synopsis.kt
203 lines (187 loc) · 7.85 KB
/
Synopsis.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
// Written by Jack Abdo
package com.experienceapplications.synopsis
//import java.io.*
import android.util.Log
import java.util.*
import kotlin.math.log
import kotlin.math.round
/*
fun main(args: Array<String>) {
if (args.size == 0) {
println("Please specify a file.")
exitProcess(0)
}
for (i in args) {
val inputarticle = Article(File(i).readText())
println(inputarticle.tagcloud)
println(inputarticle.synopsis+"\n\n\n")
//println(inputarticle.wordscore)
}
}*/
class Article(_article:String) {
// private var articlevar:String = ""
var article:String = _article
private var ignorelist:List<String>
var wordscore = HashMap<String,Int>()
var synopsis:String = ""
var tagcloud:List<String> = listOf()
private fun fillignorelist(): List<String>{ //
val _ignorereader = App.context!!.getResources().openRawResource(R.raw.ignorelist).bufferedReader()
val _ignorelist = _ignorereader.readLines().toTypedArray().toList()
return _ignorelist
}
init {
ignorelist = this@Article.fillignorelist()
}
// cleans the punctuation and doublewhitespace from the input
private fun removewhitespaceandpunctuation(textstring: String): String {
val re = Regex("[^A-Za-z0-9 ]")
var purestring = re.replace(textstring," ")
while (purestring.contains(" "))
purestring = purestring.replace(" "," ")
return purestring
}
//retuns the number of words in a string
//calls removewhiespaceandpunctuation
private fun sentencelength(textstring: String): Int {
val processedstring = this@Article.removewhitespaceandpunctuation(textstring)
val sentencesize = processedstring.split(" ").size
return sentencesize
}
//string cleaner, all strings should be referred to through here
private fun strcln(upperstring:String):String{
return upperstring.toLowerCase()
}
//takes a string of words and returns the word frequency
//accepts a string list and returns a HashMap
//calls: nothing
private fun wordfreq(purearray:List<String>): HashMap<String,Int> {
var wordscore:HashMap<String,Int> = HashMap<String,Int>()
//increments the count if it exists in the hash table, otherwise adds it
for (key in purearray) {
if (key in ignorelist)
continue
if (wordscore.containsKey(key)) {// increments the count if there
var value:Int = wordscore[key] as Int //
wordscore.put(key,(value+1))
} else {
wordscore.put(key,1)
}
}
return wordscore
}
//wrapper for wordfreq, cleans punctuation and whitespace out of the text
//calls: whitespace and wordfreq
private fun cleanandcountwords(): HashMap<String,Int> {
val purestring: String = this@Article.removewhitespaceandpunctuation(article).toLowerCase()
val wordlist:List<String> = purestring.split(" ")
val wordscore:HashMap<String,Int> = this@Article.wordfreq(wordlist)
return wordscore
}
init {
wordscore = this@Article.cleanandcountwords()
}
//Sorts the dict by value and returns the values.
private fun sortwords(purestring: String): List<String> {
var purearray: Array<String> = purestring.split(" ").toTypedArray()
purearray.sort()
return purearray.toList()
}
private fun wordscore(wordtoscore:String): Int {
val wordtoscorelower = wordtoscore.toLowerCase()
return wordscore[wordtoscorelower]!!
}
//returns the sentence score as a length three int array
//first number is the score, taken by adding the frequencies of each word
//second number is the sentence length
//third is top 5 scoring words in the sentence
private fun sentencescore(sentence: String): List<Any>{
val sentencearray = sentence.split(" ") //split sentence into single words
//first is sentence score, second is size, third is top tag words
var sentencescore = 0
var tagcloud = mutableListOf<String>("","","","","")
for (ii in sentencearray) {
val i = strcln(ii)
//if the word is already in the synopsis, do not score it
if (i in tagcloud || i in synopsis.toLowerCase() || i in ignorelist) {
continue
}
sentencescore += this@Article.wordscore(i)//add the wordscore to the sentencescore
var word = i //assign the word to 'word' so it can be added to the tagcld
for (j in 0..tagcloud.size-1){//add word to tag cloud
//if the slot is empty, place the word in the slot
if (tagcloud[j] == "") {
tagcloud[j] = word
break
}
// if the score is greater than that word, push all words down a slot
if (this@Article.wordscore(word) > this@Article.wordscore(tagcloud[j])) {
var tempword = tagcloud[j] //swap word and tagcloud[j]
tagcloud[j] = word
word = tempword
if (j == tagcloud.size-1) //if the index is at the last word, break
break
for (k in j+1..tagcloud.size-1) { //or else move everything down
tempword = tagcloud[k]
tagcloud[k] = word
word = tempword
}
break
}
}
}
return listOf(sentencescore,sentencearray.size,tagcloud as List<String>)
}
//assigns a score to each sentence, returns score, length, and top 5 tag words
private fun countworddensity(): List<List<Any>> {
var articlelist = article.split(".")
var sentencevalues = mutableListOf<List<Any>>()
var puresentence:String
for (sentence in articlelist) {
puresentence = this@Article.removewhitespaceandpunctuation(sentence)
sentencevalues.add(listOf(sentence+". ") + this@Article.sentencescore(puresentence))
}
return sentencevalues
}
//returns the top scoring sentence
//calls: nothing
private fun gettopsentence(sentencevalues:List<List<Any>>): List<Any> {
var topsentence:Int = 0
//checks the score of all the sentences, includes checking the zeroth element in case article is one sentence long
for (i in sentencevalues.indices)
//checks each value
if (sentencevalues[i][1] as Int > sentencevalues[topsentence][1] as Int)
topsentence = i
return listOf(sentencevalues[topsentence][0],sentencevalues[topsentence][3])
}
private fun formatSynopsis(){ //find optimal synopsis format
synopsis.trim()
val re = Regex("[\n]a-z")
var purestring = re.replace(synopsis,"")
while (purestring.contains("\n\n"))
purestring = purestring.replace("\n\n","\n")
}
//select the top 1-3 sentences and loads them into synopsis
//calls countworddensity, gettopsentence
//TODO: redefine what synopsis considers a sentence
private fun getshortsynopsis(){
val synopsislength = this@Article.sentencelength(article)
val finallength = round(5*log(synopsislength + 0.0,3.0)).toInt()
while ( synopsis.split(" ").size < finallength + 2 ) {
val newwordscore = this@Article.countworddensity()
val result = this@Article.gettopsentence(newwordscore)
synopsis += result[0]
tagcloud += result[1] as List<String>
}
formatSynopsis()
}
init {
this@Article.getshortsynopsis()
}
//returns a shortened version of the article
//TODO: Finish this in the future
//TODO: Compare this with existing synopsis algorithms
fun longsynopsis() {
throw NotImplementedError()
}
}