-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfitness_function.R
94 lines (77 loc) · 3.17 KB
/
fitness_function.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# This function implements the Silhouette Coefficient (fitness function to maximize).
# The Silhouette coefficient takes values in [-1; 1]. Higher Silhouette Coefficient
# denotes better clustering quality
fitness_LDA<-function(x=c()){
numero_topic<-round(x[1]) #x[1] = number of topics k
iteration<-round(x[2]) #x[2] = number of gibbs iteration
pAlpha<-x[3] #x[3] = Alpha
pDelta<-x[4] #x[4] = Beta
# apply LDA to the term-by-document matrix
ldm <- LDA(dtm, method="Gibbs", control = list(
alpha=pAlpha,
delta=pDelta,
iter=iteration,
verbose=1,
seed=5,
nstart=1), k = numero_topic) # k = num of topics
pldm <- posterior(ldm)
names(pldm)
# compute the topic-by-term matrix
names(dtm$dimnames)
docs <- dtm$dimnames$Docs
topics<-names(terms(ldm))
matrix<-pldm$topics
dimnames(matrix)<-list(docs,topics)
# compute the distance between documents in the topics space
distances <- as.matrix(dist(matrix, method = "euclidean", diag = T, upper = T))
# computing number of clusters
clustering<-matrix("",length(rownames(matrix)),1)
for (i in 1:length(rownames(matrix))) {
flag<-(matrix[i,]==max(matrix[i,]))# each documents belongs to the cluster with the higher probability
flag<-which(flag==TRUE)
topics <- sort(flag)
clustering[i,1]<-paste(topics, collapse = '_')
}
rownames(clustering)<-rownames(matrix)
# assign the clusters
clusters<-unique(clustering)
count <- 1
for (clust in clusters){
clustering[clustering[,1] == clust,1] <- count
count <- count+1
}
cluster_objects<-list();
cluster_objects$clustering <- as.numeric(clustering)
# compute the cohesion for each documents
cohesion <- matrix(nrow = length(rownames(distances)), ncol = 1)
for (i in 1:length(rownames(distances))){
cohesion[i,1] <- max(distances[clustering[,1] == clustering[i,1],i])
}
# compute the separation from other clusters
separation <- matrix(nrow = length(rownames(distances)), ncol = 1)
for (i in 1:length(rownames(distances))){
separation[i,1] <- min(distances[clustering[,1] != clustering[i,1],i])
}
# compute the silhouette coefficient
sil <- matrix(nrow = length(rownames(distances)), ncol = 1)
for (i in 1:length(rownames(distances))){
if (sum(clustering[i,1] == clustering)>1)
sil[i,1] <- (separation[i,1] - cohesion[i,1]) / max(separation[i,1], cohesion[i,1])
else
sil[i,1] <- 0 # if the cluster contanis only one document, the Silohuette Coeff. is zero
}
return(mean(sil))
}
OptimizedLDA <- function(x){
numero_topic<-round(x[1]) #x[1] = number of topics k
iteration<-round(x[2]) #x[2] = number of gibbs iteration
pAlpha<-x[3] #x[3] = Alpha
pDelta<-x[4] #x[4] = Beta
ldm <- LDA(dtm, method="Gibbs", control = list(alpha=pAlpha, delta=pDelta, iter=iteration, seed=5, nstart=1), k = numero_topic) # k = num of topics
ap_topics1 <- tidy(ldm, matrix = "gamma")
ap_topics2 <- tidy(ldm, matrix = "beta")
write.csv(ap_topics1, file = "./Results/OptimizedLDAGamma.csv")
write.csv(ap_topics2, file = "./Results/OptimizedLDABeta.csv")
# pldm <- posterior(ldm)
# document2topic <- pldm$topics
}