forked from theodos/biotextquest_v2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtop2vecTest.R
143 lines (106 loc) · 4.85 KB
/
top2vecTest.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
get_extract_data <- function(my_query, entities, maxArticles=10000) {
#print(is.numeric(my_query))
splittedQuery <- strsplit(my_query, " +")
maxDocs = maxArticles
print(my_query)
my_query<-paste(my_query, "1:2019[dp]", sep = " ")
#Get all pmids
res <- get_pubmed_ids(my_query)
print("Work")
#runjs("console.log('After query 1')")
test <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?",
"db=pubmed&&term=", res$OriginalQuery, "&usehistory=n&retmax=",maxDocs, sep = "")
#runjs("console.log('After query 2')")
res_url <- url(test,open="rb", encoding = "UTF8")
on.exit(close(res_url))
idXML <- readLines(res_url, warn = FALSE, encoding = "UTF8")
collect_ids <- list()
for (i in 1:length(idXML)) {
if (grepl("^(\\t){0,1}<Id>", idXML[i])) {
xx <- custom_grep(idXML[i], tag = "Id", format = "char")
collect_ids[[length(collect_ids) + 1]] <- as.character(xx[1])
}
}
myIDlist <- as.character(do.call(c, collect_ids))
#Connect and get from extract db
con <- dbConnect(RMySQL::MySQL(), dbname = "extract", user="theodos", password="89Lilu#1")
#df <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),")")) %>% group_by(PMID) %>% summarize(Type=paste(EntityType, collapse="."), New=paste(Word, collapse="."), Id=paste(ID, collapse=".")) %>% rename(pmid=PMID)
#Check which entity types are required by the user
if (length(entities)>0 && "0>" %in% entities) {
editedEntities <- entities[ !entities == "0>"]
print(length(editedEntities))
#Check if there are other entities
if (length(editedEntities)>0) {
allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND (entityType>0 OR entityType IN (",toString(editedEntities),"))"))
}else{
allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND entityType>0"))
}
#allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND (entityType>0 OR entityType IN (",toString(entities),"))"))
}else if (length(entities)>0) {
allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND entityType IN (",toString(entities),")"))
}else{
allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),")"))
}
#allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND entityType IN (",toString(entities),")"))
print(colnames(allData))
#Get Word-Type
wordTypeTable <- allData[, c('Word', 'EntityType')]
print(wordTypeTable)
#Format allData for clusternig etc
df <- allData %>% group_by(PMID) %>% summarize(Type=paste(EntityType, collapse="."), New=paste(Word, collapse="."), Id=paste(ID, collapse=".")) %>% rename(pmid=PMID)
print(df)
print(dim(df))
colnames(df)
topDf <- df[, c("pmid", "New")]
colnames(topDf) <- c('doc_id','text')
print (topDf)
#corpus_tokens <- extract_corpus %>% tokenize_regex(pattern = "\\.")
#articles_dfm <- dfm(tokens(corpus_tokens))
result<-top2vecr(topDf)
#print(result
#Get one column df$columnName
#extract_corpus <- corpus(df, text_field='New', docid_field = 'pmid')
extract_corpus <- corpus(df, text_field='New', docid_field = 'pmid')
corpus_tokens <- extract_corpus %>% tokenize_regex(pattern = "\\.")
articles_dfm <- dfm(tokens(corpus_tokens))
#articles_dfm <- dfm(extract_corpus)
#print('Articles dfm')
#print(articles_dfm)
#print(colSums(articles_dfm))
#Change column names for term- frequency table
sumVec <- colSums(articles_dfm)
termFreq <- data.table::as.data.table(sumVec, keep.rownames = TRUE)
colnames(termFreq) <- c('Term', 'Frequency')
results <- list()
#results$articles_corpus <- articles_corpus
#results$corpus_tokens <- corpus_tokens
results$articles_dfm <- articles_dfm
results$term_frequency <- termFreq
results$wordTypeTb <- wordTypeTable
results$untokenizedExtract <- df
return(results)
}
extractResults <- get_extract_data("pavlopoulos g [AU]", c())
allResults <- list()
allResults$article_dfm_forClustering <- extractResults$articles_dfm
#implement top2vec
top2vec_clusetring <- function(df){
colnames(df)
topDf <- df[, c("pmid", "New")]
colnames(topDf) <- c('doc_id','text')
print (topDf)
result<-top2vecr(topDf)
print(result)
results <- list()
#results$df <- data.frame(pmid, cluster)
results$df <- tibble(pmid, cluster)
}
top2vec_clusetring(extractResults$untokenizedExtract)
killDbConnections <- function () {
all_cons <- dbListConnections(MySQL())
print(all_cons)
for(con in all_cons)
+ dbDisconnect(con)
print(paste(length(all_cons), " connections killed."))
}
killDbConnections()