-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtestExtract2.R
117 lines (90 loc) · 4.39 KB
/
testExtract2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# library(RMySQL)
# library(dplyr)
# library(quanteda)
# library(tokenizers)
get_extract_data <- function(my_query, entities, maxArticles=10000) {
#print(is.numeric(my_query))
#splittedQuery <- strsplit(my_query, " +")
maxDocs = maxArticles
#print(my_query)
#my_query<-paste(my_query, "1:2019[dp]", sep = " ")
#Get all pmids
res <- get_pubmed_ids(my_query)
print("Work")
runjs("console.log('After query 1')")
test <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?",
"db=pubmed&&term=", res$OriginalQuery, "&usehistory=n&retmax=",maxDocs, sep = "")
runjs("console.log('After query 2')")
res_url <- url(test,open="rb", encoding = "UTF8")
on.exit(close(res_url))
idXML <- readLines(res_url, warn = FALSE, encoding = "UTF8")
collect_ids <- list()
for (i in 1:length(idXML)) {
if (grepl("^(\\t){0,1}<Id>", idXML[i])) {
xx <- custom_grep(idXML[i], tag = "Id", format = "char")
collect_ids[[length(collect_ids) + 1]] <- as.character(xx[1])
}
}
myIDlist <- as.character(do.call(c, collect_ids))
#Connect and get from extract db
con <- dbConnect(RMySQL::MySQL(), dbname = "extract_2023", user="theodos", password="89Lilu#1")
#df <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),")")) %>% group_by(PMID) %>% summarize(Type=paste(EntityType, collapse="."), New=paste(Word, collapse="."), Id=paste(ID, collapse=".")) %>% rename(pmid=PMID)
runjs("console.log('Connected to extract')")
#Check which entity types are required by the user
if (length(entities)>0 && "0>" %in% entities) {
editedEntities <- entities[ !entities == "0>"]
print(length(editedEntities))
#Check if there are other entities
if (length(editedEntities)>0) {
allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND (entityType>0 OR entityType IN (",toString(editedEntities),"))"))
}else{
allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND entityType>0"))
}
#allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND (entityType>0 OR entityType IN (",toString(entities),"))"))
}else if (length(entities)>0) {
allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND entityType IN (",toString(entities),")"))
}else{
allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),")"))
}
#allData <- dbGetQuery(conn = con, paste("SELECT * FROM EntityWords WHERE pmid IN (",toString(myIDlist),") AND entityType IN (",toString(entities),")"))
#print(colnames(allData))
#Get Word-Type
wordTypeIDTable <- allData[, c('Word', 'EntityType', 'Ext_db_id')]
#print(wordTypeIDTable)
#Format allData for clusternig etc
df <- allData %>% group_by(PMID) %>% summarize(Type=paste(EntityType, collapse="."), New=paste(Word, collapse="."), Id=paste(ID, collapse=".")) %>% rename(pmid=PMID)
#print(nrow(df))
runjs("console.log('DF created (testExtract2')")
#Get one column df$columnName
#extract_corpus <- corpus(df, text_field='New', docid_field = 'pmid')
extract_corpus <- corpus(df, text_field='New', docid_field = 'pmid')
corpus_tokens <- extract_corpus %>% tokenize_regex(pattern = "\\.")
articles_dfm <- dfm(tokens(corpus_tokens))
#articles_dfm <- dfm(extract_corpus)
#print('Articles dfm')
#print(articles_dfm)
#print(colSums(articles_dfm))
#Change column names for term- frequency table
sumVec <- colSums(articles_dfm)
termFreq <- data.table::as.data.table(sumVec, keep.rownames = TRUE)
colnames(termFreq) <- c('Term', 'Frequency')
results <- list()
#results$articles_corpus <- articles_corpus
#results$corpus_tokens <- corpus_tokens
results$articles_dfm <- articles_dfm
results$term_frequency <- termFreq
results$wordTypeIDTb <- wordTypeIDTable
results$untokenizedExtract <- df
return(results)
}
#get_extract_data("pavlopoulos g [AU]", c())
#get_extract_data("cancer 1:2019[dp]", c(">0"))
#get_extract_data("theodosiou t [AU]", c())
killDbConnections <- function () {
all_cons <- dbListConnections(MySQL())
print(all_cons)
for(con in all_cons)
+ dbDisconnect(con)
print(paste(length(all_cons), " connections killed."))
}
killDbConnections()