.Rhistory

for (i in 1:length(files)){
docs <- strsplit(files[[i]][2], split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE)
}
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
d <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt", f),
docvarsfrom = "filenames",
docvarnames = "job number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
id <- files$doc_id
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt", f),
docvarsfrom = "filenames",
docvarnames = "job number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
id <- files$doc_id
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
fls
fls[1]
paste0("~/Desktop/work/caste UN/UN_meeting_records/txt", f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt", f),
docvarsfrom = "filenames",
docvarnames = "job number"
) #read the files
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
id <- files$doc_id
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
library(stringr)
#read the files
folder <-  readtext(file = paste("~/Desktop/work/caste UN/UN_meeting_records/txt/", list.files("~/Desktop/work/caste UN/UN_meeting_records/txt"), sep=""),
docvarsfrom = "filenames",
docvarnames = "job number")
files <- split(folder, seq(nrow(folder)))
for (i in 1:length(files)){
docs <- strsplit(files[[i]][2], split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE)
}
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
d <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
id <- files$doc_id
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
d <- do.call(rbind, d)
d
View(d)
str(folder)
str(files)
# Chunk 1: setup
knitr::opts_chunk$set(echo = TRUE)
# Chunk 2
setwd("~/Desktop/work/caste UN/UN_meeting_records")
# Chunk 3
#install.packages("rvest")
library(rvest) #load rvest
library(dplyr) #if you want to use the pipe operator, amongst other things
library(tidyr)
library(readtext)
library(stringr)
# Chunk 4
sr <- read.csv("results.csv", header=TRUE, stringsAsFactors = FALSE)
sr <- sr[grep("B03", sr$X89, ignore.case=TRUE),]
# Chunk 5
sr$h <- 0
#In my data frame, the column containing the links is called 'links'.
for (i in 1:length(sr$links)){
h <-
read_html((sr$links)[i])%>%
html_node("strong +a") %>%
html_attrs()
sr$h[i] <- h
}
# Chunk 6
sr <- sr %>% drop_na(h) #remove missing values
# Chunk 7
sr$h <- paste0("https://digitallibrary.un.org", sr$h)
# Chunk 8
for (i in 1:nrow(sr)){
download.file(sr$h[i], paste0("~/Desktop/work/caste UN/UN_meeting_records/pdfs/",sr$X1[i],".pdf"))
}
# Chunk 9
files <- list.files(path = "~/Desktop/work/caste UN/UN_meeting_records/pdfs",
pattern = "pdf",
full.names = TRUE)
lapply(files, function(i) system(paste("/usr/local/bin/pdftotext", paste0('"', i, '"')), wait = FALSE) )
#read the files
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
d <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
id <- files$doc_id
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
d <- do.call(rbind, d)
View(d)
# Chunk 1: setup
knitr::opts_chunk$set(echo = TRUE)
# Chunk 2
setwd("~/Desktop/work/caste UN/UN_meeting_records")
# Chunk 3
#install.packages("rvest")
library(rvest) #load rvest
library(dplyr) #if you want to use the pipe operator, amongst other things
library(tidyr)
library(readtext)
library(stringr)
# Chunk 4
sr <- read.csv("results.csv", header=TRUE, stringsAsFactors = FALSE)
sr <- sr[grep("B03", sr$X89, ignore.case=TRUE),]
# Chunk 5
sr$h <- 0
#In my data frame, the column containing the links is called 'links'.
for (i in 1:length(sr$links)){
h <-
read_html((sr$links)[i])%>%
html_node("strong +a") %>%
html_attrs()
sr$h[i] <- h
}
# Chunk 6
sr <- sr %>% drop_na(h) #remove missing values
# Chunk 7
sr$h <- paste0("https://digitallibrary.un.org", sr$h)
# Chunk 8
for (i in 1:nrow(sr)){
download.file(sr$h[i], paste0("~/Desktop/work/caste UN/UN_meeting_records/pdfs/",sr$X1[i],".pdf"))
}
# Chunk 9
files <- list.files(path = "~/Desktop/work/caste UN/UN_meeting_records/pdfs",
pattern = "pdf",
full.names = TRUE)
lapply(files, function(i) system(paste("/usr/local/bin/pdftotext", paste0('"', i, '"')), wait = FALSE) )
?rbind
#read the files
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
d <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
id <- files$doc_id
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
d <- do.call(rbind, d)
View(d)
View(d)
str(fls)
str(files)
remove(d)
remove(fls)
remove(files)
#read the files
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
d <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job.number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
id <- files$doc_id
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
d <- do.call(rbind, d)
View(d)
str(d)
#read the files
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
d <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job.number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
id <- files$doc_id
jobno <- files$job.number
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
d <- do.call(rbind, d)
View(d)
#read the files
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
d <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job.number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
id <- files$doc_id
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, id, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
d <- do.call(rbind, d)
View(d)
#read the files
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
d <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job.number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
d <- do.call(rbind, d)
?dictionary
#read the files
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
df <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job.number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
d[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
df <- do.call(rbind, d)
#read the files
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
df <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job.number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
df[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
df <- do.call(rbind, df)
View(df)
View(df)
kwic(corp, dict, window=5, valuetype="glob")[sample(nrow(kwic(corp, dict, window=5, valuetype="glob")), 10),] #check the dictionary
library(quanteda)
kwic(corp, dict, window=5, valuetype="glob")[sample(nrow(kwic(corp, dict, window=5, valuetype="glob")), 10),] #check the dictionary
corp <- corpus(df, text_field="cleandocs") #create a corpus
kwic(corp, dict, window=5, valuetype="glob")[sample(nrow(kwic(corp, dict, window=5, valuetype="glob")), 10),] #check the dictionary
dict <- dictionary(list(racism=c("race", "racis*")))
df <- df[!is.na(df$names),] #remove NA in names
df <- df[df$names!="character(0)",] #remove some sort of missing values in names
df <- df[!is.na(df$orgs),] #remove NA in orgs
df <- df[which(nchar(df$names)<20),] #remove names longer than 20 characters (error in processing)
corp <- corpus(df, text_field="cleandocs") #create a corpus
kwic(corp, dict, window=5, valuetype="glob")[sample(nrow(kwic(corp, dict, window=5, valuetype="glob")), 10),] #check the dictionary
sample(nrow(kwic(corp, dict, window=5, valuetype="regex")), 10),] #check the dictionary
sample(nrow(kwic(corp, dict, window=5, valuetype="regex")), 10) #check the dictionary
k <- kwic(corp, dict, window=5, valuetype="regex")
k[sample(nrow(k), 10),] #check the dictionary
# Chunk 1: setup
knitr::opts_chunk$set(echo = TRUE)
# Chunk 2: set working directory
setwd("~/Desktop/work/caste UN/UN_meeting_records")
# Chunk 3
#install.packages("rvest")
library(rvest) #load rvest
library(dplyr) #if you want to use the pipe operator, amongst other things
library(tidyr)
library(readtext)
library(stringr)
library(quanteda)
# Chunk 4: read csv
sr <- read.csv("results.csv", header=TRUE, stringsAsFactors = FALSE)
sr <- sr[grep("B03", sr$X89, ignore.case=TRUE),]
# Chunk 5: scrape download links
sr$h <- 0
#In my data frame, the column containing the links is called 'links'.
for (i in 1:length(sr$links)){
h <-
read_html((sr$links)[i])%>%
html_node("strong +a") %>%
html_attrs()
sr$h[i] <- h
}
# Chunk 6: drop na
sr <- sr %>% drop_na(h) #remove missing values
# Chunk 7: get full url
sr$h <- paste0("https://digitallibrary.un.org", sr$h)
# Chunk 8: download files
for (i in 1:nrow(sr)){
download.file(sr$h[i], paste0("~/Desktop/work/caste UN/UN_meeting_records/pdfs/",sr$X1[i],".pdf"))
}
# Chunk 9
files <- list.files(path = "~/Desktop/work/caste UN/UN_meeting_records/pdfs",
pattern = "pdf",
full.names = TRUE)
lapply(files, function(i) system(paste("/usr/local/bin/pdftotext", paste0('"', i, '"')), wait = FALSE) )
# Chunk 10: make the data frame
#read the files
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
df <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job.number"
) #read the files
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE) #splitting the file into sections by person
cleandocs <- docs[[1]][2:length(docs[[1]])] #removing the first section- the title page
#creating the necessary entries for the data frame
title <- docs[[1]][1]
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
df[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
df <- do.call(rbind, df)
# Chunk 11: creating dictionary
dict <- dictionary(list(racism=c("race", "racis*")))
# Chunk 12
df <- df[!is.na(df$names),] #remove NA in names
df <- df[df$names!="character(0)",] #remove some sort of missing values in names
df <- df[!is.na(df$orgs),] #remove NA in orgs
df <- df[which(nchar(df$names)<20),] #remove names longer than 20 characters (error in processing)
# Chunk 13: create a corpus
corp <- corpus(df, text_field="cleandocs") #create a corpus
# Chunk 14
k <- kwic(corp, dict, window=5, valuetype="regex")
k[sample(nrow(k), 10),] #check the dictionary
# Chunk 15
corpdfm <- dfm(corp, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, remove=stopwords("en"), groups = "orgs")
dictdfm <- dfm_lookup(corpdfm, dict, valuetype="regex", case_insensitive = TRUE) #apply the dictionary
dictdfm <- dfm_sort(dictdfm, decreasing=TRUE, margin="documents")
dictdfm[1:20,] #see the top 20
citation()
# Chunk 1: setup
knitr::opts_chunk$set(echo = TRUE)
# Chunk 2: set working directory
setwd("~/Desktop/work/caste UN/UN_meeting_records")
# Chunk 3
#install.packages("rvest")
library(rvest)
library(dplyr)
library(tidyr)
library(readtext)
library(stringr)
library(quanteda)
# Chunk 4: read csv
sr <- read.csv("results.csv", header=TRUE, stringsAsFactors = FALSE)
sr <- sr[grep("B03", sr$X89, ignore.case=TRUE),]
# Chunk 5: scrape download links
sr$h <- 0
#In my data frame, the column containing the links is called 'links'.
for (i in 1:length(sr$links)){
h <-
read_html((sr$links)[i])%>%
html_node("strong +a") %>%
html_attrs()
sr$h[i] <- h
}
# Chunk 6: drop na
sr <- sr %>% drop_na(h)
# Chunk 7: get full url
sr$h <- paste0("https://digitallibrary.un.org", sr$h)
# Chunk 8: download files
for (i in 1:nrow(sr)){
download.file(sr$h[i], paste0("~/Desktop/work/caste UN/UN_meeting_records/pdfs/",sr$X1[i],".pdf"))
}
# Chunk 9
#The first argument of list.files() is the file path of the folder containing all your pdf documents.
files <- list.files(path = "~/Desktop/work/caste UN/UN_meeting_records/pdfs",
pattern = "pdf",
full.names = TRUE)
lapply(files, function(i) system(paste("/usr/local/bin/pdftotext", paste0('"', i, '"')), wait = FALSE) )
# Chunk 10
fls <- list.files("~/Desktop/work/caste UN/UN_meeting_records/txt")
df <- list()
for (f in fls){
message(f)
files <- readtext(paste0("~/Desktop/work/caste UN/UN_meeting_records/txt/", f),
docvarsfrom = "filenames",
docvarnames = "job.number"
)
docs <- strsplit(files$text, split="\\n(?=[0-9]+\\.\\s((Mr|Ms)|Mrs))", perl=TRUE)
cleandocs <- docs[[1]][2:length(docs[[1]])] #I refer to the text of the speeches as 'cleandocs'- completely arbitrary
#creating the necessary entries for the data frame
title <- docs[[1]][1]
jobno <- files$job.number
doc_id <- str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+")
names <- sub("\\s\\(","",(str_extract_all(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\(")))
orgs <- sub("\\)$","",sub("^\\(","",str_extract(str_extract(cleandocs, "((Mr|Ms)|Mrs)(.*?)\\)+"),"\\(.*\\)")))
df[[f]] <- data.frame(f, cleandocs, jobno, names, orgs, title, stringsAsFactors = FALSE)
}
df <- do.call(rbind, df)
# Chunk 11: creating dictionary
dict <- dictionary(list(racism=c("race", "racis*"))) #using regular expressions
# Chunk 12
df <- df[!is.na(df$names),] #remove NA in names
df <- df[df$names!="character(0)",] #remove some sort of missing values in names
df <- df[!is.na(df$orgs),] #remove NA in orgs
df <- df[which(nchar(df$names)<20),] #remove names longer than 20 characters (error in processing)
sr <- read.csv("results.csv", header=TRUE, stringsAsFactors = FALSE)
View(sr)