-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmergeDatasets.R
67 lines (42 loc) · 2.83 KB
/
mergeDatasets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#load datasets
covid <- read.table("Data/statewide_cases.csv", header = TRUE, sep = ",")
density <- read.table("Data/Average_Household_Size_and_Population_Density_-_County.csv", header = TRUE, sep = ",")
demographics <-read.csv("Data/demographic information.csv",header = T)
ages <- read.csv("Data/age_urban_counties.csv")
#select relevant cols in ages
ages=ages[ages$State=="CA",c("State","Area_Name","POP_ESTIMATE_2018","Urban_Influence_Code_2013","Total_age65plus","Density.per.square.mile.of.land.area...Population","Density.per.square.mile.of.land.area...Housing.units","ICU.Beds")]
colnames(ages)=c("State","county","Pop_Estimate","Urban_Influence_Code_2013","Total_age65plus","PopDensity_per_square_mile_of_land_area","HouseDensity.per.square.mile.of.land.area","ICU_Beds")
#select relevant cols in demographics
demographics=(demographics[,c("County.FIPS.Code","Postal.Code","Name","Poverty.Estimate..All.Ages","Poverty.Percent..All.Ages","Median.Household.Income")])
demographics=demographics[demographics$Postal.Code=="CA",]
#merge sets by counties in CA
colnames(density)[6] <- "county"
colnames(demographics)[3]<-"county"
merged <- merge(covid,density[density$State == "California",],by="county")
merged <- merge(merged, demographics,by='county')
#Select predictors and response for models
merged=merged[,c("county" ,"totalcountconfirmed" ,"totalcountdeaths","newcountconfirmed" ,"newcountdeaths","date","B01001_calc_PopDensity", "Poverty.Estimate..All.Ages","Poverty.Percent..All.Ages","Median.Household.Income")]
#adjust names
colnames(merged)=c("county" ,"totalcountconfirmed" ,"totalcountdeaths","newcountconfirmed" ,"newcountdeaths","date","PopDensity", "Poverty_Estimate","Poverty_Percent","Median_Household_Income")
#merge ages with merged
merged <- merge(merged, ages,by='county')
#process values which should NOT be factors
#sapply(merged, class)
#numeric -> factor
#merged$Rural_urban_Continuum_Code_2013=as.factor(merged$Rural_urban_Continuum_Code_2013)
merged$Urban_Influence_Code_2013=as.factor(merged$Urban_Influence_Code_2013)
#factor->numeric
merged$Poverty_Estimate=as.numeric(as.character(
gsub(",","",merged$Poverty_Estimate)))
merged$Poverty_Percent=as.numeric(as.character(merged$Poverty_Percent))
merged$Median_Household_Income=as.numeric(as.character(gsub(",","",merged$Median_Household_Income)))
merged$elder_ratio=merged$Total_age65plus/merged$Pop_Estimate
merged$new_casesrate=(merged$newcountconfirmed/merged$Pop_Estimate)*10000 #per10k
#MetCode: 1-> metropoliton, 0-> nonmetropolitan
merged$MetCode <- as.factor(ifelse(as.numeric(merged$Urban_Influence_Code_2013) < 3, "1", "0"))
#make factorials factorial
merged$county=as.factor(merged$county)
merged$date=as.factor(merged$date)
#export working dataset
final=merged
write.csv(final,'Data/final.csv')