-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathTidy_data.do
292 lines (205 loc) · 8.25 KB
/
Tidy_data.do
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/*-------------------------------------------------------------------------------
# Name: TidyData
# Purpose: Show how to fix untidy data using Stata code
# Author: Tim Essam, Ph.D. / USAID GeoCenter
# Created: 2016/07
# License: MIT
# Notes: Code for Stata 13 or 14
#-------------------------------------------------------------------------------
*/
clear
capture log close
* Creating Tidy Data
cd "C:/Users/Tim/Documents/TidyData"
log using "TidyData.txt", replace
copy "https://github.com/GeoCenter/resources/blob/master/untidy_data.xlsx?raw=true" untidy_data.xlsx, replace
* Example 1: Merged cells
***********************************************************************************
/* Import the file, use the firstrow option to have the first
row read in as variables
*/
import excel untidy_data.xlsx, sheet("merged_cells") firstrow
list
/* Notice that Stata does not assign the value from the merged cells to
multiple variable names. Instead, we have the variable names in the first
row of the dataframe. So we'll need to create some new variable names
based on the contents of the first row. But, State does not like variable
names to start with numbers so we will have to use a combination of strings
and numbers. */
* Create a loop to iterate over all the variables in data frame.
foreach x of varlist _all {
/* First, create a place holder variable called newnew that takes the
value of the 1st observation in the dataframe */
local newname = `x'[1]
/* Convert each value in the newname local into an allowed Stata name */
local newname2 = strtoname("`newname'")
/* Finally, rename each variable */
rename `x' `newname2'
}
*end
* Drop the 1st row as this information is now caputred in the variable names
drop in 1
* Now, rename the variables
rename (_2012 _2014) (cholera_cases_2012 cholera_cases_2014)
* create a unique id based on the region
egen id = group(region)
* Reshape the data into a fully tidy dataset
reshape long cholera_cases_@, i(id) j(year)
rename cholera_cases_ cholera_cases
egen uniqueid = group(id year)
la var year "year"
la var region "Ethiopia region (administrative unit 1)"
la var cholera_cases "Number of cholera cases reported"
* write the data frame to the window
clist, noo
* optional -- if you want to save the data
save "tidyData_ex1.dta", replace
* Example 2: Multiple values per cell
***********************************************************************************
clear
import excel untidy_data.xlsx, sheet("multiple_values_cell") firstrow
* Use the split command with a parse option (parsing on commas) to creat new variables
split region, p(,)
* Create unique id for projects and reshape the data in fully tidy dataset
rename region admin1_old
egen projectid = group(project)
* Reshape the data frame into a tidy data set and drop extra observations for which
* the admin1 region is missing
reshape long region@, i(projectid) j(regionCode)
drop if region == ""
egen uniqueid = group(projectid regionCode)
isid uniqueid
* Add a few variable labels
la var regionCode "region code"
la var region "Ethiopia region (administrative unit 1)"
clist, noo
* Example 3: No unique id
***********************************************************************************
clear
import excel untidy_data.xlsx, sheet("no_unique_id") firstrow
* Check if the combination of project and region make a variable unique
isid project region
egen id = group(project region)
sort id
clist, noo
* Example 4: Variable names not meaningful
***********************************************************************************
clear
import excel untidy_data.xlsx, sheet("varnames_not_meaningful") firstrow
rename (variable1 variable2) (cholera TB)
la var cholera "Number of cholera cases reported"
la var TB "Number of tuberculosis cases reported
egen regionid = group(region)
isid regionid
clist, noo
* Example 5: Variable contain measurements
***********************************************************************************
clear
import excel untidy_data.xlsx, sheet("varnames_contain_measurement") firstrow
rename (cholera C) (cholera2012 cholera2014)
egen regionid = group(region)
reshape long cholera@, i(region) j(year)
egen id = group(regionid year)
isid id
clist, noo
* Example 5: Variable contain measurements
***********************************************************************************
clear
import excel untidy_data.xlsx, sheet("inconsistent_data") firstrow
* Notice that everything imports as a string b/c of inconsistencies
d
clist, noo
* First, let's remove row 4 as this is duplicative of row 3
drop if project == "project3" & start_date == "01012014"
* now let's fix the start_dates
replace start_date = "01-01-16" if project == "project3"
replace start_date = "25-12-16" if project == "project2"
* Split out dates into components
split start_date, p(-) destring
rename (start_date1 start_date2 start_date3)(day month year)
replace year = year + 2000
drop start_date
* Use Stata's date function to convert the values to dates
* I always forgot how to do this so I visit: http://www.ats.ucla.edu/stat/stata/modules/dates.htm
g start_date = mdy(month, day, year)
format start_date %td
list
* Next, let's correct the funding amounts; First, strip out special characters
replace funding = subinstr(funding, "$", "",.)
replace funding = subinstr(funding, "M", "",.)
* Destring to convert to numeric values
g funding2 = real(funding)
replace funding2 = 75*1000000 if funding2 == 75
* Fix up the region names
tab region
replace region = "Afar" if regexm(region, "Affar")
* Label all of the variables
la var day "start date day"
la var month "start date month"
la var year "start date year"
la var start_date "start_date"
drop funding
ren funding2 funding
la var funding "total funding by project"
* Format the funding variable for readability below
format funding %14.0fc
* Create a unique id for projects
egen projectid = group(project)
clist, noo
* Example 7: Missing value not explicit
***********************************************************************************
clear
import excel untidy_data.xlsx, sheet("missing_values") firstrow
describe
* notice that the TB_cases variable read in as strings, "-" should be 0
* fix this up and then destring the cases;
* NOTE: Stata treats blank cells as missing values, "." represents a missing value;
replace TB_cases = subinstr(TB_cases, "-", "0",.)
g TB_cases_num = real(TB_cases)
misstable sum TB_cases_num
* Create a unique ID and label values
egen regionid = group(region)
la var TB_cases_num "tuberculosis cases"
order region regionid TB_cases_num cholera_cases
clist, noo
* Example 8: Not computer readable
***********************************************************************************
clear
import excel untidy_data.xlsx, sheet("not_comp_readable") firstrow
list
* Notice that all the cells are blank -- Stata doesn't read colors
rename (A projectstatus) (project status)
tostring status, replace
replace status = "kinda okay" if inlist(project, "project1", "project3")
replace status = "good" if project == "project2"
replace status = "really not okay" if project == "project4"
* Now, let's encode the status so it can be used as a factor
encode status, gen(status_enc)
tab status_enc, nolabel
tab status_enc
* create a unique project id
egen projectid = group(project)
isid projectid
la var project "project number"
clist, noo
* Example 9: Undocumented, vague data
***********************************************************************************
clear
import excel untidy_data.xlsx, sheet("undocumented") firstrow
clist, noo
* Need to add value labels to the region so we know what 5, 1, and 7 mean
label define reglab 1 "Oromia" 5 "Afar" 7 "Somali"
label values region reglab
tab region
tab region, nol
* Label variables
la var funding "funding in USD"
la var project "project numbeR"
la var region "Ethiopia region"
* Add a note about the funding variable
notes funding: funding amount is in 2015 USD
* create a unique project id
egen projectid = group(project)
isid projectid
clist, noo
capture log close