forked from epierson9/pain-disparities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnon_image_data_processing.py
1480 lines (1292 loc) · 84.8 KB
/
non_image_data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from constants_and_util import *
import os
import pandas as pd
import copy
from scipy.stats import pearsonr
from collections import Counter
import datetime
class NonImageData():
"""
Class for loading the non-image data.
Requires an argument to specify train val test or BLINDED_HOLD_OUT_SET.
"""
def __init__(self,
what_dataset_to_use,
timepoints_to_filter_for,
seed_to_further_shuffle_train_test_val_sets=None,
i_promise_i_really_want_to_use_the_blinded_hold_out_set=False,
filter_out_special_values_in_mri_data=False):
"""
Load raw data, turn it into processed data, and do some validations. Checked.
Raw data was downloaded from https://ndar.nih.gov/oai/full_downloads.html
Minor note: this method raises a "DtypeWarning: Columns (5) have mixed types." warning. This is caused by a file in a column we do not use in a timepoint we do not use. It could be fixed by using
pd.read_csv('/dfs/dataset/tmp/20180910-OAI/data/emma_downloaded_oai_data_9112018/MRI MetaAnalysis_ASCII/MRI10.txt',
sep='|',
dtype={'V10MQCCMNT':str})
"""
assert what_dataset_to_use in ['train', 'val', 'test', 'BLINDED_HOLD_OUT_DO_NOT_USE', 'all']
if not i_promise_i_really_want_to_use_the_blinded_hold_out_set:
assert what_dataset_to_use not in ['BLINDED_HOLD_OUT_DO_NOT_USE', 'all'] # just a sanity check to make sure we don't accidentally use these.
self.seed_to_further_shuffle_train_test_val_sets = seed_to_further_shuffle_train_test_val_sets
self.what_dataset_to_use = what_dataset_to_use
self.clinical_base_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'AllClinical_ASCII')
self.semiquantitative_xray_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR,
'X-Ray Image Assessments_ASCII',
'Semi-Quant Scoring_ASCII')
self.semiquantitative_mri_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR,
'MRI Image Assessment_ASCII',
'Semi-Quant Scoring_ASCII')
self.xray_metadata_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'X-Ray MetaAnalysis_ASCII')
self.mri_metadata_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'MRI MetaAnalysis_ASCII')
self.original_dataframes = {} # store the original CSVs
self.processed_dataframes = {} # store the processed data
self.col_mappings = {}
self.missing_data_val = '.: Missing Form/Incomplete Workbook'
self.filter_out_special_values_in_mri_data = filter_out_special_values_in_mri_data
# From the OAI quantitative x-ray notes:
# The variable SIDE denotes whether the row of data is for a right side image (SIDE=1) or a left side image (SIDE=2)
self.side_mappings = {1:'right', 2:'left'}
if timepoints_to_filter_for is None:
self.timepoints_to_filter_for = TIMEPOINTS_TO_FILTER_FOR
print("Set timepoints to filter for to", TIMEPOINTS_TO_FILTER_FOR)
else:
self.timepoints_to_filter_for = timepoints_to_filter_for
# load various dataframes
self.load_clinical_data()
self.load_semiquantitative_xray_data()
self.load_xray_metadata()
self.load_semiquantitative_mri_data()
self.load_mri_metadata()
# make processed dataframes.
self.make_nonstandard_interventions_dataframe()
self.make_medications_dataframe()
self.make_400m_walk_dataframe()
self.make_redundant_knee_xray_variable_dataframe()
self.make_knee_pain_dataframe()
self.make_other_koos_subscores_dataframe()
self.make_per_person_controls_dataframe()
self.make_previous_injury_dataframe()
self.make_previous_surgery_dataframe()
self.make_previous_knee_replacement_dataframe()
self.make_bmi_dataframe()
self.make_drinking_and_smoking_dataframe()
self.make_medical_history_dataframe()
self.make_pain_dataframe_for_all_other_types_of_pain()
self.make_age_dataframe()
self.make_dominant_leg_dataframe()
self.make_previous_fracture_or_fall_dataframe()
self.make_processed_mri_data()
# some validation.
self.validate_processed_data()
self.validate_ids()
if self.what_dataset_to_use != 'all':
self.filter_for_correct_set()
self.filter_out_timepoints()
self.filter_out_visits_too_far_from_xray_imaging()
print("Successfully loaded non-image data.")
def filter_out_timepoints(self):
"""
Remove datapoints from processed dataframes if they're not in timepoints_to_filter_for.
"""
print("Filtering for timepoints", self.timepoints_to_filter_for)
for k in sorted(self.processed_dataframes.keys()):
if 'visit' in self.processed_dataframes[k].columns:
print("\nLength of %s prior to filtering: %i" % (k, len(self.processed_dataframes[k])))
assert pd.isnull(self.processed_dataframes[k]['visit']).sum() == 0
print("Values of visit prior to filtering", sorted(list(set(self.processed_dataframes[k]['visit']))))
if not all([a in list(set(self.processed_dataframes[k]['visit'].dropna())) for a in self.timepoints_to_filter_for]):
raise Exception("There is a problem with the visit column in %s: not all the timepoints we want are present." % k)
if not all([a in CLINICAL_WAVES_TO_FOLLOWUP.values() for a in list(set(self.processed_dataframes[k]['visit'].dropna()))]):
raise Exception("There is a problem with the visit column in %s: not all values in the column are valid visits." % k)
self.processed_dataframes[k] = self.processed_dataframes[k].loc[self.processed_dataframes[k]['visit'].map(
(lambda x:x in self.timepoints_to_filter_for))]
self.processed_dataframes[k].index = range(len(self.processed_dataframes[k]))
print("Length of %s after filtering: %i" % (k, len(self.processed_dataframes[k])))
print("Values of visit after filtering", sorted(list(set(self.processed_dataframes[k]['visit']))))
else:
print("Not filtering for visit for dataframe %s because no visit column" % k)
def filter_for_correct_set(self):
"""
Make sure our dataset contains only the right dataset (eg, train set etc). Checked.
"""
print("Filtering for %s set." % self.what_dataset_to_use)
ids = make_train_val_test_hold_out_set(seed_to_further_shuffle_train_test_val_sets=self.seed_to_further_shuffle_train_test_val_sets)
ids = ids[self.what_dataset_to_use + '_ids']
self.all_ids = sorted(ids)
id_set = set(ids)
print('****Filtering unprocessed data for %s set.' % self.what_dataset_to_use)
for k in sorted(self.original_dataframes.keys()):
assert 'ID' not in self.original_dataframes[k].columns
if 'id' in self.original_dataframes[k].columns:
orig_length = len(self.original_dataframes[k])
self.original_dataframes[k] = self.original_dataframes[k].loc[self.original_dataframes[k]['id'].map(lambda x:x in id_set)]
print("After filtering, number of rows in %s goes from %i -> %i" % (k, orig_length, len(self.original_dataframes[k])))
assert orig_length != len(self.original_dataframes[k])
print('\n****Filtering processed data for %s set.' % self.what_dataset_to_use)
for k in sorted(self.processed_dataframes.keys()):
assert 'ID' not in self.processed_dataframes[k].columns
if 'id' in self.processed_dataframes[k].columns:
orig_length = len(self.processed_dataframes[k])
self.processed_dataframes[k] = self.processed_dataframes[k].loc[self.processed_dataframes[k]['id'].map(lambda x:x in id_set)]
print("After filtering, number of rows in %s goes from %i -> %i" % (k, orig_length, len(self.processed_dataframes[k])))
assert orig_length != len(self.processed_dataframes[k])
def validate_processed_data(self):
"""
Make sure there are no missing data values in the processed data. Checked.
"""
for k in self.processed_dataframes:
assert 'id' in self.processed_dataframes[k].columns
print("Validating id column for %s" % k)
assert pd.isnull(self.processed_dataframes[k]['id']).sum() == 0
if 'visit' in self.processed_dataframes[k].columns:
print("Validating visit column for %s" % k)
assert pd.isnull(self.processed_dataframes[k]['visit']).sum() == 0
assert self.processed_dataframes[k]['visit'].map(lambda x:x in CLINICAL_WAVES_TO_FOLLOWUP.values()).all()
if 'side' in self.processed_dataframes[k].columns:
print("Validating side column for %s" % k)
assert pd.isnull(self.processed_dataframes[k]['side']).sum() == 0
assert self.processed_dataframes[k]['side'].map(lambda x:x in ['left', 'right']).all()
for c in self.processed_dataframes[k].columns:
assert self.processed_dataframes[k][c].map(lambda x:str(x) == self.missing_data_val).sum() == 0
def load_all_text_files_in_directory(self, base_dir, datasets_to_skip):
"""
Given a base directory, and datasets to skip, loads in the relevant datasets to self.original_dataframes.
Column names + dataset names are stored in lowercase.
Checked.
"""
print("Base directory: %s" % base_dir)
skipped_datasets = [] # make sure we actually skipped all the datasets we want to skip.
for filename in sorted(os.listdir(base_dir)):
if filename[-4:] == '.txt':
dataset_name = filename.replace('.txt', '').lower()
if dataset_name in datasets_to_skip:
skipped_datasets.append(dataset_name)
continue
full_path = os.path.join(base_dir, filename)
d = pd.read_csv(full_path, sep='|')
d.columns = d.columns.map(lambda x:x.lower())
assert len(d.columns) == len(set(d.columns))
print("%s has %i columns, %i rows" % (filename, len(d.columns), len(d)))
assert dataset_name not in self.original_dataframes # don't add same dataset twice.
self.original_dataframes[dataset_name] = d
self.col_mappings[dataset_name] = {} # in case we want to map column names to anything else, this is a data dictionary.
assert sorted(datasets_to_skip) == sorted(skipped_datasets)
def concatenate_dataframes_from_multiple_timepoints(self, dataset_substring, columns_to_subset_on=None, visit_numbers_to_skip=None):
"""
Takes all datasets in original_dataframes that contain dataset_substring, takes the columns in columns_to_subset_on,
and adds a column called "visit" which denotes which visit it is.
Checked.
"""
print('Combining dataframes with substring %s' % dataset_substring)
dataframes_to_concatenate = []
expected_columns = None
for dataset_name in sorted(self.original_dataframes):
if dataset_substring in dataset_name:
visit_number = dataset_name.replace(dataset_substring, '') # this should be something like 00.
if visit_numbers_to_skip is not None and visit_number in visit_numbers_to_skip:
continue
visit = CLINICAL_WAVES_TO_FOLLOWUP[visit_number]
print("Adding visit=%s to dataframe %s" % (visit, dataset_name))
dataset_copy = copy.deepcopy(self.original_dataframes[dataset_name])
# make sure each field has a consistent prefix (eg, v00) indicating that it comes from the right timepoint.
# there are some exceptions: fields like id, and fields with p01 or p02, which indicate pre-enrollment measurements.
assert all(['v%s' % visit_number in a for a in dataset_copy.columns if a not in ['id', 'side', 'readprj', 'version'] and a[:3] not in ['p01', 'p02']])
dataset_copy.columns = dataset_copy.columns.map(lambda x:x.replace('v%s' % visit_number, ''))
# if desired, subset the columns.
if columns_to_subset_on is not None:
dataset_copy = dataset_copy[columns_to_subset_on]
# make sure columns stay consistent.
if expected_columns is None:
expected_columns = list(dataset_copy.columns)
else:
assert expected_columns == list(dataset_copy.columns)
dataset_copy['visit'] = visit
dataframes_to_concatenate.append(dataset_copy)
combined_data = pd.concat(dataframes_to_concatenate)
combined_data.index = range(len(combined_data))
print("Number of rows in combined data: %i" % len(combined_data))
return combined_data
def load_clinical_data(self):
print("\n***Loading all clinical data.")
# skip allclinical02 and allclinical04 because they have very little data.
self.load_all_text_files_in_directory(self.clinical_base_dir, datasets_to_skip=['allclinical02', 'allclinical04'])
def map_to_date(self, x):
# sometimes X-ray dates are missing because, as documentation notes
# "In addition, x-ray date and all QC variables have been set to missing .A for numeric variables,
# blank for text variables) when an x-ray was acquired, but is not available."
# So this date is fairly often NA. But that's okay, because that only occurs (confirmed this)
# if the ACCEPT variable is NA anyway, so the data gets filtered out subsequently in find_image_barcodes_that_pass_qc
if x is not None and str(x) != 'nan':
return datetime.datetime.strptime(x, '%m/%d/%Y')
return None
def filter_out_visits_too_far_from_xray_imaging(self):
print("\n\n***Filtering out visits too far from x-rays.")
THRESHOLD_IN_DAYS = 90
visits_to_bad_ids = {}
for visit_substring in ['00', '01', '03', '05', '06']:
allclinical_df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit_substring])
xray_df = copy.deepcopy(self.original_dataframes['xray%s' % visit_substring])
xray_df = xray_df.loc[xray_df['v%sexamtp' % visit_substring] == 'Bilateral PA Fixed Flexion Knee']
xray_date_dict = dict(zip(xray_df['id'].values,
xray_df['v%sxrdate' % visit_substring].values))
def return_date_in_dict_if_possible(xray_date_dict, x):
if x in xray_date_dict:
return xray_date_dict[x]
else:
print("Warning! ID %i not in dict." % x) # this happens only once.
return '01/01/1900'
allclinical_df['v%sxrdate' % visit_substring] = allclinical_df['id'].map(lambda x:return_date_in_dict_if_possible(xray_date_dict, x))
# xrdate: Date x-ray completed (calc).
# p01svdate: Date Screening Visit completed.
# v00evdate: Date Enrollment Visit completed.
# v01fvdate: Follow-up visit date.
if visit_substring == '00':
all_date_cols = ['p01svdate', 'v00evdate', 'v00xrdate']
else:
all_date_cols = ['v%sfvdate' % visit_substring, 'v%sxrdate' % visit_substring]
print("\n\n%s visit" % CLINICAL_WAVES_TO_FOLLOWUP[visit_substring])
# At followup, there are some people missing dates for x-rays because they didn't have them.
# We don't filter them out at this stage because they are filtered out subsequently.
# We do verify that a) very few people are missing any date data at the initial timepoint (00) and
# b) everyone missing allclinical data is also missing x-ray data, so should be filtered out.
if visit_substring != '00':
xr_missing_date = pd.isnull(allclinical_df['v%sxrdate' % visit_substring].map(lambda x:self.map_to_date(x)))
allclinical_missing_date = pd.isnull(allclinical_df['v%sfvdate' % visit_substring].map(lambda x:self.map_to_date(x)))
assert (allclinical_missing_date & (~xr_missing_date)).sum() == 0 # make sure there's no one who has x-rays without coming in for followup in allclinical.
else:
for k in all_date_cols:
assert pd.isnull(allclinical_df[k].map(lambda x:self.map_to_date(x))).mean() < .005
bad_ids = None
assert len(set(allclinical_df['id'])) == len(allclinical_df)
for i in range(len(all_date_cols)):
print("Fraction of date column %s which cannot be mapped to a date: %2.3f" %
(all_date_cols[i],
pd.isnull(allclinical_df[all_date_cols[i]].map(lambda x:self.map_to_date(x))).mean()))
for j in range(i):
print('***gaps between %s and %s' % (all_date_cols[i], all_date_cols[j]))
days_between = np.abs((allclinical_df[all_date_cols[i]].map(lambda x:self.map_to_date(x)) -
allclinical_df[all_date_cols[j]].map(lambda x:self.map_to_date(x))).map(lambda x:x.days))
print("Mean: %2.3f; median %2.3f; greater than 30 days %2.3f; greater than 60 days %2.3f; greater than 90 days %2.5f; missing data %2.5f" % (
days_between.mean(),
days_between.median(),
(days_between > 30).mean(),
(days_between > 60).mean(),
(days_between > 90).mean(),
np.isnan(days_between).mean()))
if bad_ids is None:
bad_ids = set(allclinical_df.loc[days_between > THRESHOLD_IN_DAYS, 'id'].values)
else:
bad_ids = bad_ids.union(set(allclinical_df.loc[days_between > THRESHOLD_IN_DAYS, 'id'].values))
visits_to_bad_ids[visit_substring] = bad_ids
print("Total number of IDs filtered out for visit: %i/%i" % (len(bad_ids), len(allclinical_df)))
self.visits_too_far_from_xray_screening = visits_to_bad_ids
for k in self.processed_dataframes:
if 'visit' in self.processed_dataframes[k].columns:
rows_to_filter_out = None
for visit in self.visits_too_far_from_xray_screening:
bad_rows_for_visit = (self.processed_dataframes[k]['id'].map(lambda x:x in self.visits_too_far_from_xray_screening[visit]) &
(self.processed_dataframes[k]['visit'] == CLINICAL_WAVES_TO_FOLLOWUP[visit]))
if rows_to_filter_out is None:
rows_to_filter_out = bad_rows_for_visit
else:
rows_to_filter_out = rows_to_filter_out | bad_rows_for_visit
self.processed_dataframes[k] = self.processed_dataframes[k].loc[~rows_to_filter_out]
print("For dataframe %s, filtered out %i/%i rows as too far from x-ray date" % (k, rows_to_filter_out.sum(), len(rows_to_filter_out)))
def make_drinking_and_smoking_dataframe(self):
"""
Risk factors at baseline.
"""
df = copy.deepcopy(self.original_dataframes['allclinical00'])
# cigarette smoking.
df['cigarette_smoker'] = df['v00smoker']
df.loc[df['cigarette_smoker'] == '3: Current, but never regular', 'cigarette_smoker'] = '1: Current'
df.loc[df['cigarette_smoker'] == self.missing_data_val, 'cigarette_smoker'] = None
print('Cigarette smoker: ', Counter(df['cigarette_smoker']))
# drinks per week
df['drinks_per_week'] = df['v00drnkamt']
df.loc[df['drinks_per_week'] == self.missing_data_val, 'drinks_per_week'] = None
print('Drinks per week: ', Counter(df['drinks_per_week']))
self.processed_dataframes['drinking_and_smoking'] = df[['id', 'drinks_per_week', 'cigarette_smoker']]
def make_medical_history_dataframe(self):
"""
Used to replicate David's regressions as a sanity check, but not actually for any analysis in the paper.
Currently someone is defined as a 1 if they report having a disease prior to the timepoint
Defined as missing if they are missing disease data at baseline and don't report having it subsequently.
Defined as false otherwise.
Not entirely sure this is the right way to do this. There's a lot of missing data for RA at baseline. Regarding RA: people are supposed to be excluded if they have it for sure. But I guess v00ra may or may not indicate RA, as defined by the study -- perhaps they think some people are giving unreliable answers, and that accounts for the missing data?
"Participants who report that a doctor has told them they have RA, SLE, psoriatic arthritis, ankylosing spondylitis or another inflammatory arthritis will be asked about use of specific medications that are used primarily for RA and other forms of inflammatory arthritis: e.g. gold, methotrexate, etanercept, infliximab, leflunamide, plaquenil, etc. If the person has ever used any of these medications, they will be excluded. If the participant reports having RA or inflammatory arthritis but none of these medications have been used, they will be asked about symptoms of RA and excluded if the responses are suggestive of RA"
This includes a couple of other covariates David actually doesn't use in his regression.
"""
print("\n\n***Making dataframe of medical history.")
all_dfs = []
medical_conditions = ['hrtat', 'hrtfail', 'bypleg', 'stroke', 'asthma', 'lung',
'ulcer', 'diab', 'kidfxn', 'ra', 'polyrh', 'livdam', 'cancer']
# we omit ALZDZ even though it's in david's script because it doessn't appear to be in our data.
all_ids = list(self.original_dataframes['allclinical00']['id'])
has_disease = {}
nas_at_baseline = {}
for condition in medical_conditions:
has_disease[condition] = set([])
nas_at_baseline[condition] = set([])
for visit in WAVES_WE_ARE_USING:
df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit])
for condition in medical_conditions:
if visit == '00':
has_disease_idxs = df['v%s%s' % (visit, condition)] == '1: Yes'
self.validate_col(df['v%s%s' % (visit, condition)], ['1: Yes', '0: No', self.missing_data_val])
nas_at_baseline_idxs = df['v%s%s' % (visit, condition)] == self.missing_data_val
nas_at_baseline[condition] = set(df.loc[nas_at_baseline_idxs, 'id'])
print('Proportion missing data for %-10s at visit 00: %2.3f' % (condition, nas_at_baseline_idxs.mean()))
elif visit in ['03', '06']:
has_disease_idxs = df['v%s%s' % (visit, condition)] == 1.0
self.validate_col(df['v%s%s' % (visit, condition)], [0, 1])
print("Proportion missing data for %-10s at visit %s: %2.3f" % (condition, visit, pd.isnull(df['v%s%s' % (visit, condition)]).mean()))
else:
# unfortunately, don't appear to have data for these visits.
continue
has_disease_ids = set(df.loc[has_disease_idxs, 'id'])
has_disease[condition] = has_disease[condition].union(has_disease_ids)
df_for_visit = pd.DataFrame({'id':all_ids, 'visit':CLINICAL_WAVES_TO_FOLLOWUP[visit]})
for condition in medical_conditions:
has_disease_idxs = df_for_visit['id'].map(lambda x:x in has_disease[condition])
df_for_visit[condition] = has_disease_idxs.values * 1.
nas_at_baseline_idxs = df_for_visit['id'].map(lambda x:x in nas_at_baseline[condition])
df_for_visit.loc[nas_at_baseline_idxs & (~has_disease_idxs), condition] = None
all_dfs.append(df_for_visit)
combined_df = pd.concat(all_dfs)
combined_df.index = range(len(combined_df))
print(combined_df.groupby('visit').mean())
self.processed_dataframes['medical_history'] = combined_df
def make_previous_fracture_or_fall_dataframe(self):
"""
Fractures are cumulatively defined: currently someone is defined as a 1 if they report having a fracture prior to the timepoint.
Defined as missing if they are missing data at baseline and don't report having it subsequently.
Defined as false otherwise.
Falls occur in the last 12 months and are thus not cumulatively defined.
"""
print("Making fracture and fall dataframe!")
all_ids = list(self.original_dataframes['allclinical00']['id'])
have_fracture = {}
nas_at_baseline = {}
all_dfs = []
for condition in ['fractured_bone', 'fractured_hip', 'fractured_spine']:
have_fracture[condition] = set([])
nas_at_baseline[condition] = set([])
for visit in WAVES_WE_ARE_USING:
# get the DF we need data from
df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit])
# construct df for visit.
df_for_visit = pd.DataFrame({'id':all_ids})
df_for_visit['visit'] = CLINICAL_WAVES_TO_FOLLOWUP[visit]
# Do falls. This is different from fractures because it's non-cumulative.
fall_col = 'v%sfall' % visit
if visit in ['00', '01']:
self.validate_col(df[fall_col], ['1: Yes', '0: No', self.missing_data_val])
fell_ids = set(df.loc[df[fall_col] == '1: Yes', 'id'].values)
fall_missing_data_ids = set(df.loc[df[fall_col] == self.missing_data_val, 'id'].values)
else:
fell_ids = set(df.loc[df[fall_col] == 1.0, 'id'].values)
self.validate_col(df[fall_col], [0, 1])
fall_missing_data_ids = set(df.loc[pd.isnull(df[fall_col]), 'id'].values)
df_for_visit['fell_in_last_12_months'] = df_for_visit['id'].map(lambda x:x in fell_ids)
df_for_visit.loc[df_for_visit['id'].map(lambda x:x in fall_missing_data_ids), 'fell_in_last_12_months'] = None
# Do fractures.
got_fracture_at_timepoint = {}
for condition in have_fracture.keys():
got_fracture_at_timepoint[condition] = set([])
if condition == 'fractured_bone':
if visit == '00':
col = 'v00bonefx'
else:
col = 'v%sbonfx' % visit
if visit in ['01', '00']:
got_fracture_at_timepoint[condition] = df.loc[df[col] == '1: Yes', 'id'].values
self.validate_col(df[col], ['1: Yes', '0: No', self.missing_data_val])
else:
got_fracture_at_timepoint[condition] = df.loc[df[col] == 1.0, 'id'].values
self.validate_col(df[col], [0, 1])
if visit == '00':
nas_at_baseline[condition] = df.loc[df[col] == self.missing_data_val, 'id'].values
elif condition == 'fractured_hip':
if visit == '00':
col = 'v00hipfx'
got_fracture_at_timepoint[condition] = df.loc[df[col] == '1: Yes', 'id'].values
nas_at_baseline[condition] = df.loc[df[col] == self.missing_data_val, 'id'].values
self.validate_col(df[col], ['1: Yes', '0: No', self.missing_data_val])
else:
# can't find hip fracture data at subsequent timepoints.
continue
elif condition == 'fractured_spine':
if visit == '00':
col = 'v00spnfx'
else:
col = 'v%sbonfx6' % visit
if visit in ['01', '00']:
got_fracture_at_timepoint[condition] = df.loc[df[col] == '1: Yes', 'id'].values
self.validate_col(df[col], ['1: Yes', '0: No', self.missing_data_val])
else:
got_fracture_at_timepoint[condition] = df.loc[df[col] == 1.0, 'id'].values
self.validate_col(df[col], [0, 1])
if visit == '00':
nas_at_baseline[condition] = df.loc[df[col] == self.missing_data_val, 'id'].values
else:
raise Exception("not a valid disease")
for condition in have_fracture.keys():
have_fracture[condition] = have_fracture[condition].union(got_fracture_at_timepoint[condition])
df_for_visit[condition] = df_for_visit['id'].map(lambda x:x in have_fracture[condition])
na_idxs = df_for_visit['id'].map(lambda x:x in nas_at_baseline[condition] )
df_for_visit.loc[na_idxs & (~df_for_visit[condition]), condition] = None
all_dfs.append(df_for_visit)
combined_df = pd.concat(all_dfs)
combined_df.index = range(len(combined_df))
print("Average values by visit")
print(combined_df[[a for a in combined_df.columns if a != 'id']].groupby('visit').mean())
print("NAs by visit")
print(combined_df[[a for a in combined_df.columns if a != 'id']].groupby('visit').agg(lambda x:np.mean(pd.isnull(x))))
self.processed_dataframes['fractures_and_falls'] = combined_df
def make_400m_walk_dataframe(self):
"""
Stats about how quickly they can walk. Only have data for three timepoints.
"""
walk_cols = ['400mtr', '400excl', '400mcmp', '400mtim']
walk_df = self.concatenate_dataframes_from_multiple_timepoints(dataset_substring='allclinical',
columns_to_subset_on=['id'] + walk_cols,
visit_numbers_to_skip=['01', '05', '07', '08', '09','10', '11'])
ids = sorted(list(set(walk_df['id'])))
print(Counter(walk_df['400excl'].dropna()))
print(Counter(walk_df['400mcmp'].dropna()))
walk_df['400excl'] = walk_df['400excl'].map(lambda x:str(x) not in ['0.0', '0: Not excluded'])
walk_df['400mcmp'] = walk_df['400mcmp'].map(lambda x:str(x) in ['1.0', '1: Completed test without stopping'])
print("After processing")
print(Counter(walk_df['400excl'].dropna()))
print(Counter(walk_df['400mcmp'].dropna()))
for c in walk_df.columns:
assert (walk_df[c].astype(str) == self.missing_data_val).sum() == 0
print(walk_df.head())
# Add timepoints for '01' and '05' for consistency with other processing (just fill out other columns with None).
for timepoint in ['01', '05']:
timepoint_df = pd.DataFrame({'id':ids, 'visit':CLINICAL_WAVES_TO_FOLLOWUP[timepoint]})
for col in walk_cols:
timepoint_df[col] = None
timepoint_df = timepoint_df[walk_df.columns]
walk_df = pd.concat([walk_df, timepoint_df])
self.processed_dataframes['400m_walk'] = walk_df
def make_redundant_knee_xray_variable_dataframe(self):
"""
A couple extra variables that Sendhil noticed at baseline and wanted to pull just in case.
"""
cols = ['P01SV%sKOST', 'P01SV%sKJSL', 'P01SV%sKJSM']
new_col_names = ['knee_osteophytes',
'knee_lateral_joint_space_narrowing',
'knee_medial_joint_space_narrowing']
cols = [col.lower() for col in cols]
left_cols = [col % 'l' for col in cols]
right_cols = [col % 'r' for col in cols]
left_df = self.original_dataframes['allclinical00'][['id'] + left_cols].copy()
right_df = self.original_dataframes['allclinical00'][['id'] + right_cols].copy()
left_df.columns = ['id'] + new_col_names
right_df.columns = ['id'] + new_col_names
left_df['side'] = 'left'
right_df['side'] = 'right'
redundant_knee_xray_clinical_features = pd.concat([left_df, right_df])
redundant_knee_xray_clinical_features.index = range(len(redundant_knee_xray_clinical_features))
for c in new_col_names:
if c == 'id':
continue
print(c)
assert pd.isnull(redundant_knee_xray_clinical_features[c]).sum() == 0
redundant_knee_xray_clinical_features.loc[
redundant_knee_xray_clinical_features[c] == self.missing_data_val,
c] = None
print(redundant_knee_xray_clinical_features[c].value_counts())
print("Missing data fraction: %2.3f" % pd.isnull(redundant_knee_xray_clinical_features[c]).mean())
self.processed_dataframes['redundant_knee_xray_clinical_features'] = redundant_knee_xray_clinical_features
def make_dominant_leg_dataframe(self):
"""
Checked.
Don’t use timepoint info (ie, we define this using allclinical00 only) because lots of missing data at
subsequent timepoints and seems like there are causality problems.
"""
print("\n\n***Making dominant leg dataframe")
right_leg_df = copy.deepcopy(self.original_dataframes['allclinical00'][['id', 'v00kikball']])
right_leg_df.columns = ['id', 'dominant_leg']
missing_data_idxs = (right_leg_df['dominant_leg'] == self.missing_data_val).values
left_leg_df = copy.deepcopy(right_leg_df)
right_leg_df['dominant_leg'] = right_leg_df['dominant_leg'].map(lambda x:'right' in x.lower())
left_leg_df['dominant_leg'] = left_leg_df['dominant_leg'].map(lambda x:'left' in x.lower())
left_leg_df.loc[missing_data_idxs, 'dominant_leg'] = None
right_leg_df.loc[missing_data_idxs, 'dominant_leg'] = None
left_leg_df['side'] = 'left'
right_leg_df['side'] = 'right'
combined_df = pd.concat([left_leg_df, right_leg_df])
combined_df.index = range(len(combined_df))
print(combined_df[['side', 'dominant_leg']].groupby('side').agg(['mean', 'size']))
print("Missing data: %2.3f" % pd.isnull(combined_df['dominant_leg']).mean())
self.processed_dataframes['dominant_leg'] = combined_df
def make_bmi_dataframe(self):
"""
Computes current and max BMI as categorical variables. Only uses baseline numbers.
Checked.
"""
print("\n\nComputing current amd max BMI.")
current_weight_col = 'p01weight'
max_weight_col = 'v00wtmaxkg'
current_height_col = 'p01height'
desired_cols = ['id'] + [current_weight_col, max_weight_col, current_height_col]
bmi_df = copy.deepcopy(self.original_dataframes['allclinical00'][desired_cols])
bmi_df['current_bmi'] = bmi_df[current_weight_col] / ((bmi_df[current_height_col] / 1000.) ** 2)
bmi_df['max_bmi'] = bmi_df[max_weight_col] / ((bmi_df[current_height_col] / 1000.) ** 2)
bmi_df = bmi_df[['id', 'current_bmi', 'max_bmi']]
def map_bmi_to_david_cats(x):
if x < 18.5:
return '<18.5'
elif x < 25:
return '18.5-25'
elif x < 30:
return '25-30'
elif x < 35:
return '30-35'
elif x >= 35:
return '>=35'
else:
return None
bmi_not_nan = (~pd.isnull(bmi_df['current_bmi'])) & (~pd.isnull(bmi_df['max_bmi']))
bmi_max_smaller_than_current = bmi_not_nan & (bmi_df['current_bmi'] > bmi_df['max_bmi'])
print('Warning: proportion %2.3f of rows have current BMI > max BMI. Setting max to current.' %
bmi_max_smaller_than_current.mean()) # this is likely caused by fact that max BMI is self-reported, while current BMI I assume is weighed at the site.
bmi_df.loc[bmi_max_smaller_than_current, 'max_bmi'] = bmi_df.loc[bmi_max_smaller_than_current, 'current_bmi'].values
assert (bmi_not_nan & (bmi_df['current_bmi'] > bmi_df['max_bmi'])).sum() == 0
print(bmi_df[['current_bmi', 'max_bmi']].describe())
bmi_df['current_bmi'] = bmi_df['current_bmi'].map(map_bmi_to_david_cats)
bmi_df['max_bmi'] = bmi_df['max_bmi'].map(map_bmi_to_david_cats)
print('Counts of values for current BMI are', Counter(bmi_df['current_bmi']))
print('Counts of values for max BMI are', Counter(bmi_df['max_bmi']))
self.processed_dataframes['bmi'] = bmi_df
def make_previous_knee_replacement_dataframe(self):
print("\n\nComputing previous knee replacements/arthroplasties")
# "ever have replacement where all or part of joint was replaced"
self.processed_dataframes['knee_replacement'] = self.make_previous_injury_or_surgery_dataframe(
baseline_substring='krs',
followup_substring='krs',
col_name='knee_replacement',
set_missing_baseline_to_0=True,
waves_to_skip='06'
)
df_to_concat = self.processed_dataframes['knee_replacement'].loc[self.processed_dataframes['knee_replacement']['visit'] == '36 month follow-up'].copy()
df_to_concat['visit'] = '48 month follow-up'
self.processed_dataframes['knee_replacement'] = pd.concat([self.processed_dataframes['knee_replacement'], df_to_concat])
self.processed_dataframes['knee_replacement'].index = range(len(self.processed_dataframes['knee_replacement']))
def make_previous_injury_dataframe(self):
print("\n\nComputing previous injuries to knees!")
self.processed_dataframes['knee_injury'] = self.make_previous_injury_or_surgery_dataframe(
baseline_substring='inj',
followup_substring='inj',
col_name='knee_injury')
def make_previous_surgery_dataframe(self):
print("\n\nComputing previous surgeries to knees!")
self.processed_dataframes['knee_surgery'] = self.make_previous_injury_or_surgery_dataframe(
baseline_substring='ksurg',
followup_substring='ksrg',
col_name='knee_surgery')
def make_age_dataframe(self):
print("\n\n***Creating combined age dataframe")
combined_df = []
for visit in WAVES_WE_ARE_USING:
age_df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit][['id', 'v%sage' % visit]])
age_df.columns = ['id', 'age_at_visit']
age_df['visit'] = CLINICAL_WAVES_TO_FOLLOWUP[visit]
combined_df.append(age_df)
def convert_age_to_categorical_variable(age):
assert not (age < 45)
assert not (age > 85)
if age < 50 and age >= 45:
return '45-49'
if age < 55:
return '50-54'
if age < 60:
return '55-59'
if age < 65:
return '60-64'
if age < 70:
return '65-69'
if age < 75:
return '70-74'
if age < 80:
return '75-79'
if age < 85:
return '80-84'
assert np.isnan(age)
return None
combined_df = pd.concat(combined_df)
combined_df['age_at_visit'] = combined_df['age_at_visit'].map(convert_age_to_categorical_variable)
print(Counter(combined_df['age_at_visit']))
self.processed_dataframes['age_at_visit'] = combined_df
def make_other_pain_dataframe(self, type_of_pain):
"""
Helper method to make the combined pain dataframe.
Returns things as strings.
"""
assert type_of_pain in ['hip', 'back',
'foot', 'ankle', 'shoulder', 'elbow', 'wrist', 'hand']
combined_df = []
for visit in WAVES_WE_ARE_USING:
# first have to identify cols of interest.
if type_of_pain == 'hip':
if visit == '00':
cols_of_interest = ['p01hp%s12cv' % side for side in ['l', 'r']]
else:
cols_of_interest = ['v%shp%s12cv' % (visit, side) for side in ['l', 'r']]
col_names_to_use = ['id',
'left_hip_pain_more_than_half_of_days',
'right_hip_pain_more_than_half_of_days']
elif type_of_pain == 'back':
if visit == '00':
cols_of_interest = ['p01bp30oft']
else:
cols_of_interest = ['v%sbp30oft' % visit]
col_names_to_use = ['id', 'how_often_bothered_by_back_pain']
elif type_of_pain in ['foot', 'ankle', 'shoulder', 'elbow', 'wrist', 'hand']:
pain_abbrv = type_of_pain[0]
if visit == '00':
cols_of_interest = ['p01ojpn%s%s' % (side, pain_abbrv) for side in ['l', 'r']]
else:
cols_of_interest = ['v%sojpn%s%s' % (visit, side, pain_abbrv) for side in ['l', 'r']]
col_names_to_use = ['id',
'left_%s_pain_more_than_half_of_days' % type_of_pain,
'right_%s_pain_more_than_half_of_days' % type_of_pain]
else:
raise Exception("Your pain is invalid :(")
# select columns.
pain_df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit][['id'] + cols_of_interest])
# do mapping.
if type_of_pain == 'hip':
if visit == '00' or visit == '01':
for col in cols_of_interest:
self.validate_col(pain_df[col], ['1: Yes', '0: No', self.missing_data_val])
else:
for col in cols_of_interest:
self.validate_col(pain_df[col], [0, 1])
pain_df[col] = pain_df[col].replace({np.nan:self.missing_data_val,
1:'1: Yes',
0:'0: No'}).astype(str)
for col in cols_of_interest:
self.validate_col(pain_df[col], [self.missing_data_val, '1: Yes', '0: No'])
elif type_of_pain == 'back':
if visit == '00' or visit == '01':
for col in cols_of_interest:
self.validate_col(pain_df[col], ['1: Some of the time', '0: Rarely',
'2: Most of the time', '3: All of the time', self.missing_data_val])
else:
for col in cols_of_interest:
self.validate_col(pain_df[col], [0, 1, 2, 3])
pain_df[col] = pain_df[col].replace({1:'1: Some of the time',
0:'0: Rarely',
2:'2: Most of the time',
3:'3: All of the time',
np.nan:self.missing_data_val}).astype(str)
for col in cols_of_interest:
self.validate_col(pain_df[col], ['0: Rarely', '1: Some of the time', '2: Most of the time', '3: All of the time', self.missing_data_val])
elif type_of_pain in ['foot', 'ankle', 'shoulder', 'elbow', 'wrist', 'hand']:
if visit == '00' or visit == '01':
for col in cols_of_interest:
self.validate_col(pain_df[col], ['1: Yes', '0: No', self.missing_data_val])
else:
for col in cols_of_interest:
self.validate_col(pain_df[col], [0, 1])
pain_df[col] = pain_df[col].replace({None:self.missing_data_val,
1:'1: Yes'}).astype(str)
for col in cols_of_interest:
self.validate_col(pain_df[col], [self.missing_data_val, '1: Yes'])
pain_df.columns = col_names_to_use
pain_df['visit'] = CLINICAL_WAVES_TO_FOLLOWUP[visit]
combined_df.append(pain_df)
combined_df = pd.concat(combined_df)
combined_df.index = range(len(combined_df))
# Set missing values to None for consistency with the rest of data processing.
for col in combined_df.columns:
if col == 'visit' or col == 'id':
continue
assert type(combined_df[col].iloc[0]) is str
assert pd.isnull(pain_df[col]).sum() == 0
print("Setting values of %s in column %s to None" % (self.missing_data_val, col))
combined_df.loc[combined_df[col] == self.missing_data_val, col] = None
return combined_df
def make_nonstandard_interventions_dataframe(self):
"""
Make dataframe of 0-1 indicators whether someone has had other interventions for pain
which are not standard in medical practice.
"""
print("Processing interventions data")
interventions = ["V00ACUTCV", "V00ACUSCV", "V00CHELCV", "V00CHIRCV",
"V00FOLKCV", "V00HOMECV", "V00MASSCV", "V00DIETCV",
"V00VITMCV", "V00RUBCV", "V00CAPSNCV", "V00BRACCV",
"V00YOGACV", "V00HERBCV", "V00RELACV", "V00SPIRCV",
"V00OTHCAMC", "V00OTHCAM"]
cols = ['id'] + [a.lower() for a in interventions]
df = self.original_dataframes['allclinical00'][cols].copy()
for c in df.columns:
if c != 'id':
self.validate_col(df[c], ['0: No', '1: Yes', self.missing_data_val])
nan_idxs = df[c].map(lambda x:x in self.missing_data_val).values
intervention_idxs = df[c] == '1: Yes'
df[c] = 0.
df.loc[intervention_idxs, c] = 1.
df.loc[nan_idxs, c] = None
print("Missing data")
print(df.agg(lambda x:np.mean(pd.isnull(x))))
print("Fraction with other interventions")
print(df.mean())
self.processed_dataframes['nonstandard_interventions'] = df
def make_medications_dataframe(self):
"""
Make dataframe of 0-1 indicators whether someone is taking medication.
"""
print("Processing medications data")
medications = ["V00RXACTM", "V00RXANALG", "V00RXASPRN", "V00RXBISPH",
"V00RXCHOND", "V00RXCLCTN", "V00RXCLCXB", "V00RXCOX2",
"V00RXFLUOR", "V00RXGLCSM", "V00RXIHYAL", "V00RXISTRD",
"V00RXMSM", "V00RXNARC", "V00RXNSAID", "V00RXNTRAT",
"V00RXOSTRD", "V00RXOTHAN", "V00RXRALOX", "V00RXRFCXB",
"V00RXSALIC", "V00RXSAME", "V00RXTPRTD", "V00RXVIT_D", "V00RXVLCXB"]
medications = [a.replace('V00', '').lower() for a in medications]
med_df = self.concatenate_dataframes_from_multiple_timepoints(dataset_substring='allclinical',
columns_to_subset_on=['id'] + medications,
visit_numbers_to_skip=['07', '08', '09', '10', '11'])
for c in med_df.columns:
if c != 'id' and c != 'visit':
self.validate_col(med_df[c].map(lambda x:str(x)), ['1.0', '0.0',
'0: Not used in last 30 days',
'1: Used in last 30 days',
self.missing_data_val,
'nan'])
nan_idxs = med_df[c].map(lambda x:str(x) in [self.missing_data_val, 'nan']).values
took_idxs = med_df[c].map(lambda x:str(x) in ['1: Used in last 30 days', '1.0']).values
med_df[c] = 0.
med_df.loc[took_idxs, c] = 1.
med_df.loc[nan_idxs, c] = None
print("Missing data")
print(med_df.groupby('visit').agg(lambda x:np.mean(pd.isnull(x))))
print("Fraction taking medication")
print(med_df.groupby('visit').mean())
self.processed_dataframes['medications'] = med_df
def make_pain_dataframe_for_all_other_types_of_pain(self):
print("\n\n\n***Creating dataframe for all other types of pain")
for i, other_type_of_pain in enumerate(['hip', 'back',
'foot', 'ankle', 'shoulder', 'elbow', 'wrist', 'hand']):
if i == 0:
combined_pain_df = self.make_other_pain_dataframe(other_type_of_pain)
original_len = len(combined_pain_df)
else:
combined_pain_df = pd.merge(combined_pain_df,
self.make_other_pain_dataframe(other_type_of_pain),
how='inner',
on=['id', 'visit'])
assert len(combined_pain_df) == original_len
assert len(combined_pain_df[['id', 'visit']].drop_duplicates() == original_len)
print("Missing data by timepoint")
print(combined_pain_df.groupby('visit').agg(lambda x:np.mean(pd.isnull(x))))
self.processed_dataframes['other_pain'] = combined_pain_df
def validate_col(self, col, expected_values):
if not (col.dropna().map(lambda x:x not in expected_values).sum() == 0):
print("Error: unexpected value in column. Expected values:")
print(expected_values)
print("Actual values")
print(sorted(list(set(col.dropna()))))
assert False
def make_previous_injury_or_surgery_dataframe(self, baseline_substring, followup_substring, col_name, set_missing_baseline_to_0=False, waves_to_skip=None):
"""
While the code in this method refers to "injury", we actually use it to define both injuries + surgeries.
baseline_substring identifies the column used in allclinical00
followup_substring identifies the column in subsequent clinical dataframes
col_name is the name we want to give the column.
Set someone to True if they report an injury at any previous timepoint.
Set them to NA if they don't report an injury and are missing data for the first timepoint
Set them to False otherwise.
(some followup people are missing data, so we might have a few false negatives who didn't report an injury, but it should be small).
Checked.
"""
ids_who_report_injury_at_any_timepoint = {'left':set([]), 'right':set([])}
ids_with_nas_at_first_timepoint = {'left':set([]), 'right':set([])}
all_dfs = []
if waves_to_skip is None:
waves_to_skip = []
for visit in WAVES_WE_ARE_USING:
if visit in waves_to_skip:
continue
if visit == '00':
left_col = 'p01%sl' % baseline_substring
right_col = 'p01%sr' % baseline_substring
else:
left_col = 'v%s%sl12' % (visit, followup_substring)
right_col = 'v%s%sr12' % (visit, followup_substring)
df_to_use = copy.deepcopy(self.original_dataframes['allclinical%s' % visit][['id', left_col, right_col]])
df_to_use.columns = ['id', 'left_side', 'right_side']
assert len(set(df_to_use['id'])) == len(df_to_use)
df_to_use['visit'] = CLINICAL_WAVES_TO_FOLLOWUP[visit]
if visit == '00':
all_ids = set(df_to_use['id'])
else:
assert set(df_to_use['id']) == all_ids
dfs_by_knee = {}
for side in ['left', 'right']:
dfs_by_knee[side] = copy.deepcopy(df_to_use[['id', 'visit', '%s_side' % side]])
dfs_by_knee[side].columns = ['id', 'visit', col_name]
dfs_by_knee[side]['side'] = side
# map to bools.
if visit == '00' or visit == '01':
self.validate_col(dfs_by_knee[side][col_name], ['1: Yes', '0: No', self.missing_data_val])
knee_injury_at_this_timepoint = set(dfs_by_knee[side]['id'].loc[
dfs_by_knee[side][col_name] == '1: Yes'])
else:
knee_injury_at_this_timepoint = set(dfs_by_knee[side]['id'].loc[
dfs_by_knee[side][col_name] == 1])
self.validate_col(dfs_by_knee[side][col_name], [0, 1])
if visit == '00':
na_ids = set(dfs_by_knee[side]['id'].loc[dfs_by_knee[side][col_name] == self.missing_data_val])
if set_missing_baseline_to_0:
ids_with_nas_at_first_timepoint[side] = set([])
print("Warning: setting %i missing datapoints for baseline to 0" % len(na_ids))
else:
ids_with_nas_at_first_timepoint[side] = na_ids
# update list of people who report an injury.
ids_who_report_injury_at_any_timepoint[side] = ids_who_report_injury_at_any_timepoint[side].union(knee_injury_at_this_timepoint)
# set people to True if report injury at any timepoint.
dfs_by_knee[side][col_name] = dfs_by_knee[side]['id'].map(lambda x:x in ids_who_report_injury_at_any_timepoint[side])
# set people to NA if False and missing data at initial timepoint
dfs_by_knee[side].loc[dfs_by_knee[side]['id'].map(lambda x:(x in ids_with_nas_at_first_timepoint[side]) &
(x not in ids_who_report_injury_at_any_timepoint[side])),
col_name] = None
dfs_by_knee[side].index = range(len(dfs_by_knee[side]))
all_dfs.append(dfs_by_knee[side].copy())
print("At timepoint %s, rate for %s leg: %i=1, %i=0, %i are missing" % (CLINICAL_WAVES_TO_FOLLOWUP[visit],
side,
(dfs_by_knee[side][col_name] == 1).sum(),
(dfs_by_knee[side][col_name] == 0).sum(),
pd.isnull(dfs_by_knee[side][col_name]).sum()))
combined_df = pd.concat(all_dfs)
combined_df.index = range(len(combined_df))
assert len(combined_df[['id', 'visit', 'side']].drop_duplicates()) == len(combined_df)
print("Average values")
print(combined_df[[col_name, 'visit', 'side']].groupby(['side', 'visit']).agg(['mean', 'size']))
print("Missing data")
print(combined_df[[col_name, 'visit', 'side']].groupby(['side', 'visit']).agg(lambda x:np.mean(pd.isnull(x))))
return combined_df
def make_other_koos_subscores_dataframe(self):
"""
Make dataframe of other Koos pain subscores.
Each row is one visit for one side for one id.
Other koos_symptoms_score is knee specific. Everything else is the same for both.