-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdata_analysis.py
2513 lines (2163 loc) · 140 KB
/
data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import shutil
import sys
from typing import FrozenSet
from dbApp.models import AnalysisType, ReferenceSequence, CladeCollectionType, CladeCollection
import itertools
from collections import defaultdict
import virtual_objects
import numpy as np
from scipy.stats import gaussian_kde
import symportal_utils
from general import ThreadSafeGeneral
import string
import re
import sp_config
import json
from django import db
class SPDataAnalysis:
def __init__(self, workflow_manager_parent, data_analysis_obj, force_basal_lineage_separation):
self.workflow_manager = workflow_manager_parent
self.force_basal_lineage_separation = force_basal_lineage_separation
self.temp_wkd = os.path.join(self.workflow_manager.symportal_root_directory, 'temp')
self._del_and_remake_temp_wkd()
self.data_analysis_obj = data_analysis_obj
# The abundance that a given DIV must be found at when it has been considered 'unlocked'
# https://github.com/didillysquat/SymPortal_framework/wiki/The-SymPortal-logic#type-profile-assignment---logic
self.unlocked_abundance = 0.0001
self.clade_list = list('ABCDEFGHI')
self.ccs_of_analysis = self.data_analysis_obj.get_clade_collections()
# List that will hold a dictionary for each clade
# Each dictionary will hold key = footprint (set of sequences)
# value = [[] []] where []0 = list of cladeCollections containing given footprint
# and []1 = list of majority sequence for given sample
self.clade_footp_dicts_list = [{} for _ in self.clade_list]
self.list_of_initial_types_after_collapse = None
self.current_clade = None
self.list_of_data_set_uids = [
int(ds_id_str) for ds_id_str in self.data_analysis_obj.list_of_data_set_uids.split(',')]
self.virtual_object_manager = virtual_objects.VirtualObjectManager(
within_clade_cutoff=self.workflow_manager.within_clade_cutoff,
num_proc=self.workflow_manager.args.num_proc,
list_of_data_set_uids=self.list_of_data_set_uids,
force_basal_lineage_separation=self.force_basal_lineage_separation)
self.thread_safe_general = ThreadSafeGeneral()
def analyse_data(self):
print('\n\nBeginning profile discovery')
self._populate_clade_fp_dicts_list()
self._collapse_footprints_and_make_analysis_types()
self._associate_vats_to_vccs()
self._check_for_artefacts()
print('TYPE DISCOVERY COMPLETE')
self._reset_vcc_vat_rep_abund_dicts()
self._profile_assignment()
self._update_grand_tot_attribute_for_vats()
self._name_divs()
self._associate_species_designations()
self._del_and_remake_temp_wkd()
print('DATA ANALYSIS COMPLETE')
self._make_analysis_type_objects_from_vats()
def _update_grand_tot_attribute_for_vats(self):
"""We need to populate the grand_tot_num_instances_of_vat_in_analysis attribute of the vats after
# the profile assignment."""
for vat in self.virtual_object_manager.vat_manager.vat_dict.values():
self.virtual_object_manager.vat_manager.vat_dict[vat.id].grand_tot_num_instances_of_vat_in_analysis = len(
vat.clade_collection_obj_set_profile_assignment)
def _reset_vcc_vat_rep_abund_dicts(self):
for vcc_uid in self.virtual_object_manager.vcc_manager.vcc_dict.keys():
self.virtual_object_manager.vcc_manager.vcc_dict[
vcc_uid].analysis_type_obj_to_representative_rel_abund_in_cc_dict = {}
def _make_analysis_type_objects_from_vats(self):
print('\nConverting VirtualAnalysisTypes to database AnalysisTypes')
for vat in self.virtual_object_manager.vat_manager.vat_dict.values():
sys.stdout.write(f'\r{vat.name}')
new_at = self._create_analysis_type_from_vat(vat)
self._update_uid_of_vat(new_at, vat)
self._update_keys_of_vat_dict()
# now create the clade_collection_types
# we will need a cct for each vat, vcc combination
clade_collection_type_list_for_bulk_create = []
for vat in self.virtual_object_manager.vat_manager.vat_dict.values():
for vcc in vat.clade_collection_obj_set_profile_assignment:
clade_collection_type_list_for_bulk_create.append(
CladeCollectionType(
analysis_type_of=AnalysisType.objects.get(id=vat.id),
clade_collection_found_in=CladeCollection.objects.get(id=vcc.id)))
for cct_chunk in self.thread_safe_general.chunks(clade_collection_type_list_for_bulk_create):
CladeCollectionType.objects.bulk_create(cct_chunk)
def _update_keys_of_vat_dict(self):
# now update remake the vat dict so that the correct ids are used
new_dict = {}
for vat in self.virtual_object_manager.vat_manager.vat_dict.values():
new_dict[vat.id] = vat
self.virtual_object_manager.vat_manager.vat_dict = new_dict
def _update_uid_of_vat(self, new_at, vat):
# now update the id of the vat
self.virtual_object_manager.vat_manager.vat_dict[vat.id].id = new_at.id
def _create_analysis_type_from_vat(self, vat):
ordered_footprint_list = ','.join(str(rs_id) for rs_id in list(vat.multi_modal_detection_rel_abund_df))
majority_reference_sequence_set = ','.join([str(rs_id) for rs_id in vat.majority_reference_sequence_uid_set])
list_of_clade_collections = ','.join(
[str(cc.id) for cc in vat.clade_collection_obj_set_profile_assignment])
footprint_sequence_abundances = json.dumps(vat.abs_abund_of_ref_seqs_in_assigned_vccs_df.values.tolist())
footprint_sequence_ratios = json.dumps(vat.multi_modal_detection_rel_abund_df.values.tolist())
artefact_intras = ','.join(str(rs_uid) for rs_uid in vat.artefact_ref_seq_uid_set)
new_at = AnalysisType(
data_analysis_from=self.data_analysis_obj,
ordered_footprint_list=ordered_footprint_list,
majority_reference_sequence_set=majority_reference_sequence_set,
list_of_clade_collections=list_of_clade_collections,
footprint_sequence_abundances=footprint_sequence_abundances,
footprint_sequence_ratios = footprint_sequence_ratios,
clade=vat.clade, co_dominant=vat.co_dominant, name=vat.name,
species=vat.species, artefact_intras=artefact_intras
)
new_at.save()
return new_at
def _del_and_remake_temp_wkd(self):
if os.path.exists(self.temp_wkd):
shutil.rmtree(self.temp_wkd)
os.makedirs(self.temp_wkd, exist_ok=True)
def _associate_species_designations(self):
for vat in self.virtual_object_manager.vat_manager.vat_dict.values():
# For the time being I am disabling the species association
# We may reimplement this if we can get a separate species description
# platform up and running
# I am disabling it in the _associate_species_info_to_vat method of
# the SpeciesAssociation class.
species_association = self.SpeciesAssociation(vat=vat)
species_association.assign_species()
class SpeciesAssociation:
def __init__(self, vat):
self.vat = vat
self.assigned_species = []
self.maj_seq_names = [rs.name for rs in self.vat.majority_reference_sequence_obj_set]
self.all_seq_names = [rs.name for rs in self.vat.footprint_as_ref_seq_objs_set]
def assign_species(self):
"""For each analysis type check and assign the associated species"""
self._designate_species()
self._associate_species_info_to_vat()
def _designate_species(self):
if self.vat.clade == 'A':
self._clade_a_associations()
elif self.vat.clade == 'B':
self._clade_b_associations()
elif self.vat.clade == 'C':
self._clade_c_associations()
elif self.vat.clade == 'D':
self._clade_d_associations()
elif self.vat.clade == 'E':
self._clade_e_associations()
elif self.vat.clade == 'F':
self._clade_f_associations()
elif self.vat.clade == 'G':
pass
elif self.vat.clade == 'H':
pass
elif self.vat.clade == 'I':
pass
def _associate_species_info_to_vat(self):
# Disable species association
self.vat.species = 'None'
# if not self.assigned_species: # If no suggested species have been associated
# self.vat.species = 'None'
# else:
# self.vat.species = ','.join(self.assigned_species)
def _clade_f_associations(self):
if 'F1' in self.maj_seq_names:
self.assigned_species.append('S. kawagutii')
def _clade_e_associations(self):
self.assigned_species.append('S. voratum')
def _clade_d_associations(self):
# I have decided that we are not going to take into account the abundance of non-maj intragenomic
# defining sequences. e.g. D6 when calling associated species.
# This is because there can be a large difference sample to sample in the abundance of the sequences
# Rather we will assign both clade D species if the required sequences are present
# We are also giving the researcher the average abundances and SDs for each output type
if 'D1' in self.maj_seq_names:
if 'D4' not in self.all_seq_names:
self.assigned_species.append('S. glynnii')
else: # There is a significant abundance of D4
if 'D6' in self.all_seq_names:
# Then there is a significant amount of D6
self.assigned_species.extend(['S. glynnii', 'S. trenchii'])
else:
# THere is D1, D4 but not D6
self.assigned_species.append('S. trenchii')
if 'D8' in self.maj_seq_names or 'D12' in self.maj_seq_names or 'D13' in self.maj_seq_names:
self.assigned_species.append('S. eurythalpos')
if 'D15' in self.maj_seq_names:
self.assigned_species.append('S. boreum')
def _clade_c_associations(self):
if 'C1' in self.maj_seq_names:
self.assigned_species.append('S. goreaui')
if 'C3' in self.all_seq_names and 'C3gulf' in self.all_seq_names:
self.assigned_species.append('S. thermophilum')
def _clade_b_associations(self):
if 'B1' in self.maj_seq_names:
self.assigned_species.extend(['S. minutum', 'S. antillogorgium', 'S. pseudominutum'])
if 'B2' in self.maj_seq_names:
self.assigned_species.append('S. psygmophilum')
if 'B4' in self.maj_seq_names:
self.assigned_species.append('S. muscatinei')
if 'B7' in self.maj_seq_names or 'B13' in self.maj_seq_names:
self.assigned_species.append('S. endomadracis')
if 'B2a' in self.maj_seq_names:
self.assigned_species.append('S. aenigmaticum')
def _clade_a_associations(self):
if 'A1' in self.maj_seq_names:
self.assigned_species.append('S. microadriaticum')
if 'A2' in self.maj_seq_names:
self.assigned_species.append('S. pilosum')
if 'A3' in self.maj_seq_names:
self.assigned_species.extend(['S. natans', 'S. tridacnidorum'])
if 'A4' in self.maj_seq_names:
self.assigned_species.append('S. linucheae')
def _name_divs(self):
if sp_config.system_type == 'remote':
print('Naming unamed DIVs')
div_namer = self.DIVNamer(parent_sp_data_analysis=self)
div_namer.name_unamed_div_seqs()
print('\nDIV naming complete')
else:
print('Automatic sequence name generation is currently disabled for local instances of SymPortal.\n'
'This is to prevent naming conlifcts between the remote and the '
'local instances of SymPortal from arising\n')
class DIVNamer:
def __init__(self, parent_sp_data_analysis):
self.thread_safe_general = ThreadSafeGeneral()
self.sp_data_analysis = parent_sp_data_analysis
self.query_fasta_as_list = []
self.query_fasta_path = os.path.join(
self.sp_data_analysis.workflow_manager.symportal_root_directory,
'symbiodiniaceaeDB', 'unnamedRefSeqs.fasta')
self.db_fasta_as_list = []
self.db_fasta_path = os.path.join(
self.sp_data_analysis.workflow_manager.symportal_root_directory,
'symbiodiniaceaeDB', 'named_seqs_in_SP_remote_db.fa')
self.blast_output_path = os.path.join(
self.sp_data_analysis.workflow_manager.symportal_root_directory,
'symbiodiniaceaeDB', 'blast.out')
self.blast_analysis_object = symportal_utils.BlastnAnalysis(
input_file_path=self.query_fasta_path, output_file_path=self.blast_output_path,
db_path=self.db_fasta_path, output_format_string='6 qseqid sseqid evalue pident qcovs', num_threads=20)
self.blast_output_dict = None
self.list_of_sequence_names_that_already_exist = self._set_exist_seq_names()
self.unamed_div_uid_to_div_obj = {}
def _set_exist_seq_names(self):
# This is giving us the strange SSL EOF error: django.db.utils.OperationalError: SSL SYSCALL error: EOF detected
# I have managed to duplicate this error by filling up the linode server's RAM using python:
# https://stackoverflow.com/questions/6317818/eat-memory-using-python
# (I did this from within the test.py script; it didn't work doing manage.py shell)
# And then run this same query and we get the SSL EOF error. So at least we know what the problem is.
# The strange thing is, the query itself doesn't seem to use up too much memory so it must
# be other parts of the script that are using up memory on the linode machine.
# In support of this when i run a smaller analysis I also don't get the error being raised
# despite this query being run.
# So one thing I will try is to reset all of the db connections and see if this helps this command pass.
db.connections.close_all()
list_of_sequence_names_that_already_exist = [ref_seq.name for ref_seq in
ReferenceSequence.objects.filter(has_name=True)]
list_of_sequence_names_that_already_exist.append('D1a')
return list_of_sequence_names_that_already_exist
def name_unamed_div_seqs(self):
""" Generate names for the DIV ReferenceSequences that currently have no names. This will only happen
on 'remote' type systems.
"""
for vat in self.sp_data_analysis.virtual_object_manager.vat_manager.vat_dict.values():
for unamed_div in [rs for rs in vat.footprint_as_ref_seq_objs_set if not rs.has_name]:
self.unamed_div_uid_to_div_obj[unamed_div.id] = unamed_div
if self.unamed_div_uid_to_div_obj:
self._create_and_write_query_fasta()
self._create_and_write_db_fasta()
self.blast_analysis_object.make_db(title_for_db='name_ref_seqs')
self.blast_analysis_object.execute_blastn_analysis()
self.blast_output_dict = {blast_line.split('\t')[0]:blast_line.split('\t')[1:] for blast_line in
self.blast_analysis_object.return_blast_output_as_list()}
# It will be possible that some of the queries did not return a match
# as such, we should assert that all did
assert(set([int(_) for _ in self.blast_output_dict.keys()]) == set(self.unamed_div_uid_to_div_obj.keys()))
self._generate_and_assign_new_names()
self._regenerate_vat_names()
def _regenerate_vat_names(self):
"""
Some of the DIVs were not names and so their ID was being used in the vat name.
Now that all DIVs have names, use these names. Not that we will need to refreash
the ReferenceSequence objects that didn't have names now that we have done the naming."""
print('Regenerating VirtualAnalysisType names')
for vat in self.sp_data_analysis.virtual_object_manager.vat_manager.vat_dict.values():
vat.generate_name(at_df=vat.multi_modal_detection_rel_abund_df)
sys.stdout.write(f'\r{vat.name}')
def _generate_and_assign_new_names(self):
# Now assign names to those that aren't exact matches
# NB this was causing us issues as although we have updated the db object,
# and we have updated one of th instances of the ref seq we are hold in memory
# if there were multiple instances of the refseq objects, these other instances
# will not have been updated.
# We will update these in the following method
for no_name_ref_seq_id, output_items in self.blast_output_dict.items():
ref_seq_in_question = self.unamed_div_uid_to_div_obj[int(no_name_ref_seq_id)]
if not ref_seq_in_question.has_name:
new_name = self._create_new_reference_sequence_name(output_items[0])
ref_seq_in_question.name = new_name
ref_seq_in_question.has_name = True
ref_seq_in_question.save()
self.list_of_sequence_names_that_already_exist.append(new_name)
def _create_new_reference_sequence_name(self, closest_match):
match_object = re.match("^[A-I]{1}[0-9]{1,3}", closest_match)
base_name = match_object.group(0)
#https://stackoverflow.com/questions/23686398/iterate-a-to-zzz-in-python
for x in range(1, 4):
for combo in itertools.product(string.ascii_lowercase, repeat=x):
alpha = ''.join(combo)
if f'{base_name}{alpha}' not in self.list_of_sequence_names_that_already_exist:
return f'{base_name}{alpha}'
return False
def _create_and_write_db_fasta(self):
# create the fasta that will be the database to blast against
for rs in ReferenceSequence.objects.filter(has_name=True):
self.db_fasta_as_list.extend(['>{}'.format(rs.name), rs.sequence])
self.thread_safe_general.write_list_to_destination(destination=self.db_fasta_path,
list_to_write=self.db_fasta_as_list)
def _create_and_write_query_fasta(self):
# create the fasta as a file that will be queried in the blast
for rs in self.unamed_div_uid_to_div_obj.values():
self.query_fasta_as_list.extend([f'>{rs.id}', rs.sequence])
self.thread_safe_general.write_list_to_destination(
destination=self.query_fasta_path, list_to_write=self.query_fasta_as_list)
class ProfileAssigner:
"""Responsible for searching a given VirtualCladeCollection for VirtualAnalysisTypes and associating
the found VirtualAnalysisTypes to the VirtualCladeCollection"""
def __init__(self, virtual_clade_collection, parent_sp_data_analysis):
self.sp_data_analysis = parent_sp_data_analysis
self.vcc = virtual_clade_collection
self.vat_match_object_list = []
# transient objects updated during vat checks
self.potential_match_object = None
def assign_profiles(self):
print(f'\nAssigning ITS2 type profiles to {self.vcc}:')
list_of_vats_to_search = self._get_list_of_vats_to_search()
self._find_vats_in_vcc(list_of_vats_to_search)
# # TODO here we want to make sure that the most abundant sequence of the
# # VCC is represented by one of the profiles in the self.vat_match_objects_list
# # If it is not, then we should create a new 1 DIV VAT that is
# # the most abundant sequence and assign this to the sample.
# if not self._maj_seq_is_represented_in_matched_vats():
# self._create_vat_of_maj_seq()
# self._add_maj_seq_vat_to_matched_list()
self._associate_vcc_to_vats()
def _associate_vcc_to_vats(self):
for vat_match in self.vat_match_object_list:
print(f'Assigning {vat_match.at.name}')
vat_match.at.clade_collection_obj_set_profile_assignment.add(vat_match.cc)
vat_match.cc.analysis_type_obj_to_representative_rel_abund_in_cc_dict[
vat_match.at] = vat_match.rel_abund_of_at_in_cc
def _find_vats_in_vcc(self, list_of_vats_to_search):
for vat in list_of_vats_to_search:
if self._search_for_vat_in_vcc(vat=vat):
if self._vat_has_divs_in_common_with_other_vats(vat=vat):
self._add_new_vat_to_list_if_highest_rel_abund_representative()
else:
self.vat_match_object_list.append(self.potential_match_object)
def _get_list_of_vats_to_search(self):
return [
vat for vat in
self.sp_data_analysis.virtual_object_manager.vat_manager.vat_dict.values() if
vat.ref_seq_uids_set.issubset(self.vcc.footprint_as_frozen_set_of_ref_seq_uids)]
def _add_new_vat_to_list_if_highest_rel_abund_representative(self):
"""Get a list of the current matches that have refseqs in common with the potential match.
Iter through this list and compare the represented abundances of the current matches to potential match.
If the potential match has a lower abundance than any of them, do not accept. If it has a higher abundance
than all of then, accept and be sure to remove the current matches that shared a div with it from
the current matches of the VirtualCladeCollection.
"""
shared_div_match_list = []
for match_obj in self.vat_match_object_list:
if self._vats_have_divs_in_common(vat_one=match_obj.at, vat_two=self.potential_match_object.at):
shared_div_match_list.append(match_obj)
# if any one of the current matches reps a greater proportion then do not accept the potential match
for match_obj in shared_div_match_list:
if self.potential_match_object.rel_abund_of_at_in_cc < match_obj.rel_abund_of_at_in_cc:
return
# if we reach here then we should delete all of the matches in the shared_div_match list and
# add the potential new type in their place.
for match_obj in shared_div_match_list:
self.vat_match_object_list.remove(match_obj)
self.vat_match_object_list.append(self.potential_match_object)
def _vats_have_divs_in_common(self, vat_one, vat_two):
return vat_one.ref_seq_uids_set.intersection(vat_two.ref_seq_uids_set)
def _vat_has_divs_in_common_with_other_vats(self, vat):
if self.vat_match_object_list:
for vat_match_obj in self.vat_match_object_list:
if vat.ref_seq_uids_set.intersection(vat_match_obj.at.ref_seq_uids_set):
return True
return False
else:
return False
def _search_for_vat_in_vcc(self, vat):
"""This will check whether the DIV relative abundance requirements of
a vat are met by the vcc in question. If met we will keep track of what proportion of the CladeCollection
this set of refseqs represents. If the VAT is a single sequence VAT then we will require it to
be present at an abundance of at least 0.05."""
vcc_rel_abund_dict = self.vcc.ref_seq_id_to_rel_abund_dict
if len(vat.ref_seq_uids_set) > 1:
# NB here, when looking to see if the DIVs are found at the right proportions in the CladeCollection
# to match the VirtualAnalysisType we need to work with the seq abundances as a proportion of
# only the sequences in the CC that are found in the VAT. However, when we want to get an
# idea of whether this VAT match is better than another one, then we need to work with the VAT DIV
# rel abundances as a proportion of all of the sequences in the CladeCollection
total_abundance_of_seqs_of_vat = sum([rel_abund for ref_seq_uid, rel_abund in vcc_rel_abund_dict.items() if
ref_seq_uid in vat.ref_seq_uids_set])
vcc_rel_abund_dict_for_vat = {ref_seq_uid : rel_abund/total_abundance_of_seqs_of_vat for
ref_seq_uid, rel_abund in vcc_rel_abund_dict.items() if
ref_seq_uid in vat.ref_seq_uids_set}
total_seq_rel_abund_for_cc = []
for ref_seq_id, ref_seq_req_abund_obj in vat.prof_assignment_required_rel_abund_dict.items():
rel_abund_of_div_in_vat_seqs = vcc_rel_abund_dict_for_vat[ref_seq_id]
total_seq_rel_abund_for_cc.append(vcc_rel_abund_dict[ref_seq_id])
if ref_seq_req_abund_obj.max_abund <= rel_abund_of_div_in_vat_seqs <= ref_seq_req_abund_obj.min_abund:
return False
self.potential_match_object = CCToATMatchInfoHolder(
vat=vat, vcc=self.vcc, rel_abund_of_at_in_cc=sum(total_seq_rel_abund_for_cc))
return True
else:
abund_of_vat_in_vcc = vcc_rel_abund_dict[list(vat.ref_seq_uids_set)[0]]
if abund_of_vat_in_vcc > 0.05:
self.potential_match_object = CCToATMatchInfoHolder(
vat=vat, vcc=self.vcc, rel_abund_of_at_in_cc=abund_of_vat_in_vcc)
return True
else:
return False
def _profile_assignment(self):
print('\n\nBeginning profile assignment')
for virtual_clade_collection in self.virtual_object_manager.vcc_manager.vcc_dict.values():
if virtual_clade_collection.id == 111118:
foo = "bar"
profile_assigner = self.ProfileAssigner(virtual_clade_collection = virtual_clade_collection,
parent_sp_data_analysis = self)
profile_assigner.assign_profiles()
# Reinit the VirtualAnalysisTypes to populate the post-profile assignment objects
self.reinit_vats_post_profile_assignment()
self.multimodal_detection()
print('Profile Assignment Complete')
def multimodal_detection(self):
mmd = self.MultiModalDetection(parent_sp_data_analysis=self)
mmd.run_multimodal_detection()
class MultiModalDetection:
def __init__(self, parent_sp_data_analysis):
self.sp_data_analysis = parent_sp_data_analysis
self.vat_uids_checked_set = set()
self.restart = True
# attributes that will be updated with each vat checked
self.current_vat = None
# The two lists that will hold the VirtualCladeCollection objects belonging to each of the potential
# new VATs resulting from a splitting occurrence.
self.list_of_vcc_uids_one = []
self.list_of_vcc_uids_two = []
def run_multimodal_detection(self):
print('\nStarting MultiModalDetection')
while self.restart:
self.restart = False
for vat_uid in self.sp_data_analysis.virtual_object_manager.vat_manager.vat_dict.keys():
self.current_vat = self.sp_data_analysis.virtual_object_manager.vat_manager.vat_dict[vat_uid]
sys.stdout.write(f'\rChecking {self.current_vat.name}')
if self.current_vat.id in self.vat_uids_checked_set:
continue
if len(self.current_vat.ref_seq_uids_set) == 1:
self.vat_uids_checked_set.add(self.current_vat.id)
continue
if len(self.current_vat.clade_collection_obj_set_profile_assignment) < 8:
self.vat_uids_checked_set.add(self.current_vat.id)
continue
for ref_seq_uid_col in list(self.current_vat.multi_modal_detection_rel_abund_df):
if not self.restart:
self._assess_if_div_multimodal(ref_seq_uid_col)
if not self.restart:
self.vat_uids_checked_set.add(self.current_vat.id)
if self.restart:
break
def _assess_if_div_multimodal(self, ref_seq_uid_col):
c, modes, pdf, x_grid = self._find_modes_of_abundances(ref_seq_uid_col)
if modes == 2:
x_diff_valid = self._assess_if_modes_sufficiently_separated(c, pdf, x_grid)
if x_diff_valid:
self._assign_vccs_to_modes(pdf, ref_seq_uid_col, x_grid)
if self._sufficient_support_of_each_mode():
self._split_vat_into_two_new_vats()
def _assess_if_modes_sufficiently_separated(self, c, pdf, x_grid):
# Must be sufficient separation between the peaks in x axis
x_diff_valid = False
if x_grid[c[1]] - x_grid[c[0]] > 0.7:
x_diff_valid = True
# plotHists(pdf, x_grid, listOfRatios, listOfTypesToAnalyse[k].name)
# Must also be sufficient diff between minima y and small peak y
# This represents the x spread and overlap of the two peaks
d = list((np.diff(np.sign(np.diff(pdf))) != 0).nonzero()[0] + 1) # max and min indices
if min([pdf[d[0]], pdf[d[2]]]) == 0:
x_diff_valid = False
else:
if pdf[d[1]] / min([pdf[d[0]], pdf[d[2]]]) > 0.85: # Insufficient separation of peaks
x_diff_valid = False
return x_diff_valid
def _assign_vccs_to_modes(self, pdf, ref_seq_uid_col, x_grid):
# Then we have found modes that are sufficiently separated.
self.list_of_vcc_uids_one = []
self.list_of_vcc_uids_two = []
min_x = x_grid[list(((np.diff(np.sign(np.diff(pdf))) != 0).nonzero()[0] + 1))[1]]
for vcc_uid in self.current_vat.multi_modal_detection_rel_abund_df.index.tolist():
if self.current_vat.multi_modal_detection_rel_abund_df.at[
vcc_uid, ref_seq_uid_col] < min_x:
self.list_of_vcc_uids_one.append(vcc_uid)
else:
self.list_of_vcc_uids_two.append(vcc_uid)
def _sufficient_support_of_each_mode(self):
return len(self.list_of_vcc_uids_one) >= 4 and len(self.list_of_vcc_uids_two) >= 4
def _update_vccs_rep_abund_dict_for_split_type(self, list_of_vcc_objs, resultant_vat):
for vcc in list_of_vcc_objs:
del vcc.analysis_type_obj_to_representative_rel_abund_in_cc_dict[self.current_vat]
rep_rel_abund_of_resultant_type = sum([vcc.ref_seq_id_to_rel_abund_dict[ref_seq_uid] for ref_seq_uid in resultant_vat.ref_seq_uids_set])
vcc.analysis_type_obj_to_representative_rel_abund_in_cc_dict[resultant_vat] = rep_rel_abund_of_resultant_type
def _split_vat_into_two_new_vats(self):
print(f'\n\nMultiModalDetection: Splitting {self.current_vat.name}')
list_of_vcc_objs_one = [
vcc for vcc in self.current_vat.clade_collection_obj_set_profile_assignment if
vcc.id in self.list_of_vcc_uids_one]
resultant_type_one = self.sp_data_analysis.virtual_object_manager.vat_manager. \
make_vat_post_profile_assignment(
clade_collection_obj_list=list_of_vcc_objs_one,
ref_seq_obj_list=self.current_vat.footprint_as_ref_seq_objs_set)
self._update_vccs_rep_abund_dict_for_split_type(
list_of_vcc_objs=list_of_vcc_objs_one, resultant_vat=resultant_type_one)
print(f'Created {resultant_type_one.name}')
list_of_vcc_objs_two = [
vcc for vcc in self.current_vat.clade_collection_obj_set_profile_assignment if
vcc.id in self.list_of_vcc_uids_two]
resultant_type_two = self.sp_data_analysis.virtual_object_manager.vat_manager. \
make_vat_post_profile_assignment(
clade_collection_obj_list=list_of_vcc_objs_two,
ref_seq_obj_list=self.current_vat.footprint_as_ref_seq_objs_set)
self._update_vccs_rep_abund_dict_for_split_type(
list_of_vcc_objs=list_of_vcc_objs_two, resultant_vat=resultant_type_two)
print(f'Created {resultant_type_two.name}')
print(f'Destroyed {self.current_vat.name}\n')
self.sp_data_analysis.virtual_object_manager.vat_manager. \
delete_virtual_analysis_type(self.current_vat)
self.restart = True
def _find_modes_of_abundances(self, ref_seq_uid_col):
rel_abunds_of_ref_seq = self.current_vat.multi_modal_detection_rel_abund_df.loc[
:, ref_seq_uid_col].values.tolist()
x_grid = np.linspace(min(rel_abunds_of_ref_seq) - 1, max(rel_abunds_of_ref_seq) + 1, 2000)
kde = gaussian_kde(rel_abunds_of_ref_seq)
pdf = kde.evaluate(x_grid)
c = list((np.diff(np.sign(np.diff(pdf))) < 0).nonzero()[0] + 1)
modes = len(c)
return c, modes, pdf, x_grid
def reinit_vats_post_profile_assignment(self):
print('\nReinstantiating VirtualAnalysisTypes')
for vat in self.virtual_object_manager.vat_manager.vat_dict.values():
sys.stdout.write(f'\r{vat.name}')
self.virtual_object_manager.vat_manager.reinit_vat_post_profile_assignment(
vat_to_reinit=vat, new_clade_collection_obj_set=vat.clade_collection_obj_set_profile_assignment)
print('\nReinstantiation complete')
def _associate_vats_to_vccs(self):
"""Populate the analysis_type_obj_to_representative_rel_abund_in_cc_dict of the VirtualCladeCollection
using the VirtualAnalysisTypes."""
print('Populating starting analysis type info to cc info dict')
clade_collection_to_type_tuple_list = []
for vat in self.virtual_object_manager.vat_manager.vat_dict.values():
initial_clade_collections = vat.clade_collection_obj_set_profile_discovery
for cc in initial_clade_collections:
clade_collection_to_type_tuple_list.append((cc, vat))
for cc, vat in clade_collection_to_type_tuple_list:
virtual_cc = self.virtual_object_manager.vcc_manager.vcc_dict[cc.id]
current_type_seq_rel_abund_for_cc = []
cc_ref_seq_abundance_dict = virtual_cc.ref_seq_id_to_rel_abund_dict
for ref_seq in vat.footprint_as_ref_seq_objs_set:
rel_abund = cc_ref_seq_abundance_dict[ref_seq.id]
current_type_seq_rel_abund_for_cc.append(rel_abund)
current_type_seq_tot_rel_abund_for_cc = sum(current_type_seq_rel_abund_for_cc)
virtual_cc.analysis_type_obj_to_representative_rel_abund_in_cc_dict[
vat] = current_type_seq_tot_rel_abund_for_cc
sys.stdout.write(f'\rCladeCollection:{cc} AnalysisType:{vat}')
def _check_for_artefacts(self):
artefact_assessor = ArtefactAssessor(parent_sp_data_analysis=self)
artefact_assessor.assess_within_clade_cutoff_artefacts()
artefact_assessor.reassess_support_of_artefact_div_containing_types()
def _collapse_footprints_and_make_analysis_types(self):
for i, clade_fp_dict in enumerate(self.clade_footp_dicts_list):
self.current_clade = self.clade_list[i]
if self._there_are_footprints_of_this_clade(clade_fp_dict):
sfi = SupportedFootPrintIdentifier(clade_footprint_dict=clade_fp_dict, parent_sp_data_analysis=self)
self.list_of_initial_types_after_collapse = sfi.identify_supported_footprints()
analysis_type_creator = AnalysisTypeCreator(parent_sp_data_analysis=self)
analysis_type_creator.create_analysis_types()
self._verify_all_ccs_associated_to_analysis_type()
def _verify_all_ccs_associated_to_analysis_type(self):
print('\nVerifying all CladeCollections have been associated to an AnalysisType...')
clade_collections_represented_by_types = set()
for vat in self.virtual_object_manager.vat_manager.vat_dict.values():
clade_collections_represented_by_types.update([vcc.id for vcc in vat.clade_collection_obj_set_profile_discovery])
ccs_of_data_analysis_dict = {cc.id : cc for cc in self.ccs_of_analysis}
if set(ccs_of_data_analysis_dict.keys()).issuperset(clade_collections_represented_by_types):
set_of_unassociated_cc_uids = set(ccs_of_data_analysis_dict.keys()).difference(clade_collections_represented_by_types)
diff_list = []
for diff_uid in set_of_unassociated_cc_uids:
diff_list.append(ccs_of_data_analysis_dict[diff_uid])
if len(set_of_unassociated_cc_uids) == 0:
print('All CladeCollections successfuly associated to at least one AnalysisType')
else:
raise RuntimeError(
f'{len(set_of_unassociated_cc_uids)} CladeCollections are unassociated from an AnalysisType')
def _there_are_footprints_of_this_clade(self, clade_fp_dict):
return clade_fp_dict
def _populate_clade_fp_dicts_list(self):
for cc_id, vcc in self.virtual_object_manager.vcc_manager.vcc_dict.items():
clade_index = self.clade_list.index(vcc.clade)
if vcc.above_cutoff_ref_seqs_obj_set in self.clade_footp_dicts_list[clade_index]:
self.clade_footp_dicts_list[clade_index][vcc.above_cutoff_ref_seqs_obj_set].cc_list.append(vcc)
self.clade_footp_dicts_list[clade_index][vcc.above_cutoff_ref_seqs_obj_set].maj_dss_seq_list.append(vcc.ordered_dsss_objs[0])
else:
self.clade_footp_dicts_list[clade_index][vcc.above_cutoff_ref_seqs_obj_set] = FootprintRepresentative(
cc=vcc, maj_dss_seq_list=vcc.ordered_dsss_objs[0])
class ArtefactAssessor:
def __init__(self, parent_sp_data_analysis):
self.sp_data_analysis = parent_sp_data_analysis
self.list_of_vccs = list(self.sp_data_analysis.virtual_object_manager.\
vcc_manager.vcc_dict.values())
# key:VirtualAnalysisType.id, value:VirtualAnalysisType
self.virtual_analysis_type_dict = self.sp_data_analysis.virtual_object_manager.vat_manager.vat_dict
self.analysis_types_of_analysis = list(self.virtual_analysis_type_dict.values())
self.set_of_clades_from_analysis = self._set_set_of_clades_from_analysis()
# key = set of ref_seq_objects, value = VirtualAnalysisType
self.ref_seq_fp_set_to_analysis_type_obj_dict = self._init_fp_to_at_dict()
# Attributes updated on an iterative basis
self.current_clade = None
# NB we have the two lists below as we only want to check combinations of the original AnalysiTypes and
# not the new AnalysisTypes that will be created as part of this process. This is to prevent any infinite
# loops occuring.
# A query that will be coninually updated
self.vat_uids_of_clade_dynamic = None
# A fixed list of the original types that we stated with
self.vat_uids_of_clade_static = None
# A list that holds a tuple of ids that have already been compared.
self.already_compared_analysis_type_uid_set = set()
# Bool whether the pair comparisons need to be restarted.
# This will be true when we have modified a type in anyway
self.restart_pair_comparisons = True
# reassess support of artefact DIV containing analysis types attributes
self.already_checked_vat_uid_list = []
def _init_fp_to_at_dict(self):
ref_seq_fp_set_to_analysis_type_obj_dict = {}
for vat in self.virtual_analysis_type_dict.values():
ref_seq_fp_set_to_analysis_type_obj_dict[
frozenset(vat.footprint_as_ref_seq_objs_set)] = vat
return ref_seq_fp_set_to_analysis_type_obj_dict
def _types_should_be_checked(self, vat_a, vat_b):
"""Check
1 - the non-artefact_ref_seqs match
2 - neither of the types is a subset of the other
3 - there are artefact ref seqs in at least one of the types"""
if vat_a.basal_seq == vat_b.basal_seq:
if vat_a.non_artefact_ref_seq_uid_set and vat_b.non_artefact_ref_seq_uid_set:
if vat_a.non_artefact_ref_seq_uid_set == vat_b.non_artefact_ref_seq_uid_set:
if not set(vat_a.ref_seq_uids_set).issubset(vat_b.ref_seq_uids_set):
if not set(vat_b.ref_seq_uids_set).issubset(vat_a.ref_seq_uids_set):
if vat_a.artefact_ref_seq_uid_set.union(vat_b.artefact_ref_seq_uid_set):
return True
return False
def assess_within_clade_cutoff_artefacts(self):
"""Check through all of the types to see if there are super types that have not been identified due to the
withincladecutoff. Please see:
https://github.com/didillysquat/SymPortal_framework/wiki/
The-SymPortal-logic#artefacts-during-the-its2-type-profile-discovery-phase
For further details"""
for clade in self.set_of_clades_from_analysis:
self.current_clade = clade
self._set_static_and_dynamic_vat_lists()
self.restart_pair_comparisons = True
while self.restart_pair_comparisons:
self.restart_pair_comparisons = False
self._set_vat_to_compare_pairwise()
for analysis_type_a_uid, analysis_type_b_uid in itertools.combinations(self.vat_uids_to_check_of_clade, 2):
if {analysis_type_a_uid, analysis_type_b_uid} not in self.already_compared_analysis_type_uid_set:
vat_a = self.virtual_analysis_type_dict[analysis_type_a_uid]
vat_b = self.virtual_analysis_type_dict[analysis_type_b_uid]
if self._types_should_be_checked(vat_a, vat_b):
print(f'\n\nChecking {vat_a.name} and {vat_b.name} for additional artefactual profiles')
ctph = CheckTypePairingHandler(parent_artefact_assessor=self, vat_a=vat_a, vat_b=vat_b)
if ctph.check_type_pairing():
self.restart_pair_comparisons = True
self._reset_dynamic_vat_list()
break
else:
self._log_completed_comparison(analysis_type_a_uid, analysis_type_b_uid)
else:
self._log_completed_comparison(analysis_type_a_uid, analysis_type_b_uid)
def reassess_support_of_artefact_div_containing_types(self):
"""Check to see how the association of VirtualCladeCollections to VirtualAnalysisTypes changes when taking into
account that some sequencs are now 'artefact' sequences and thus 'unlocked'. Please see:
https://github.com/didillysquat/SymPortal_framework/wiki
/The-SymPortal-logic#artefacts-during-the-its2-type-profile-assignment-phase
for further details."""
for clade in self.set_of_clades_from_analysis:
self.current_clade = clade
self.already_checked_vat_uid_list = []
self.restart_pair_comparisons = True
while self.restart_pair_comparisons:
self._set_artefact_div_vat_to_check()
self.restart_pair_comparisons = False
for vat_uid in self.vat_uids_to_check_of_clade:
if vat_uid not in self.already_checked_vat_uid_list:
vat_to_check = self.virtual_analysis_type_dict[vat_uid]
print(f'\n\nChecking associations of VirtualCladeCollections to {vat_to_check.name}')
cadivvata = CheckArtefactDIVVATAssociations(parent_artefact_assessor=self, vat_to_check=vat_to_check)
if cadivvata.check_artefact_div_vat_associations():
self.restart_pair_comparisons = True
self.already_checked_vat_uid_list.append(vat_to_check.id)
break
else:
self.already_checked_vat_uid_list.append(vat_to_check.id)
def _set_artefact_div_vat_to_check(self):
self.vat_uids_to_check_of_clade = [
vat.id for vat in self.virtual_analysis_type_dict.values() if
vat.clade == self.current_clade if
vat.artefact_ref_seq_uid_set]
def _reset_dynamic_vat_list(self):
self.vat_uids_of_clade_dynamic = [
at_id for at_id in self.virtual_analysis_type_dict.keys() if
self.virtual_analysis_type_dict[at_id].clade == self.current_clade]
def _log_completed_comparison(self, analysis_type_a_uid, analysis_type_b_uid):
self.already_compared_analysis_type_uid_set.add(
frozenset({analysis_type_a_uid, analysis_type_b_uid}))
def _set_vat_to_compare_pairwise(self):
self.vat_uids_to_check_of_clade = [at_id for at_id in self.vat_uids_of_clade_dynamic if at_id in
self.vat_uids_of_clade_static]
def _set_static_and_dynamic_vat_lists(self):
self.vat_uids_of_clade_dynamic = [
vat_id for vat_id in self.virtual_analysis_type_dict.keys() if
self.virtual_analysis_type_dict[vat_id].clade == self.current_clade]
self.vat_uids_of_clade_static = [
at_id for at_id in self.virtual_analysis_type_dict.keys() if
self.virtual_analysis_type_dict[at_id].clade == self.current_clade]
def _set_set_of_clades_from_analysis(self):
self.set_of_clades_from_analysis = set()
for at in self.virtual_analysis_type_dict.values():
self.set_of_clades_from_analysis.add(at.clade)
return self.set_of_clades_from_analysis
class PotentialNewType:
def __init__(
self, artefact_ref_seq_uid_set, non_artefact_ref_seq_uid_set,
ref_seq_uids_set, list_of_ref_seq_names, resf_seq_obj_set, force_basal_lineage_separation):
self.artefact_ref_seq_uid_set = artefact_ref_seq_uid_set
self.non_artefact_ref_seq_uid_set = non_artefact_ref_seq_uid_set
self.ref_seq_uids_set = ref_seq_uids_set
self.name = ','.join(list_of_ref_seq_names)
if force_basal_lineage_separation:
self.basal_seq = self._set_basal_seq(list_of_ref_seq_names)
else:
self.basal_seq = None
self.ref_seq_objects_set = resf_seq_obj_set
def _set_basal_seq(self, list_of_ref_seq_names):
basal_set = set()
found_c15_a = False
for name in list_of_ref_seq_names:
if name == 'C3':
basal_set.add('C3')
elif name == 'C1':
basal_set.add('C1')
elif 'C15' in name and not found_c15_a:
basal_set.add('C15')
found_c15_a = True
if len(basal_set) == 1:
return list(basal_set)[0]
elif len(basal_set) > 1:
raise RuntimeError(f'basal seq set {basal_set} contains more than one ref seq')
else:
return None
class CheckVCCToVATAssociations:
"""Base class for use by the two ArtefactAssessor instances"""
def __init__(self, parent_artefact_assessor):
self.artefact_assessor = parent_artefact_assessor
self.list_of_vcc_objs_to_check = None
self.list_of_loss_of_support_info_holder_objs = []
self.at_obj_to_cc_obj_list_to_be_removed = defaultdict(list)
self.stranded_ccs = []
self.ref_seqs_in_common_for_stranded_ccs = set()
self.at_matching_stranded_ccs = None
self.new_analysis_type_from_stranded_ccs = None
def _assess_support_of_pnt_or_vat(self, pnt_or_vat):
for vcc in self.list_of_vcc_objs_to_check:
cpntsw = self.CheckPNTorVATSupportWorker(
virtual_clade_collection_object=vcc, parent_check_type_pairing=self, pnt_or_vat=pnt_or_vat)
cpntsw.check_pnt_support()
def _pnt_or_vat_has_support(self):
return len(self.list_of_loss_of_support_info_holder_objs) >= 4
def _update_cc_info_for_ccs_that_support_new_type(self, vat_to_add_to_vcc):
for loss_of_support_info_obj in self.list_of_loss_of_support_info_holder_objs:
self._remove_no_longer_supported_type_from_cc_info(loss_of_support_info_obj)
self.add_new_type_to_cc_from_match_obj(loss_of_support_info_obj, vat_to_add_to_vcc)
self._populate_at_obj_to_cc_obj_to_be_removed_dict(loss_of_support_info_obj)
def _remove_no_longer_supported_type_from_cc_info(self, loss_of_support_info_obj):
print(f'removing association of {loss_of_support_info_obj.cc} from {loss_of_support_info_obj.at}')
del loss_of_support_info_obj.cc.analysis_type_obj_to_representative_rel_abund_in_cc_dict[
loss_of_support_info_obj.at]
def add_new_type_to_cc_from_match_obj(self, match_info_obj, vat_to_add_to_vcc):
print(f'associating {match_info_obj.cc} to {vat_to_add_to_vcc.name}')
match_info_obj.cc.analysis_type_obj_to_representative_rel_abund_in_cc_dict[
vat_to_add_to_vcc] = match_info_obj.rel_abund_of_at_in_cc
def _populate_at_obj_to_cc_obj_to_be_removed_dict(self, loss_of_support_info_obj):
self.at_obj_to_cc_obj_list_to_be_removed[
loss_of_support_info_obj.at].append(loss_of_support_info_obj.cc)
def _reinit_or_del_affected_types_and_create_stranded_cc_list(self):
for vat, cc_obj_list_val in self.at_obj_to_cc_obj_list_to_be_removed.items():
new_list_of_ccs_to_associate_to = [cc for cc in vat.clade_collection_obj_set_profile_discovery if cc not in cc_obj_list_val]
# If the analysis type still has support then simply reinitialize it
if self._afftected_type_still_has_sufficient_support(new_list_of_ccs_to_associate_to):
print(f'\nType {vat.name} supported by {len(new_list_of_ccs_to_associate_to)} CCs. Reinitiating.')
self.artefact_assessor.sp_data_analysis.virtual_object_manager.vat_manager.\
reinit_vat_pre_profile_assignment(
vat_to_reinit=vat, new_clade_collection_obj_set=new_list_of_ccs_to_associate_to)
else:
try:
self._del_affected_type_and_populate_stranded_cc_list(
vat=vat, new_list_of_ccs_to_associate_to=new_list_of_ccs_to_associate_to)
except:
apples = 'asdf'
def _afftected_type_still_has_sufficient_support(self, new_list_of_ccs_to_associate_to):
return len(new_list_of_ccs_to_associate_to) >= 4
def _del_affected_type_and_populate_stranded_cc_list(self, vat, new_list_of_ccs_to_associate_to):
print(
f'Type {vat.name} no longer supported. '
f'Deleting. {len(new_list_of_ccs_to_associate_to)} CCs stranded.')
del self.artefact_assessor.ref_seq_fp_set_to_analysis_type_obj_dict[
frozenset(vat.footprint_as_ref_seq_objs_set)]
del self.artefact_assessor.virtual_analysis_type_dict[vat.id]
for vcc in new_list_of_ccs_to_associate_to:
if self._vcc_supports_only_one_vat(vcc):
self.stranded_ccs.append(vcc)
else:
# vcc was supported by more than one item. Simply remove the at in question from the dict
# and leave as is.
del vcc.analysis_type_obj_to_representative_rel_abund_in_cc_dict[vat]
def _vcc_supports_only_one_vat(self, vcc):
return len(vcc.analysis_type_obj_to_representative_rel_abund_in_cc_dict.items()) == 1
def _reassociate_stranded_ccs_if_necessary(self):
if self._sufficient_stranded_ccs_for_new_analysis_type():
print(f'{len(self.stranded_ccs)} VirtualCladeCollections are stranded. Rehoming...')
# Get ref_seqs in common
self._get_ref_seqs_in_common_btw_stranded_ccs()
if self.ref_seqs_in_common_for_stranded_ccs:
if self._analysis_type_already_exists_with_profile_of_seqs_in_common():
self._add_stranded_ccs_to_existing_at_and_update_dicts()
else:
if self.artefact_assessor.sp_data_analysis.force_basal_lineage_separation:
if not self._ref_seqs_in_common_contain_multiple_basal_seqs():
self._add_stranded_ccs_to_new_at_made_from_common_ref_seqs_and_update_dicts()
else:
self._rehome_cc_individually()
else:
self._rehome_cc_individually()
else:
self._rehome_cc_individually()