Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reserved word check #151

Merged
merged 2 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions sequence_processing_pipeline/Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from collections import defaultdict
from datetime import datetime
from xml.etree import ElementTree as ET
from metapool.prep import PREP_MF_COLUMNS


logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
Expand Down Expand Up @@ -235,6 +236,38 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,

self._configure_profile()

def identify_reserved_words(self, words):
'''
Returns a list of words that should not appear as column names in any
project referenced in the Pipeline's sample-sheet/pre-prep file.
:param words: A list of words that may include reserved words.
:return: A list of words that are already reserved in upper, lower,
and mixed cases.
'''

# Only strings used as column names in pre-prep files are currently
# considered 'reserved' as loading a pre-prep file containing these
# column names will fail if one or more of the strings already appears
# as a column name in a study's sample metadata table.

# This implementation assumes some understanding of metapool's impl,
# specifically how the proper set of prep-info file columns are
# generated. For now the functionality will be defined here as this
# area of metapool is currently in flux.
if self.mapping_file is not None:
reserved = PREP_MF_COLUMNS
else:
# results will be dependent on SheetType and SheetVersion of
# the sample-sheet. Since all columns in a prep-info file are
# lower()ed before writing out to file, the word must be
# reserved in all case forms. e.g.: 'Sample_Well' and 'Sample_well'
# are both forms of 'sample_well'.
reserved = [x.lower() for x in
self.sample_sheet.CARRIED_PREP_COLUMNS] + \
self.sample_sheet.GENERATED_PREP_COLUMNS

return list(set([x.lower() for x in words]) & set(reserved))

def _configure_profile(self):
# extract the instrument type from self.run_dir and the assay type
# from self.sample_sheet (or self.mapping_file).
Expand Down
40 changes: 40 additions & 0 deletions sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
[Header],,,,,,,,,,
IEMFileVersion,4,,,,,,,,,
SheetType,standard_metag,,,,,,,,,
SheetVersion,90,,,,,,,,,
Investigator Name,Caballero,,,,,,,,,
Experiment Name,RKL0042,,,,,,,,,
Date,2/26/20,,,,,,,,,
Workflow,GenerateFASTQ,,,,,,,,,
Application,FASTQ Only,,,,,,,,,
Assay,Metagenomic,,,,,,,,,
Description,,,,,,,,,,
Chemistry,Default,,,,,,,,,
,,,,,,,,,,
[Reads],,,,,,,,,,
150,,,,,,,,,,
150,,,,,,,,,,
,,,,,,,,,,
[Settings],,,,,,,,,,
ReverseComplement,0,,,,,,,,,
,,,,,,,,,,
[Data],,,,,,,,,,
Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Well_description
1,sample1,sample1,FooBar_666_p1,A1,iTru7_107_07,CCGACTAT,iTru5_01_A,ACCGACAA,Project_1111,s1
1,sample2,sample2,FooBar_666_p1,A2,iTru7_107_08,CCGACTAT,iTru5_01_A,CTTCGCAA,Project_1111,s2
3,sample1,sample1,FooBar_666_p1,A3,iTru7_107_09,GCCTTGTT,iTru5_01_A,AACACCAC,Project_1111,s1
3,sample2,sample2,FooBar_666_p1,A4,iTru7_107_10,AACTTGCC,iTru5_01_A,CGTATCTC,Project_1111,s2
3,sample3,sample3,FooBar_666_p1,A5,iTru7_107_11,CAATGTGG,iTru5_01_A,GGTACGAA,Trojecp_666,s5
3,sample4,sample4,FooBar_666_p1,B6,iTru7_107_12,AAGGCTGA,iTru5_01_A,CGATCGAT,Trojecp_666,s6
3,sample5,sample5,FooBar_666_p1,B8,iTru7_107_13,TTACCGAG,iTru5_01_A,AAGACACC,Trojecp_666,s7
,,,,,,,,,,
[Bioinformatics],,,,,,,,,,
Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,,
Project_1111,1111,False,AACC,GGTT,False,Knight Lab Kapa HP,Eqiiperiment,,,
Trojecp_666,666,False,AACC,GGTT,False,Knight Lab Kapa HP,SomethingWitty,,,
,,,,,,,,,,
[Contact],,,,,,,,,,
Email,Sample_Project,,,,,,,,,
test@lol.com,Project_1111,,,,,,,,,
tester@rofl.com,Trojecp_666,,,,,,,,,
,,,,,,,,,,
56 changes: 56 additions & 0 deletions sequence_processing_pipeline/tests/test_Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def setUp(self):
makedirs(self.output_file_path, exist_ok=True)
self.maxDiff = None
self.good_sample_sheet_path = self.path('good-sample-sheet.csv')
self.good_legacy_sheet_path = self.path('mgv90_test_sheet.csv')
self.mp_sheet_path = self.path('multi-project-sheet.csv')
self.bad_sample_sheet_path = self.path('duplicate_sample-sample-sheet'
'.csv')
Expand Down Expand Up @@ -1630,6 +1631,38 @@ def test_parse_project_name(self):
obs = pipeline._parse_project_name(test, t_set == 'True')
self.assertEqual(obs, exp)

def test_identify_reserved_words(self):
pipeline = Pipeline(self.good_config_file, self.good_run_id,
self.good_sample_sheet_path, None,
self.output_file_path, self.qiita_id,
Pipeline.METAGENOMIC_PTYPE)

# assert that arbitrary strings are not reserved.
obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD',
'ANOTHER_WORD'])
self.assertEqual(obs, [])

# assert that 'well_id_384' is a reserved word.
obs = pipeline.identify_reserved_words(['well_id_384',
'NOT_A_RESERVED_WORD'])

self.assertEqual(obs, ['well_id_384'])

# create new pipeline using a/legacy (v90) metagenomic sample-sheet.
pipeline = Pipeline(self.good_config_file, self.good_run_id,
self.good_legacy_sheet_path, None,
self.output_file_path, self.qiita_id,
Pipeline.METAGENOMIC_PTYPE)

# assert that for legacy sample-sheets, well_id_384 is NOT a reserved
# word and the appropriate reserved word is 'Sample_well'.
obs = pipeline.identify_reserved_words(['well_id_384',
'NOT_A_RESERVED_WORD',
'Sample_well',
'Sample_Well'])

self.assertEqual(obs, ['sample_well'])


class TestAmpliconPipeline(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -2339,6 +2372,29 @@ def test_process_run_info_file(self):
# These are indirectly tested as generate_dummy_sample_sheet() is
# called by Pipeline's constructor.

def test_identify_reserved_words(self):
pipeline = Pipeline(self.good_config_file,
self.good_run_id,
None,
self.good_mapping_file_path,
self.output_file_path,
self.qiita_id,
Pipeline.AMPLICON_PTYPE)

# assert that arbitrary strings are not reserved.
obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD',
'ANOTHER_WORD'])
self.assertEqual(obs, [])

# assert that Sample_Well is okay for current pre-prep files but
# well_id_384 is reserved. Show that all forms of tm300_8_tool are
# also reserved.
obs = pipeline.identify_reserved_words(['Sample_Well',
'TM300_8_Tool',
'tm300_8_tool',
'well_id_384'])
self.assertEqual(set(obs), {'tm300_8_tool', 'well_id_384'})


class TestInstrumentUtils(unittest.TestCase):
def setUp(self):
Expand Down
Loading