Skip to content

Commit

Permalink
improvements to canonicalize_iupac and log2 option in get_lectin_array
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed May 15, 2024
1 parent f5796a2 commit 814f9ff
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 13 deletions.
6 changes: 4 additions & 2 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1370,14 +1370,15 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
return sorted_auc_scores


def get_lectin_array(df, group1, group2, paired = False):
def get_lectin_array(df, group1, group2, paired = False, transform = ''):
"""Function for analyzing lectin array data for two or more groups.\n
| Arguments:
| :-
| df (dataframe): dataframe containing samples as rows and lectins as columns [alternative: filepath to .csv or .xlsx]
| group1 (list): list of indices or names for the first group of samples, usually the control
| group2 (list): list of indices or names for the second group of samples (note, if an empty list is provided, group 1 can be used a list of group identifiers for each column - e.g., [1,1,2,2,3,3...])
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False\n
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
| transform (string): optional data-processing, "log2" transforms the with np.log2; default:nothing\n
| Returns:
| :-
| Returns an output dataframe with:
Expand All @@ -1395,6 +1396,7 @@ def get_lectin_array(df, group1, group2, paired = False):
if duplicated_cols:
raise ValueError(f'Analysis aborted due to:\nDuplicates found for the following lectin(s): {", ".join(duplicated_cols)}.\nIf you have multiple copies of the same lectin, rename them by adding a suffix in the form of "_<identifier>" (underscore + an identifier).\nFor example, "SNA" may be renamed "SNA_1", "SNA_batch1", etc. ')
lectin_list = df.columns.tolist()
df = np.log2(df) if transform == "log2" else df
df = df.T
if not isinstance(group1[0], str):
if group1[0] == 1 or group2[0] == 1:
Expand Down
15 changes: 11 additions & 4 deletions build/lib/glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,7 @@ def canonicalize_iupac(glycan):
replace_dic = {'Nac': 'NAc', 'AC': 'Ac', 'Nc': 'NAc', 'NeuAc': 'Neu5Ac', 'NeuNAc': 'Neu5Ac', 'NeuGc': 'Neu5Gc',
'\u03B1': 'a', '\u03B2': 'b', 'N(Gc)': 'NGc', 'GL': 'Gl', 'GaN': 'GalN', '(9Ac)': '9Ac',
'KDN': 'Kdn', 'OSO3': 'S', '-O-Su-': 'S', '(S)': 'S', 'SO3-': 'S', 'SO3(-)': 'S', 'H2PO3': 'P', '(P)': 'P',
'–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-',
'–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-', '*': '', 'Ga(': 'Gal(',
'Glcp': 'Glc', 'Galp': 'Gal', 'Manp': 'Man', 'Fucp': 'Fuc', 'Neup': 'Neu', 'a?': 'a1',
'5Ac4Ac': '4Ac5Ac', '(-)': '(?1-?)'}
glycan = multireplace(glycan, replace_dic)
Expand Down Expand Up @@ -897,6 +897,16 @@ def canonicalize_iupac(glycan):
glycan = re.sub(r'([1-9]?[SP])(?!en)([A-Za-z]+)', r'\2\1', glycan)
if bool(re.search(r'[1-9]?[SP]-[A-Za-z]+', glycan)):
glycan = re.sub(r'([1-9]?[SP])-([A-Za-z]+)', r'\2\1', glycan)
# Handle malformed things like Gal-GlcNAc in an otherwise properly formatted string
glycan = re.sub(r'([a-z])\?', r'\1(?', glycan)
glycan = re.sub(r'([c-z])([1-2])-', r'\1(?\2-', glycan)
glycan = re.sub(r'-([\?2-9])([A-Z])', r'-\1)\2', glycan)
glycan = re.sub(r'([\?2-9])([\[\]])', r'\1)\2', glycan)
# Floating bits
if '+' in glycan:
if '-' not in glycan[:glycan.index('+')]:
glycan = glycan.replace('+', '(?1-?)+')
glycan = '{'+glycan.replace('+', '}')
post_process = {'5Ac(?1': '5Ac(a2', '5Gc(?1': '5Gc(a2', '5Ac(a1': '5Ac(a2', '5Gc(a1': '5Gc(a2', 'Fuc(?': 'Fuc(a',
'GalS': 'GalOS', 'GlcNAcS': 'GlcNAcOS', 'GalNAcS': 'GalNAcOS', 'SGal': 'GalOS', 'Kdn(?1': 'Kdn(a2',
'Kdn(a1': 'Kdn(a2'}
Expand All @@ -905,9 +915,6 @@ def canonicalize_iupac(glycan):
if '[' in glycan:
isos = find_isomorphs(glycan)
glycan = choose_correct_isoform(isos)
# Floating bits
if '+' in glycan:
glycan = '{'+glycan.replace('+', '}')
if '{' in glycan:
floating_bits = re.findall(r'\{.*?\}', glycan)
sorted_floating_bits = ''.join(sorted(floating_bits, key = len, reverse = True))
Expand Down
1 change: 1 addition & 0 deletions glycowork.egg-info/requires.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ pubchempy
requests
Pillow
openpyxl
py3Dmol

[draw]
CairoSVG
Expand Down
6 changes: 4 additions & 2 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1370,14 +1370,15 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
return sorted_auc_scores


def get_lectin_array(df, group1, group2, paired = False):
def get_lectin_array(df, group1, group2, paired = False, transform = ''):
"""Function for analyzing lectin array data for two or more groups.\n
| Arguments:
| :-
| df (dataframe): dataframe containing samples as rows and lectins as columns [alternative: filepath to .csv or .xlsx]
| group1 (list): list of indices or names for the first group of samples, usually the control
| group2 (list): list of indices or names for the second group of samples (note, if an empty list is provided, group 1 can be used a list of group identifiers for each column - e.g., [1,1,2,2,3,3...])
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False\n
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
| transform (string): optional data-processing, "log2" transforms the with np.log2; default:nothing\n
| Returns:
| :-
| Returns an output dataframe with:
Expand All @@ -1395,6 +1396,7 @@ def get_lectin_array(df, group1, group2, paired = False):
if duplicated_cols:
raise ValueError(f'Analysis aborted due to:\nDuplicates found for the following lectin(s): {", ".join(duplicated_cols)}.\nIf you have multiple copies of the same lectin, rename them by adding a suffix in the form of "_<identifier>" (underscore + an identifier).\nFor example, "SNA" may be renamed "SNA_1", "SNA_batch1", etc. ')
lectin_list = df.columns.tolist()
df = np.log2(df) if transform == "log2" else df
df = df.T
if not isinstance(group1[0], str):
if group1[0] == 1 or group2[0] == 1:
Expand Down
15 changes: 11 additions & 4 deletions glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,7 @@ def canonicalize_iupac(glycan):
replace_dic = {'Nac': 'NAc', 'AC': 'Ac', 'Nc': 'NAc', 'NeuAc': 'Neu5Ac', 'NeuNAc': 'Neu5Ac', 'NeuGc': 'Neu5Gc',
'\u03B1': 'a', '\u03B2': 'b', 'N(Gc)': 'NGc', 'GL': 'Gl', 'GaN': 'GalN', '(9Ac)': '9Ac',
'KDN': 'Kdn', 'OSO3': 'S', '-O-Su-': 'S', '(S)': 'S', 'SO3-': 'S', 'SO3(-)': 'S', 'H2PO3': 'P', '(P)': 'P',
'–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-',
'–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-', '*': '', 'Ga(': 'Gal(',
'Glcp': 'Glc', 'Galp': 'Gal', 'Manp': 'Man', 'Fucp': 'Fuc', 'Neup': 'Neu', 'a?': 'a1',
'5Ac4Ac': '4Ac5Ac', '(-)': '(?1-?)'}
glycan = multireplace(glycan, replace_dic)
Expand Down Expand Up @@ -897,6 +897,16 @@ def canonicalize_iupac(glycan):
glycan = re.sub(r'([1-9]?[SP])(?!en)([A-Za-z]+)', r'\2\1', glycan)
if bool(re.search(r'[1-9]?[SP]-[A-Za-z]+', glycan)):
glycan = re.sub(r'([1-9]?[SP])-([A-Za-z]+)', r'\2\1', glycan)
# Handle malformed things like Gal-GlcNAc in an otherwise properly formatted string
glycan = re.sub(r'([a-z])\?', r'\1(?', glycan)
glycan = re.sub(r'([c-z])([1-2])-', r'\1(?\2-', glycan)
glycan = re.sub(r'-([\?2-9])([A-Z])', r'-\1)\2', glycan)
glycan = re.sub(r'([\?2-9])([\[\]])', r'\1)\2', glycan)
# Floating bits
if '+' in glycan:
if '-' not in glycan[:glycan.index('+')]:
glycan = glycan.replace('+', '(?1-?)+')
glycan = '{'+glycan.replace('+', '}')
post_process = {'5Ac(?1': '5Ac(a2', '5Gc(?1': '5Gc(a2', '5Ac(a1': '5Ac(a2', '5Gc(a1': '5Gc(a2', 'Fuc(?': 'Fuc(a',
'GalS': 'GalOS', 'GlcNAcS': 'GlcNAcOS', 'GalNAcS': 'GalNAcOS', 'SGal': 'GalOS', 'Kdn(?1': 'Kdn(a2',
'Kdn(a1': 'Kdn(a2'}
Expand All @@ -905,9 +915,6 @@ def canonicalize_iupac(glycan):
if '[' in glycan:
isos = find_isomorphs(glycan)
glycan = choose_correct_isoform(isos)
# Floating bits
if '+' in glycan:
glycan = '{'+glycan.replace('+', '}')
if '{' in glycan:
floating_bits = re.findall(r'\{.*?\}', glycan)
sorted_floating_bits = ''.join(sorted(floating_bits, key = len, reverse = True))
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"Pillow", "openpyxl", "py3Dmol"],
'dev':["torch_geometric", "torch", "CairoSVG",
"drawSvg~=2.0", "glyles", "pubchempy", "requests",
"Pillow", "openpyxl"],
"Pillow", "openpyxl", "py3Dmol"],
'ml':["torch_geometric", "torch"],
'draw':["CairoSVG", "drawSvg~=2.0", "Pillow",
"openpyxl"],
Expand Down

0 comments on commit 814f9ff

Please sign in to comment.