improvements to canonicalize_iupac and log2 option in get_lectin_array

BojarLab · May 15, 2024 · 814f9ff · 814f9ff
1 parent f5796a2
commit 814f9ff
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 13 deletions.
diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py
@@ -1370,14 +1370,15 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
   return sorted_auc_scores
 
 
-def get_lectin_array(df, group1, group2, paired = False):
+def get_lectin_array(df, group1, group2, paired = False, transform = ''):
   """Function for analyzing lectin array data for two or more groups.\n
   | Arguments:
   | :-
   | df (dataframe): dataframe containing samples as rows and lectins as columns [alternative: filepath to .csv or .xlsx]
   | group1 (list): list of indices or names for the first group of samples, usually the control
   | group2 (list): list of indices or names for the second group of samples (note, if an empty list is provided, group 1 can be used a list of group identifiers for each column - e.g., [1,1,2,2,3,3...])
-  | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False\n
+  | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
+  | transform (string): optional data-processing, "log2" transforms the with np.log2; default:nothing\n
   | Returns:
   | :-
   | Returns an output dataframe with:
@@ -1395,6 +1396,7 @@ def get_lectin_array(df, group1, group2, paired = False):
   if duplicated_cols:
     raise ValueError(f'Analysis aborted due to:\nDuplicates found for the following lectin(s): {", ".join(duplicated_cols)}.\nIf you have multiple copies of the same lectin, rename them by adding a suffix in the form of "_<identifier>" (underscore + an identifier).\nFor example, "SNA" may be renamed "SNA_1", "SNA_batch1", etc. ')
   lectin_list = df.columns.tolist()
+  df = np.log2(df) if transform == "log2" else df
   df = df.T
   if not isinstance(group1[0], str):
     if group1[0] == 1 or group2[0] == 1:

diff --git a/build/lib/glycowork/motif/processing.py b/build/lib/glycowork/motif/processing.py
@@ -826,7 +826,7 @@ def canonicalize_iupac(glycan):
   replace_dic = {'Nac': 'NAc', 'AC': 'Ac', 'Nc': 'NAc', 'NeuAc': 'Neu5Ac', 'NeuNAc': 'Neu5Ac', 'NeuGc': 'Neu5Gc',
                  '\u03B1': 'a', '\u03B2': 'b', 'N(Gc)': 'NGc', 'GL': 'Gl', 'GaN': 'GalN', '(9Ac)': '9Ac',
                  'KDN': 'Kdn', 'OSO3': 'S', '-O-Su-': 'S', '(S)': 'S', 'SO3-': 'S', 'SO3(-)': 'S', 'H2PO3': 'P', '(P)': 'P',
-                 '–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-',
+                 '–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-', '*': '', 'Ga(': 'Gal(',
                  'Glcp': 'Glc', 'Galp': 'Gal', 'Manp': 'Man', 'Fucp': 'Fuc', 'Neup': 'Neu', 'a?': 'a1',
                  '5Ac4Ac': '4Ac5Ac', '(-)': '(?1-?)'}
   glycan = multireplace(glycan, replace_dic)
@@ -897,6 +897,16 @@ def canonicalize_iupac(glycan):
     glycan = re.sub(r'([1-9]?[SP])(?!en)([A-Za-z]+)', r'\2\1', glycan)
   if bool(re.search(r'[1-9]?[SP]-[A-Za-z]+', glycan)):
     glycan = re.sub(r'([1-9]?[SP])-([A-Za-z]+)', r'\2\1', glycan)
+  # Handle malformed things like Gal-GlcNAc in an otherwise properly formatted string
+  glycan = re.sub(r'([a-z])\?', r'\1(?', glycan)
+  glycan = re.sub(r'([c-z])([1-2])-', r'\1(?\2-', glycan)
+  glycan = re.sub(r'-([\?2-9])([A-Z])', r'-\1)\2', glycan)
+  glycan = re.sub(r'([\?2-9])([\[\]])', r'\1)\2', glycan)
+  # Floating bits
+  if '+' in glycan:
+    if '-' not in glycan[:glycan.index('+')]:
+      glycan = glycan.replace('+', '(?1-?)+')
+    glycan = '{'+glycan.replace('+', '}')
   post_process = {'5Ac(?1': '5Ac(a2', '5Gc(?1': '5Gc(a2', '5Ac(a1': '5Ac(a2', '5Gc(a1': '5Gc(a2', 'Fuc(?': 'Fuc(a',
                   'GalS': 'GalOS', 'GlcNAcS': 'GlcNAcOS', 'GalNAcS': 'GalNAcOS', 'SGal': 'GalOS', 'Kdn(?1': 'Kdn(a2',
                   'Kdn(a1': 'Kdn(a2'}
@@ -905,9 +915,6 @@ def canonicalize_iupac(glycan):
   if '[' in glycan:
     isos = find_isomorphs(glycan)
     glycan = choose_correct_isoform(isos)
-  # Floating bits
-  if '+' in glycan:
-    glycan = '{'+glycan.replace('+', '}')
   if '{' in glycan:
     floating_bits = re.findall(r'\{.*?\}', glycan)
     sorted_floating_bits = ''.join(sorted(floating_bits, key = len, reverse = True))

diff --git a/glycowork.egg-info/requires.txt b/glycowork.egg-info/requires.txt
@@ -37,6 +37,7 @@ pubchempy
 requests
 Pillow
 openpyxl
+py3Dmol
 
 [draw]
 CairoSVG

diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -1370,14 +1370,15 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
   return sorted_auc_scores
 
 
-def get_lectin_array(df, group1, group2, paired = False):
+def get_lectin_array(df, group1, group2, paired = False, transform = ''):
   """Function for analyzing lectin array data for two or more groups.\n
   | Arguments:
   | :-
   | df (dataframe): dataframe containing samples as rows and lectins as columns [alternative: filepath to .csv or .xlsx]
   | group1 (list): list of indices or names for the first group of samples, usually the control
   | group2 (list): list of indices or names for the second group of samples (note, if an empty list is provided, group 1 can be used a list of group identifiers for each column - e.g., [1,1,2,2,3,3...])
-  | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False\n
+  | paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
+  | transform (string): optional data-processing, "log2" transforms the with np.log2; default:nothing\n
   | Returns:
   | :-
   | Returns an output dataframe with:
@@ -1395,6 +1396,7 @@ def get_lectin_array(df, group1, group2, paired = False):
   if duplicated_cols:
     raise ValueError(f'Analysis aborted due to:\nDuplicates found for the following lectin(s): {", ".join(duplicated_cols)}.\nIf you have multiple copies of the same lectin, rename them by adding a suffix in the form of "_<identifier>" (underscore + an identifier).\nFor example, "SNA" may be renamed "SNA_1", "SNA_batch1", etc. ')
   lectin_list = df.columns.tolist()
+  df = np.log2(df) if transform == "log2" else df
   df = df.T
   if not isinstance(group1[0], str):
     if group1[0] == 1 or group2[0] == 1:

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -826,7 +826,7 @@ def canonicalize_iupac(glycan):
   replace_dic = {'Nac': 'NAc', 'AC': 'Ac', 'Nc': 'NAc', 'NeuAc': 'Neu5Ac', 'NeuNAc': 'Neu5Ac', 'NeuGc': 'Neu5Gc',
                  '\u03B1': 'a', '\u03B2': 'b', 'N(Gc)': 'NGc', 'GL': 'Gl', 'GaN': 'GalN', '(9Ac)': '9Ac',
                  'KDN': 'Kdn', 'OSO3': 'S', '-O-Su-': 'S', '(S)': 'S', 'SO3-': 'S', 'SO3(-)': 'S', 'H2PO3': 'P', '(P)': 'P',
-                 '–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-',
+                 '–': '-', ' ': '', ',': '-', 'α': 'a', 'β': 'b', 'ß': 'b', '.': '', '((': '(', '))': ')', '→': '-', '*': '', 'Ga(': 'Gal(',
                  'Glcp': 'Glc', 'Galp': 'Gal', 'Manp': 'Man', 'Fucp': 'Fuc', 'Neup': 'Neu', 'a?': 'a1',
                  '5Ac4Ac': '4Ac5Ac', '(-)': '(?1-?)'}
   glycan = multireplace(glycan, replace_dic)
@@ -897,6 +897,16 @@ def canonicalize_iupac(glycan):
     glycan = re.sub(r'([1-9]?[SP])(?!en)([A-Za-z]+)', r'\2\1', glycan)
   if bool(re.search(r'[1-9]?[SP]-[A-Za-z]+', glycan)):
     glycan = re.sub(r'([1-9]?[SP])-([A-Za-z]+)', r'\2\1', glycan)
+  # Handle malformed things like Gal-GlcNAc in an otherwise properly formatted string
+  glycan = re.sub(r'([a-z])\?', r'\1(?', glycan)
+  glycan = re.sub(r'([c-z])([1-2])-', r'\1(?\2-', glycan)
+  glycan = re.sub(r'-([\?2-9])([A-Z])', r'-\1)\2', glycan)
+  glycan = re.sub(r'([\?2-9])([\[\]])', r'\1)\2', glycan)
+  # Floating bits
+  if '+' in glycan:
+    if '-' not in glycan[:glycan.index('+')]:
+      glycan = glycan.replace('+', '(?1-?)+')
+    glycan = '{'+glycan.replace('+', '}')
   post_process = {'5Ac(?1': '5Ac(a2', '5Gc(?1': '5Gc(a2', '5Ac(a1': '5Ac(a2', '5Gc(a1': '5Gc(a2', 'Fuc(?': 'Fuc(a',
                   'GalS': 'GalOS', 'GlcNAcS': 'GlcNAcOS', 'GalNAcS': 'GalNAcOS', 'SGal': 'GalOS', 'Kdn(?1': 'Kdn(a2',
                   'Kdn(a1': 'Kdn(a2'}
@@ -905,9 +915,6 @@ def canonicalize_iupac(glycan):
   if '[' in glycan:
     isos = find_isomorphs(glycan)
     glycan = choose_correct_isoform(isos)
-  # Floating bits
-  if '+' in glycan:
-    glycan = '{'+glycan.replace('+', '}')
   if '{' in glycan:
     floating_bits = re.findall(r'\{.*?\}', glycan)
     sorted_floating_bits = ''.join(sorted(floating_bits, key = len, reverse = True))

diff --git a/setup.py b/setup.py
@@ -30,7 +30,7 @@
                            "Pillow", "openpyxl", "py3Dmol"],
                     'dev':["torch_geometric", "torch", "CairoSVG",
                            "drawSvg~=2.0", "glyles", "pubchempy", "requests",
-                           "Pillow", "openpyxl"],
+                           "Pillow", "openpyxl", "py3Dmol"],
                     'ml':["torch_geometric", "torch"],
                     'draw':["CairoSVG", "drawSvg~=2.0", "Pillow",
                             "openpyxl"],