minor speed-ups and version release prep

BojarLab · Aug 3, 2023 · 870af95 · 870af95
1 parent 02666b2
commit 870af95
Show file tree

Hide file tree

Showing 22 changed files with 38,046 additions and 35,803 deletions.
diff --git a/00_core.ipynb b/00_core.ipynb
diff --git a/01_glycan_data.ipynb b/01_glycan_data.ipynb
diff --git a/02_ml.ipynb b/02_ml.ipynb
diff --git a/03_motif.ipynb b/03_motif.ipynb
diff --git a/04_network.ipynb b/04_network.ipynb
diff --git a/05_examples.ipynb b/05_examples.ipynb
diff --git a/README.md b/README.md
@@ -51,10 +51,10 @@ alternative: <br>
 Note that we have optional extra installs for specialized use (even
 further instructions can be found in the `Examples` tab), such as: <br>
 *deep learning* <br> `pip install glycowork[ml]` <br> *drawing glycan
-images with GlycoDraw* <br> `pip install glycowork[draw]` <br>
-*analyzing atomic/chemical properties of glycans* <br>
-`pip install glycowork[chem]` <br> *everything* <br>
-`pip install glycowork[all]` <br>
+images with GlycoDraw (see install instructions in the `Examples` tab)*
+<br> `pip install glycowork[draw]` <br> *analyzing atomic/chemical
+properties of glycans* <br> `pip install glycowork[chem]` <br>
+*everything* <br> `pip install glycowork[all]` <br>
 
 ## Data & Models
 

diff --git a/_proc/.quarto/xref/2c7d6167 b/_proc/.quarto/xref/2c7d6167
@@ -1 +1 @@
-{"entries":[],"headings":["install","data-models","how-to-use"]}
+{"headings":["install","data-models","how-to-use"],"entries":[]}
diff --git a/_proc/00_core.ipynb b/_proc/00_core.ipynb
diff --git a/_proc/01_glycan_data.ipynb b/_proc/01_glycan_data.ipynb
diff --git a/_proc/02_ml.ipynb b/_proc/02_ml.ipynb
diff --git a/_proc/03_motif.ipynb b/_proc/03_motif.ipynb
diff --git a/_proc/04_network.ipynb b/_proc/04_network.ipynb
diff --git a/_proc/05_examples.ipynb b/_proc/05_examples.ipynb
diff --git a/_proc/_docs/index_files/figure-commonmark/cell-3-output-1.svg b/_proc/_docs/index_files/figure-commonmark/cell-3-output-1.svg
diff --git a/_proc/index.ipynb b/_proc/index.ipynb
diff --git a/build/lib/glycowork/motif/processing.py b/build/lib/glycowork/motif/processing.py
@@ -134,13 +134,11 @@ def presence_to_matrix(df, glycan_col_name = 'target', label_col_name = 'Species
   | :-
   | Returns pandas dataframe with labels as rows and glycan occurrences as columns
   """
-  glycans = sorted(set(df[glycan_col_name].values.tolist()))
-  species = sorted(set(df[label_col_name].values.tolist()))
-  # Get a count matrix for each rank - glycan combination
-  mat_dic = {k: [df[df[label_col_name] == j][glycan_col_name].values.tolist().count(k) for j in species] for k in glycans}
-  mat = pd.DataFrame(mat_dic)
-  mat.index = species
-  return mat
+  # Create a grouped dataframe where we count the occurrences of each glycan in each species group
+  grouped_df = df.groupby([label_col_name, glycan_col_name]).size().unstack(fill_value = 0)
+  # Sort the index and columns
+  grouped_df = grouped_df.sort_index().sort_index(axis = 1)
+  return grouped_df
 
 
 def find_matching_brackets_indices(s):
@@ -153,12 +151,12 @@ def find_matching_brackets_indices(s):
       stack.append(i)
       opening_indices[i] = len(stack) - 1
     elif c == ']':
-      if len(stack) > 0:
+      if stack:
         opening_index = stack.pop()
         matching_indices.append((opening_index, i))
         del opening_indices[opening_index]
 
-  if len(stack) > 0:
+  if stack:
     print("Unmatched opening brackets:", [s[i] for i in stack])
     return None
   else:

diff --git a/build/lib/glycowork/motif/tokenization.py b/build/lib/glycowork/motif/tokenization.py
@@ -327,15 +327,15 @@ def mz_to_composition(mz_value, mode = 'negative', mass_value = 'monoisotopic',
       if not filter_out.intersection(c.keys()):
         out = [c]
         break
-  if len(out) > 0:
+  if out:
     return out
   else:
     for m, c in cache.items():
       if abs(m+adduct - mz_value) < mass_tolerance:
         if not filter_out.intersection(c.keys()):
           out = [c]
           break
-    if len(out) > 0:
+    if out:
       return out
     else:
       mz_value = (mz_value+0.5*multiplier)*2+(reduced*1)
@@ -669,11 +669,14 @@ def composition_to_mass(dict_comp, mass_value = 'monoisotopic',
   | :-
   | Returns the theoretical mass of input composition
   """
-  mass_dict = dict(zip(mapping_file.composition, mapping_file[sample_prep + '_' + mass_value]))
+  if sample_prep + '_' + mass_value == "underivatized_monoisotopic":
+    mass_dict_in = mass_dict
+  else:
+    mass_dict_in = dict(zip(mapping_file.composition, mapping_file[sample_prep + '_' + mass_value]))
   for old_key, new_key in {'S': 'Sulphate', 'P': 'Phosphate', 'Me': 'Methyl', 'Ac': 'Acetate'}.items():
     if old_key in dict_comp:
       dict_comp[new_key] = dict_comp.pop(old_key)
-  return sum(mass_dict.get(k, 0) * v for k, v in dict_comp.items()) + mass_dict['red_end']
+  return sum(mass_dict_in.get(k, 0) * v for k, v in dict_comp.items()) + mass_dict_in['red_end']
 
 
 def glycan_to_mass(glycan, mass_value = 'monoisotopic', sample_prep = 'underivatized', stem_libr = None):

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -134,13 +134,11 @@ def presence_to_matrix(df, glycan_col_name = 'target', label_col_name = 'Species
   | :-
   | Returns pandas dataframe with labels as rows and glycan occurrences as columns
   """
-  glycans = sorted(set(df[glycan_col_name].values.tolist()))
-  species = sorted(set(df[label_col_name].values.tolist()))
-  # Get a count matrix for each rank - glycan combination
-  mat_dic = {k: [df[df[label_col_name] == j][glycan_col_name].values.tolist().count(k) for j in species] for k in glycans}
-  mat = pd.DataFrame(mat_dic)
-  mat.index = species
-  return mat
+  # Create a grouped dataframe where we count the occurrences of each glycan in each species group
+  grouped_df = df.groupby([label_col_name, glycan_col_name]).size().unstack(fill_value = 0)
+  # Sort the index and columns
+  grouped_df = grouped_df.sort_index().sort_index(axis = 1)
+  return grouped_df
 
 
 def find_matching_brackets_indices(s):
@@ -153,12 +151,12 @@ def find_matching_brackets_indices(s):
       stack.append(i)
       opening_indices[i] = len(stack) - 1
     elif c == ']':
-      if len(stack) > 0:
+      if stack:
         opening_index = stack.pop()
         matching_indices.append((opening_index, i))
         del opening_indices[opening_index]
 
-  if len(stack) > 0:
+  if stack:
     print("Unmatched opening brackets:", [s[i] for i in stack])
     return None
   else:

diff --git a/glycowork/motif/tokenization.py b/glycowork/motif/tokenization.py
@@ -327,15 +327,15 @@ def mz_to_composition(mz_value, mode = 'negative', mass_value = 'monoisotopic',
       if not filter_out.intersection(c.keys()):
         out = [c]
         break
-  if len(out) > 0:
+  if out:
     return out
   else:
     for m, c in cache.items():
       if abs(m+adduct - mz_value) < mass_tolerance:
         if not filter_out.intersection(c.keys()):
           out = [c]
           break
-    if len(out) > 0:
+    if out:
       return out
     else:
       mz_value = (mz_value+0.5*multiplier)*2+(reduced*1)
@@ -669,11 +669,14 @@ def composition_to_mass(dict_comp, mass_value = 'monoisotopic',
   | :-
   | Returns the theoretical mass of input composition
   """
-  mass_dict = dict(zip(mapping_file.composition, mapping_file[sample_prep + '_' + mass_value]))
+  if sample_prep + '_' + mass_value == "underivatized_monoisotopic":
+    mass_dict_in = mass_dict
+  else:
+    mass_dict_in = dict(zip(mapping_file.composition, mapping_file[sample_prep + '_' + mass_value]))
   for old_key, new_key in {'S': 'Sulphate', 'P': 'Phosphate', 'Me': 'Methyl', 'Ac': 'Acetate'}.items():
     if old_key in dict_comp:
       dict_comp[new_key] = dict_comp.pop(old_key)
-  return sum(mass_dict.get(k, 0) * v for k, v in dict_comp.items()) + mass_dict['red_end']
+  return sum(mass_dict_in.get(k, 0) * v for k, v in dict_comp.items()) + mass_dict_in['red_end']
 
 
 def glycan_to_mass(glycan, mass_value = 'monoisotopic', sample_prep = 'underivatized', stem_libr = None):