bigbio · zprobot · Mar 28, 2024 · Mar 28, 2024 · Apr 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,5 @@ res.csv
 venv
 /venv/
 /compute-all.sh
+/build
+/dist
diff --git a/bin/__pycache__/normalize_methods.cpython-310.pyc b/bin/__pycache__/normalize_methods.cpython-310.pyc
diff --git a/bin/__pycache__/parquet.cpython-310.pyc b/bin/__pycache__/parquet.cpython-310.pyc
diff --git a/bin/normalize_methods.py b/bin/normalize_methods.py
@@ -0,0 +1,111 @@
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import quantile_transform
+
+def normalize_run(df,sdrf_path,method):
+    reps = get_replicate(sdrf_path)
+    if(reps>1):
+        samples = df['SampleID'].unique()
+        for sample in samples:
+            runs = df.loc[df['SampleID']==sample,'Run'].unique().tolist()
+            if(len(runs)>1):
+                sample_df = df.loc[df['SampleID']==sample,:]
+                map,base = get_normalize_args(sample_df,runs,method)
+                for run in runs:
+                    run = str(run)
+                    run_intensity = df.loc[(df['SampleID']==sample)&(df['Run']==run),'NormIntensity']
+                    df.loc[(df['SampleID']==sample)&(df['Run']==run),'NormIntensity'] = run_intensity / (map[run] / base)
+        return df
+    else:
+        return df
+
+def get_replicate(sdrf_path):
+    sdrf = pd.read_csv(sdrf_path,sep="\t")
+    reps = len(sdrf["comment[technical replicate]"].unique())
+    return reps
+
+def get_normalize_args(df,runs,method):
+    match method:
+        case 'mean':
+            return normalize_mean(df,runs)
+        case 'median':
+            return normalize_median(df,runs)
+        case 'iqr':
+            return normalize_q(df,runs)
+
+def normalize_mean(df,runs):
+    map = {}
+    total = 0
+    for run in runs:
+        run = str(run)
+        run_m = df.loc[df['Run']==run,'NormIntensity'].mean()
+        map[run] = run_m
+        total += run_m
+    avg = total / len(runs)
+    return map,avg
+
+def normalize_median(df,runs):
+    map = {}
+    total = 0
+    for run in runs:
+        run = str(run)
+        run_m = df.loc[df['Run']==run,'NormIntensity'].median()
+        map[run] = run_m
+        total += run_m
+    med = total / len(runs)
+    return map,med
+
+def normalize_q(df,runs):
+    map = {}
+    total = 0
+    for run in runs:
+        run = str(run)
+        run_m = df.loc[df['Run']==run,'NormIntensity'].quantile([0.75,0.25],interpolation='linear').mean()
+        map[run] = run_m
+        total += run_m
+    q = total / len(runs)
+    return map,q
+
+def normalize(df,method):
+    match method:
+        case 'mean':
+            return mean_normalize(df)
+        case 'median':
+            return median_normalize(df)
+        case 'max':
+            return max_normalize(df)
+        case 'global':
+            return global_normalize(df)
+        case 'max_min':
+            return max_min_mormalize(df)
+        case _:
+            return -1
+
+# mean
+def mean_normalize(df):
+    return df / df.mean()
+
+# median
+def median_normalize(df):
+    return df / df.median()
+
+#max
+def max_normalize(df):
+    return df / df.max()
+
+#global
+def global_normalize(df):
+    return df / df.sum()
+
+#max-min
+def max_min_mormalize(df):
+    min = df.min()
+    return (df - min) / (df.max() - min)
+
+#quantile
+def quantile_normalize(df):
+    index = df.index
+    columns = df.columns
+    DF = quantile_transform(df)
+    df = pd.DataFrame(df,columns=columns,index=index)
+    return df
diff --git a/bin/parquet.py b/bin/parquet.py
@@ -0,0 +1,46 @@
+import re
+import os
+import duckdb
+class Feature:
+
+    def __init__(self, parquet_path: str):
+        if os.path.exists(parquet_path):
+            self.parquet_db = duckdb.connect()
+            self.parquet_db = self.parquet_db.execute(
+                "CREATE VIEW parquet_db AS SELECT * FROM parquet_scan('{}')".format(parquet_path))
+        else:
+            raise FileNotFoundError(f'the file {parquet_path} does not exist.')
+
+    def get_report_from_database(self, samples: list):
+        """
+        This function loads the report from the duckdb database for a group of ms_runs.
+        :param runs: A list of ms_runs
+        :return: The report
+        """
+        database = self.parquet_db.sql(
+                        """
+            select * from parquet_db
+            where sample_accession IN {}
+            """.format(tuple(samples))
+        ) 
+        report = database.df()
+        return report
+
+    def iter_samples(self,file_num:int=20):
+        """
+        :params file_num: The number of files being processed at the same time(default 10)
+        :yield: _description_
+        """
+        samples = self.get_unique_samples()
+        ref_list =  [samples[i:i+file_num] for i in range(0,len(samples), file_num)]
+        for refs in ref_list:
+            batch_df = self.get_report_from_database(refs)
+            yield refs,batch_df
+
+    def get_unique_samples(self):
+        """
+        return: A list of deduplicated peptides.
+        """
+        unique_peps = self.parquet_db.sql(f"SELECT DISTINCT sample_accession FROM parquet_db").df()
+
+        return unique_peps['sample_accession'].tolist()
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,5 @@ res.csv @@
     venv
     /venv/
     /compute-all.sh
+    /build
+    /dist