Merge pull request #2 from levitsky/minihackathon-1

Add materials and update teaching module
hds-sandbox · Nov 8, 2024 · 10f5e78 · 10f5e78
2 parents 30db0a2 + 565fbf0
commit 10f5e78
Show file tree

Hide file tree

Showing 4 changed files with 410 additions and 37 deletions.
diff --git a/Create SDRF.ipynb b/Create SDRF.ipynb
@@ -0,0 +1,268 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "888301cb-544d-45ed-9100-c7893fbb57a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e40954cb-ee9c-47d8-a5de-065a175c0797",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_table('sdrf (3).tsv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2f5251dd-2e6e-4c3a-9189-b7f463bbe178",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['comment[instrument]'] = 'Q Exactive'\n",
+    "df['comment[technical replicate]'] = 1\n",
+    "df['characteristics[organism]'] = 'Homo Sapiens'\n",
+    "df['characteristics[organism part]'] = 'breast'\n",
+    "df['comment[fraction identifier]'] = df['comment[data file]'].str.extract(r'fr(\\d+).raw').astype(int)\n",
+    "df['comment[file uri]'] = 'https://storage.jpostdb.org/JPST000265/' + df['comment[data file]']\n",
+    "df['technology type'] = 'proteomic profiling by mass spectrometry'\n",
+    "df['characteristics[ancestry category]'] = 'not available'\n",
+    "df['characteristics[age]'] = 'not available'\n",
+    "df['characteristics[sex]'] = 'female'\n",
+    "df['characteristics[cell type]'] = 'malignant cell'\n",
+    "df['characteristics[biological replicate]'] = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "578230a6-3ccf-41ff-9731-9eefe0d2b55f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tumor_id = pd.read_excel('41467_2019_9018_MOESM3_ESM.xlsx', sheet_name='Tumor annotations', usecols=['Tumor ID', 'TMT set nr', 'TMT tag', 'PAM50 subtype'], index_col=(1, 2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7beb75d9-bb9b-45bc-8628-a08559b2c220",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tumor_id.index = pd.MultiIndex.from_tuples([(s, str(label)) for s, label in tumor_id.index], names=tumor_id.index.names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "ead191c9-aef3-431a-816c-c15273f052df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tumor_types = {\n",
+    "    'Basal': 'basal-like breast carcinoma',\n",
+    "    'LumA': 'luminal A breast carcinoma',\n",
+    "    'LumB': 'luminal B breast carcinoma',\n",
+    "    'HER2': 'HER2 Positive Breast Carcinoma',\n",
+    "    'Normal': 'Normal Breast-Like Subtype of Breast Carcinoma'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "65f5a20e-4d12-48cc-9a32-cdcecad4f602",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>Tumor ID</th>\n",
+       "      <th>PAM50 subtype</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TMT set nr</th>\n",
+       "      <th>TMT tag</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"5\" valign=\"top\">1</th>\n",
+       "      <th>126</th>\n",
+       "      <td>OSL.53E</td>\n",
+       "      <td>Basal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>127N</th>\n",
+       "      <td>OSL.567</td>\n",
+       "      <td>LumA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>127C</th>\n",
+       "      <td>OSL.3FF</td>\n",
+       "      <td>Basal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128N</th>\n",
+       "      <td>OSL.55F</td>\n",
+       "      <td>Basal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128C</th>\n",
+       "      <td>OSL.46A</td>\n",
+       "      <td>Basal</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   Tumor ID PAM50 subtype\n",
+       "TMT set nr TMT tag                       \n",
+       "1          126      OSL.53E         Basal\n",
+       "           127N     OSL.567          LumA\n",
+       "           127C     OSL.3FF         Basal\n",
+       "           128N     OSL.55F         Basal\n",
+       "           128C     OSL.46A         Basal"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tumor_id.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "8ecf547f-1594-48f2-aa54-67cdc4257620",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pool_str = 'SN=' + ','.join(tumor_id['Tumor ID'].values)\n",
+    "\n",
+    "def get_info(row):\n",
+    "    pool = int(re.search(r'pool(\\d)', row['comment[data file]']).group(1))\n",
+    "    try:\n",
+    "        sample = tumor_id.loc[(pool, row['comment[label]'][3:]), 'Tumor ID']\n",
+    "        disease = tumor_types[tumor_id.loc[(pool, row['comment[label]'][3:]), 'PAM50 subtype']]\n",
+    "        pooled = 'not pooled'\n",
+    "    except KeyError:\n",
+    "        sample = 'pool'\n",
+    "        disease = 'breast cancer'\n",
+    "        pooled = pool_str\n",
+    "    assay = f\"pool {pool}, fraction {row['comment[fraction identifier]']}\"\n",
+    "    return sample, assay, pooled, disease"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f37d664a-a044-4b38-96de-d2e561819468",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[['source name', 'assay name', 'characteristics[pooled sample]', 'characteristics[disease]']] = df.apply(get_info, axis=1, result_type='expand')\n",
+    "df['factor value[disease]'] = df['characteristics[disease]']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ab2be8a8-237f-4ae1-bdc7-b70024f828bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def key(colname):\n",
+    "    if colname == 'source name':\n",
+    "        return 0\n",
+    "    if colname[:15] == 'characteristics':\n",
+    "        return 1\n",
+    "    if colname == 'assay name':\n",
+    "        return 2\n",
+    "    if colname == 'technology type':\n",
+    "        return 3\n",
+    "    if colname[:7] == 'comment':\n",
+    "        return 4\n",
+    "    if colname[:12] == 'factor value':\n",
+    "        return 5\n",
+    "    return 6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c95778b3-4230-4890-bf9b-7b276a8667cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df[sorted(df.columns, key=key)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "1dde5960-3afc-4f25-a71d-dddd58cedb8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv('PXD008841.sdrf.tsv', sep='\\t', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/sdrf.qmd b/sdrf.qmd
@@ -0,0 +1,103 @@
+---
+title: "Breast Cancer Proteomics Module: SDRF"
+summary: Teaching module provided for the proteomics module. 
+date: last-modified
+author: Lev Levitsky
+hide:
+  - navigation
+---
+
+
+::: {.question}
+Suppose you want to reproduce the figures from the article. What do you need for that, except the experimental data files?
+:::
+
+### SDRF metadata format
+
+Familiarize yourself with the **Sample and Data Relationship Format for Proteomics** - its [general description](https://github.com/bigbio/proteomics-sample-metadata)
+and take a look at the [specification](https://github.com/bigbio/proteomics-sample-metadata/blob/master/sdrf-proteomics/README.adoc). Then answer a few questions:
+
+::: {.question}
+What is the general layout of an SDRF file?
+:::
+
+::: {.question}
+What is the scope of information contained in an SDRF file?
+:::
+
+::: {.question}
+What columns would capture the most important sample characteristics for the dataset you are working with?
+:::
+
+::: {.question}
+How are the valid values defined for different columns?
+:::
+
+
+We will now create an annotation for a small subset of our data according to SDRF-Proteomics standard. Go to [lesSDRF](https://lessdrf.streamlit.app/),
+then start a new SDRF annotation with the human template. When asked for file names, input only the first RAW file name from the annotation table found in supporting information.
+After that, proceed to step 2, labeling.
+
+Carefully select the list of labels corresponding to the dataset, then click "Submit selection", and proceed to specify that every label is present in "ALL" files. Then click "Ready" and proceed to step 3.
+
+::: {.question}
+How many rows does the SDRF table have now? How many would it have if we annotated the entire dataset?
+:::
+
+Fill in the first three required columns one by one: source name, organism, and organism part.
+
+::: {.question}
+What would be a good sample identifier for the "source name" column?
+:::
+
+Fill in the next column, cell type. Consider that we are dealing with cancer samples. For the next columns, ancestry category and age, you can select "not available".
+Then, fill in the "sex" column.
+
+It is time to fill in the disease column.
+
+::: {.question}
+How many different values for disease can we possibly have in our annotation? What do they correspond to?
+:::
+
+::: {.question}
+How many different values will we actually use when annotating the selected subset of data?
+:::
+
+Proceed to fill the disease column, then fill in the rest of the columns, up to and including "instrument".
+
+::: {.question}
+How many different values should you use in the "assay name" column when annotating the subset? Why? How many different values should there be in the entire annotation?
+:::
+
+When you have only four columns left (cleavage agent details, modifications, precursor and fragment mass tolerance), skip to step 4 and fill the factor value column.
+
+::: {.question}
+What is the meaning of factor value, and what should it be in this case?
+:::
+
+After that, download the resulting file. Copy the cleavage agent, modifications and mass tolerance information from the partial annotation provided by FragPipe into your file,
+using Excel or similar software. Congratulations! You have a complete annotation according to the SDRF standard, but only for one out of all raw files in the data set.
+If you need a grade for this course, submit your SDRF file together with your answers.
+
+::: {.question}
+How would you go about making a full dataset annotation?
+:::
+
+## Personal Details
+
+**Name:**
+<textarea id="name" placeholder="Type your name here..." class="input-box"></textarea>
+
+**Email:**
+<textarea id="email" placeholder="Type your email here..." class="input-box"></textarea>
+
+**Course/Program:**
+<textarea id="course" placeholder="Type your course/program here..." class="input-box"></textarea>
+
+**Date:**
+<textarea id="date" placeholder="Type the date here..." class="input-box"></textarea>
+
+<button id="downloadBtn" class="download-button">Download Your Answers as PDF</button>
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/html2pdf.js/0.9.3/html2pdf.bundle.min.js"></script>
+