diff --git a/genomicFeatures_test.ipynb b/genomicFeatures_test.ipynb new file mode 100644 index 0000000..8b8263d --- /dev/null +++ b/genomicFeatures_test.ipynb @@ -0,0 +1,241 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a699ddc4-502f-418e-9f26-99677ad07cff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "The downloaded binary packages are in\n", + "\t/var/folders/zs/gjblv2b16g3b50jqcq6fw76m0000gq/T//RtmpDxAFv6/downloaded_packages\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "'getOption(\"repos\")' replaces Bioconductor standard repositories, see\n", + "'help(\"repositories\", package = \"BiocManager\")' for details.\n", + "Replacement repositories:\n", + " CRAN: https://cran.r-project.org\n", + "\n", + "Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)\n", + "\n", + "Installing package(s) 'BiocVersion'\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "The downloaded binary packages are in\n", + "\t/var/folders/zs/gjblv2b16g3b50jqcq6fw76m0000gq/T//RtmpDxAFv6/downloaded_packages\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Old packages: 'boot', 'codetools', 'lattice'\n", + "\n" + ] + } + ], + "source": [ + "if (!require(\"BiocManager\", quietly = TRUE))\n", + " install.packages(\"BiocManager\")\n", + "BiocManager::install(version = \"3.18\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a54fc80d-569d-409d-8163-1ff4215a6a7c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "'getOption(\"repos\")' replaces Bioconductor standard repositories, see\n", + "'help(\"repositories\", package = \"BiocManager\")' for details.\n", + "Replacement repositories:\n", + " CRAN: https://cran.r-project.org\n", + "\n", + "Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)\n", + "\n", + "Installing package(s) 'TxDb.Hsapiens.UCSC.hg38.knownGene'\n", + "\n", + "installing the source package ‘TxDb.Hsapiens.UCSC.hg38.knownGene’\n", + "\n", + "\n", + "Old packages: 'boot', 'codetools', 'lattice'\n", + "\n" + ] + } + ], + "source": [ + "# BiocManager::install(\"GenomicFeatures\")\n", + "BiocManager::install(\"TxDb.Hsapiens.UCSC.hg38.knownGene\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9f2ca2f7-6c28-4a70-ace8-3015e7fbf0b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TxDb object:\n", + "# Db type: TxDb\n", + "# Supporting package: GenomicFeatures\n", + "# Data source: UCSC\n", + "# Genome: hg38\n", + "# Organism: Homo sapiens\n", + "# Taxonomy ID: 9606\n", + "# UCSC Table: knownGene\n", + "# UCSC Track: GENCODE V44\n", + "# Resource URL: http://genome.ucsc.edu/\n", + "# Type of Gene ID: Entrez Gene ID\n", + "# Full dataset: yes\n", + "# miRBase build ID: NA\n", + "# Nb of transcripts: 276905\n", + "# Db created by: GenomicFeatures package from Bioconductor\n", + "# Creation time: 2023-09-20 17:25:17 +0000 (Wed, 20 Sep 2023)\n", + "# GenomicFeatures version at creation time: 1.53.2\n", + "# RSQLite version at creation time: 2.3.1\n", + "# DBSCHEMAVERSION: 1.2" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "library(TxDb.Hsapiens.UCSC.hg38.knownGene)\n", + "txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene\n", + "txdb" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2ded4ebf-2390-4261-93ee-464bcb7d58db", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 2135 genes were dropped because they have exons located on both strands\n", + " of the same reference sequence or on more than one reference sequence,\n", + " so cannot be represented by a single genomic range.\n", + " Use 'single.strand.genes.only=FALSE' to get all the genes in a\n", + " GRangesList object, or use suppressMessages() to suppress this message.\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "GRanges object with 30733 ranges and 1 metadata column:\n", + " seqnames ranges strand | gene_id\n", + " | \n", + " 1 chr19 58345178-58362751 - | 1\n", + " 10 chr8 18386311-18401218 + | 10\n", + " 100 chr20 44584896-44652252 - | 100\n", + " 1000 chr18 27932879-28177946 - | 1000\n", + " 100008586 chrX 49551278-49568218 + | 100008586\n", + " ... ... ... ... . ...\n", + " 9990 chr15 34229784-34338060 - | 9990\n", + " 9991 chr9 112217716-112333664 - | 9991\n", + " 9992 chr21 34364006-34371381 + | 9992\n", + " 9993 chr22 19036282-19122454 - | 9993\n", + " 9997 chr22 50523568-50526461 - | 9997\n", + " -------\n", + " seqinfo: 711 sequences (1 circular) from hg38 genome" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "genes(txdb)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "41f123b2-6f97-4f33-b54f-46c07603a432", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 2135 genes were dropped because they have exons located on both strands\n", + " of the same reference sequence or on more than one reference sequence,\n", + " so cannot be represented by a single genomic range.\n", + " Use 'single.strand.genes.only=FALSE' to get all the genes in a\n", + " GRangesList object, or use suppressMessages() to suppress this message.\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "GRanges object with 30733 ranges and 1 metadata column:\n", + " seqnames ranges strand | gene_id\n", + " | \n", + " 1 chr19 58345178-58362751 - | 1\n", + " 10 chr8 18386311-18401218 + | 10\n", + " 100 chr20 44584896-44652252 - | 100\n", + " 1000 chr18 27932879-28177946 - | 1000\n", + " 100008586 chrX 49551278-49568218 + | 100008586\n", + " ... ... ... ... . ...\n", + " 9990 chr15 34229784-34338060 - | 9990\n", + " 9991 chr9 112217716-112333664 - | 9991\n", + " 9992 chr21 34364006-34371381 + | 9992\n", + " 9993 chr22 19036282-19122454 - | 9993\n", + " 9997 chr22 50523568-50526461 - | 9997\n", + " -------\n", + " seqinfo: 711 sequences (1 circular) from hg38 genome" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "genes(txdb)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/genomic_features/__init__.py b/src/genomic_features/__init__.py index 679dc43..b5da4de 100644 --- a/src/genomic_features/__init__.py +++ b/src/genomic_features/__init__.py @@ -1,7 +1,7 @@ from importlib.metadata import version -from . import ensembl, filters +from . import ensembl, filters, ucsc -__all__ = ["ensembl"] +__all__ = ["ensembl", "ucsc"] __version__ = version("genomic-features") diff --git a/src/genomic_features/ucsc/__init__.py b/src/genomic_features/ucsc/__init__.py new file mode 100644 index 0000000..a363549 --- /dev/null +++ b/src/genomic_features/ucsc/__init__.py @@ -0,0 +1 @@ +from .ucscdb import UCSCDB, annotation, list_ucscdb_annotations diff --git a/src/genomic_features/ucsc/ucscdb.py b/src/genomic_features/ucsc/ucscdb.py new file mode 100644 index 0000000..c37a3ec --- /dev/null +++ b/src/genomic_features/ucsc/ucscdb.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +import os +from functools import cached_property +from pathlib import Path +from typing import Literal + +import ibis +import requests +from ibis import deferred +from ibis.expr.types import Table as IbisTable +from pandas import DataFrame, Timestamp +from requests.exceptions import HTTPError + +from genomic_features._core import filters as _filters +from genomic_features._core.cache import retrieve_annotation + +PKG_CACHE_DIR = "genomic-features" + +BIOC_ANNOTATION_HUB_URL = ( + "https://bioconductorhubs.blob.core.windows.net/annotationhub/" +) +ANNOTATION_HUB_URL = ( + "https://annotationhub.bioconductor.org/metadata/annotationhub.sqlite3" +) +TIMESTAMP_URL = "https://annotationhub.bioconductor.org/metadata/database_timestamp" + +_TX_TABLE = "transcript" +_EXONS_TABLE = "exon" +_GENES_TABLE = "gene" + +_PRETTY_NAMES = { + "_tx_id": "tx_id", + "tx_chrom": "chrom", + "tx_strand": "strand", + "tx_start": "start", + "tx_end": "end", + "_exon_id": "exon_id", + "exon_chrom": "chrom", + "exon_strand": "strand", + "exon_start": "start", + "exon_end": "end", +} + + +def annotation( + species: str, bioc_version: str, assembly: str, ucsc_table: str +) -> UCSCDB: + try: + ucscdb = UCSCDB( + ibis.sqlite.connect( + retrieve_annotation( + os.path.join( + BIOC_ANNOTATION_HUB_URL, + f"ucsc/standard/{bioc_version}/TxDb.{species}.UCSC.{assembly}.{ucsc_table}.sqlite", + ) + ) + ) + ) + except HTTPError as err: + if err.response.status_code == 404: + raise ValueError( + f"No ucsc TxDb database found for {species} {bioc_version} {assembly} {ucsc_table}. Check available versions with `genomic_features.ucsc.list_ucscdb_annotation`." + ) from err + else: + raise HTTPError from err + return ucscdb + + +def list_ucscdb_annotations(species: None | str | list[str] = None) -> DataFrame: + """List available Ensembl gene annotations. + + Parameters + ---------- + species + Show gene annotations for subset of species E.g. Hsapiens for human, Mmusculus + for mouse (optional) + + Returns + ------- + A table of available species and annotation versions in EnsDb. + + + Usage + ----- + >>> gf.ensembl.list_ensdb_annotations("Mmusculus") + """ + _COL_ORDERS = ["species", "assembly", "ucsc_table", "bioc_version"] + # Get latest AnnotationHub timestamp + db_path = Path(retrieve_annotation(ANNOTATION_HUB_URL)) + timestamp = requests.get(TIMESTAMP_URL).text + ahdb = ibis.sqlite.connect(db_path) + latest_ts = Timestamp(timestamp).replace(tzinfo=None) + cached_ts = ahdb.table("timestamp").execute()["timestamp"][0] + if latest_ts != cached_ts: + db_path.unlink() + ahdb = ibis.sqlite.connect(retrieve_annotation(ANNOTATION_HUB_URL)) + + version_table = ( + ahdb.table("rdatapaths").filter(deferred.rdataclass == "TxDb").execute() + ) + version_table = version_table[ + version_table["rdatapath"].map(lambda x: x.split("/")[0] == "ucsc") + ] + + version_table["bioc_version"] = version_table["rdatapath"].str.split( + "/", expand=True + )[2] + version_table["species"] = ( + version_table["rdatapath"] + .str.split("/", expand=True)[3] + .str.split(".", expand=True)[1] + ) + version_table["assembly"] = ( + version_table["rdatapath"] + .str.split("/", expand=True)[3] + .str.split(".", expand=True)[3] + ) + version_table["ucsc_table"] = ( + version_table["rdatapath"] + .str.split("/", expand=True)[3] + .str.split(".", expand=True)[4] + ) + # `Athaliana` do not follow the normal name formatting, drop them. + version_table = version_table[version_table["ucsc_table"] != "sqlite"] + + if species is not None: + if isinstance(species, str): + version_table = version_table[version_table["species"] == species] + else: + version_table = version_table[version_table["species"].isin(species)] + # check that species exist + if version_table.shape[0] == 0: + raise ValueError( + f'No ucsc database found for {species}. Must be in {" ".join(df["species"].unique())}.' + ) + + return version_table[_COL_ORDERS].sort_values(_COL_ORDERS) + + +class UCSCDB: + """UCSC annotation database.""" + + def __init__(self, connection: ibis.BaseBackend): + self.db = connection + + @cached_property + def metadata(self) -> dict: + metadata_tbl = self.db.table("metadata").execute() + return dict(zip(metadata_tbl["name"], metadata_tbl["value"])) + + def __repr__(self) -> str: + d = self.metadata + return f"UCSCDB(organism='{d['Organism']}', ucsc_track='{d['UCSC Track']}', genome='{d['Genome']}', ucsc_table='{d['UCSC Table']}')" + + def chrominfo(self) -> DataFrame: + return self.db.table("chrominfo").execute() + + def list_tables(self) -> list: + return self.db.list_tables() + + def transcripts( + self, + # cols: list[str] | None = None, + # filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + ) -> DataFrame: + tx = self.db.table(_TX_TABLE).execute() + tx = tx.rename(columns=_PRETTY_NAMES) + tx = tx.drop("tx_type", axis=1) # always None + return tx + + def exons( + self, + # cols: list[str] | None = None, + # filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + ) -> DataFrame: + exons = self.db.table(_EXONS_TABLE).execute() + exons = exons.rename(columns=_PRETTY_NAMES) + exons = exons.drop("exon_name", axis=1) # always None + return exons + + def genes( + self, + # cols: list[str] | None = None, + # filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + ) -> DataFrame: + genes = self.db.table(_GENES_TABLE).execute() + return genes + + def _execute_query(self, query: IbisTable) -> DataFrame: + # TODO: Allow more options for returning results + return query.distinct().execute() + + def list_columns(self, tables: str | list[str] | None = None) -> list[str]: + if tables is None: + tables = self.db.list_tables() # list of table names + elif isinstance(tables, str): + tables = [tables] # list of tables names (only one) + columns = [c for t in tables for c in self.db.table(t).columns] + return columns + + def _clean_columns(self, columns: list[str]) -> list[str]: + if isinstance(columns, str): + columns = [columns] + + valid_columns = set(self.list_columns()) + cols = list(filter(lambda c: c in valid_columns, columns)) + invalid_columns = set(columns) - valid_columns + if invalid_columns: + raise ValueError( + f"The following columns are not found in any database: {invalid_columns}" + ) + if not cols: + raise ValueError("No valid columns were found.") + return cols + + def _build_query( + self, + table: Literal["gene", "tx", "exon"], + cols: list[str], + filter: _filters.AbstractFilterExpr, + join_type: Literal["inner", "left"] = "inner", + ) -> IbisTable: + """Build a query for the genomic features table.""" + # Finalize cols + self._clean_columns(cols) + for col in filter.columns(): + if col not in cols: + cols.append(col) + + # check if join is required + tables = self._get_required_tables(self._tables_for_columns(cols)) + + # Basically just to make sure exons stay in the query + if table not in tables: + tables.append(table) + + if len(tables) > 1: + query = self._join_query(tables, start_with=table, join_type=join_type) + else: + query = self.db.table(table) + # add filter + query = query.filter(filter.convert()).select(cols) + return query diff --git a/ucscdb.ipynb b/ucscdb.ipynb new file mode 100644 index 0000000..82a171b --- /dev/null +++ b/ucscdb.ipynb @@ -0,0 +1,1060 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "d1cbfd54-ed0a-475a-9ab4-d990ead4fa21", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/felixraimundo/Library/Application Support/hatch/env/virtual/genomic-features/KcSbK2dP/genomic-features/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import genomic_features as gf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6c41a912-e665-44a0-8df5-a670db50973b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "828d4119-b66c-460e-8233-0ca84e2c8d17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "UCSCDB(organism='Homo sapiens', ucsc_track='GENCODE V44', genome='hg38', ucsc_table='knownGene')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ucscdb = gf.ucsc.annotation(\n", + " species=\"Hsapiens\",\n", + " assembly=\"hg38\",\n", + " ucsc_table=\"knownGene\",\n", + " bioc_version=\"3.18\",\n", + ")\n", + "ucscdb" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ef0b732b-84af-4528-8714-c3843828f321", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Db type': 'TxDb',\n", + " 'Supporting package': 'GenomicFeatures',\n", + " 'Data source': 'UCSC',\n", + " 'Genome': 'hg38',\n", + " 'Organism': 'Homo sapiens',\n", + " 'Taxonomy ID': '9606',\n", + " 'UCSC Table': 'knownGene',\n", + " 'UCSC Track': 'GENCODE V44',\n", + " 'Resource URL': 'http://genome.ucsc.edu/',\n", + " 'Type of Gene ID': 'Entrez Gene ID',\n", + " 'Full dataset': 'yes',\n", + " 'miRBase build ID': None,\n", + " 'Nb of transcripts': '276905',\n", + " 'Db created by': 'GenomicFeatures package from Bioconductor',\n", + " 'Creation time': '2023-09-20 17:25:17 +0000 (Wed, 20 Sep 2023)',\n", + " 'GenomicFeatures version at creation time': '1.53.2',\n", + " 'RSQLite version at creation time': '2.3.1',\n", + " 'DBSCHEMAVERSION': '1.2'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ucscdb.metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1dca4051-dc86-4654-862e-346a6e578d93", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cds ibis.Schema {\n", + " _cds_id int32\n", + " cds_name string\n", + " cds_chrom !string\n", + " cds_strand !string\n", + " cds_start !int32\n", + " cds_end !int32\n", + "}\n", + "chrominfo ibis.Schema {\n", + " _chrom_id int32\n", + " chrom !string\n", + " length int32\n", + " is_circular int32\n", + "}\n", + "exon ibis.Schema {\n", + " _exon_id int32\n", + " exon_name string\n", + " exon_chrom !string\n", + " exon_strand !string\n", + " exon_start !int32\n", + " exon_end !int32\n", + "}\n", + "gene ibis.Schema {\n", + " gene_id !string\n", + " _tx_id !int32\n", + "}\n", + "metadata ibis.Schema {\n", + " name string\n", + " value string\n", + "}\n", + "splicing ibis.Schema {\n", + " _tx_id !int32\n", + " exon_rank !int32\n", + " _exon_id !int32\n", + " _cds_id int32\n", + " cds_phase int32\n", + "}\n", + "transcript ibis.Schema {\n", + " _tx_id int32\n", + " tx_name string\n", + " tx_type string\n", + " tx_chrom !string\n", + " tx_strand !string\n", + " tx_start !int32\n", + " tx_end !int32\n", + "}\n" + ] + } + ], + "source": [ + "for tbl_name in ucscdb.db.list_tables():\n", + " print(tbl_name, ucscdb.db.table(tbl_name).schema())" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "c1d5a9cb-c884-43d0-beac-12237e0fb2da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_tx_idtx_nametx_typetx_chromtx_strandtx_starttx_end
01ENST00000456328.2Nonechr1+1186914409
12ENST00000450305.2Nonechr1+1201013670
23ENST00000473358.1Nonechr1+2955431097
34ENST00000469289.1Nonechr1+3026731109
45ENST00000607096.1Nonechr1+3036630503
........................
276900276901ENST00000710260.1NonechrX_MU273397v1_alt-239036260095
276901276902ENST00000710028.1NonechrX_MU273397v1_alt-272358282686
276902276903ENST00000710030.1NonechrX_MU273397v1_alt-314193316302
276903276904ENST00000710216.1NonechrX_MU273397v1_alt-314813315236
276904276905ENST00000710031.1NonechrX_MU273397v1_alt-324527324923
\n", + "

276905 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " _tx_id tx_name tx_type tx_chrom tx_strand \\\n", + "0 1 ENST00000456328.2 None chr1 + \n", + "1 2 ENST00000450305.2 None chr1 + \n", + "2 3 ENST00000473358.1 None chr1 + \n", + "3 4 ENST00000469289.1 None chr1 + \n", + "4 5 ENST00000607096.1 None chr1 + \n", + "... ... ... ... ... ... \n", + "276900 276901 ENST00000710260.1 None chrX_MU273397v1_alt - \n", + "276901 276902 ENST00000710028.1 None chrX_MU273397v1_alt - \n", + "276902 276903 ENST00000710030.1 None chrX_MU273397v1_alt - \n", + "276903 276904 ENST00000710216.1 None chrX_MU273397v1_alt - \n", + "276904 276905 ENST00000710031.1 None chrX_MU273397v1_alt - \n", + "\n", + " tx_start tx_end \n", + "0 11869 14409 \n", + "1 12010 13670 \n", + "2 29554 31097 \n", + "3 30267 31109 \n", + "4 30366 30503 \n", + "... ... ... \n", + "276900 239036 260095 \n", + "276901 272358 282686 \n", + "276902 314193 316302 \n", + "276903 314813 315236 \n", + "276904 324527 324923 \n", + "\n", + "[276905 rows x 7 columns]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ucscdb.db.table(\"transcript\").execute()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2dfccc7b-df65-4072-9b18-eebbd486d69b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tx_idtx_namechromstrandstartend
01ENST00000456328.2chr1+1186914409
12ENST00000450305.2chr1+1201013670
23ENST00000473358.1chr1+2955431097
34ENST00000469289.1chr1+3026731109
45ENST00000607096.1chr1+3036630503
.....................
276900276901ENST00000710260.1chrX_MU273397v1_alt-239036260095
276901276902ENST00000710028.1chrX_MU273397v1_alt-272358282686
276902276903ENST00000710030.1chrX_MU273397v1_alt-314193316302
276903276904ENST00000710216.1chrX_MU273397v1_alt-314813315236
276904276905ENST00000710031.1chrX_MU273397v1_alt-324527324923
\n", + "

276905 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " tx_id tx_name chrom strand start end\n", + "0 1 ENST00000456328.2 chr1 + 11869 14409\n", + "1 2 ENST00000450305.2 chr1 + 12010 13670\n", + "2 3 ENST00000473358.1 chr1 + 29554 31097\n", + "3 4 ENST00000469289.1 chr1 + 30267 31109\n", + "4 5 ENST00000607096.1 chr1 + 30366 30503\n", + "... ... ... ... ... ... ...\n", + "276900 276901 ENST00000710260.1 chrX_MU273397v1_alt - 239036 260095\n", + "276901 276902 ENST00000710028.1 chrX_MU273397v1_alt - 272358 282686\n", + "276902 276903 ENST00000710030.1 chrX_MU273397v1_alt - 314193 316302\n", + "276903 276904 ENST00000710216.1 chrX_MU273397v1_alt - 314813 315236\n", + "276904 276905 ENST00000710031.1 chrX_MU273397v1_alt - 324527 324923\n", + "\n", + "[276905 rows x 6 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tx = ucscdb.transcripts()\n", + "tx" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d01654ff-75d6-415a-a37f-3fb1c6e2c02e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
exon_idchromstrandstartend
01chr1+1186912227
12chr1+1201012057
23chr1+1217912227
34chr1+1261312697
45chr1+1261312721
..................
734617734618chrX_MU273397v1_alt-314193314248
734618734619chrX_MU273397v1_alt-314813315236
734619734620chrX_MU273397v1_alt-315258315407
734620734621chrX_MU273397v1_alt-316254316302
734621734622chrX_MU273397v1_alt-324527324923
\n", + "

734622 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " exon_id chrom strand start end\n", + "0 1 chr1 + 11869 12227\n", + "1 2 chr1 + 12010 12057\n", + "2 3 chr1 + 12179 12227\n", + "3 4 chr1 + 12613 12697\n", + "4 5 chr1 + 12613 12721\n", + "... ... ... ... ... ...\n", + "734617 734618 chrX_MU273397v1_alt - 314193 314248\n", + "734618 734619 chrX_MU273397v1_alt - 314813 315236\n", + "734619 734620 chrX_MU273397v1_alt - 315258 315407\n", + "734620 734621 chrX_MU273397v1_alt - 316254 316302\n", + "734621 734622 chrX_MU273397v1_alt - 324527 324923\n", + "\n", + "[734622 rows x 5 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exons = ucscdb.exons()\n", + "exons" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "e9bb67ce-ca75-470f-8d59-520595d6229a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene_id_tx_id
06011264219
16011264220
2100130386264224
365265271292
465265271293
.........
2355205565722459
2355215565722460
2355225565722461
2355235565722462
2355245565722463
\n", + "

235525 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " gene_id _tx_id\n", + "0 6011 264219\n", + "1 6011 264220\n", + "2 100130386 264224\n", + "3 65265 271292\n", + "4 65265 271293\n", + "... ... ...\n", + "235520 55657 22459\n", + "235521 55657 22460\n", + "235522 55657 22461\n", + "235523 55657 22462\n", + "235524 55657 22463\n", + "\n", + "[235525 rows x 2 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = ucscdb.genes()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "d1362962-a610-45f3-95c2-bec594b6f871", + "metadata": {}, + "outputs": [], + "source": [ + "s = ucscdb.db.table(\"splicing\").execute()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "0652e3d5-8fef-4777-b989-6b4d431d8134", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_tx_idexon_rank_exon_id_cds_idcds_phase
0111NaNNone
1125NaNNone
2138NaNNone
311639131607NaNNone
43110NaNNone
..................
17889552748472729350NaNNone
17889562748473729351NaNNone
17889572748474729352NaNNone
17889582748475729353NaNNone
17889592748621729407NaNNone
\n", + "

1788960 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " _tx_id exon_rank _exon_id _cds_id cds_phase\n", + "0 1 1 1 NaN None\n", + "1 1 2 5 NaN None\n", + "2 1 3 8 NaN None\n", + "3 11639 1 31607 NaN None\n", + "4 3 1 10 NaN None\n", + "... ... ... ... ... ...\n", + "1788955 274847 2 729350 NaN None\n", + "1788956 274847 3 729351 NaN None\n", + "1788957 274847 4 729352 NaN None\n", + "1788958 274847 5 729353 NaN None\n", + "1788959 274862 1 729407 NaN None\n", + "\n", + "[1788960 rows x 5 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "d4225b28-7891-464f-ab88-3853eab0746c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tx_id 1\n", + "tx_name ENST00000456328.2\n", + "chrom chr1\n", + "strand +\n", + "start 11869\n", + "end 14409\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tx.loc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "428fa2df-0960-48b5-9466-fb2f5bfd0b4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "966235" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(s[\"_cds_id\"].isnull() == False)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "ab1837e6-9994-4ddc-adb3-097e475af1f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 True\n", + "1 True\n", + "2 True\n", + "3 True\n", + "4 True\n", + " ... \n", + "1788955 True\n", + "1788956 True\n", + "1788957 True\n", + "1788958 True\n", + "1788959 True\n", + "Name: _cds_id, Length: 1788960, dtype: bool" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s[\"_cds_id\"].isnull()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}