diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6afe6c55eb..5e12caa83c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: - id: check-toml - id: debug-statements - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.6 + rev: v0.6.1 hooks: - id: ruff-format - id: ruff diff --git a/doc/kmers-and-minhash.ipynb b/doc/kmers-and-minhash.ipynb index f4c5150251..fc779f436d 100644 --- a/doc/kmers-and-minhash.ipynb +++ b/doc/kmers-and-minhash.ipynb @@ -49,10 +49,10 @@ "def jaccard_similarity(a, b):\n", " a = set(a)\n", " b = set(b)\n", - " \n", + "\n", " intersection = len(a.intersection(b))\n", " union = len(a.union(b))\n", - " \n", + "\n", " return intersection / union" ] }, @@ -65,9 +65,9 @@ "def jaccard_containment(a, b):\n", " a = set(a)\n", " b = set(b)\n", - " \n", + "\n", " intersection = len(a.intersection(b))\n", - " \n", + "\n", " return intersection / len(a)" ] }, @@ -84,9 +84,9 @@ "metadata": {}, "outputs": [], "source": [ - "a = ['ATGG', 'AACC']\n", - "b = ['ATGG', 'CACA']\n", - "c = ['ATGC', 'CACA']" + "a = [\"ATGG\", \"AACC\"]\n", + "b = [\"ATGG\", \"CACA\"]\n", + "c = [\"ATGC\", \"CACA\"]" ] }, { @@ -270,11 +270,11 @@ "def build_kmers(sequence, ksize):\n", " kmers = []\n", " n_kmers = len(sequence) - ksize + 1\n", - " \n", + "\n", " for i in range(n_kmers):\n", - " kmer = sequence[i:i + ksize]\n", + " kmer = sequence[i : i + ksize]\n", " kmers.append(kmer)\n", - " \n", + "\n", " return kmers" ] }, @@ -307,7 +307,7 @@ } ], "source": [ - "build_kmers('ATGGACCAGATATAGGGAGAGCCAGGTAGGACA', 21)" + "build_kmers(\"ATGGACCAGATATAGGGAGAGCCAGGTAGGACA\", 21)" ] }, { @@ -325,8 +325,8 @@ "metadata": {}, "outputs": [], "source": [ - "seq1 = 'ATGGACCAGATATAGGGAGAGCCAGGTAGGACA'\n", - "seq2 = 'ATGGACCAGATATTGGGAGAGCCGGGTAGGACA'\n", + "seq1 = \"ATGGACCAGATATAGGGAGAGCCAGGTAGGACA\"\n", + "seq2 = \"ATGGACCAGATATTGGGAGAGCCGGGTAGGACA\"\n", "# differences: ^ ^" ] }, @@ -375,13 +375,14 @@ "metadata": {}, "outputs": [], "source": [ - "import screed # a library for reading in FASTA/FASTQ\n", + "import screed # a library for reading in FASTA/FASTQ\n", + "\n", "\n", "def read_kmers_from_file(filename, ksize):\n", " all_kmers = []\n", " for record in screed.open(filename):\n", " sequence = record.sequence\n", - " \n", + "\n", " kmers = build_kmers(sequence, ksize)\n", " all_kmers += kmers\n", "\n", @@ -394,7 +395,7 @@ "metadata": {}, "outputs": [], "source": [ - "akker_kmers = read_kmers_from_file('genomes/akkermansia.fa', 31)" + "akker_kmers = read_kmers_from_file(\"genomes/akkermansia.fa\", 31)" ] }, { @@ -444,8 +445,8 @@ "metadata": {}, "outputs": [], "source": [ - "shew1_kmers = read_kmers_from_file('genomes/shew_os185.fa', 31)\n", - "shew2_kmers = read_kmers_from_file('genomes/shew_os223.fa', 31)" + "shew1_kmers = read_kmers_from_file(\"genomes/shew_os185.fa\", 31)\n", + "shew2_kmers = read_kmers_from_file(\"genomes/shew_os223.fa\", 31)" ] }, { @@ -471,9 +472,9 @@ } ], "source": [ - "print('akker vs shew1', jaccard_similarity(akker_kmers, shew1_kmers))\n", - "print('akker vs shew2', jaccard_similarity(akker_kmers, shew2_kmers))\n", - "print('shew1 vs shew2', jaccard_similarity(shew1_kmers, shew2_kmers))" + "print(\"akker vs shew1\", jaccard_similarity(akker_kmers, shew1_kmers))\n", + "print(\"akker vs shew2\", jaccard_similarity(akker_kmers, shew2_kmers))\n", + "print(\"shew1 vs shew2\", jaccard_similarity(shew1_kmers, shew2_kmers))" ] }, { @@ -492,9 +493,9 @@ } ], "source": [ - "print('akker vs shew1', jaccard_containment(akker_kmers, shew1_kmers))\n", - "print('akker vs shew2', jaccard_containment(akker_kmers, shew2_kmers))\n", - "print('shew1 vs shew2', jaccard_containment(shew1_kmers, shew2_kmers))" + "print(\"akker vs shew1\", jaccard_containment(akker_kmers, shew1_kmers))\n", + "print(\"akker vs shew2\", jaccard_containment(akker_kmers, shew2_kmers))\n", + "print(\"shew1 vs shew2\", jaccard_containment(shew1_kmers, shew2_kmers))" ] }, { @@ -568,20 +569,22 @@ "source": [ "import mmh3\n", "\n", + "\n", "def hash_kmer(kmer):\n", " # calculate the reverse complement\n", " rc_kmer = screed.rc(kmer)\n", - " \n", + "\n", " # determine whether original k-mer or reverse complement is lesser\n", " if kmer < rc_kmer:\n", " canonical_kmer = kmer\n", " else:\n", " canonical_kmer = rc_kmer\n", - " \n", + "\n", " # calculate murmurhash using a hash seed of 42\n", " hash = mmh3.hash64(canonical_kmer, 42)[0]\n", - " if hash < 0: hash += 2**64\n", - " \n", + " if hash < 0:\n", + " hash += 2**64\n", + "\n", " # done\n", " return hash" ] @@ -610,7 +613,7 @@ } ], "source": [ - "hash_kmer('ATGGC')" + "hash_kmer(\"ATGGC\")" ] }, { @@ -637,7 +640,7 @@ } ], "source": [ - "hash_kmer('ATGGC')" + "hash_kmer(\"ATGGC\")" ] }, { @@ -664,7 +667,7 @@ } ], "source": [ - "hash_kmer('GCCAT')" + "hash_kmer(\"GCCAT\")" ] }, { @@ -691,7 +694,7 @@ } ], "source": [ - "hash_kmer('GCCAA')" + "hash_kmer(\"GCCAA\")" ] }, { @@ -836,7 +839,7 @@ " if hash_kmer(kmer) < keep_below:\n", " keep.append(kmer)\n", " # otherwise, discard\n", - " \n", + "\n", " return keep" ] }, @@ -901,8 +904,8 @@ } ], "source": [ - "print('akker vs akker, total', jaccard_similarity(akker_kmers, akker_kmers))\n", - "print('akker vs akker, sub', jaccard_similarity(akker_sub, akker_sub))" + "print(\"akker vs akker, total\", jaccard_similarity(akker_kmers, akker_kmers))\n", + "print(\"akker vs akker, sub\", jaccard_similarity(akker_sub, akker_sub))" ] }, { @@ -920,8 +923,8 @@ } ], "source": [ - "print('akker vs shew1, total', jaccard_similarity(akker_kmers, shew1_kmers))\n", - "print('akker vs shew1, sub', jaccard_similarity(akker_sub, shew1_sub))" + "print(\"akker vs shew1, total\", jaccard_similarity(akker_kmers, shew1_kmers))\n", + "print(\"akker vs shew1, sub\", jaccard_similarity(akker_sub, shew1_sub))" ] }, { @@ -939,8 +942,8 @@ } ], "source": [ - "print('shew1 vs shew2, total', jaccard_similarity(shew1_kmers, shew2_kmers))\n", - "print('shew1 vs shew2, sub', jaccard_similarity(shew1_sub, shew2_sub))" + "print(\"shew1 vs shew2, total\", jaccard_similarity(shew1_kmers, shew2_kmers))\n", + "print(\"shew1 vs shew2, sub\", jaccard_similarity(shew1_sub, shew2_sub))" ] }, { diff --git a/doc/plotting-compare.ipynb b/doc/plotting-compare.ipynb index 6659e170cd..0b5df4b378 100644 --- a/doc/plotting-compare.ipynb +++ b/doc/plotting-compare.ipynb @@ -107,7 +107,7 @@ "metadata": {}, "outputs": [], "source": [ - "matrix, labels = fig.load_matrix_and_labels('compare-demo')" + "matrix, labels = fig.load_matrix_and_labels(\"compare-demo\")" ] }, { @@ -139,8 +139,8 @@ } ], "source": [ - "print('matrix:\\n', matrix)\n", - "print('labels:', labels)" + "print(\"matrix:\\n\", matrix)\n", + "print(\"labels:\", labels)" ] }, { @@ -192,8 +192,8 @@ } ], "source": [ - "print('reordered matrix:\\n', reordered_matrix)\n", - "print('reordered labels:', reordered_labels)" + "print(\"reordered matrix:\\n\", reordered_matrix)\n", + "print(\"reordered labels:\", reordered_labels)" ] }, { @@ -218,8 +218,10 @@ "source": [ "import scipy.cluster.hierarchy as sch\n", "\n", - "def plot_composite_matrix(D, labeltext, show_labels=True,\n", - " vmax=1.0, vmin=0.0, force=False):\n", + "\n", + "def plot_composite_matrix(\n", + " D, labeltext, show_labels=True, vmax=1.0, vmin=0.0, force=False\n", + "):\n", " \"\"\"Build a composite plot showing dendrogram + distance matrix/heatmap.\n", "\n", " Returns a matplotlib figure.\n", @@ -228,25 +230,34 @@ " shown on the plot.\n", " \"\"\"\n", " if D.max() > 1.0 or D.min() < 0.0:\n", - " error('This matrix doesn\\'t look like a distance matrix - min value {}, max value {}', D.min(), D.max())\n", + " error(\n", + " \"This matrix doesn't look like a distance matrix - min value {}, max value {}\",\n", + " D.min(),\n", + " D.max(),\n", + " )\n", " if not force:\n", " raise ValueError(\"not a distance matrix\")\n", " else:\n", - " notify('force is set; scaling to [0, 1]')\n", + " notify(\"force is set; scaling to [0, 1]\")\n", " D -= D.min()\n", " D /= D.max()\n", "\n", " if show_labels:\n", - " show_indices = True\n", + " pass\n", "\n", " fig = pylab.figure(figsize=(11, 8))\n", " ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])\n", "\n", " # plot dendrogram\n", - " Y = sch.linkage(D, method='single') # centroid\n", + " Y = sch.linkage(D, method=\"single\") # centroid\n", "\n", - " Z1 = sch.dendrogram(Y, orientation='left', labels=labeltext,\n", - " no_labels=not show_labels, get_leaves=True)\n", + " Z1 = sch.dendrogram(\n", + " Y,\n", + " orientation=\"left\",\n", + " labels=labeltext,\n", + " no_labels=not show_labels,\n", + " get_leaves=True,\n", + " )\n", " ax1.set_xticks([])\n", "\n", " xstart = 0.45\n", @@ -256,8 +267,8 @@ " scale_xstart = xstart + width + 0.01\n", "\n", " # re-order labels along rows, top to bottom\n", - " idx1 = Z1['leaves']\n", - " reordered_labels = [ labeltext[i] for i in idx1 ]\n", + " idx1 = Z1[\"leaves\"]\n", + " reordered_labels = [labeltext[i] for i in idx1]\n", "\n", " # reorder D by the clustering in the dendrogram\n", " D = D[idx1, :]\n", @@ -266,8 +277,9 @@ " # show matrix\n", " axmatrix = fig.add_axes([xstart, 0.1, width, 0.6])\n", "\n", - " im = axmatrix.matshow(D, aspect='auto', origin='lower',\n", - " cmap=pylab.cm.YlGnBu, vmin=vmin, vmax=vmax)\n", + " im = axmatrix.matshow(\n", + " D, aspect=\"auto\", origin=\"lower\", cmap=pylab.cm.YlGnBu, vmin=vmin, vmax=vmax\n", + " )\n", " axmatrix.set_xticks([])\n", " axmatrix.set_yticks([])\n", "\n", diff --git a/doc/sourmash-collections.ipynb b/doc/sourmash-collections.ipynb index 5e5cf1dfeb..f604072681 100644 --- a/doc/sourmash-collections.ipynb +++ b/doc/sourmash-collections.ipynb @@ -405,7 +405,8 @@ ], "source": [ "from IPython.display import Image\n", - "Image(filename='compare_all.mat.matrix.png') " + "\n", + "Image(filename=\"compare_all.mat.matrix.png\")" ] }, { @@ -857,7 +858,8 @@ ], "source": [ "import pandas\n", - "df = pandas.read_csv('podar-lineage.csv')\n", + "\n", + "df = pandas.read_csv(\"podar-lineage.csv\")\n", "df" ] }, diff --git a/doc/using-LCA-database-API.ipynb b/doc/using-LCA-database-API.ipynb index 95c99e0f22..999b0c7a34 100644 --- a/doc/using-LCA-database-API.ipynb +++ b/doc/using-LCA-database-API.ipynb @@ -53,6 +53,7 @@ "outputs": [], "source": [ "import sourmash\n", + "\n", "db = sourmash.lca.LCA_Database(ksize=31, scaled=1000)" ] }, @@ -101,9 +102,9 @@ "metadata": {}, "outputs": [], "source": [ - "sig1 = sourmash.load_one_signature('akkermansia.fa.sig', ksize=31)\n", - "sig2 = sourmash.load_one_signature('shew_os185.fa.sig', ksize=31)\n", - "sig3 = sourmash.load_one_signature('shew_os223.fa.sig', ksize=31)" + "sig1 = sourmash.load_one_signature(\"akkermansia.fa.sig\", ksize=31)\n", + "sig2 = sourmash.load_one_signature(\"shew_os185.fa.sig\", ksize=31)\n", + "sig3 = sourmash.load_one_signature(\"shew_os223.fa.sig\", ksize=31)" ] }, { @@ -123,9 +124,9 @@ } ], "source": [ - "db.insert(sig1, ident='akkermansia')\n", - "db.insert(sig2, ident='shew_os185')\n", - "db.insert(sig3, ident='shew_os223')" + "db.insert(sig1, ident=\"akkermansia\")\n", + "db.insert(sig2, ident=\"shew_os185\")\n", + "db.insert(sig3, ident=\"shew_os223\")" ] }, { @@ -150,6 +151,7 @@ ], "source": [ "from pprint import pprint\n", + "\n", "pprint(db.search(sig1, threshold=0.1))" ] }, @@ -296,7 +298,7 @@ } ], "source": [ - "print('{} hash values total in this database'.format(len(db._hashval_to_idx)))" + "print(f\"{len(db._hashval_to_idx)} hash values total in this database\")" ] }, { @@ -316,7 +318,7 @@ "all_idx = set()\n", "for idx_set in db._hashval_to_idx.values():\n", " all_idx.update(idx_set)\n", - "print('belonging to signatures with idx {}'.format(all_idx))" + "print(f\"belonging to signatures with idx {all_idx}\")" ] }, { @@ -345,7 +347,7 @@ ], "source": [ "for hashval in first_three_hashvals:\n", - " print('hashval {} belongs to idxs {}'.format(hashval, db._hashval_to_idx[hashval]))" + " print(f\"hashval {hashval} belongs to idxs {db._hashval_to_idx[hashval]}\")" ] }, { @@ -369,14 +371,14 @@ "for hashval, idx_set in db._hashval_to_idx.items():\n", " if query_idx in idx_set:\n", " hashval_set.add(hashval)\n", - " \n", - "print('{} hashvals belong to query idx {}'.format(len(hashval_set), query_idx))\n", + "\n", + "print(f\"{len(hashval_set)} hashvals belong to query idx {query_idx}\")\n", "\n", "ident = db._idx_to_ident[query_idx]\n", - "print('query idx {} matches to ident {}'.format(query_idx, ident))\n", + "print(f\"query idx {query_idx} matches to ident {ident}\")\n", "\n", "name = db._ident_to_name[ident]\n", - "print('query idx {} matches to name {}'.format(query_idx, name))" + "print(f\"query idx {query_idx} matches to name {name}\")" ] }, { @@ -401,9 +403,9 @@ "metadata": {}, "outputs": [], "source": [ - "superkingdom = LineagePair('superkingdom', 'Bacteria')\n", - "phylum = LineagePair('phylum', 'Verrucomicrobia')\n", - "klass = LineagePair('class', 'Verrucomicrobiae')\n", + "superkingdom = LineagePair(\"superkingdom\", \"Bacteria\")\n", + "phylum = LineagePair(\"phylum\", \"Verrucomicrobia\")\n", + "klass = LineagePair(\"class\", \"Verrucomicrobiae\")\n", "\n", "lineage = (superkingdom, phylum, klass)" ] @@ -448,14 +450,14 @@ "# by default, the identifier is the signature name --\n", "ident = sig1.name\n", "idx = db._ident_to_idx[ident]\n", - "print(\"ident '{}' has idx {}\".format(ident, idx))\n", + "print(f\"ident '{ident}' has idx {idx}\")\n", "\n", "lid = db._idx_to_lid[idx]\n", - "print(\"lid for idx {} is {}\".format(idx, lid))\n", + "print(f\"lid for idx {idx} is {lid}\")\n", "\n", "lineage = db._lid_to_lineage[lid]\n", "display = sourmash.lca.display_lineage(lineage)\n", - "print(\"lineage for lid {} is {}\".format(lid, display))" + "print(f\"lineage for lid {lid} is {display}\")" ] }, { @@ -519,11 +521,18 @@ } ], "source": [ - "linstr1 = [\"Bacteria\", \"Verrucomicrobia\", \"Verrucomicrobiae\",\n", - " \"Verrucomicrobiales\", \"Akkermansiaceae\", \"Akkermansia\",\n", - " \"Akkermansia muciniphila\", \"Akkermansia muciniphila ATCC BAA-835\"]\n", + "linstr1 = [\n", + " \"Bacteria\",\n", + " \"Verrucomicrobia\",\n", + " \"Verrucomicrobiae\",\n", + " \"Verrucomicrobiales\",\n", + " \"Akkermansiaceae\",\n", + " \"Akkermansia\",\n", + " \"Akkermansia muciniphila\",\n", + " \"Akkermansia muciniphila ATCC BAA-835\",\n", + "]\n", "\n", - "lineage1 = [ LineagePair(*pair) for pair in zip(sourmash.lca.taxlist(), linstr1) ]\n", + "lineage1 = [LineagePair(*pair) for pair in zip(sourmash.lca.taxlist(), linstr1)]\n", "pprint(lineage1)" ] }, @@ -579,18 +588,32 @@ } ], "source": [ - "linstr2 = [\"Bacteria\", \"Proteobacteria\", \"Gammaproteobacteria\",\n", - " \"Alteromonadales\", \"Shewanellaceae\", \"Shewanella\",\n", - " \"Shewanella baltica\", \"Shewanella baltica OS185\"]\n", - "lineage2 = [ LineagePair(*pair) for pair in zip(sourmash.lca.taxlist(), linstr2) ]\n", + "linstr2 = [\n", + " \"Bacteria\",\n", + " \"Proteobacteria\",\n", + " \"Gammaproteobacteria\",\n", + " \"Alteromonadales\",\n", + " \"Shewanellaceae\",\n", + " \"Shewanella\",\n", + " \"Shewanella baltica\",\n", + " \"Shewanella baltica OS185\",\n", + "]\n", + "lineage2 = [LineagePair(*pair) for pair in zip(sourmash.lca.taxlist(), linstr2)]\n", "\n", - "linstr3 = [\"Bacteria\", \"Proteobacteria\", \"Gammaproteobacteria\",\n", - " \"Alteromonadales\", \"Shewanellaceae\", \"Shewanella\",\n", - " \"Shewanella baltica\", \"Shewanella baltica OS223\"]\n", - "lineage3 = [ LineagePair(*pair) for pair in zip(sourmash.lca.taxlist(), linstr3) ]\n", + "linstr3 = [\n", + " \"Bacteria\",\n", + " \"Proteobacteria\",\n", + " \"Gammaproteobacteria\",\n", + " \"Alteromonadales\",\n", + " \"Shewanellaceae\",\n", + " \"Shewanella\",\n", + " \"Shewanella baltica\",\n", + " \"Shewanella baltica OS223\",\n", + "]\n", + "lineage3 = [LineagePair(*pair) for pair in zip(sourmash.lca.taxlist(), linstr3)]\n", "\n", - "print('lineage2 is', sourmash.lca.display_lineage(lineage2))\n", - "print('lineage3 is', sourmash.lca.display_lineage(lineage3))" + "print(\"lineage2 is\", sourmash.lca.display_lineage(lineage2))\n", + "print(\"lineage3 is\", sourmash.lca.display_lineage(lineage3))" ] }, { @@ -724,7 +747,7 @@ ], "source": [ "assignments = sourmash.lca.gather_assignments(sig2.minhash.get_mins(), [db])\n", - "print('num hashvals:', len(assignments))" + "print(\"num hashvals:\", len(assignments))" ] }, { @@ -797,7 +820,7 @@ "\n", "# count_lca_for_assignments returns a collections.Counter object\n", "for lineage, count in counter.most_common():\n", - " print('{} hashes have LCA: {}'.format(count, sourmash.lca.display_lineage(lineage)))" + " print(f\"{count} hashes have LCA: {sourmash.lca.display_lineage(lineage)}\")" ] }, {