From 037fbce120ce7f6bf267ad58db18945aee86845d Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Wed, 26 Jun 2024 15:49:51 -0600 Subject: [PATCH] Add CLI for `coSMicQC` (#34) * Create qcdataframe.py * linting * add qcdataframe * linting * adding tests * linting * update name, tests * add back compat for self type * back compat for isinstance * linting * add cli for cosmicqc * linting * add tests and wrappers * linting and test adjustment * attempting wrapper * patch python-fire; fix tests * add docstring to top of test * add csv.gz compatibility * add export capabilities * rename file to correct module name * add export capabilities * add output capabilities * Apply suggestions from code review Co-authored-by: Gregory Way * update tests and docs * fix tests * update tests; add constructor path for scdataframe * linting * modify tests * enable pd.series compatibility * update for exports via cli * fix docstring * add return types for test util * fix deps * add to docs on exports * add docs for context * note about ignore rule * remove todo * minor comment about display * retain code comment * correct code comment --------- --- example.csv | 11 +++ poetry.lock | 113 ++++++++++++++++++---------- pyproject.toml | 4 + src/cosmicqc/analyze.py | 45 +++++++++-- src/cosmicqc/cli.py | 112 +++++++++++++++++++++++++++ src/cosmicqc/scdataframe.py | 7 +- tests/__init__.py | 0 tests/test_cli.py | 146 ++++++++++++++++++++++++++++++++++++ tests/utils.py | 23 ++++++ 9 files changed, 416 insertions(+), 45 deletions(-) create mode 100644 example.csv create mode 100644 src/cosmicqc/cli.py create mode 100644 tests/__init__.py create mode 100644 tests/test_cli.py create mode 100644 tests/utils.py diff --git a/example.csv b/example.csv new file mode 100644 index 0000000..08055b3 --- /dev/null +++ b/example.csv @@ -0,0 +1,11 @@ +,example_feature +0,1 +1,2 +2,3 +3,4 +4,5 +5,6 +6,7 +7,8 +8,9 +9,10 diff --git a/poetry.lock b/poetry.lock index a932b27..d12396d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,6 +25,20 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "fire" +version = "0.6.0" +description = "A library for automatically generating command line interfaces." +optional = false +python-versions = "*" +files = [ + {file = "fire-0.6.0.tar.gz", hash = "sha256:54ec5b996ecdd3c0309c800324a0703d6da512241bc73b553db959d98de0aa66"}, +] + +[package.dependencies] +six = "*" +termcolor = "*" + [[package]] name = "iniconfig" version = "2.0.0" @@ -75,47 +89,56 @@ files = [ [[package]] name = "numpy" -version = "1.26.3" +version = "2.0.0" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.9" files = [ - {file = "numpy-1.26.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:806dd64230dbbfaca8a27faa64e2f414bf1c6622ab78cc4264f7f5f028fee3bf"}, - {file = "numpy-1.26.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02f98011ba4ab17f46f80f7f8f1c291ee7d855fcef0a5a98db80767a468c85cd"}, - {file = "numpy-1.26.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d45b3ec2faed4baca41c76617fcdcfa4f684ff7a151ce6fc78ad3b6e85af0a6"}, - {file = "numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdd2b45bf079d9ad90377048e2747a0c82351989a2165821f0c96831b4a2a54b"}, - {file = "numpy-1.26.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:211ddd1e94817ed2d175b60b6374120244a4dd2287f4ece45d49228b4d529178"}, - {file = "numpy-1.26.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1240f767f69d7c4c8a29adde2310b871153df9b26b5cb2b54a561ac85146485"}, - {file = "numpy-1.26.3-cp310-cp310-win32.whl", hash = "sha256:21a9484e75ad018974a2fdaa216524d64ed4212e418e0a551a2d83403b0531d3"}, - {file = "numpy-1.26.3-cp310-cp310-win_amd64.whl", hash = "sha256:9e1591f6ae98bcfac2a4bbf9221c0b92ab49762228f38287f6eeb5f3f55905ce"}, - {file = "numpy-1.26.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b831295e5472954104ecb46cd98c08b98b49c69fdb7040483aff799a755a7374"}, - {file = "numpy-1.26.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9e87562b91f68dd8b1c39149d0323b42e0082db7ddb8e934ab4c292094d575d6"}, - {file = "numpy-1.26.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c66d6fec467e8c0f975818c1796d25c53521124b7cfb760114be0abad53a0a2"}, - {file = "numpy-1.26.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f25e2811a9c932e43943a2615e65fc487a0b6b49218899e62e426e7f0a57eeda"}, - {file = "numpy-1.26.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:af36e0aa45e25c9f57bf684b1175e59ea05d9a7d3e8e87b7ae1a1da246f2767e"}, - {file = "numpy-1.26.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:51c7f1b344f302067b02e0f5b5d2daa9ed4a721cf49f070280ac202738ea7f00"}, - {file = "numpy-1.26.3-cp311-cp311-win32.whl", hash = "sha256:7ca4f24341df071877849eb2034948459ce3a07915c2734f1abb4018d9c49d7b"}, - {file = "numpy-1.26.3-cp311-cp311-win_amd64.whl", hash = "sha256:39763aee6dfdd4878032361b30b2b12593fb445ddb66bbac802e2113eb8a6ac4"}, - {file = "numpy-1.26.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a7081fd19a6d573e1a05e600c82a1c421011db7935ed0d5c483e9dd96b99cf13"}, - {file = "numpy-1.26.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12c70ac274b32bc00c7f61b515126c9205323703abb99cd41836e8125ea0043e"}, - {file = "numpy-1.26.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f784e13e598e9594750b2ef6729bcd5a47f6cfe4a12cca13def35e06d8163e3"}, - {file = "numpy-1.26.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f24750ef94d56ce6e33e4019a8a4d68cfdb1ef661a52cdaee628a56d2437419"}, - {file = "numpy-1.26.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:77810ef29e0fb1d289d225cabb9ee6cf4d11978a00bb99f7f8ec2132a84e0166"}, - {file = "numpy-1.26.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8ed07a90f5450d99dad60d3799f9c03c6566709bd53b497eb9ccad9a55867f36"}, - {file = "numpy-1.26.3-cp312-cp312-win32.whl", hash = "sha256:f73497e8c38295aaa4741bdfa4fda1a5aedda5473074369eca10626835445511"}, - {file = "numpy-1.26.3-cp312-cp312-win_amd64.whl", hash = "sha256:da4b0c6c699a0ad73c810736303f7fbae483bcb012e38d7eb06a5e3b432c981b"}, - {file = "numpy-1.26.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1666f634cb3c80ccbd77ec97bc17337718f56d6658acf5d3b906ca03e90ce87f"}, - {file = "numpy-1.26.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18c3319a7d39b2c6a9e3bb75aab2304ab79a811ac0168a671a62e6346c29b03f"}, - {file = "numpy-1.26.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b7e807d6888da0db6e7e75838444d62495e2b588b99e90dd80c3459594e857b"}, - {file = "numpy-1.26.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4d362e17bcb0011738c2d83e0a65ea8ce627057b2fdda37678f4374a382a137"}, - {file = "numpy-1.26.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b8c275f0ae90069496068c714387b4a0eba5d531aace269559ff2b43655edd58"}, - {file = "numpy-1.26.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cc0743f0302b94f397a4a65a660d4cd24267439eb16493fb3caad2e4389bccbb"}, - {file = "numpy-1.26.3-cp39-cp39-win32.whl", hash = "sha256:9bc6d1a7f8cedd519c4b7b1156d98e051b726bf160715b769106661d567b3f03"}, - {file = "numpy-1.26.3-cp39-cp39-win_amd64.whl", hash = "sha256:867e3644e208c8922a3be26fc6bbf112a035f50f0a86497f98f228c50c607bb2"}, - {file = "numpy-1.26.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3c67423b3703f8fbd90f5adaa37f85b5794d3366948efe9a5190a5f3a83fc34e"}, - {file = "numpy-1.26.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46f47ee566d98849323f01b349d58f2557f02167ee301e5e28809a8c0e27a2d0"}, - {file = "numpy-1.26.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a8474703bffc65ca15853d5fd4d06b18138ae90c17c8d12169968e998e448bb5"}, - {file = "numpy-1.26.3.tar.gz", hash = "sha256:697df43e2b6310ecc9d95f05d5ef20eacc09c7c4ecc9da3f235d39e71b7da1e4"}, + {file = "numpy-2.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:04494f6ec467ccb5369d1808570ae55f6ed9b5809d7f035059000a37b8d7e86f"}, + {file = "numpy-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2635dbd200c2d6faf2ef9a0d04f0ecc6b13b3cad54f7c67c61155138835515d2"}, + {file = "numpy-2.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:0a43f0974d501842866cc83471bdb0116ba0dffdbaac33ec05e6afed5b615238"}, + {file = "numpy-2.0.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:8d83bb187fb647643bd56e1ae43f273c7f4dbcdf94550d7938cfc32566756514"}, + {file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79e843d186c8fb1b102bef3e2bc35ef81160ffef3194646a7fdd6a73c6b97196"}, + {file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7696c615765091cc5093f76fd1fa069870304beaccfd58b5dcc69e55ef49c1"}, + {file = "numpy-2.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b4c76e3d4c56f145d41b7b6751255feefae92edbc9a61e1758a98204200f30fc"}, + {file = "numpy-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:acd3a644e4807e73b4e1867b769fbf1ce8c5d80e7caaef0d90dcdc640dfc9787"}, + {file = "numpy-2.0.0-cp310-cp310-win32.whl", hash = "sha256:cee6cc0584f71adefe2c908856ccc98702baf95ff80092e4ca46061538a2ba98"}, + {file = "numpy-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:ed08d2703b5972ec736451b818c2eb9da80d66c3e84aed1deeb0c345fefe461b"}, + {file = "numpy-2.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad0c86f3455fbd0de6c31a3056eb822fc939f81b1618f10ff3406971893b62a5"}, + {file = "numpy-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7f387600d424f91576af20518334df3d97bc76a300a755f9a8d6e4f5cadd289"}, + {file = "numpy-2.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:34f003cb88b1ba38cb9a9a4a3161c1604973d7f9d5552c38bc2f04f829536609"}, + {file = "numpy-2.0.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:b6f6a8f45d0313db07d6d1d37bd0b112f887e1369758a5419c0370ba915b3871"}, + {file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f64641b42b2429f56ee08b4f427a4d2daf916ec59686061de751a55aafa22e4"}, + {file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7039a136017eaa92c1848152827e1424701532ca8e8967fe480fe1569dae581"}, + {file = "numpy-2.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:46e161722e0f619749d1cd892167039015b2c2817296104487cd03ed4a955995"}, + {file = "numpy-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0e50842b2295ba8414c8c1d9d957083d5dfe9e16828b37de883f51fc53c4016f"}, + {file = "numpy-2.0.0-cp311-cp311-win32.whl", hash = "sha256:2ce46fd0b8a0c947ae047d222f7136fc4d55538741373107574271bc00e20e8f"}, + {file = "numpy-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd6acc766814ea6443628f4e6751d0da6593dae29c08c0b2606164db026970c"}, + {file = "numpy-2.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:354f373279768fa5a584bac997de6a6c9bc535c482592d7a813bb0c09be6c76f"}, + {file = "numpy-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4d2f62e55a4cd9c58c1d9a1c9edaedcd857a73cb6fda875bf79093f9d9086f85"}, + {file = "numpy-2.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1e72728e7501a450288fc8e1f9ebc73d90cfd4671ebbd631f3e7857c39bd16f2"}, + {file = "numpy-2.0.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:84554fc53daa8f6abf8e8a66e076aff6ece62de68523d9f665f32d2fc50fd66e"}, + {file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73aafd1afca80afecb22718f8700b40ac7cab927b8abab3c3e337d70e10e5a2"}, + {file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49d9f7d256fbc804391a7f72d4a617302b1afac1112fac19b6c6cec63fe7fe8a"}, + {file = "numpy-2.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0ec84b9ba0654f3b962802edc91424331f423dcf5d5f926676e0150789cb3d95"}, + {file = "numpy-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:feff59f27338135776f6d4e2ec7aeeac5d5f7a08a83e80869121ef8164b74af9"}, + {file = "numpy-2.0.0-cp312-cp312-win32.whl", hash = "sha256:c5a59996dc61835133b56a32ebe4ef3740ea5bc19b3983ac60cc32be5a665d54"}, + {file = "numpy-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a356364941fb0593bb899a1076b92dfa2029f6f5b8ba88a14fd0984aaf76d0df"}, + {file = "numpy-2.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e61155fae27570692ad1d327e81c6cf27d535a5d7ef97648a17d922224b216de"}, + {file = "numpy-2.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4554eb96f0fd263041baf16cf0881b3f5dafae7a59b1049acb9540c4d57bc8cb"}, + {file = "numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:903703372d46bce88b6920a0cd86c3ad82dae2dbef157b5fc01b70ea1cfc430f"}, + {file = "numpy-2.0.0-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:3e8e01233d57639b2e30966c63d36fcea099d17c53bf424d77f088b0f4babd86"}, + {file = "numpy-2.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cde1753efe513705a0c6d28f5884e22bdc30438bf0085c5c486cdaff40cd67a"}, + {file = "numpy-2.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:821eedb7165ead9eebdb569986968b541f9908979c2da8a4967ecac4439bae3d"}, + {file = "numpy-2.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a1712c015831da583b21c5bfe15e8684137097969c6d22e8316ba66b5baabe4"}, + {file = "numpy-2.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9c27f0946a3536403efb0e1c28def1ae6730a72cd0d5878db38824855e3afc44"}, + {file = "numpy-2.0.0-cp39-cp39-win32.whl", hash = "sha256:63b92c512d9dbcc37f9d81b123dec99fdb318ba38c8059afc78086fe73820275"}, + {file = "numpy-2.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:3f6bed7f840d44c08ebdb73b1825282b801799e325bcbdfa6bc5c370e5aecc65"}, + {file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9416a5c2e92ace094e9f0082c5fd473502c91651fb896bc17690d6fc475128d6"}, + {file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:17067d097ed036636fa79f6a869ac26df7db1ba22039d962422506640314933a"}, + {file = "numpy-2.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38ecb5b0582cd125f67a629072fed6f83562d9dd04d7e03256c9829bdec027ad"}, + {file = "numpy-2.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cef04d068f5fb0518a77857953193b6bb94809a806bd0a14983a8f12ada060c9"}, + {file = "numpy-2.0.0.tar.gz", hash = "sha256:cf5d1c9e6837f8af9f92b6bd3e86d513cdc11f60fd62185cc49ec7d1aba34864"}, ] [[package]] @@ -526,6 +549,20 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "termcolor" +version = "2.4.0" +description = "ANSI color formatting for output in terminal" +optional = false +python-versions = ">=3.8" +files = [ + {file = "termcolor-2.4.0-py3-none-any.whl", hash = "sha256:9297c0df9c99445c2412e832e882a7884038a25617c60cea2ad69488d4040d63"}, + {file = "termcolor-2.4.0.tar.gz", hash = "sha256:aab9e56047c8ac41ed798fa36d892a37aca6b3e9159f3e0c24bc64a9b3ac7b7a"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "tomli" version = "2.0.1" @@ -551,4 +588,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.13" -content-hash = "a001251df36357dba7b70e5c246a76022a313fb9e120ddc5e95f4cc1efa778b7" +content-hash = "d905f4221fc9dac67766e214aa716fafe85597c04a86494d3bdbbefb70f04fbf" diff --git a/pyproject.toml b/pyproject.toml index 5adde66..f99302b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,10 +19,14 @@ scipy = [ ] pyarrow = "^16.0.0" pyyaml = "^6.0.1" +fire = "^0.6.0" [tool.poetry.group.dev.dependencies] pytest = "^8.2.0" +[tool.poetry.scripts] +cosmicqc = "cosmicqc.cli:cli_analyze" + [tool.isort] profile = "black" diff --git a/src/cosmicqc/analyze.py b/src/cosmicqc/analyze.py index bf926e3..4c649e4 100644 --- a/src/cosmicqc/analyze.py +++ b/src/cosmicqc/analyze.py @@ -23,6 +23,7 @@ def identify_outliers( feature_thresholds: Union[Dict[str, float], str], feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, include_threshold_scores: bool = False, + export_path: Optional[str] = None, ) -> Union[pd.Series, pd.DataFrame]: """ This function uses z-scoring to format the data for detecting outlier @@ -35,8 +36,6 @@ def identify_outliers( df: Union[SCDataFrame, pd.DataFrame, str] DataFrame or file string-based filepath of a Parquet, CSV, or TSV file with CytoTable output or similar data. - metadata_columns: List[str] - List of metadata columns that should be outputted with the outlier data. feature_thresholds: Dict[str, float] One of two options: A dictionary with the feature name(s) as the key(s) and their assigned @@ -48,6 +47,13 @@ def identify_outliers( feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, An optional feature thresholds file where thresholds may be defined within a file. + include_threshold_scores: bool + Whether to include the threshold scores in addition to whether + the threshold set passes per row. + export_path: Optional[str] = None + An optional path to export the data using SCDataFrame export + capabilities. If None no export is performed. + Note: compatible exports are CSV's, TSV's, and parquet. Returns: Union[pd.Series, pd.DataFrame]: @@ -95,7 +101,7 @@ def identify_outliers( condition = outlier_df[zscore_columns[feature]] < threshold conditions.append(condition) - return ( + result = ( # create a boolean pd.series identifier for dataframe # based on all conditions for use within other functions. reduce(operator.and_, conditions) @@ -111,12 +117,18 @@ def identify_outliers( ) ) + if export_path is not None: + SCDataFrame(data=result).export(file_path=export_path) + + return result + def find_outliers( df: Union[SCDataFrame, pd.DataFrame, str], metadata_columns: List[str], feature_thresholds: Union[Dict[str, float], str], feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, + export_path: Optional[str] = None, ) -> pd.DataFrame: """ This function uses identify_outliers to return a dataframe @@ -139,6 +151,10 @@ def find_outliers( feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, An optional feature thresholds file where thresholds may be defined within a file. + export_path: Optional[str] = None + An optional path to export the data using SCDataFrame export + capabilities. If None no export is performed. + Note: compatible exports are CSV's, TSV's, and parquet. Returns: pd.DataFrame: @@ -174,8 +190,14 @@ def find_outliers( # Include metadata columns in the output DataFrame columns_to_include = list(feature_thresholds.keys()) + metadata_columns + result = outliers_df[columns_to_include] + + # export the file if specified + if export_path is not None: + SCDataFrame(data=result).export(file_path=export_path) + # Return outliers DataFrame with specified columns - return outliers_df[columns_to_include] + return result def label_outliers( @@ -183,6 +205,7 @@ def label_outliers( feature_thresholds: Optional[Union[Dict[str, float], str]] = None, feature_thresholds_file: Optional[str] = DEFAULT_QC_THRESHOLD_FILE, include_threshold_scores: bool = False, + export_path: Optional[str] = None, ) -> pd.DataFrame: """ Use identify_outliers to label the original dataset for @@ -206,6 +229,10 @@ def label_outliers( include_threshold_scores: bool = False Whether to include the scores in addition to whether an outlier was detected or not. + export_path: Optional[str] = None + An optional path to export the data using SCDataFrame export + capabilities. If None no export is performed. + Note: compatible exports are CSV's, TSV's, and parquet. Returns: pd.DataFrame: @@ -224,7 +251,7 @@ def label_outliers( feature_thresholds_file=feature_thresholds_file, include_threshold_scores=include_threshold_scores, ) - return pd.concat( + result = pd.concat( [ df, ( @@ -265,7 +292,13 @@ def label_outliers( axis=1, ) # return a dataframe with a deduplicated columns by name - return labeled_df.loc[:, ~labeled_df.columns.duplicated()] + result = labeled_df.loc[:, ~labeled_df.columns.duplicated()] + + # export the file if specified + if export_path is not None: + SCDataFrame(data=result).export(file_path=export_path) + + return result def read_thresholds_set_from_file( diff --git a/src/cosmicqc/cli.py b/src/cosmicqc/cli.py new file mode 100644 index 0000000..e0d0a51 --- /dev/null +++ b/src/cosmicqc/cli.py @@ -0,0 +1,112 @@ +""" +Setup coSMicQC CLI through python-fire +""" + +import inspect +import sys +import types +from typing import Optional + +import fire +from fire import helptext, inspectutils, value_types +from fire.core import Display, FireError, _DictAsString, _OneLineResult +from fire.trace import FireTrace + +from . import analyze + + +# used to avoid bugs with python-fire and pandas string-based repr compatibility. +# referenced from https://github.com/google/python-fire/pull/446 +# to be removed after python-fire merges changes (uncertain of timeline) +def HasCustomRepr(component: object) -> bool: + """Reproduces above HasCustomStr function to determine if component has a + custom __repr__ method. + + ... + + Args: + component: The object to check for a custom __repr__ method. + Returns: + Whether `component` has a custom __repr__ method. + """ + if hasattr(component, "__repr__"): + class_attrs = inspectutils.GetClassAttrsDict(type(component)) or {} + repr_attr = class_attrs.get("__repr__") + if repr_attr and repr_attr.defining_class is not object: + return True + return False + + +# used to avoid bugs with python-fire and pandas string-based repr compatibility. +# referenced with modifications from https://github.com/google/python-fire/pull/446 +# to be removed after python-fire merges changes (uncertain of timeline) +# ignore rule below added to help avoid triggering ruff linting checks on temporary fix. +def _PrintResult( # noqa: C901 + component_trace: FireTrace, verbose: bool = False, serialize: Optional[bool] = None +) -> None: + """Prints the result of the Fire call to stdout in a human readable way.""" + result = component_trace.GetResult() + # Allow users to modify the return value of the component and provide + # custom formatting. + if serialize: + if not callable(serialize): + raise FireError( + "The argument `serialize` must be empty or callable:", serialize + ) + result = serialize(result) + if value_types.HasCustomStr(result): + # If the object has a custom __str__ method, rather than one inherited from + # object, then we use that to serialize the object. + print(str(result)) + return + + elif HasCustomRepr(result): + # Same as above, but for __repr__. + # For pandas.DataFrame, __str__ is inherited from object, but __repr__ has + # a custom implementation (see pandas.core.frame.DataFrame.__repr__) + print(str(result)) + return + + if isinstance(result, (list, set, frozenset, types.GeneratorType)): + for i in result: + print(_OneLineResult(i)) + elif inspect.isgeneratorfunction(result): + raise NotImplementedError + elif isinstance(result, dict) and value_types.IsSimpleGroup(result): + print(_DictAsString(result, verbose)) + elif isinstance(result, tuple): + print(_OneLineResult(result)) + elif isinstance(result, value_types.VALUE_TYPES): + if result is not None: + print(result) + else: + help_text = helptext.HelpText(result, trace=component_trace, verbose=verbose) + output = [help_text] + # used for displaying output through python-fire + Display(output, out=sys.stdout) + + +# replace the _PrintResult function with a fix for pandas dataframes +fire.core._PrintResult = _PrintResult + + +def cli_analyze() -> None: + """ + Run the analyze module functions through python-fire CLI + + This function serves as the CLI entry point for functions + within the analyze module. + """ + + fire.Fire(analyze) + + +if __name__ == "__main__": + """ + Setup the CLI with python-fire for the coSMicQC package. + + This enables running the functions identify_outliers, find_outliers, + and label_outliers from the command line interface through analyze + """ + + cli_analyze() diff --git a/src/cosmicqc/scdataframe.py b/src/cosmicqc/scdataframe.py index b722a50..177683a 100644 --- a/src/cosmicqc/scdataframe.py +++ b/src/cosmicqc/scdataframe.py @@ -71,6 +71,11 @@ def __init__( self.data_source = "pandas.DataFrame" self.data = data + elif isinstance(data, pd.Series): + # if data is a pd.DataFrame, remember this within the data_source attr + self.data_source = "pandas.Series" + self.data = pd.DataFrame(data) + elif isinstance(data, (pathlib.Path, str)): # if the data is a string or a pathlib path, remember the original source # through a data_source attr @@ -137,7 +142,7 @@ def __repr__(self: SCDataFrame_type) -> str: Returns the representation of the underlying pandas DataFrame. Returns: - pd.DataFrame: The data in a pandas DataFrame. + str: The string-based representation of a pandas DataFrame. """ return repr(self.data) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..622bd7e --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,146 @@ +""" +Tests cosmicqc cli module +""" + +import pathlib + +from pyarrow import parquet + +from .utils import run_cli_command + + +def test_cli_util(): + """ + Test the run_cli_command for successful output + """ + + command = """echo 'hello world'""" + stdout, stderr, returncode = run_cli_command(command) + + assert returncode == 0 + + +def test_cli_identify_outliers(tmp_path: pathlib.Path, basic_outlier_csv: str): + """ + Test the `identify_outliers` function of the CLI. + """ + + stdout, stderr, returncode = run_cli_command( + ( + f"""cosmicqc identify_outliers --df {basic_outlier_csv}""" + """ --feature_thresholds {"example_feature":1.0}""" + f" --export_path {tmp_path}/identify_outliers_output.parquet" + ) + ) + + print(stderr) + assert returncode == 0 + assert ( + stdout.strip() + == """0 False +1 False +2 False +3 False +4 False +5 False +6 False +7 False +8 True +9 True +Name: Z_Score_example_feature, dtype: bool""".strip() + ) + + assert parquet.read_table( + f"{tmp_path}/identify_outliers_output.parquet" + ).to_pydict() == { + "Z_Score_example_feature": [ + False, + False, + False, + False, + False, + False, + False, + False, + True, + True, + ] + } + + +def test_cli_find_outliers(tmp_path: pathlib.Path, basic_outlier_csv: str): + """ + Test the `find_outliers` function of the CLI. + """ + + stdout, stderr, returncode = run_cli_command( + ( + f"""cosmicqc find_outliers --df {basic_outlier_csv}""" + """ --metadata_columns [] --feature_thresholds {"example_feature":1.0}""" + f" --export_path {tmp_path}/find_outliers_output.parquet" + ) + ) + + assert returncode == 0 + assert ( + stdout.strip() + == """Number of outliers: 2 +Outliers Range: +example_feature Min: 9 +example_feature Max: 10 + example_feature +8 9 +9 10""".strip() + ) + + assert parquet.read_table( + f"{tmp_path}/find_outliers_output.parquet" + ).to_pydict() == {"example_feature": [9, 10], "__index_level_0__": [8, 9]} + + +def test_cli_label_outliers(tmp_path: pathlib.Path, basic_outlier_csv: str): + """ + Test the `label_outliers` function of the CLI. + """ + + stdout, stderr, returncode = run_cli_command( + ( + f"""cosmicqc label_outliers --df {basic_outlier_csv}""" + """ --feature_thresholds {"example_feature":1.0}""" + f" --export_path {tmp_path}/label_outliers_output.parquet" + ) + ) + + assert returncode == 0 + assert ( + stdout.strip() + == """example_feature outlier_custom +0 1 False +1 2 False +2 3 False +3 4 False +4 5 False +5 6 False +6 7 False +7 8 False +8 9 True +9 10 True""".strip() + ) + + assert parquet.read_table( + f"{tmp_path}/label_outliers_output.parquet" + ).to_pydict() == { + "example_feature": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "outlier_custom": [ + False, + False, + False, + False, + False, + False, + False, + False, + True, + True, + ], + } diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..a3d6373 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,23 @@ +""" +Utilities for running pytest tests in coSMicQC +""" + +import subprocess +from typing import Tuple + + +def run_cli_command(command: str) -> Tuple[str, str, int]: + """ + Run a CLI command using subprocess and capture the output and return code. + + Args: + command (list): The command to run as a list of strings. + + Returns: + tuple: (str: stdout, str: stderr, int: returncode) + """ + + result = subprocess.run( + command.split(" "), capture_output=True, text=True, check=False + ) + return result.stdout, result.stderr, result.returncode