diff --git a/.gitignore b/.gitignore index a84e64ae..56a1f382 100644 --- a/.gitignore +++ b/.gitignore @@ -118,3 +118,4 @@ tests/input/discrepant_snps[12].csv tests/input/empty.txt tests/input/ftdna.csv.gz tests/input/generic.fa.gz +tests/input/testvcf.vcf.gz diff --git a/Pipfile b/Pipfile index 92a4386b..1eb12647 100644 --- a/Pipfile +++ b/Pipfile @@ -7,6 +7,9 @@ verify_ssl = true pytest = "*" pytest-cov = "*" pytest-watch = "*" +sphinx = "*" +sphinx-rtd-theme = "*" +black = "==19.10b0" [packages] snps = {editable = true,path = "."} diff --git a/Pipfile.lock b/Pipfile.lock index f2124d55..3d7991ea 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "04d6573b6a023a142b251876572417e84e18882553c682c30b6bcb74cbc983bd" + "sha256": "63ccb11406dcb107cca759a568ffdef146d3e3dc4a9833552f5d84b41a6f7bf2" }, "pipfile-spec": 6, "requires": { @@ -25,44 +25,54 @@ }, "numpy": { "hashes": [ - "sha256:05dbfe72684cc14b92568de1bc1f41e5f62b00f714afc9adee42f6311738091f", - "sha256:0d82cb7271a577529d07bbb05cb58675f2deb09772175fab96dc8de025d8ac05", - "sha256:10132aa1fef99adc85a905d82e8497a580f83739837d7cbd234649f2e9b9dc58", - "sha256:12322df2e21f033a60c80319c25011194cd2a21294cc66fee0908aeae2c27832", - "sha256:16f19b3aa775dddc9814e02a46b8e6ae6a54ed8cf143962b4e53f0471dbd7b16", - "sha256:3d0b0989dd2d066db006158de7220802899a1e5c8cf622abe2d0bd158fd01c2c", - "sha256:438a3f0e7b681642898fd7993d38e2bf140a2d1eafaf3e89bb626db7f50db355", - "sha256:5fd214f482ab53f2cea57414c5fb3e58895b17df6e6f5bca5be6a0bb6aea23bb", - "sha256:73615d3edc84dd7c4aeb212fa3748fb83217e00d201875a47327f55363cef2df", - "sha256:7bd355ad7496f4ce1d235e9814ec81ee3d28308d591c067ce92e49f745ba2c2f", - "sha256:7d077f2976b8f3de08a0dcf5d72083f4af5411e8fddacd662aae27baa2601196", - "sha256:a4092682778dc48093e8bda8d26ee8360153e2047826f95a3f5eae09f0ae3abf", - "sha256:b458de8624c9f6034af492372eb2fee41a8e605f03f4732f43fc099e227858b2", - "sha256:e70fc8ff03a961f13363c2c95ef8285e0cf6a720f8271836f852cc0fa64e97c8", - "sha256:ee8e9d7cad5fe6dde50ede0d2e978d81eafeaa6233fb0b8719f60214cf226578", - "sha256:f4a4f6aba148858a5a5d546a99280f71f5ee6ec8182a7d195af1a914195b21a2" - ], - "version": "==1.17.2" + "sha256:0b0dd8f47fb177d00fa6ef2d58783c4f41ad3126b139c91dd2f7c4b3fdf5e9a5", + "sha256:25ffe71f96878e1da7e014467e19e7db90ae7d4e12affbc73101bcf61785214e", + "sha256:26efd7f7d755e6ca966a5c0ac5a930a87dbbaab1c51716ac26a38f42ecc9bc4b", + "sha256:28b1180c758abf34a5c3fea76fcee66a87def1656724c42bb14a6f9717a5bdf7", + "sha256:2e418f0a59473dac424f888dd57e85f77502a593b207809211c76e5396ae4f5c", + "sha256:30c84e3a62cfcb9e3066f25226e131451312a044f1fe2040e69ce792cb7de418", + "sha256:4650d94bb9c947151737ee022b934b7d9a845a7c76e476f3e460f09a0c8c6f39", + "sha256:4dd830a11e8724c9c9379feed1d1be43113f8bcce55f47ea7186d3946769ce26", + "sha256:4f2a2b279efde194877aff1f76cf61c68e840db242a5c7169f1ff0fd59a2b1e2", + "sha256:62d22566b3e3428dfc9ec972014c38ed9a4db4f8969c78f5414012ccd80a149e", + "sha256:669795516d62f38845c7033679c648903200980d68935baaa17ac5c7ae03ae0c", + "sha256:75fcd60d682db3e1f8fbe2b8b0c6761937ad56d01c1dc73edf4ef2748d5b6bc4", + "sha256:9395b0a41e8b7e9a284e3be7060db9d14ad80273841c952c83a5afc241d2bd98", + "sha256:9e37c35fc4e9410093b04a77d11a34c64bf658565e30df7cbe882056088a91c1", + "sha256:a0678793096205a4d784bd99f32803ba8100f639cf3b932dc63b21621390ea7e", + "sha256:b46554ad4dafb2927f88de5a1d207398c5385edbb5c84d30b3ef187c4a3894d8", + "sha256:c867eeccd934920a800f65c6068acdd6b87e80d45cd8c8beefff783b23cdc462", + "sha256:dd0667f5be56fb1b570154c2c0516a528e02d50da121bbbb2cbb0b6f87f59bc2", + "sha256:de2b1c20494bdf47f0160bd88ed05f5e48ae5dc336b8de7cfade71abcc95c0b9", + "sha256:f1df7b2b7740dd777571c732f98adb5aad5450aee32772f1b39249c8a50386f6", + "sha256:ffca69e29079f7880c5392bf675eb8b4146479d976ae1924d01cd92b04cccbcc" + ], + "version": "==1.17.3" }, "pandas": { "hashes": [ - "sha256:18d91a9199d1dfaa01ad645f7540370ba630bdcef09daaf9edf45b4b1bca0232", - "sha256:3f26e5da310a0c0b83ea50da1fd397de2640b02b424aa69be7e0784228f656c9", - "sha256:4182e32f4456d2c64619e97c58571fa5ca0993d1e8c2d9ca44916185e1726e15", - "sha256:426e590e2eb0e60f765271d668a30cf38b582eaae5ec9b31229c8c3c10c5bc21", - "sha256:5eb934a8f0dc358f0e0cdf314072286bbac74e4c124b64371395e94644d5d919", - "sha256:717928808043d3ea55b9bcde636d4a52d2236c246f6df464163a66ff59980ad8", - "sha256:8145f97c5ed71827a6ec98ceaef35afed1377e2d19c4078f324d209ff253ecb5", - "sha256:8744c84c914dcc59cbbb2943b32b7664df1039d99e834e1034a3372acb89ea4d", - "sha256:c1ac1d9590d0c9314ebf01591bd40d4c03d710bfc84a3889e5263c97d7891dee", - "sha256:cb2e197b7b0687becb026b84d3c242482f20cbb29a9981e43604eb67576da9f6", - "sha256:d4001b71ad2c9b84ff18b182cea22b7b6cbf624216da3ea06fb7af28d1f93165", - "sha256:d8930772adccb2882989ab1493fa74bd87d47c8ac7417f5dd3dd834ba8c24dc9", - "sha256:dfbb0173ee2399bc4ed3caf2d236e5c0092f948aafd0a15fbe4a0e77ee61a958", - "sha256:eebfbba048f4fa8ac711b22c78516e16ff8117d05a580e7eeef6b0c2be554c18", - "sha256:f1b21bc5cf3dbea53d33615d1ead892dfdae9d7052fa8898083bec88be20dcd2" - ], - "version": "==0.25.1" + "sha256:00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d", + "sha256:22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e", + "sha256:255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b", + "sha256:26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7", + "sha256:33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2", + "sha256:4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9", + "sha256:52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4", + "sha256:61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0", + "sha256:6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71", + "sha256:7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3", + "sha256:78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b", + "sha256:8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f", + "sha256:89f8fdf8c0ff3ed1e2c6f5c8482cf64fcc9645afd49be0a872a22f46d0bee57b", + "sha256:975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17", + "sha256:9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d", + "sha256:adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a", + "sha256:bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf", + "sha256:df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133", + "sha256:e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7", + "sha256:ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c" + ], + "version": "==0.25.3" }, "python-dateutil": { "hashes": [ @@ -73,16 +83,10 @@ }, "pytz": { "hashes": [ - "sha256:26c0b32e437e54a18161324a2fca3c4b9846b74a8dccddd843113109e1116b32", - "sha256:c894d57500a4cd2d5c71114aaab77dbab5eabd9022308ce5ac9bb93a60a6f0c7" + "sha256:1c557d7d0e871de1f5ccd5833f60fb2550652da6be2693c1e02300743d21500d", + "sha256:b02c06db6cf09c12dd25137e563b31700d3b80fcc4ad23abb7a315f2789819be" ], - "version": "==2019.2" - }, - "pyvcf": { - "hashes": [ - "sha256:e9d872513d179d229ab61da47a33f42726e9613784d1cb2bac3f8e2642f6f9d9" - ], - "version": "==0.6.8" + "version": "==2019.3" }, "six": { "hashes": [ @@ -97,6 +101,20 @@ } }, "develop": { + "alabaster": { + "hashes": [ + "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", + "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" + ], + "version": "==0.7.12" + }, + "appdirs": { + "hashes": [ + "sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92", + "sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e" + ], + "version": "==1.4.3" + }, "argh": { "hashes": [ "sha256:a9b3aaa1904eeb78e32394cd46c6f37ac0fb4af6dc488daa58971bdc7d7fcaf3", @@ -113,10 +131,46 @@ }, "attrs": { "hashes": [ - "sha256:ec20e7a4825331c1b5ebf261d111e16fa9612c1f7a5e1f884f12bd53a664dfd2", - "sha256:f913492e1663d3c36f502e5e9ba6cd13cf19d7fab50aa13239e420fef95e1396" + "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", + "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" ], - "version": "==19.2.0" + "version": "==19.3.0" + }, + "babel": { + "hashes": [ + "sha256:af92e6106cb7c55286b25b38ad7695f8b4efb36a90ba483d7f7a6628c46158ab", + "sha256:e86135ae101e31e2c8ec20a4e0c5220f4eed12487d5cf3f78be7e98d3a57fc28" + ], + "version": "==2.7.0" + }, + "black": { + "hashes": [ + "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b", + "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539" + ], + "index": "pypi", + "version": "==19.10b0" + }, + "certifi": { + "hashes": [ + "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", + "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" + ], + "version": "==2019.9.11" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "click": { + "hashes": [ + "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", + "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7" + ], + "version": "==7.0" }, "colorama": { "hashes": [ @@ -168,6 +222,28 @@ ], "version": "==0.6.2" }, + "docutils": { + "hashes": [ + "sha256:6c4f696463b79f1fb8ba0c594b63840ebd41f059e92b31957c46b74a4599b6d0", + "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827", + "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99" + ], + "version": "==0.15.2" + }, + "idna": { + "hashes": [ + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + ], + "version": "==2.8" + }, + "imagesize": { + "hashes": [ + "sha256:3f349de3eb99145973fefb7dbe38554414e5c30abd0c8e4b970a7c9d09f3a1d8", + "sha256:f3832918bc3c66617f92e35f5d70729187676313caa60c187eb0f28b8fe5e3b5" + ], + "version": "==1.1.0" + }, "importlib-metadata": { "hashes": [ "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26", @@ -176,6 +252,46 @@ "markers": "python_version < '3.8'", "version": "==0.23" }, + "jinja2": { + "hashes": [ + "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", + "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + ], + "version": "==2.10.3" + }, + "markupsafe": { + "hashes": [ + "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", + "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", + "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", + "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", + "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", + "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", + "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", + "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", + "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", + "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", + "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", + "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", + "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", + "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", + "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", + "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", + "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", + "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", + "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", + "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", + "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", + "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", + "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", + "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" + ], + "version": "==1.1.1" + }, "more-itertools": { "hashes": [ "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832", @@ -190,6 +306,12 @@ ], "version": "==19.2" }, + "pathspec": { + "hashes": [ + "sha256:e285ccc8b0785beadd4c18e5708b12bb8fcf529a1e61215b3feff1d1e559ea5c" + ], + "version": "==0.6.0" + }, "pathtools": { "hashes": [ "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0" @@ -210,6 +332,13 @@ ], "version": "==1.8.0" }, + "pygments": { + "hashes": [ + "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127", + "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297" + ], + "version": "==2.4.2" + }, "pyparsing": { "hashes": [ "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80", @@ -219,11 +348,11 @@ }, "pytest": { "hashes": [ - "sha256:13c1c9b22127a77fc684eee24791efafcef343335d855e3573791c68588fe1a5", - "sha256:d8ba7be9466f55ef96ba203fc0f90d0cf212f2f927e69186e1353e30bc7f62e5" + "sha256:27abc3fef618a01bebb1f0d6d303d2816a99aa87a5968ebc32fe971be91eb1e6", + "sha256:58cee9e09242937e136dbb3dab466116ba20d6b7828c7620f23947f37eb4dae4" ], "index": "pypi", - "version": "==5.2.0" + "version": "==5.2.2" }, "pytest-cov": { "hashes": [ @@ -240,6 +369,13 @@ "index": "pypi", "version": "==4.2.0" }, + "pytz": { + "hashes": [ + "sha256:1c557d7d0e871de1f5ccd5833f60fb2550652da6be2693c1e02300743d21500d", + "sha256:b02c06db6cf09c12dd25137e563b31700d3b80fcc4ad23abb7a315f2789819be" + ], + "version": "==2019.3" + }, "pyyaml": { "hashes": [ "sha256:0113bc0ec2ad727182326b61326afa3d1d8280ae1122493553fd6f4397f33df9", @@ -258,6 +394,31 @@ ], "version": "==5.1.2" }, + "regex": { + "hashes": [ + "sha256:15454b37c5a278f46f7aa2d9339bda450c300617ca2fca6558d05d870245edc7", + "sha256:1ad40708c255943a227e778b022c6497c129ad614bb7a2a2f916e12e8a359ee7", + "sha256:5e00f65cc507d13ab4dfa92c1232d004fa202c1d43a32a13940ab8a5afe2fb96", + "sha256:604dc563a02a74d70ae1f55208ddc9bfb6d9f470f6d1a5054c4bd5ae58744ab1", + "sha256:720e34a539a76a1fedcebe4397290604cc2bdf6f81eca44adb9fb2ea071c0c69", + "sha256:7caf47e4a9ac6ef08cabd3442cc4ca3386db141fb3c8b2a7e202d0470028e910", + "sha256:7faf534c1841c09d8fefa60ccde7b9903c9b528853ecf41628689793290ca143", + "sha256:b4e0406d822aa4993ac45072a584d57aa4931cf8288b5455bbf30c1d59dbad59", + "sha256:c31eaf28c6fe75ea329add0022efeed249e37861c19681960f99bbc7db981fb2", + "sha256:c7393597191fc2043c744db021643549061e12abe0b3ff5c429d806de7b93b66", + "sha256:d2b302f8cdd82c8f48e9de749d1d17f85ce9a0f082880b9a4859f66b07037dc6", + "sha256:e3d8dd0ec0ea280cf89026b0898971f5750a7bd92cb62c51af5a52abd020054a", + "sha256:ec032cbfed59bd5a4b8eab943c310acfaaa81394e14f44454ad5c9eba4f24a74" + ], + "version": "==2019.11.1" + }, + "requests": { + "hashes": [ + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" + ], + "version": "==2.22.0" + }, "six": { "hashes": [ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", @@ -265,6 +426,110 @@ ], "version": "==1.12.0" }, + "snowballstemmer": { + "hashes": [ + "sha256:209f257d7533fdb3cb73bdbd24f436239ca3b2fa67d56f6ff88e86be08cc5ef0", + "sha256:df3bac3df4c2c01363f3dd2cfa78cce2840a79b9f1c2d2de9ce8d31683992f52" + ], + "version": "==2.0.0" + }, + "sphinx": { + "hashes": [ + "sha256:31088dfb95359384b1005619827eaee3056243798c62724fd3fa4b84ee4d71bd", + "sha256:52286a0b9d7caa31efee301ec4300dbdab23c3b05da1c9024b4e84896fb73d79" + ], + "index": "pypi", + "version": "==2.2.1" + }, + "sphinx-rtd-theme": { + "hashes": [ + "sha256:00cf895504a7895ee433807c62094cf1e95f065843bf3acd17037c3e9a2becd4", + "sha256:728607e34d60456d736cc7991fd236afb828b21b82f956c5ea75f94c8414040a" + ], + "index": "pypi", + "version": "==0.4.3" + }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:edaa0ab2b2bc74403149cb0209d6775c96de797dfd5b5e2a71981309efab3897", + "sha256:fb8dee85af95e5c30c91f10e7eb3c8967308518e0f7488a2828ef7bc191d0d5d" + ], + "version": "==1.0.1" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:6c64b077937330a9128a4da74586e8c2130262f014689b4b89e2d08ee7294a34", + "sha256:9512ecb00a2b0821a146736b39f7aeb90759834b07e81e8cc23a9c70bacb9981" + ], + "version": "==1.0.1" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:4670f99f8951bd78cd4ad2ab962f798f5618b17675c35c5ac3b2132a14ea8422", + "sha256:d4fd39a65a625c9df86d7fa8a2d9f3cd8299a3a4b15db63b50aac9e161d8eff7" + ], + "version": "==1.0.2" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:513049b93031beb1f57d4daea74068a4feb77aa5630f856fcff2e50de14e9a20", + "sha256:79465ce11ae5694ff165becda529a600c754f4bc459778778c7017374d4d406f" + ], + "version": "==1.0.2" + }, + "sphinxcontrib-serializinghtml": { + "hashes": [ + "sha256:c0efb33f8052c04fd7a26c0a07f1678e8512e0faec19f4aa8f2473a8b81d5227", + "sha256:db6615af393650bf1151a6cd39120c29abaf93cc60db8c48eb2dddbfdc3a9768" + ], + "version": "==1.1.3" + }, + "toml": { + "hashes": [ + "sha256:229f81c57791a41d65e399fc06bf0848bab550a9dfd5ed66df18ce5f05e73d5c", + "sha256:235682dd292d5899d361a811df37e04a8828a5b1da3115886b73cf81ebc9100e" + ], + "version": "==0.10.0" + }, + "typed-ast": { + "hashes": [ + "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", + "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", + "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", + "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", + "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", + "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", + "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", + "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", + "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", + "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", + "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", + "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", + "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", + "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", + "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", + "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", + "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", + "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", + "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", + "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" + ], + "version": "==1.4.0" + }, + "urllib3": { + "hashes": [ + "sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398", + "sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86" + ], + "version": "==1.25.6" + }, "watchdog": { "hashes": [ "sha256:965f658d0732de3188211932aeb0bb457587f04f63ab4c1e33eab878e9de961d" diff --git a/README.rst b/README.rst index 4619fa9f..dd2f8d4d 100644 --- a/README.rst +++ b/README.rst @@ -34,7 +34,6 @@ Dependencies - `numpy `_ - `pandas `_ - `atomicwrites `_ -- `PyVCF `_ Installation ------------ @@ -48,7 +47,14 @@ Examples -------- Download Example Data ````````````````````` -Let's download some example data from `openSNP `_: +First, let's setup logging to get some helpful output: + +>>> import logging, sys +>>> logger = logging.getLogger() +>>> logger.setLevel(logging.DEBUG) +>>> logger.addHandler(logging.StreamHandler(sys.stdout)) + +Now we're ready to download some example data from `openSNP `_: >>> from snps.resources import Resources >>> r = Resources() @@ -63,7 +69,8 @@ Load a `23andMe `_ raw data file: >>> from snps import SNPs >>> s = SNPs('resources/662.23andme.340.txt.gz') -The loaded SNPs are available via a ``pandas.DataFrame``: +The ``SNPs`` class accepts a path to a file or a bytes object. A ``Reader`` class attempts to +infer the data source and load the SNPs. The loaded SNPs are available via a ``pandas.DataFrame``: >>> df = s.snps >>> df.columns.values diff --git a/docs/conf.py b/docs/conf.py index 1197d4e4..1275ee70 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -126,7 +126,7 @@ def __getattr__(cls, name): # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] +html_static_path = [] # Custom sidebar templates, must be a dictionary that maps document names # to template names. diff --git a/docs/index.rst b/docs/index.rst index 13ae28d2..8d9790f9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -15,6 +15,7 @@ README output_files snps_banner + snps Indices and tables ================== diff --git a/docs/modules.rst b/docs/modules.rst deleted file mode 100644 index f459e487..00000000 --- a/docs/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -snps -==== - -.. toctree:: - :maxdepth: 4 - - snps diff --git a/docs/snps.rst b/docs/snps.rst index 78836ba0..9658411b 100644 --- a/docs/snps.rst +++ b/docs/snps.rst @@ -1,11 +1,19 @@ snps package ============ +Module +------ + +.. automodule:: snps + :members: + :undoc-members: + :show-inheritance: + Submodules ---------- snps\.ensembl module --------------------- +~~~~~~~~~~~~~~~~~~~~ .. automodule:: snps.ensembl :members: @@ -13,7 +21,7 @@ snps\.ensembl module :show-inheritance: snps\.io module ---------------- +~~~~~~~~~~~~~~~ .. automodule:: snps.io :members: @@ -21,7 +29,7 @@ snps\.io module :show-inheritance: snps\.resources module ----------------------- +~~~~~~~~~~~~~~~~~~~~~~ .. automodule:: snps.resources :members: @@ -29,18 +37,9 @@ snps\.resources module :show-inheritance: snps\.utils module ------------------- +~~~~~~~~~~~~~~~~~~ .. automodule:: snps.utils :members: :undoc-members: :show-inheritance: - - -Module contents ---------------- - -.. automodule:: snps - :members: - :undoc-members: - :show-inheritance: diff --git a/setup.cfg b/setup.cfg index c6fee1a8..fb025f86 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,16 +21,8 @@ addopts = --tb=short # http://coverage.readthedocs.io/en/latest/ -[coverage:paths] -source = - src - */site-packages - [coverage:run] branch = true -source = - src - tests omit = */snps/_version.py [coverage:report] diff --git a/setup.py b/setup.py index f12864c6..1bd247b7 100644 --- a/setup.py +++ b/setup.py @@ -118,7 +118,7 @@ "Issue Tracker": "https://github.com/apriha/snps/issues", }, keywords="snps dna chromosomes bioinformatics", - install_requires=["numpy", "pandas", "atomicwrites", "PyVCF"], + install_requires=["numpy", "pandas", "atomicwrites"], python_requires=">=3.5", platforms=["any"], ) diff --git a/src/snps/__init__.py b/src/snps/__init__.py index f14e8bf4..dcfe84e4 100644 --- a/src/snps/__init__.py +++ b/src/snps/__init__.py @@ -1,4 +1,4 @@ -""" snps +""" `snps` tools for reading, writing, merging, and remapping SNPs @@ -53,6 +53,10 @@ # set version string with Versioneer from snps._version import get_versions +import logging + +logger = logging.getLogger(__name__) + __version__ = get_versions()["version"] del get_versions @@ -67,6 +71,7 @@ def __init__( resources_dir="resources", parallelize=False, processes=os.cpu_count(), + rsids=(), ): """ Object used to read and parse genotype / raw data files. @@ -86,6 +91,8 @@ def __init__( utilize multiprocessing to speedup calculations processes : int processes to launch if multiprocessing + rsids : tuple, optional + rsids to extract if loading a VCF file """ self._file = file self._only_detect_source = only_detect_source @@ -99,7 +106,9 @@ def __init__( if file: - self._snps, self._source = self._read_raw_data(file, only_detect_source) + self._snps, self._source = self._read_raw_data( + file, only_detect_source, rsids + ) if not self._snps.empty: self.sort_snps() @@ -115,7 +124,7 @@ def __init__( self._assign_par_snps() def __repr__(self): - return "SNPs({!r})".format(self._file) + return "SNPs({!r})".format(self._file[0:50]) @property def source(self): @@ -210,6 +219,19 @@ def sex(self): """ return self.determine_sex() + @property + def unannotated_vcf(self): + """ Indicates if VCF file is unannotated. + + Returns + ------- + bool + """ + if self.snp_count == 0 and self.source == "vcf": + return True + + return False + def get_summary(self): """ Get summary of ``SNPs``. @@ -269,23 +291,23 @@ def save_snps(self, filename="", vcf=False, atomic=True, **kwargs): snps=self, filename=filename, vcf=vcf, atomic=atomic, **kwargs ) - def _read_raw_data(self, file, only_detect_source): - return Reader.read_file(file, only_detect_source, self._resources) + def _read_raw_data(self, file, only_detect_source, rsids): + return Reader.read_file(file, only_detect_source, self._resources, rsids) def _assign_par_snps(self): """ Assign PAR SNPs to the X or Y chromosome using SNP position. References ----- - .. [1] National Center for Biotechnology Information, Variation Services, RefSNP, + 1. National Center for Biotechnology Information, Variation Services, RefSNP, https://api.ncbi.nlm.nih.gov/variation/v0/ - .. [2] Yates et. al. (doi:10.1093/bioinformatics/btu613), + 2. Yates et. al. (doi:10.1093/bioinformatics/btu613), ``_ - .. [3] Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098 - .. [4] Sherry ST, Ward MH, Kholodov M, Baker J, Phan L, Smigielski EM, Sirotkin K. + 3. Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098 + 4. Sherry ST, Ward MH, Kholodov M, Baker J, Phan L, Smigielski EM, Sirotkin K. dbSNP: the NCBI database of genetic variation. Nucleic Acids Res. 2001 Jan 1; 29(1):308-11. - .. [5] Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center + 5. Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center for Biotechnology Information, National Library of Medicine. dbSNP accession: rs28736870, rs113313554, and rs758419898 (dbSNP Build ID: 151). Available from: http://www.ncbi.nlm.nih.gov/SNP/ @@ -319,7 +341,7 @@ def _assign_par_snps(self): break except Exception as err: - print(err) + logger.warning(err) def _assign_snp(self, rsid, alleles, chrom): # only assign SNP if positions match (i.e., same build) @@ -359,13 +381,13 @@ def detect_build(self): References ---------- - .. [1] Yates et. al. (doi:10.1093/bioinformatics/btu613), + 1. Yates et. al. (doi:10.1093/bioinformatics/btu613), ``_ - .. [2] Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098 - .. [3] Sherry ST, Ward MH, Kholodov M, Baker J, Phan L, Smigielski EM, Sirotkin K. + 2. Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098 + 3. Sherry ST, Ward MH, Kholodov M, Baker J, Phan L, Smigielski EM, Sirotkin K. dbSNP: the NCBI database of genetic variation. Nucleic Acids Res. 2001 Jan 1;29(1):308-11. - .. [4] Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center + 4. Database of Single Nucleotide Polymorphisms (dbSNP). Bethesda (MD): National Center for Biotechnology Information, National Library of Medicine. dbSNP accession: rs3094315, rs11928389, rs2500347, rs964481, and rs2341354 (dbSNP Build ID: 151). Available from: http://www.ncbi.nlm.nih.gov/SNP/ @@ -598,7 +620,7 @@ def remap_snps(self, target_assembly, complement_bases=True): References ---------- - .. [1] Ensembl, Assembly Map Endpoint, + 1. Ensembl, Assembly Map Endpoint, http://rest.ensembl.org/documentation/info/assembly_map """ chromosomes_remapped = [] @@ -607,7 +629,7 @@ def remap_snps(self, target_assembly, complement_bases=True): snps = self.snps if snps.empty: - print("No SNPs to remap") + logger.debug("No SNPs to remap") return chromosomes_remapped, chromosomes_not_remapped else: chromosomes = snps["chrom"].unique() @@ -616,7 +638,7 @@ def remap_snps(self, target_assembly, complement_bases=True): valid_assemblies = ["NCBI36", "GRCh37", "GRCh38", 36, 37, 38] if target_assembly not in valid_assemblies: - print("Invalid target assembly") + logger.debug("Invalid target assembly") return chromosomes_remapped, chromosomes_not_remapped if isinstance(target_assembly, int): @@ -655,7 +677,7 @@ def remap_snps(self, target_assembly, complement_bases=True): } ) else: - print( + logger.debug( "Chromosome {} not remapped; " "removing chromosome from SNPs for consistency".format(chrom) ) @@ -712,11 +734,13 @@ def _remapper(self, task): mapped_region = mapping["mapped"]["seq_region_name"] if orig_region != mapped_region: - print("discrepant chroms") + logger.debug("discrepant chroms") continue if orig_range_len != mapped_range_len: - print("discrepant coords") # observed when mapping NCBI36 -> GRCh38 + logger.debug( + "discrepant coords" + ) # observed when mapping NCBI36 -> GRCh38 continue # find the SNPs that are being remapped for this mapping @@ -898,7 +922,7 @@ def _load_snps_helper( discrepant_genotypes_threshold, save_output, ): - print("Loading " + os.path.relpath(file)) + logger.debug("Loading " + os.path.relpath(file)) discrepant_positions, discrepant_genotypes = self._add_snps( SNPs(file), discrepant_snp_positions_threshold, @@ -1044,12 +1068,12 @@ def _add_snps( source = [s.strip() for s in snps._source.split(",")] if not snps._build_detected: - print("build not detected, assuming build {}".format(snps._build)) + logger.debug("build not detected, assuming build {}".format(snps._build)) if not self._build: self._build = build elif self._build != build: - print( + logger.debug( "build / assembly mismatch between current build of SNPs and SNPs being loaded" ) @@ -1073,7 +1097,7 @@ def _add_snps( prefix = "{}_".format(clean_str(self._name)) if 0 < len(discrepant_positions) < discrepant_snp_positions_threshold: - print( + logger.debug( "{} SNP positions were discrepant; keeping original positions".format( str(len(discrepant_positions)) ) @@ -1089,7 +1113,7 @@ def _add_snps( ), ) elif len(discrepant_positions) >= discrepant_snp_positions_threshold: - print( + logger.debug( "too many SNPs differ in position; ensure same genome build is being used" ) return discrepant_positions, discrepant_genotypes @@ -1138,7 +1162,7 @@ def _add_snps( ] if 0 < len(discrepant_genotypes) < discrepant_genotypes_threshold: - print( + logger.debug( "{} SNP genotypes were discrepant; marking those as null".format( str(len(discrepant_genotypes)) ) @@ -1154,7 +1178,7 @@ def _add_snps( ), ) elif len(discrepant_genotypes) >= discrepant_genotypes_threshold: - print( + logger.debug( "too many SNPs differ in their genotype; ensure file is for same " "individual" ) diff --git a/src/snps/ensembl.py b/src/snps/ensembl.py index 15e6b763..03bbdb67 100644 --- a/src/snps/ensembl.py +++ b/src/snps/ensembl.py @@ -6,9 +6,9 @@ References ---------- -.. [1] Yates et. al. (doi:10.1093/bioinformatics/btu613), +1. Yates et. al. (doi:10.1093/bioinformatics/btu613), ``_ -.. [2] Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098 +2. Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098 """ diff --git a/src/snps/io.py b/src/snps/io.py index 367ee4df..22a35277 100644 --- a/src/snps/io.py +++ b/src/snps/io.py @@ -1,3 +1,7 @@ +""" Classes for reading and writing SNPs. + +""" + """ BSD 3-Clause License @@ -41,16 +45,19 @@ import numpy as np import pandas as pd -import vcf import snps from snps.utils import save_df_as_csv, clean_str +import logging + +logger = logging.getLogger(__name__) + class Reader: """ Class for reading and parsing raw data / genotype files. """ - def __init__(self, file="", only_detect_source=False, resources=None): + def __init__(self, file="", only_detect_source=False, resources=None, rsids=()): """ Initialize a `Reader`. Parameters @@ -61,10 +68,14 @@ def __init__(self, file="", only_detect_source=False, resources=None): only detect the source of the data resources : Resources instance of Resources + rsids : tuple, optional + rsids to extract if loading a VCF file + """ self._file = file self._only_detect_source = only_detect_source self._resources = resources + self._rsids = rsids def __call__(self): """ Read and parse a raw data / genotype file. @@ -147,7 +158,7 @@ def __call__(self): elif first_line.startswith("rsid"): return self.read_generic_csv(file) elif "vcf" in comments.lower(): - return self.read_vcf(file) + return self.read_vcf(file, self._rsids) elif ("Genes for Good" in comments) | ("PLINK" in comments): return self.read_genes_for_good(file) elif "CODIGO46" in comments: @@ -155,11 +166,11 @@ def __call__(self): else: return pd.DataFrame(), "" except Exception as err: - print(err) + logger.warning(err) return pd.DataFrame(), "" @classmethod - def read_file(cls, file, only_detect_source, resources): + def read_file(cls, file, only_detect_source, resources, rsids): """ Read `file`. Parameters @@ -170,13 +181,15 @@ def read_file(cls, file, only_detect_source, resources): only detect the source of the data resources : Resources instance of Resources + rsids : tuple + rsids to extract if loading a VCF file Returns ------- tuple : (pandas.DataFrame, str) dataframe of parsed SNPs, detected source of SNPs """ - r = cls(file, only_detect_source, resources) + r = cls(file, only_detect_source, resources, rsids) return r() def _extract_comments(self, f, decode): @@ -287,7 +300,6 @@ def read_ftdna(self, file): ) # remove incongruous data - df = df.drop(df.loc[df["chrom"] == "0"].index) df = df.drop( df.loc[df.index == "RSID"].index ) # second header for concatenated data @@ -651,19 +663,28 @@ def read_generic_csv(self, file): return df, "generic" - def read_vcf(self, file): + def read_vcf(self, file, rsids=()): """ Read and parse VCF file. Notes ----- - This function uses the PyVCF python module to parse the genotypes from VCF files: - https://pyvcf.readthedocs.io/en/latest/index.html + This method attempts to read and parse a VCF file or buffer, optionally + compressed with gzip. Some assumptions are made throughout this process: + * SNPs that are not annotated with an RSID are skipped + * If the VCF contains multiple samples, only the first sample is used to + lookup the genotype + * Insertions and deletions are skipped + * If a sample allele is not specified, the genotype is reported as NaN + * If a sample allele refers to a REF or ALT allele that is not specified, + the genotype is reported as NaN Parameters ---------- - file : str - path to file + file : str or bytes + path to file or bytes to load + rsids : tuple, optional + rsids to extract if loading a VCF file Returns ------- @@ -676,51 +697,83 @@ def read_vcf(self, file): if self._only_detect_source: return pd.DataFrame(), "vcf" - df = pd.DataFrame(columns=["rsid", "chrom", "pos", "genotype"]) - df = df.astype( - {"rsid": object, "chrom": object, "pos": np.int64, "genotype": object} - ) + if not isinstance(file, io.BytesIO): + with open(file, "rb") as f: + return self._parse_vcf(f, rsids) + else: + return self._parse_vcf(file, rsids) - with open(file, "r") as f: - vcf_reader = vcf.Reader(f) + def _parse_vcf(self, buffer, rsids): + rows = [] + first_four_bytes = buffer.read(4) + buffer.seek(0) - # snps does not yet support multi-sample vcf. - if len(vcf_reader.samples) > 1: - print( - "Multiple samples detected in the vcf file, please use a single sample vcf." - ) - return df, "vcf" + if self.is_gzip(first_four_bytes): + f = gzip.open(buffer) + else: + f = buffer - for i, record in enumerate(vcf_reader): - # assign null genotypes if either allele is None - # Could capture full genotype, if REF is None, but genotype is 1/1 or - # if ALT is None, but genotype is 0/0 - if record.REF is None or record.ALT[0] is None: - genotype = np.nan + with io.TextIOWrapper(io.BufferedReader(f)) as file: + + for line in file: + + line_strip = line.strip("\n") + if line_strip.startswith("#"): + continue + rsid = line_strip.split("\t")[2] # skip SNPs with missing rsIDs. - elif record.ID is None: + if rsid == ".": continue + if rsids: + if rsid not in rsids: + continue + + line_split = line_strip.split("\t") + + # snps does not yet support multi-sample vcf. + if len(line_split) > 10: + logger.debug("Multiple samples detected in the vcf file") + + ref = line_split[3] + alt = line_split[4] + zygote = line_split[9] + zygote = zygote.split(":")[0] + + ref_alt = [ref] + alt.split(",") + # skip insertions and deletions - elif len(record.REF) > 1 or len(record.ALT[0]) > 1: + if sum(map(len, ref_alt)) > len(ref_alt): continue + + zygote1, zygote2 = zygote.replace("|", " ").replace("/", " ").split(" ") + if zygote1 == "." or zygote2 == ".": + # assign null genotypes if either allele is None + genotype = np.nan + elif (zygote1 == "0" or zygote2 == "0") and ref == ".": + # sample allele specifies REF allele, which is None + genotype = np.nan + elif (zygote1 == "1" or zygote2 == "1") and alt == ".": + # sample allele specifies ALT allele, which is None + genotype = np.nan else: - alleles = record.genotype(vcf_reader.samples[0]).gt_bases - a1 = alleles[0] - a2 = alleles[-1] - genotype = "{}{}".format(a1, a2) - - record_info = { - "rsid": record.ID, - "chrom": "{}".format(record.CHROM).strip("chr"), - "pos": record.POS, - "genotype": genotype, - } - # append the record to the DataFrame - df = df.append( - pd.DataFrame([record_info]), ignore_index=True, sort=False - ) + # Could capture full genotype, if REF is None, but genotype is 1/1 or + # if ALT is None, but genotype is 0/0 + genotype = ref_alt[int(zygote1)] + ref_alt[int(zygote2)] + + record_array = [ + rsid, + "{}".format(line_split[0]).strip("chr"), + line_split[1], + genotype, + ] + rows.append(record_array) + + df = pd.DataFrame(rows, columns=["rsid", "chrom", "pos", "genotype"]) + df = df.astype( + {"rsid": object, "chrom": object, "pos": np.int64, "genotype": object} + ) - df.set_index("rsid", inplace=True, drop=True) + df.set_index("rsid", inplace=True, drop=True) return df, "vcf" @@ -827,7 +880,7 @@ def _write_vcf(self): References ---------- - .. [1] The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019, + 1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019, https://samtools.github.io/hts-specs/VCFv4.2.pdf Returns diff --git a/src/snps/resources.py b/src/snps/resources.py index e50c5e58..dbdc7ad2 100644 --- a/src/snps/resources.py +++ b/src/snps/resources.py @@ -2,14 +2,14 @@ References ---------- -.. [1] International Human Genome Sequencing Consortium. Initial sequencing and +1. International Human Genome Sequencing Consortium. Initial sequencing and analysis of the human genome. Nature. 2001 Feb 15;409(6822):860-921. http://dx.doi.org/10.1038/35057062 -.. [2] hg19 (GRCh37): Hiram Clawson, Brooke Rhead, Pauline Fujita, Ann Zweig, Katrina +2. hg19 (GRCh37): Hiram Clawson, Brooke Rhead, Pauline Fujita, Ann Zweig, Katrina Learned, Donna Karolchik and Robert Kuhn, https://genome.ucsc.edu/cgi-bin/hgGateway?db=hg19 -.. [3] Yates et. al. (doi:10.1093/bioinformatics/btu613), +3. Yates et. al. (doi:10.1093/bioinformatics/btu613), ``_ -.. [4] Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098 +4. Zerbino et. al. (doi.org/10.1093/nar/gkx1098), https://doi.org/10.1093/nar/gkx1098 """ @@ -62,6 +62,10 @@ from snps.ensembl import EnsemblRestClient from snps.utils import create_dir, Singleton +import logging + +logger = logging.getLogger(__name__) + class Resources(metaclass=Singleton): """ Object used to manage resources required by `snps`. """ @@ -131,7 +135,7 @@ def get_reference_sequences( valid_assemblies = ["NCBI36", "GRCh37", "GRCh38"] if assembly not in valid_assemblies: - print("Invalid assembly") + logger.debug("Invalid assembly") return {} if not self._reference_chroms_available(assembly, chroms): @@ -182,7 +186,7 @@ def download_example_datasets(self): References ---------- - .. [1] Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource + 1. Greshake B, Bayer PE, Rausch H, Reda J (2014), "openSNP-A Crowdsourced Web Resource for Personal Genomics," PLOS ONE, 9(3): e89204, https://doi.org/10.1371/journal.pone.0089204 """ @@ -297,7 +301,7 @@ def _load_assembly_mapping_data(filename): return assembly_mapping_data except Exception as err: - print(err) + logger.warning(err) return {} def _get_paths_reference_sequences( @@ -331,7 +335,7 @@ def _get_paths_reference_sequences( References ---------- - .. [1] Daniel R. Zerbino, Premanand Achuthan, Wasiu Akanni, M. Ridwan Amode, + 1. Daniel R. Zerbino, Premanand Achuthan, Wasiu Akanni, M. Ridwan Amode, Daniel Barrell, Jyothish Bhai, Konstantinos Billis, Carla Cummins, Astrid Gall, Carlos García Giro´n, Laurent Gil, Leo Gordon, Leanne Haggerty, Erin Haskell, Thibaut Hourlier, Osagie G. Izuogu, Sophie H. Janacek, Thomas Juettemann, @@ -346,11 +350,11 @@ def _get_paths_reference_sequences( Ensembl 2018. PubMed PMID: 29155950. doi:10.1093/nar/gkx1098 - .. [2] NCBI 36, Oct 2005, Ensembl release 54, Database version: 54.36p - .. [3] GRCh37.p13 (Genome Reference Consortium Human Reference 37), + 2. NCBI 36, Oct 2005, Ensembl release 54, Database version: 54.36p + 3. GRCh37.p13 (Genome Reference Consortium Human Reference 37), INSDC Assembly GCA_000001405.14, Feb 2009, Ensembl GRCh37 release 96, Database version: 96.37 - .. [4] GRCh38.p12 (Genome Reference Consortium Human Build 38), + 4. GRCh38.p12 (Genome Reference Consortium Human Build 38), INSDC Assembly GCA_000001405.27, Dec 2013, Ensembl release 96, Database version: 96.38 """ @@ -426,9 +430,9 @@ def _get_path_assembly_mapping_data( References ---------- - .. [1] Ensembl, Assembly Information Endpoint, + 1. Ensembl, Assembly Information Endpoint, https://rest.ensembl.org/documentation/info/assembly_info - .. [2] Ensembl, Assembly Map Endpoint, + 2. Ensembl, Assembly Map Endpoint, http://rest.ensembl.org/documentation/info/assembly_map """ @@ -472,14 +476,14 @@ def _get_path_assembly_mapping_data( if not os.path.exists(destination) or not self._all_chroms_in_tar( chroms, destination ): - print("Downloading {}".format(os.path.relpath(destination))) + logger.debug("Downloading {}".format(os.path.relpath(destination))) try: self._download_assembly_mapping_data( destination, chroms, source_assembly, target_assembly, retries ) except Exception as err: - print(err) + logger.warning(err) return "" return destination @@ -527,7 +531,7 @@ def _all_chroms_in_tar(self, chroms, filename): if chrom + ".json" not in members: return False except Exception as err: - print(err) + logger.warning(err) return False return True @@ -554,7 +558,7 @@ def _load_codigo46_resources(self, rsid_map, chrpos_map): return d except Exception as err: - print(err) + logger.warning(err) return {} def _get_path_codigo46_rsid_map(self): @@ -613,7 +617,7 @@ def _download_file(self, url, filename, compress=False, timeout=30): else: f.write(data) except urllib.error.URLError as err: - print(err) + logger.warning(err) destination = "" # try HTTP if an FTP error occurred if "ftp://" in url: @@ -624,7 +628,7 @@ def _download_file(self, url, filename, compress=False, timeout=30): timeout=timeout, ) except Exception as err: - print(err) + logger.warning(err) return "" return destination @@ -638,7 +642,7 @@ def _print_download_msg(path): path : str path to file being downloaded """ - print("Downloading " + os.path.relpath(path)) + logger.debug("Downloading " + os.path.relpath(path)) class ReferenceSequence: @@ -664,7 +668,7 @@ def __init__(self, ID="", url="", path="", assembly="", species="", taxonomy="") References ---------- - .. [1] The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019, + 1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019, https://samtools.github.io/hts-specs/VCFv4.2.pdf """ self._ID = ID diff --git a/src/snps/utils.py b/src/snps/utils.py index 502d418e..ac8dc0d9 100644 --- a/src/snps/utils.py +++ b/src/snps/utils.py @@ -1,3 +1,7 @@ +""" Utility classes and functions. + +""" + """ BSD 3-Clause License @@ -41,6 +45,9 @@ import pandas as pd import snps +import logging + +logger = logging.getLogger(__name__) class Parallelizer: @@ -106,7 +113,7 @@ def create_dir(path): try: os.makedirs(path, exist_ok=True) except Exception as err: - print(err) + logger.warning(err) return False if os.path.exists(path): @@ -156,7 +163,7 @@ def save_df_as_csv( destination = filename else: destination = os.path.join(path, filename) - print("Saving " + os.path.relpath(destination)) + logger.debug("Saving " + os.path.relpath(destination)) if prepend_info: s = ( @@ -190,10 +197,10 @@ def save_df_as_csv( return destination except Exception as err: - print(err) + logger.warning(err) return "" else: - print("no data to save...") + logger.debug("no data to save...") return "" diff --git a/tests/input/generic.fa b/tests/input/generic.fa index d7dc0167..1766fe91 100644 --- a/tests/input/generic.fa +++ b/tests/input/generic.fa @@ -1,3 +1,3 @@ ->generic test sequence:1:1:110 +>generic test sequence:1:1:117 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN diff --git a/tests/input/testvcf.vcf b/tests/input/testvcf.vcf index b05e1ed4..d072e89f 100644 --- a/tests/input/testvcf.vcf +++ b/tests/input/testvcf.vcf @@ -4,12 +4,20 @@ ## ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID -1 101 rs1 A G . . . GT 0/0 -1 102 rs2 G C . . . GT 1/1 -1 103 rs3 G T . . . GT 0/0 +1 101 rs1 A . . . . GT 0/0 +1 102 rs2 . C . . . GT 1/1 +1 103 rs3 G T . . . GT 0|0 1 104 rs4 C T . . . GT 1/1 -1 105 rs5 C . . . . GT ./. +1 105 rs5 C . . . . GT ./. 1 106 rs6 G C . . . GT 0/1 1 107 rs7 G T,C . . . GT 1/2 -1 108 rs8 A T . . . GT 0|1 -1 109 . C T . . . GT 0/1 +1 108 rs8 A T . . . GT 0/1 +1 109 . C T . . . GT 0/1 +1 110 rs10 A AGC . . . GT 0/1 +1 111 rs11 AGC A . . . GT 0/1 +1 112 rs12 . A . . . GT 0/1 +1 113 rs13 . A . . . GT 1/0 +1 114 rs14 A . . . . GT 0/1 +1 115 rs15 A . . . . GT 1/0 +1 116 rs16 A A . . . GT 0/. +1 117 rs17 A A . . . GT ./0 diff --git a/tests/input/unannotated_testvcf.vcf b/tests/input/unannotated_testvcf.vcf new file mode 100644 index 00000000..73fcce47 --- /dev/null +++ b/tests/input/unannotated_testvcf.vcf @@ -0,0 +1,15 @@ +##fileformat=VCFv4.1 +##fileDate=20190527 +## +## +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLEID +1 101 . A G . . . GT 0/0 +1 102 . G C . . . GT 1/1 +1 103 . G T . . . GT 0/0 +1 104 . C T . . . GT 1/1 +1 105 . C . . . . GT ./. +1 106 . G C . . . GT 0/1 +1 107 . G T,C . . . GT 1/2 +1 108 . A T . . . GT 0/1 +1 109 . C T . . . GT 0/1 diff --git a/tests/test_resources.py b/tests/test_resources.py index 7427a7d1..49841965 100644 --- a/tests/test_resources.py +++ b/tests/test_resources.py @@ -291,7 +291,7 @@ def test_reference_sequence_generic_load_sequence(self): seq.sequence, np.array( bytearray( - "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACN", + "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN", encoding="utf-8", errors="strict", ), @@ -299,7 +299,7 @@ def test_reference_sequence_generic_load_sequence(self): ), ) assert list("AGGCCGGAC") == list(map(chr, seq.sequence[100:109])) - assert seq.md5 == "dc86fbda2f6febd77622407beae66b9a" + assert seq.md5 == "6ac6176535ad0e38aba2d05d786c39b6" assert seq.start == 1 - assert seq.end == 110 - assert seq.length == 110 + assert seq.end == 117 + assert seq.length == 117 diff --git a/tests/test_snps_collection.py b/tests/test_snps_collection.py index 3fc2ce60..9b25fe8a 100644 --- a/tests/test_snps_collection.py +++ b/tests/test_snps_collection.py @@ -46,15 +46,26 @@ from tests import BaseSNPsTestCase -class TestIndividual(BaseSNPsTestCase): +class TestSNPsCollection(BaseSNPsTestCase): def generic_snps(self): return self.create_snp_df( - rsid=["rs1", "rs2", "rs3", "rs4", "rs5", "rs6", "rs7", "rs8"], - chrom=["1", "1", "1", "1", "1", "1", "1", "1"], - pos=[101, 102, 103, 104, 105, 106, 107, 108], + rsid=["rs" + str(i) for i in range(1, 9)], + chrom=["1"] * 8, + pos=list(range(101, 109)), genotype=["AA", "CC", "GG", "TT", np.nan, "GC", "TC", "AT"], ) + def generic_snps_vcf(self): + df = self.generic_snps() + return df.append( + self.create_snp_df( + rsid=["rs" + str(i) for i in range(12, 18)], + chrom=["1"] * 6, + pos=list(range(112, 118)), + genotype=[np.nan] * 6, + ) + ) + def snps_NCBI36(self): return self.create_snp_df( rsid=["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"], @@ -202,7 +213,113 @@ def test_snps_vcf(self): # phased snps, and snps with missing rsID s = SNPs("tests/input/testvcf.vcf") assert s.source == "vcf" - pd.testing.assert_frame_equal(s.snps, self.generic_snps()) + assert not s.unannotated_vcf + pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf()) + + def test_snps_vcf_rsids(self): + # https://samtools.github.io/hts-specs/VCFv4.2.pdf + # this tests for homozygous snps, heterozygous snps, multiallelic snps, + # phased snps, and snps with missing rsID + rsids = ["rs1", "rs2"] + s = SNPs("tests/input/testvcf.vcf", rsids=rsids) + assert s.source == "vcf" + assert not s.unannotated_vcf + pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf().loc[rsids]) + + def test_snps_vcf_gz(self): + # https://samtools.github.io/hts-specs/VCFv4.2.pdf + # this tests for homozygous snps, heterozygous snps, multiallelic snps, + # phased snps, and snps with missing rsID + with open("tests/input/testvcf.vcf", "rb") as f_in: + with atomic_write( + "tests/input/testvcf.vcf.gz", mode="wb", overwrite=True + ) as f_out: + with gzip.open(f_out, "wb") as f_gzip: + shutil.copyfileobj(f_in, f_gzip) + + s = SNPs("tests/input/testvcf.vcf.gz") + assert s.source == "vcf" + pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf()) + + def test_snps_vcf_gz_rsids(self): + # https://samtools.github.io/hts-specs/VCFv4.2.pdf + # this tests for homozygous snps, heterozygous snps, multiallelic snps, + # phased snps, and snps with missing rsID + with open("tests/input/testvcf.vcf", "rb") as f_in: + with atomic_write( + "tests/input/testvcf.vcf.gz", mode="wb", overwrite=True + ) as f_out: + with gzip.open(f_out, "wb") as f_gzip: + shutil.copyfileobj(f_in, f_gzip) + + rsids = ["rs1", "rs2"] + s = SNPs("tests/input/testvcf.vcf.gz", rsids=rsids) + assert s.source == "vcf" + pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf().loc[rsids]) + + def test_snps_unannotated_vcf(self): + # https://samtools.github.io/hts-specs/VCFv4.2.pdf + # this tests for homozygous snps, heterozygous snps, multiallelic snps, + # phased snps, and snps with missing rsID + s = SNPs("tests/input/unannotated_testvcf.vcf") + assert s.source == "vcf" + assert s.unannotated_vcf + + def test_snps_vcf_buffer(self): + with open("tests/input/testvcf.vcf", "r") as f: + snps_vcf_buffer = SNPs(f.read().encode("utf-8")) + # https://samtools.github.io/hts-specs/VCFv4.2.pdf + # this tests for homozygous snps, heterozygous snps, multiallelic snps, + # phased snps, and snps with missing rsID + assert snps_vcf_buffer.source == "vcf" + pd.testing.assert_frame_equal(snps_vcf_buffer.snps, self.generic_snps_vcf()) + + def test_snps_vcf_buffer_rsids(self): + with open("tests/input/testvcf.vcf", "r") as f: + rsids = ["rs1", "rs2"] + df = SNPs(f.read().encode("utf-8"), rsids=rsids) + # https://samtools.github.io/hts-specs/VCFv4.2.pdf + # this tests for homozygous snps, heterozygous snps, multiallelic snps, + # phased snps, and snps with missing rsID + assert df.source == "vcf" + pd.testing.assert_frame_equal(df.snps, self.generic_snps_vcf().loc[rsids]) + + def test_snps_vcf_buffer_gz(self): + with open("tests/input/testvcf.vcf", "rb") as f_in: + with atomic_write( + "tests/input/testvcf.vcf.gz", mode="wb", overwrite=True + ) as f_out: + with gzip.open(f_out, "wb") as f_gzip: + shutil.copyfileobj(f_in, f_gzip) + + with open("tests/input/testvcf.vcf.gz", "rb") as f: + data = f.read() + s = SNPs(data) + os.remove("tests/input/testvcf.vcf.gz") + # https://samtools.github.io/hts-specs/VCFv4.2.pdf + # this tests for homozygous snps, heterozygous snps, multiallelic snps, + # phased snps, and snps with missing rsID + assert s.source == "vcf" + pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf()) + + def test_snps_vcf_buffer_gz_rsids(self): + with open("tests/input/testvcf.vcf", "rb") as f_in: + with atomic_write( + "tests/input/testvcf.vcf.gz", mode="wb", overwrite=True + ) as f_out: + with gzip.open(f_out, "wb") as f_gzip: + shutil.copyfileobj(f_in, f_gzip) + + with open("tests/input/testvcf.vcf.gz", "rb") as f: + rsids = ["rs1", "rs2"] + data = f.read() + s = SNPs(data, rsids=rsids) + os.remove("tests/input/testvcf.vcf.gz") + # https://samtools.github.io/hts-specs/VCFv4.2.pdf + # this tests for homozygous snps, heterozygous snps, multiallelic snps, + # phased snps, and snps with missing rsID + assert s.source == "vcf" + pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf().loc[rsids]) def test_source_lineage_file(self): sc = SNPsCollection("tests/input/GRCh37.csv") @@ -475,7 +592,7 @@ def test_save_snps_vcf(self): assert os.path.relpath(s.save_snps(vcf=True)) == "output/vcf_GRCh37.vcf" s = SNPs("output/vcf_GRCh37.vcf") - pd.testing.assert_frame_equal(s.snps, self.generic_snps()) + pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf()) def test_save_snps_specify_file(self): s = SNPs("tests/input/GRCh37.csv")