diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..94a9ed0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/src/Makefile b/src/Makefile index 99e29ed..fcc8b5a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -2,11 +2,25 @@ TCFLAGS = -ltcmalloc ARMAFLAGS = -larmadillo +CONDAFLAGS = -fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem ${CONDA_PREFIX}/include all: - $(LINK.cc) -O3 -o ancestry_hmm ancestry_hmm.cpp $(ARMAFLAGS) -### if tcmalloc is installed, this can be linked using the command below instead -# $(LINK.cc) -O3 -o ancestry_hmm ancestry_hmm.cpp $(TCFLAGS) $(ARMAFLAGS) + $(CXX) -O3 $(CXXFLAGS) -o ancestry_hmm ancestry_hmm.cpp $(ARMAFLAGS) + $(CXX) -O3 $(CXXFLAGS) -o ahmm-s ahmms.cpp $(ARMAFLAGS) + +conda: + $(CXX) -O3 $(CONDAFLAGS) -o ahmm-s ahmms.cpp -L ${CONDA_PREFIX}/lib -I ${CONDA_PREFIX}/include $(ARMAFLAGS) + $(CXX) -O3 $(CONDAFLAGS) -o ancestry_hmm ancestry_hmm.cpp -L ${CONDA_PREFIX}/lib -I ${CONDA_PREFIX}/include $(ARMAFLAGS) + +ahmms: + $(CXX) -O3 $(CXXFLAGS) -o ahmm-s ahmms.cpp $(ARMAFLAGS) + +ahmm: + $(CXX) -O3 $(CXXFLAGS) -o ancestry_hmm ancestry_hmm.cpp $(ARMAFLAGS) + + +## if you have a local install of google perftools, please add a TCFlag link. +### $(LINK.cc) -std=c++11 -O3 -o ahmm-s ahmms.cpp $(ARMAFLAGS) $(TCFLAGS) ## if you have a local armadillo installation, you will need to provide the directory during compile time and possible also link lblas and lapack ## our recommendation is to use miniconda3 to do the installation @@ -14,8 +28,9 @@ all: ## then you will have the appropriate lib and include files in your home directory under subdirectory miniconda3/ ## so, replace USERNAME with your unix id on the following line and try this - #$(LINK.cc) -O3 -o ancestry_hmm ancestry_hmm.cpp -L /home/USERNAME/miniconda3/lib/ -I /home/USERNAME/miniconda3/include/ $(ARMAFLAGS) +## $(CXX) -std=c++11 -O3 -o ahmm-s ahmms.cpp -L ${CONDA_PREFIX}/lib -I ${CONDA_PREFIX}/include $(ARMAFLAGS) -## if it builds correctly, you will also need to link the library during runtime +## if it builds correctly, you may also need to link the library during runtime ## to do this, add the following line to your ~/.bash_profile or ~/.bashrc -## export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/USERNAME/miniconda3/lib/ +## export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${CONDA_PREFIX}/lib + diff --git a/src/ahmms.cpp b/src/ahmms.cpp new file mode 100644 index 0000000..eba510e --- /dev/null +++ b/src/ahmms.cpp @@ -0,0 +1,215 @@ +/* + + copyright: Russ Corbett-Detig + rucorbet@ucsc.edu + + Jesper Svedberg + jsvedber@ucsc.edu + + This is software distributed under the gnu public license version 3. + + */ + +/// headers +#include +#include +#include +#include +#include +#include +#include + +// Includes specific for Ancestry_HMM-S +#include +#include +#include +#include +#include +#include // ++++ REQUIRES C++11 ++++ + + +/// linear algebra library is armadillo +#define ARMA_NO_DEBUG +#include +using namespace arma ; +using namespace std ; + +/// our header files in /src directory +#include "selection_print_usage.h" // JS +#include "factorial.h" +#include "nchoosek.h" +#include "selection_subsample.h" +#include "multichoose.h" +#include "multipermute.h" +#include "normalize.h" +#include "ancestry_pulse.h" +#include "ploidy_path.h" +#include "selection_class.h" // JS +#include "selection_markov_chain.h" +#include "read_samples.h" +#include "pulses_to_ancestry.h" +#include "compute_forward.h" +#include "compute_backward.h" +#include "forward_backward.h" +#include "viterbi.h" +#include "transition_information.h" +#include "exponentiate_matrix.h" +#include "selection_cmd_line.h" +#include "create_transition_rates.h" +#include "selection_read_cmd_line.h" +#include "evaluate_vertex.h" +#include "check_vertex.h" +#include "sort_vertices.h" +#include "create_pulses.h" +#include "create_states.h" +#include "input_line.h" +#include "distribute_alleles.h" +#include "binomial.h" +#include "read_emissions.h" +#include "genotype_emissions.h" +#include "selection_read_input.h" +#include "nelder_mead.h" +#include "golden_search.h" +#include "bootstrap.h" + +// Includes specific for Ancestry_HMM-S +#include "selection_get_position.h" +#include "selection_optimize_test_func.h" // Function for testing Nelder-Mead. Remove? +#include "selection_fwd_iter.h" +#include "selection_trajectory.h" +#include "selection_split_vector.h" +#include "selection_forward.h" +#include "selection_stochastic_traj.h" +#include "selection_transition_rates.h" + + + + +int main ( int argc, char *argv[] ) { + + /// time tracking + clock_t t = clock() ; + clock_t total = clock() ; + + /// seed prng + srand (t) ; + + // read cmd line + cmd_line options ; + cerr << "reading command line" ; t = clock(); + options.read_cmd_line( argc, argv ) ; + + /// chain objects for each sample + vector markov_chain_information ; + + /// get sample ids and ploidy from input file + cerr << "\t\t\t\t" << (double) (clock() - t) << " ms\n" << "reading sample ids and ploidy" ; t = clock(); + read_samples( markov_chain_information, options.sample_file, options.viterbi ) ; + + /// create states matrix + cerr << "\t\t\t" << (double) (clock() - t) << " ms\n" << "creating states matrix" ; t = clock(); + /// store all possible state space arranged by ploidy and then vector of state counts + map > > state_list ; + /// now create initial state list + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + for ( int p = 0 ; p < markov_chain_information[m].sample_ploidy_path.size() ; p ++ ) { + create_initial_states( markov_chain_information.at(m).sample_ploidy_path[p].ploidy, options.ancestry_pulses, state_list ) ; + } + } + + /// read in panels and update matrices + cerr << "\t\t\t\t" << (double) (clock() - t) << " ms\n" << "reading data and creating emissions matrices\t" ; t = clock() ; + /// store recombination rates and positions + vector position ; + vector recombination_rate ; + vector chromosomes ; + int sel_pos ; + read_file( options, markov_chain_information, state_list, position, recombination_rate, chromosomes, sel_pos ) ; + + + + /// create basic transition information + cerr << (double) (clock() - t) << " ms" << endl << "computing transition routes\t\t\t" ; t = clock() ; + /// 3d map to look up by ploidy, start state, end state, and then relevant transition information + map, double > > > > transition_matrix_information ; + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + for ( int p = 0 ; p < markov_chain_information[m].sample_ploidy_path.size() ; p ++ ) { + create_transition_information( markov_chain_information.at(m).sample_ploidy_path[p].ploidy, transition_matrix_information, state_list[markov_chain_information.at(m).sample_ploidy_path[p].ploidy] ) ; + } + } + cerr << endl; + + + // Below are ahmm-s specific options + + // If using grid search with --grid flag + if (options.calc_grid == true) { + int p_start = options.grid_pstart; + int p_stop = options.grid_pstop; + int p_step = options.grid_pstep; + + double s_start = options.grid_sstart; + double s_stop = options.grid_sstop; + + if ( options.limit_sel_space == true ) { + s_stop = selection_get_max_sel(options.grid_sstart, options.grid_sstop, options.grid_sstep, options.ancestry_pulses[1].proportion, options.ancestry_pulses[1].time, options.ne); + } + double s_step = options.grid_sstep; + + cerr << "Grid search. Likelihood calculated for values of selection between " << s_start << " and " << s_stop << endl; + + selection_grid(p_start, p_stop, p_step, s_start, s_stop, s_step, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_list); + return 0; + } + + + + // If testing a single point using --site flag. + if (options.test_point == true) { + cerr << "Evaluating point: " << options.test_pos << ", " << options.test_sel << endl; + + map > sel_trajectories; + vector > split_vecs; + int testpos; + selection point0; + + if (options.is_coord == true) { + testpos = get_position(options.test_pos, position); + + if (testpos == -1) { + cerr << "ERROR: specified site not found on chromosome" << endl; + exit(1); + } + + } + else { + testpos = options.test_pos; + } + + point0.pos = testpos; + point0.sel = 0; + selection_evaluate_point_genotypes( point0, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_list, split_vecs, sel_trajectories ) ; + + selection point1; + point1.pos = testpos; + point1.sel = options.test_sel; + selection_evaluate_point_genotypes( point1, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_list, split_vecs, sel_trajectories ) ; + + cout << "lnL for a selected site s=" << options.test_sel << " at position " << position[point0.pos] << " is: " << point1.lnl-point0.lnl << endl; + + return 0; + } + + + + // If using Golden Section Search with --gss flag + if (options.run_gss == true) { + selection_golden_section(markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_list); + return 0; + } + + + return 0 ; +} + + diff --git a/src/read_cmd_line.h b/src/read_cmd_line.h index aedc343..0c92a84 100644 --- a/src/read_cmd_line.h +++ b/src/read_cmd_line.h @@ -139,7 +139,7 @@ void cmd_line::read_cmd_line ( int argc, char *argv[] ) { if ( strcmp(argv[i],"--help") == 0 ) { print_usage() ; - exit(1) ; + exit(0) ; } if ( strcmp(argv[i],"-g") == 0 ) { diff --git a/src/selection_class.h b/src/selection_class.h new file mode 100644 index 0000000..0cf5635 --- /dev/null +++ b/src/selection_class.h @@ -0,0 +1,21 @@ +#ifndef __SELECTION_CLASS_H +#define __SELECTION_CLASS_H + +class selection { +public: + int pos; + double sel; + double lnl; + + /// sort pulses by time + friend bool operator < ( const selection &a, const selection &b ) { + return a.lnl < b.lnl ; + } +} ; + +ostream& operator<< (ostream &out, selection const& point) { + out << "Selection point. pos:" << point.pos << " sel:" << setprecision(15) << point.sel << " lnL: " << point.lnl; + return out; +} + +#endif diff --git a/src/selection_cmd_line.h b/src/selection_cmd_line.h new file mode 100644 index 0000000..e8c7f65 --- /dev/null +++ b/src/selection_cmd_line.h @@ -0,0 +1,126 @@ +#ifndef __SELECTION_CMD_LINE_H +#define __SELECTION_CMD_LINE_H + +/// command line information and global parameters +class cmd_line { +public: + + /// terms to bound the optimization + double t_max ; + double t_min ; + + /// to bound proportion search + double p_max ; + double p_min ; + + /// to create intial simplex points + double t_length ; + double p_length ; + + /// number of restarts + int n_restarts ; + + /// proportion ancestry for 0-n must sum to 1 + /// these are therefore the final ancestry proportion, not necessary the proportion that fluxed if additional pulses occured closer to the present + vector ancestry_proportion ; + + /// store relevant ancestry information + vector ancestry_pulses ; + + /// diploid effective population size ( i.e. 2n ) + double ne ; + + /// tolerance for parameter search + double tolerance ; + + /// minimum recombinational distance between markers + double minimum_distance ; + + /// error rates for reads (if read based) or genotypes (if genotype based) + double error_rate ; + + /// bool sample is expressed as genotypes, not read counts + bool genotype ; + + /// ancestral genotype frequencies are fixed + bool ancestral_fixed ; + + /// viterbi output + /// caution: not recommended for more samples of ploidy > 1 + bool viterbi ; + + /// number of digits of precision to include + int precision ; + + /// output actual pulses rather than ancestry states + bool output_pulses ; + + /// error rates specifed + bool error_rates ; + + /// input file name + string input_file ; + + /// sample file + string sample_file ; + + /// bootstrap + int n_bootstraps ; + int block_size ; + + /// +=+=+=+=+=+=+ selection +=+=+=+=+=+=+ + + + bool is_limit; // limits to only one chromosome + string limit_chr ; // specifies which chromosome + int limit_win_start ; // window to analyse start + int limit_win_end ; // window to analyze end + + // grid search + bool calc_grid; // set grid search + int grid_pstart; // position start + int grid_pstop; + int grid_pstep; + double grid_sstart; // selection coeffient start + double grid_sstop; + double grid_sstep; + + // single site test + bool test_point; + int test_pos; + double test_sel; + + // golden section search + bool run_gss; // set gss search + int gs_pstart; + int gs_pstop; + int gs_pstep; + double gs_sstart; + double gs_sstop; + double gs_sstep; + int gs_max_iterations; + double gs_precision; + + bool is_coord; // use chromosome coordinates + + bool limit_sel_space; // use full search space for s + + int traj_function = 0; // set trajectory function + + // HMM chain window size + string win_unit; + double win_morgan; + double win_percent; + // int win_bp; + + // stochastic trajectory + bool use_stochastic; // use stochastic trajectory function + int stochastic_reps; // how many repeats + + /// read relevant information + void read_cmd_line ( int argc, char *argv[] ) ; + +} ; + +#endif + diff --git a/src/selection_forward.h b/src/selection_forward.h new file mode 100644 index 0000000..3acd568 --- /dev/null +++ b/src/selection_forward.h @@ -0,0 +1,141 @@ +#ifndef __SELECTION_FORWARD_H +#define __SELECTION_FORWARD_H + + + +// Forward algoritm modified for selection inferrence. +double markov_chain::selection_forward_probabilities_genotypes( map > &transition_probabilites, vector &interploidy_transitions, selection &point, bool go_downstream, vector &genofreq, vector &position ) { + //cerr << "cp2_1 " << genofreq[0] << endl; + + /// return log likelihood which is sum of cts + double lnl = 0 ; + + /// clear the fw probs matrix + alphas.resize( transition_probabilites[ploidy_switch[0]].size() ) ; + + /// ploidy index to tract where in path we are + int ploidy_index = 0 ; + + //// set all values to zero, but mostly just reize + alphas[0].resize( transition_probabilites[ploidy_switch[0]][1].n_cols ) ; + + /// get initial state set + //alphas[0] = emission_probabilities[point.pos] * start_prob ; + + // Populate starting conditions + + // Check how to specify nn. The current way is a bit of a hack. + double nn = transition_probabilites[ploidy_switch[0]][1].n_cols - 1; + for (int k = nn; k >= 0; k--) { + alphas[0][nn-k] = binomial(nn, k, genofreq[0]); + } + + lnl += normalize( alphas[0] ) ; + + /// do all other sites + /// Checks if going upstream or downstream from the selected site. + if (go_downstream == true) { + selection_forward_loop_reverse(transition_probabilites, interploidy_transitions, point, lnl, ploidy_index, position) ; + } + else { + selection_forward_loop(transition_probabilites, interploidy_transitions, point, lnl, ploidy_index, position) ; + } + + return lnl ; +} + +// Loop in forward algorithm going downstream from the selected site +void markov_chain::selection_forward_loop( map > &transition_probabilites, vector &interploidy_transitions, selection &point, double &lnl, int ploidy_index, vector &position) { + // WARNING: Check what +1 index does. May be unnecessary. + //cerr << "Emission probabilities: " << emission_probabilities.size() << " " << point.pos << endl; + + int j = 1; + int k; + double normalpha; + + for ( int i = 1 ; i < transition_probabilites[ploidy_switch[ploidy_index]].size() ; i ++ ) { + k = point.pos + i; + /// if we're at or past the next switch position + bool ploidy_change = false ; + if ( i >= ploidy_switch_position[ploidy_index+1] ) { + ploidy_index ++ ; + if ( ploidy_switch[ploidy_index] != ploidy_switch[ploidy_index-1] ) { + ploidy_change = true ; + } + } + /// resize matrix + alphas[j].resize( transition_probabilites[ploidy_switch[ploidy_index]][1].n_cols ) ; + + /// requires slightly different math if we are transitioning in ploidy between two adjacent sites + if ( ploidy_change == true ) { + /// transitions across a chromosome boundary will have low self-self rates + if ( transition_probabilites[ploidy_switch[ploidy_index]][i](0,0) < 0.75 ) { + alphas[j].fill( 1 ) ; + } + + //// otherwise, this is a transition across ploidy types on the same chromosome use the interploidy transition rates + else { + alphas[j] = interploidy_transitions[ploidy_switch[ploidy_index-1]-1] * alphas[j-1] % emission_probabilities[k] ; + } + } + + /// otehrwise business as ususal + + else { + alphas[j] = transition_probabilites[ploidy_switch[ploidy_index]][j] * alphas[j-1] % emission_probabilities[k] ; + normalpha = normalize( alphas[j] ) ; + } + + /// normalize and updated likelihood + lnl += normalpha ; + j++; + } +} + +// Loop in forward algorithm going upstream from the selected site +void markov_chain::selection_forward_loop_reverse( map > &transition_probabilites, vector &interploidy_transitions, selection &point, double &lnl, int ploidy_index, vector &position) { + // WARNING: Check what +1 index does. May be unnecessary. + + int j = 1; + int k; + double normalpha; + + for ( int i = 0 ; i < transition_probabilites[ploidy_switch[ploidy_index]].size()-1 ; i ++ ) { + k = point.pos - i; + /// if we're at or past the next switch position + bool ploidy_change = false ; + if ( i >= ploidy_switch_position[ploidy_index+1] ) { + ploidy_index ++ ; + if ( ploidy_switch[ploidy_index] != ploidy_switch[ploidy_index-1] ) { + ploidy_change = true ; + } + } + /// resize matrix + alphas[j].resize( transition_probabilites[ploidy_switch[ploidy_index]][1].n_cols ) ; + + /// requires slightly different math if we are transitioning in ploidy between two adjacent sites + if ( ploidy_change == true ) { + /// transitions across a chromosome boundary will have low self-self rates + if ( transition_probabilites[ploidy_switch[ploidy_index]][i](0,0) < 0.75 ) { + alphas[j].fill( 1 ) ; + } + + //// otherwise, this is a transition across ploidy types on the same chromosome use the interploidy transition rates + else { + alphas[j] = interploidy_transitions[ploidy_switch[ploidy_index-1]-1] * alphas[j-1] % emission_probabilities[k] ; + } + } + + /// otehrwise business as ususal + else { + alphas[j] = transition_probabilites[ploidy_switch[ploidy_index]][j] * alphas[j-1] % emission_probabilities[k] ; + normalpha = normalize( alphas[j] ) ; + } + + /// normalize and updated likelihood + lnl += normalpha ; + j++; + } +} + +#endif diff --git a/src/selection_fwd_iter.h b/src/selection_fwd_iter.h new file mode 100644 index 0000000..4e4daa7 --- /dev/null +++ b/src/selection_fwd_iter.h @@ -0,0 +1,451 @@ +#ifndef __FWD_ITER_H +#define __FWD_ITER_H + +// Calculates transition rate when selection is 0 +double neutral_rate(int n, double mm, double gen) { + double m = 1 - mm; + return 2*n*m*(1-exp(-gen/(2*n))); +} + +// Return a flat/constant vector for the case when selection is 0 +vector neutral_rates_vector(vector &recombination_rate, double m, int n, int generations) { + vector transition_rates ; + int r; + + double tr01 = neutral_rate(n, m, generations); + double tr10 = neutral_rate(n, 1-m, generations); + + for ( int site = 0 ; site < recombination_rate.size() ; site ++ ) { + r = recombination_rate[site] ; + mat tr_mat(2,2); + + tr_mat(0,0) = 1-r*tr01; + tr_mat(0,1) = r*tr01; + tr_mat(1,0) = r*tr10; + tr_mat(1,1) = 1-r*tr01; + + transition_rates.push_back(tr_mat); + } + + return transition_rates; + +} + +// forward iteration algoritm for calculating transition rates across a chromosome +// takes vector of recombination rates (sites) and vector of allele frequency change of the selected site over time +// uses the 2-site version with back coalescence. +// Not used at the moment +vector fwd_iter(vector &recombination_rate, vector &basefreq, double m, int n) +{ + vector freq(basefreq) ; + vector freq_(basefreq); + vector transition_rates ; + double sum ; + double h11; + double h12; + double h21; + double h22; + double r; + + double a1; + double a1_; + + double h11_; + double h12_; + double h21_; + double h22_; + + double p_coal; + + for ( int site = 0 ; site < recombination_rate.size() ; site ++ ) { + r = recombination_rate[site] ; + mat tr_mat(2,2); + + h11 = m; + h12 = 0; + h21 = 0; + h22 = 1-m; + + p_coal = 1 ; + + for ( int t = 0 ; t < basefreq.size()-1 ; t++ ) { + a1 = h11 + h12; + a1_ = h11 + h21; + + h11_ = h11*(1-r) + a1*r*a1_*p_coal; + h12_ = h12*(1-r) + a1*r*(1-a1_)*p_coal; + h21_ = h21*(1-r) + (1-a1)*r*a1_*p_coal; + h22_ = h22*(1-r) + (1-a1)*r*(1-a1_)*p_coal; + + freq_[t] = ( h11_ + h21_ )/( h11 + h12 + h21 + h22 ); + + h11 = h11_ ; + h12 = h12_ ; + h21 = h21_ ; + h22 = h22_ ; + + h11 = h11/freq[t]*freq[t+1]; + h12 = h12/freq[t]*freq[t+1]; + + h21 = h21/(1-freq[t])*(1-freq[t+1]); + h22 = h22/(1-freq[t])*(1-freq[t+1]); + + sum = h11 + h12 + h21 + h22; + h11 = h11/sum; + h12 = h12/sum; + h21 = h21/sum; + h22 = h22/sum; + + freq_[t+1] = (h11 + h21) ; + p_coal *= ( 1 - 1/(2*n) ) ; + + } + + // matrix with transition rates + tr_mat(0,0) = 1 - h12/(h11+h12); + tr_mat(0,1) = h12/(h11+h12); + tr_mat(1,0) = h21/(h21+h22); + tr_mat(1,1) = 1 - h21/(h21+h22); + + transition_rates.push_back(tr_mat) ; + + freq = freq_ ; + } + + return transition_rates; +} + +// Forward iteration function for calculating transition rates +vector fwd_iter_genotype_freq(vector &recombination_rate, vector &basefreq, double m, int n, vector &genotype_freqs) +{ + + vector freq(basefreq) ; + vector freq_(basefreq); + vector transition_rates ; + double sum ; + double h11; + double h12; + double h21; + double h22; + double r; + + double a1; + double a1_; + + double h11_; + double h12_; + double h21_; + double h22_; + + double p_coal; + + + for ( int site = 0 ; site < recombination_rate.size() ; site ++ ) { + r = recombination_rate[site] ; + mat tr_mat(2,2); + + h11 = m; + h12 = 0; + h21 = 0; + h22 = 1-m; + + p_coal = 1 ; + for ( int t = 0 ; t < basefreq.size()-1 ; t++ ) { + a1 = h11 + h12; + a1_ = h11 + h21; + + h11_ = h11*(1-r) + a1*r*a1_*p_coal; + h12_ = h12*(1-r) + a1*r*(1-a1_)*p_coal; + h21_ = h21*(1-r) + (1-a1)*r*a1_*p_coal; + h22_ = h22*(1-r) + (1-a1)*r*(1-a1_)*p_coal; + + freq_[t] = ( h11_ + h21_ )/( h11 + h12 + h21 + h22 ); + + h11 = h11_ ; + h12 = h12_ ; + h21 = h21_ ; + h22 = h22_ ; + + h11 = h11/freq[t]*freq[t+1]; + h12 = h12/freq[t]*freq[t+1]; + + h21 = h21/(1-freq[t])*(1-freq[t+1]); + h22 = h22/(1-freq[t])*(1-freq[t+1]); + + sum = h11 + h12 + h21 + h22; + h11 = h11/sum; + h12 = h12/sum; + h21 = h21/sum; + h22 = h22/sum; + + freq_[t+1] = (h11 + h21) ; + p_coal *= ( 1 - 1/(2*n) ) ; + + } + + // matrix with transition rates + tr_mat(0,0) = 1 - h12/(h11+h12); + tr_mat(0,1) = h12/(h11+h12); + tr_mat(1,0) = h21/(h21+h22); + tr_mat(1,1) = 1 - h21/(h21+h22); + + genotype_freqs.push_back(freq_.back()); // For outputting genotype frequencies. Maybe remove??? + + transition_rates.push_back(tr_mat) ; + + freq = freq_ ; + + } + + return transition_rates; +} + +// forward iteration algoritm for calculating transition rates across a chromosome +// 3-site version. Is called from fwd_curve() and only calculates a single site +// takes a distance from the selected site and a vector of allele frequency at the selected site over time +mat iterate_site(mat &tr_mat, vector &freqlist, double r, double site, double m) { + double h111 = m; + double h112 = 0; + double h121 = 0; + double h122 = 0; + double h211 = 0; + double h212 = 0; + double h221 = 0; + double h222 = 1 - m; + double m11; + double m12; + double m21; + double m22; + double a1; + double h111_; + double h112_; + double h121_; + double h122_; + double h211_; + double h212_; + double h221_; + double h222_; + double summ; + + for (int t = 0; t < freqlist.size()-1; t++) { + /// get marginal haplotype frequencies + /// Haplotype = specific combination of alleles at the two neighboring sites + m11 = h111 + h211; + m12 = h112 + h212; + m21 = h121 + h221; + m22 = h222 + h122; + + // marginal allele freqs + // Frequency of selected allele + a1 = (h111+h112+h121+h122); + + // now compute change associated with recombination between selected site and first marker + h111_ = h111*(1-site) + a1*site*m11; // = h111 - site*(h111-a1*m11) + h112_ = h112*(1-site) + a1*site*m12; + h121_ = h121*(1-site) + a1*site*m21; + h122_ = h122*(1-site) + a1*site*m22; + h211_ = h211*(1-site) + (1-a1)*site*m11; + h212_ = h212*(1-site) + (1-a1)*site*m12; + h221_ = h221*(1-site) + (1-a1)*site*m21; + h222_ = h222*(1-site) + (1-a1)*site*m22; + + /// then do the same thing for second set of possible recombination events + // marginal haplotype frequencies + m11 = h111_ + h112_; + m12 = h121_ + h122_; + m21 = h211_ + h212_; + m22 = h221_ + h222_; + + // marginal allele frequencies + a1 = h111 + h121 + h211 + h221; + + /// now second allele change associated with recombination + h111 = h111_*(1-r) + m11*r*a1; // = h111_ - r*(h111_-m11*a1) + h112 = h112_*(1-r) + m11*r*(1-a1); + h121 = h121_*(1-r) + m12*r*a1; + h122 = h122_*(1-r) + m12*r*(1-a1); + h211 = h211_*(1-r) + m21*r*a1; + h212 = h212_*(1-r) + m21*r*(1-a1); + h221 = h221_*(1-r) + m22*r*a1; + h222 = h222_*(1-r) + m22*r*(1-a1); + + // update frequencies as a consequence of selection + h111 = h111/freqlist[t]*freqlist[t+1]; // = h111 * (freq[t+1]/freq[t]) + h112 = h112/freqlist[t]*freqlist[t+1]; + h121 = h121/freqlist[t]*freqlist[t+1]; + h122 = h122/freqlist[t]*freqlist[t+1]; + + // nonselected alleles go down in frequency + h211 = h211/(1-freqlist[t])*(1-freqlist[t+1]); + h212 = h212/(1-freqlist[t])*(1-freqlist[t+1]); + h221 = h221/(1-freqlist[t])*(1-freqlist[t+1]); + h222 = h222/(1-freqlist[t])*(1-freqlist[t+1]); + + /// normalize b/c rounding errors + summ = h111 + h112 + h121 + h211 + h122 + h212 + h221 + h222; + h111 = h111/summ; + h112 = h112/summ; + h121 = h121/summ; + h122 = h122/summ; + h211 = h211/summ; + h212 = h212/summ; + h221 = h221/summ; + h222 = h222/summ; + } + + tr_mat(0,0) = 1 - (h112+h212)/(h111+h112+h211+h212); + tr_mat(0,1) = (h112+h212)/(h111+h112+h211+h212); + tr_mat(1,0) = (h121+h221)/(h121+h122+h221+h222); + tr_mat(1,1) = 1 - (h121+h221)/(h121+h122+h221+h222); + + //cerr << site << "\t" << r << "\t" << tr_mat(0,1) << "\t" << tr_mat(1,0) << "\t" << tr_mat(0,1)/r << "\t" << tr_mat(1,0)/r << endl; + + return tr_mat; +} + +// forward iteration algoritm for calculating transition rates across a chromosome +// 3-site version. Loops over vector of recombination distances and calles iterate_site() +// to calculate the transition rate at each site. +// Not used at the moment. +vector fwd_curve(vector &recombination_rate, vector &basefreq, double m) { + vector transition_rates ; + double r; + mat tr_mat(2,2); + double cumulative_r = 0; + + for ( int site = 0 ; site < recombination_rate.size() ; site ++ ) { + r = recombination_rate[site] ; + iterate_site(tr_mat, basefreq, r, cumulative_r, m); + transition_rates.push_back(tr_mat) ; + cumulative_r = cumulative_r + r; + } + + return transition_rates; +} + +// Vladimir's 4-point approximative algorithm for calculating transition rates. +vector approx_curve(vector &recombination_rate, vector &basefreq, double m) { + mat V1_mat(2,2); + mat V2_mat(2,2); + mat M_mat(2,2); + mat U_mat(2,2); + double L1_sel; + double L2_sel; + double L1_non; + double L2_non; + double p_sel; + double p_non; + double alpha_sel; + double alpha_non; + double r_est = 0.0001; + double x1 = 0.01; + double x2 = 0.1; + + // compute 4 anchor points to generate curve + + iterate_site(V1_mat, basefreq, r_est, x1, m); + iterate_site(V2_mat, basefreq, r_est, x2, m); + iterate_site(M_mat, basefreq, r_est, 2, m); + iterate_site(U_mat, basefreq, r_est, 0, m); + + + // for selected allele + L1_sel = log((V1_mat(0,1)-M_mat(0,1))/(U_mat(0,1)-M_mat(0,1))); + L2_sel = log((V2_mat(0,1)-M_mat(0,1))/(U_mat(0,1)-M_mat(0,1))); + p_sel = log(L1_sel/L2_sel)/log(x1/x2); + alpha_sel = -L1_sel/pow(x1,p_sel); + + // for non-selected allele + L1_non = log((V1_mat(1,0)-M_mat(1,0))/(U_mat(1,0)-M_mat(1,0))); + L2_non = log((V2_mat(1,0)-M_mat(1,0))/(U_mat(1,0)-M_mat(1,0))); + p_non = log(L1_non/L2_non)/log(x1/x2); + alpha_non = -L1_non/pow(x1,p_non); + + + vector transition_rates ; + double r; + mat tr_mat(2,2); + double cumulative_r = 0; + double sel_trans; + double non_trans; + //double diff = 0; + + // loops over each site and calculates the transition rates + for ( int site = 0 ; site < recombination_rate.size() ; site ++ ) { + r = recombination_rate[site] ; + cumulative_r += r; + //r=0.0001; + + sel_trans = M_mat(0,1)+(U_mat(0,1)-M_mat(0,1))*exp(-alpha_sel*pow(cumulative_r,p_sel)); + non_trans = M_mat(1,0)+(U_mat(1,0)-M_mat(1,0))*exp(-alpha_non*pow(cumulative_r,p_non)); + + tr_mat(0,0) = 1 -r*sel_trans/r_est; + tr_mat(0,1) = r*sel_trans/r_est; + tr_mat(1,0) = r*non_trans/r_est; + tr_mat(1,1) = 1 - r*non_trans/r_est; + + transition_rates.push_back(tr_mat); + } + + return transition_rates; +} + +// Vladimir's earlier 3-point approximation of the transition rates +// Not recommended. Use 4-point approximation instead +vector approx_curve_3point(vector &recombination_rate, vector &basefreq, double m) { + + //k = x1 / ((1/((M/U)-1)) - (1/((M/V)-1))) + //r0 = k / ((M/V)-1) + //return lambda r: M / (1 + (k/(r+r0))) + + mat V_mat(2,2); + mat M_mat(2,2); + mat U_mat(2,2); + + double r_est = 0.0001; + double r_u = 0.05; + + double k_sel; + double k_non; + + double r0_sel; + double r0_non; + + // compute 3 anchor points to generate curve + iterate_site(U_mat, basefreq, r_est, r_u, m); + iterate_site(M_mat, basefreq, r_est, 2, m); + iterate_site(V_mat, basefreq, r_est, 0, m); + + k_non = r_u / (1/(M_mat(1,0)/U_mat(1,0))-(1/(M_mat(1,0)/V_mat(1,0)))); + k_sel = r_u / (1/(M_mat(0,1)/U_mat(0,1))-(1/(M_mat(0,1)/V_mat(0,1)))); + + r0_non = k_non / ((M_mat(1,0)/V_mat(1,0))-1); + r0_sel = k_sel / ((M_mat(0,1)/V_mat(0,1))-1); + + vector transition_rates ; + double r; + mat tr_mat(2,2); + double cumulative_r = 0; + double sel_trans; + double non_trans; + + // loops over each site and calculates the transition rates + for ( int site = 0 ; site < recombination_rate.size() ; site ++ ) { + r = recombination_rate[site] ; + cumulative_r += r; + + sel_trans = M_mat(0,1) / (1+(k_sel/(cumulative_r+r0_sel))); + non_trans = M_mat(1,0) / (1+(k_non/(cumulative_r+r0_non))); + + tr_mat(0,0) = 1 -r*sel_trans/r_est; + tr_mat(0,1) = r*sel_trans/r_est; + tr_mat(1,0) = r*non_trans/r_est; + tr_mat(1,1) = 1 - r*non_trans/r_est; + + transition_rates.push_back(tr_mat); + } + + return transition_rates; +} + +#endif diff --git a/src/selection_get_position.h b/src/selection_get_position.h new file mode 100644 index 0000000..e566b9b --- /dev/null +++ b/src/selection_get_position.h @@ -0,0 +1,13 @@ +#ifndef __GET_POSITION_H +#define __GET_POSITION_H + +int get_position(int pos, vector &positions) { + for (int i = 0; i < positions.size(); i++) { + if ( positions[i] >= pos ) { + return i; + } + } + return -1; +} + +#endif \ No newline at end of file diff --git a/src/selection_markov_chain.h b/src/selection_markov_chain.h new file mode 100644 index 0000000..9dc3447 --- /dev/null +++ b/src/selection_markov_chain.h @@ -0,0 +1,59 @@ +#ifndef __SELECTION_MARKOV_CHAIN_H +#define __SELECTION_MARKOV_CHAIN_H + +/// will include all basic information for input data and functions to compute forward, backward, forward-backward, and viterbi +class markov_chain { +public: + + /// sample attributes + string output_file ; + double number_chromosomes ; + + /// file describing ploidy path across the genome + string path_file ; + + /// data object storing ploidy paths to be looked up during emissions computation with chromosome as key + vector sample_ploidy_path ; + vector ploidy_switch_position ; + vector ploidy_switch ; + + /// read from input file + vector emission_probabilities ; + + /// create initial states to be stored + double start_prob ; + double end_prob ; + + /// forward probs + vector alphas ; + double compute_forward_probabilities( map > &transition_matrix, vector &interploidy_transitions ) ; + + + + /// For selection + //vector genotype_freqs; + + double selection_forward_probabilities( map > &transition_probabilites, vector &interploidy_transitions, selection &point, bool go_downstream ) ; + + void selection_forward_loop_reverse( map > &transition_probabilites, vector &interploidy_transitions, selection &point, double &lnl, int ploidy_index, vector &position) ; + + void selection_forward_loop( map > &transition_probabilites, vector &interploidy_transitions, selection &point, double &lnl, int ploidy_index, vector &position) ; + + double selection_forward_probabilities_genotypes( map > &transition_probabilites, vector &interploidy_transitions, selection &point, bool go_downstream, vector &genofreq , vector &position) ; + + + + + /// backward probs + vector betas ; + void compute_backward_probabilities( map > &transition_matrix, vector &interploidy_transitions ) ; + + /// combine probs + void combine_prob( vector &position, map > > &states, vector &chrom, bool output_pulses, vector &pulses ) ; + + /// output viterbi paths + void viterbi( vector &position, vector &recombination_rate, map > > &states, vector &chrom, map > &transition_probabilites, vector &interploidy_transitions, bool output_pulses, vector &pulses ) ; + +} ; + +#endif diff --git a/src/selection_nelder_mead.h b/src/selection_nelder_mead.h new file mode 100644 index 0000000..e8f0e8a --- /dev/null +++ b/src/selection_nelder_mead.h @@ -0,0 +1,346 @@ +#ifndef __SELECTION_NELDER_MEAD_H +#define __SELECTION_NELDER_MEAD_H + + +// The Nelder mead method is no longer used in AHMM-S +// This file is therefor not necessary and is included out of laziness + +// sorts vertex. can probably be optimized +void selection_sort_vertex( vector &v ) { + sort(v.begin(), v.end()); + reverse(v.begin(), v.end()); +} + +// generates random double between fmin and fmax +double double_rand(double fmin, double fmax) { + double f = (double)rand() / RAND_MAX; + return fmin + f * (fmax - fmin); +} + +/* +// calculates 2 vectors with transition rates going away from the site of interest +vector> selection_transition_rates(selection point, vector &recombination_rate, cmd_line &options) { + vector vecf ; + vector vecb ; + //split_vector(point.pos, recombination_rate, vecb, vecf, options) ; + + double m = options.ancestry_pulses[1].proportion; + int generations = options.ancestry_pulses[1].time ; + int n = options.ne ; /// DOUBLE CHECK HAPLOID/DIPLOID!! + int tt = 0; + + // generates vector with allele frequencies of selected allele over time + vector sel_traject ; + //double halfsel = 0.5 * point.sel; // test. remove + selection_trajectory(sel_traject, point.sel, tt, m, generations, n) ; // change tt + + //cerr << endl << "fwd_iter" << endl; + + //vector fwd_trans = fwd_iter(vecf, sel_traject, tt, m, generations, n) ; + //vector back_trans = fwd_iter(vecb, sel_traject, tt, m, generations, n) ; + + // generates two vectors of transition rates, both going away from the site of interest in different directions + + //cerr << "fwd_vector" << endl; + vector fwd_trans = fwd_iter(vecf, sel_traject, m, options.ne) ; //options.ne + //cerr << endl << "back_vector" << endl; + vector back_trans = fwd_iter(vecb, sel_traject, m, options.ne) ; + + vector> tr_vector; + tr_vector.push_back(fwd_trans); + tr_vector.push_back(back_trans); + return tr_vector; +} + + +// function for calculating likelihood of parameters +double selection_evaluate_point(selection &point, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_changes ) { + cerr << "BP2: Before transition rates." << endl; + vector> t_rates = selection_transition_rates(point, recombination_rate, options); + //vector> t_rates = selection_transition_rates_genotypes(point, recombination_rate, options, position); // test. remove + + //cerr << "BP3: After transition rates." << endl; + + double comb_lnl = 0; + bool go_backwards = false; + //go_backwards = true; + + for (int i=0 ; i < 2 ; i++) { + // transition matrix + map > transition_matrix ; + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + selection_transition_matrix( transition_matrix, transition_matrix_information[markov_chain_information.at(m).number_chromosomes], recombination_rate, position, markov_chain_information.at(m).number_chromosomes, t_rates[i] ) ; + // Delete maybe + for ( int p = 0 ; p < markov_chain_information[m].ploidy_switch.size() ; p ++ ) { + selection_transition_matrix( transition_matrix, transition_matrix_information[markov_chain_information[m].ploidy_switch[p]], recombination_rate, position, markov_chain_information[m].ploidy_switch[p], t_rates[i] ) ; + } + } + //cerr << "BP4: After transition matrix." << endl; + /// compute transitions within a state + vector interploidy_transitions ; + //interploidy_transitions = create_interploidy_transitions( state_changes, vertex, options.ancestry_proportion ) ; + + /// now compute the forward probabilities + double lnl = 0 ; + cerr << "markov_chain_information.size() " << markov_chain_information.size() << endl; + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + //cerr << "Sample#: " << m << endl; + lnl += markov_chain_information[m].selection_forward_probabilities( transition_matrix, interploidy_transitions, point, go_backwards ) ; + } + //cerr << "BP5: After compute forward. " << i << " " << lnl << endl; + comb_lnl += lnl; + go_backwards = true; + } + point.lnl = comb_lnl; + return comb_lnl ; + // forward probabilities + // other probabilities ?? +} + */ + + +// function for determining if point is within bounds +// iterates until it is. (Somewhat dangerous. May cause slow-down or infinite loop if peak is outsite of bounds) +void selection_check_point(selection &point, cmd_line &options) { + double pos_range_term = 0.1 ; + double sel_range_term = 0.05 ; + bool changed; + int reps = 0; + do { + changed = false; + if (point.pos > options.pos_max) { + cout << "pos too large: " << point.pos << "\t"; + point.pos = point.pos - ((double) rand() / (RAND_MAX))*pos_range_term * ( options.pos_max - options.pos_min ) ; + cout << point.pos << endl; + changed = true; + } + else if (point.pos < options.pos_min) + { + cout << "pos too small: " << point.pos << "\t"; + point.pos = point.pos + ((double) rand() / (RAND_MAX))*pos_range_term * ( options.pos_max - options.pos_min ) ; + cout << point.pos << endl; + changed = true; + } + + if (point.sel > options.sel_max) { + cout << "sel too large: " << point.sel << "\t"; + point.sel = point.sel - ((double) rand() / (RAND_MAX))*sel_range_term * ( options.sel_max - options.sel_min ) ; + cout << point.sel << endl; + changed = true; + } + else if (point.sel < options.sel_min) { + cout << "sel too small: " << point.sel << "\t"; + point.sel = point.sel + ((double) rand() / (RAND_MAX))*sel_range_term * ( options.sel_max - options.sel_min ) ; + cout << point.sel << endl; + changed = true; + } + if ( reps > 20) { + cout << "selection_check_point run more than 20 times. Breaking." << endl; + break; + } + reps++; + } while (changed == true); +} + +// function for reflecting, extending or contracting vertex. What it does depends on "mod" argument +selection selection_reflection(vector &vertex, double mod, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_changes ) { + selection centroid; + selection newpoint; + + // calculate centroid point + centroid.sel = vertex[0].sel - (vertex[0].sel - vertex[1].sel)/2; + centroid.pos = vertex[0].pos - (vertex[0].pos - vertex[1].pos)/2; + + // calculate new point to be used + newpoint.sel = centroid.sel + mod*(centroid.sel - vertex[2].sel); + newpoint.pos = centroid.pos + mod*(centroid.pos - vertex[2].pos); + + // checks if new point is within bounds + selection_check_point(newpoint, options); + + // calculates likelihood + vector > split_vecs; + map > sel_trajectories; + selection_evaluate_point_genotypes( newpoint, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + + return newpoint; +} + +// function for shinking vertex +vector selection_shrink(vector &vertex, double mod, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_changes ) { + vector new_vertex; + new_vertex.push_back(vertex[0]); + + selection point1; + selection point2; + + point1.sel = vertex[0].sel - (vertex[0].sel - vertex[1].sel)/2; + point1.pos = vertex[0].pos - (vertex[0].pos - vertex[1].pos)/2; + + point2.sel = vertex[0].sel - (vertex[0].sel - vertex[2].sel)/2; + point2.pos = vertex[0].pos - (vertex[0].pos - vertex[2].pos)/2; + + selection_check_point(point1, options); + vector > split_vecs; + map > sel_trajectories; + selection_evaluate_point_genotypes( point1, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + new_vertex.push_back(point1); + + selection_check_point(point2, options); + vector > split_vecs2; + map > sel_trajectories2; + selection_evaluate_point_genotypes( point2, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs2, sel_trajectories2 ) ; + new_vertex.push_back(point2); + + return new_vertex; +} + +// generates start vertex +vector selection_start_vertex(cmd_line &options) { + vector vertex; + + // Chromosomal position of first point in vertex is within 0.2 old_selection_start_vertex(cmd_line &options) { + vector vertex; + + for (int i; i < 3; i++) { + selection point; + point.pos = rand()%(options.pos_max-options.pos_min + 1) + options.pos_min; + point.sel = double_rand(options.sel_min, options.sel_max); + vertex.push_back(point); + } + return vertex; +} +*/ + + +// function for using Nelder-Mead to find optimal values for selection and position +selection selection_nelder_mead(cmd_line &options, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, map > > &state_changes) { + + int r = 1; + + double alpha = 1 ; // for reflecting vertex + double gamma = 2 ; // for extension vertex + double rho = -0.5 ; // used for contracting vertex. Note negative value + double sigma = 0.5 ; // for shrinking vertex + + double best_lnl = -1.7976931348623157E+308; + selection best_optimum; + vector vertex; + + // loop to repeat nelder mead search 20 times (should be changed) + for (int reps=0 ; reps<20 ; reps++) { + + // generates random start vertex + vertex = selection_start_vertex(options); + best_optimum = vertex[0]; + + // calculates likelihood for start vertex + // Check parameters + for ( int v = 0 ; v < vertex.size() ; v ++ ) { + vector > split_vecs; + map > sel_trajectories; // WARNING: should probably only be defined once in the beginning of Nelder-Mead and then sent as an argument to all subfunctions + vertex[v].lnl = selection_evaluate_point_genotypes( vertex[v], markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + cout << vertex[v] << endl; + } + + // sorts start vertex + selection_sort_vertex(vertex) ; + + int iteration = 0 ; + + // loop for Nelder-Mead search. Breaks when the change between each iteration is smaller than a tolerance + while ( vertex[0].lnl-vertex.back().lnl > options.tolerance ) { + /*if (vertex[0].pos == vertex[1].pos && vertex[0].pos == vertex[2].pos) { + cout << "ABORTING. Degenerate position: " << vertex[0].pos << endl; + break; + }*/ + + cout << "Vertex: " << r << "\t" << iteration << "\t" << vertex[0].lnl-vertex.back().lnl << "\t" ; + iteration ++ ; + + cout << vertex[0] << "\t" << vertex[1] << "\t" << vertex[2] << "\t" << endl; + + // creates reflected vertex + selection reflection = selection_reflection(vertex, alpha, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes); + //cout << "R1" << endl; + + // checks new vertex and if an extension should be tried instead + if (reflection < vertex[0] && vertex[1] < reflection) { + vertex.back() = reflection ; + cout << "Reflection1" << endl; + selection_sort_vertex(vertex) ; + continue ; + } + else if (vertex[0] < reflection) { + selection extension = selection_reflection(vertex, gamma, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes); + if ( reflection < extension ) { + vertex.back() = extension ; + cout << "Extension" << endl; + } + else { + vertex.back() = reflection ; + cout << "Reflection2" << endl; + } + selection_sort_vertex(vertex) ; + continue; + } + + // if reflected or extended vertex is not good enough, tries contracting or shrinking vertex + selection contraction = selection_reflection(vertex, rho, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes); + + if (vertex.back() < contraction) { + vertex.back() = contraction ; + cout << "Contraction" << endl; + } + else { + vertex = selection_shrink(vertex, sigma, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes); + cout << "Shrink" << endl; + } + selection_sort_vertex(vertex) ; + } + + selection_sort_vertex(vertex) ; + + // outputs best point. This will be compared between each iteration of the N-M algorithm + if ( best_lnl < vertex[0].lnl ) { + best_optimum = vertex[0] ; + best_lnl = vertex[0].lnl ; + } + cout << "Max point: " << vertex[0] << " best_lnl: " << best_lnl << endl; + } + + cout << "END" << endl; + cout << vertex[0] << "\t" << vertex[1] << "\t" << vertex[2] << "\t" << endl; + return best_optimum; + +} + + +#endif \ No newline at end of file diff --git a/src/selection_optimize_test_func.h b/src/selection_optimize_test_func.h new file mode 100644 index 0000000..284b648 --- /dev/null +++ b/src/selection_optimize_test_func.h @@ -0,0 +1,21 @@ +#ifndef __OPTIMIZE_TEST_FUNC_H +#define __OPTIMIZE_TEST_FUNC_H + +double optimize_test_func(int x, double y) { + return 2000-1000*(pow(sin(x/1000), 10) + cos(10 + y * x/1000) * cos(x/1000)); + //2000-1000*(np.sin(x/1000) ** 10 + np.cos(10 + y * x/1000) * np.cos(x/1000)) + +} +double optimize_test_func2(int x, double y) { + double xx = x; + return 1000 - 50*(pow((xx/1000)-3,2) + pow(y-3,2)); +} + +double selection_evaluate_point2(selection &point, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_changes ) { + //cout << "Evaluate point1: " << point<< endl; + point.lnl = optimize_test_func2(point.pos, point.sel); + //cout << "Evaluate point2: " << point<< endl; + return point.lnl; +} + +#endif \ No newline at end of file diff --git a/src/selection_print_usage.h b/src/selection_print_usage.h new file mode 100644 index 0000000..e8a8e03 --- /dev/null +++ b/src/selection_print_usage.h @@ -0,0 +1,66 @@ +#ifndef __SELECTION_PRINT_USAGE_H +#define __SELECTION_PRINT_USAGE_H + +void print_usage() { + + cerr << endl << endl << "ahmm-s usage:" << endl << endl ; + cerr << "\trequired:" << endl ; + cerr << "\t\t-i [string]\n\t\t\tinput file name" << endl ; + cerr << "\t\t-s [string]\n\t\t\tsample id and ploidy file" << endl ; + + cerr << "\t\t-p [int] [int] [float]" << endl ; + cerr << "\t\t\t ancestry pulse with format, ancestral population, time," << endl ; + cerr << "\t\t\t and proportion of final ancestry from this pulse" << endl ; + cerr << "\t\t--ne [int]\n\t\t\teffective population size of the admixed population" << endl ; + + cerr << "\n\tselect one of the following working modes:" << endl ; + + cerr << "\t\t--gss [int] [int] [int] [float] [float]" << endl ; + cerr << "\t\t\t golden section search for optimal selection coeffient at each site." << endl ; + cerr << "\t\t\t parameters: chromosomal position start, stop, step, selection coefficient start, stop" << endl ; + + cerr << "\t\t--grid [int] [int] [int] [float] [float] [float]" << endl ; + cerr << "\t\t\t calculate likelihood ratios in a grid." << endl ; + cerr << "\t\t\t parameters: chromosomal position start, stop, step, selection coefficient start, stop, step." << endl ; + + cerr << "\t\t--site [int] [float]" << endl ; + cerr << "\t\t\t calculate likelihood ratios for a single value of s at a single site." << endl ; + cerr << "\t\t\t parameters: chromosomal position, selective coeffient" << endl ; + + + cerr << "\n\toptional:" << endl ; + cerr << "\t\t--help\n\t\t\tprint this help statement" << endl ; + cerr << "\t\t-g\n\t\t\tsamples are specified with genotypes rather than read counts" << endl ; + + cerr << "\t\t--chr [string]" << endl ; + cerr << "\t\t\t specify chromosome that will be analyzed" << endl ; + cerr << "\t\t\t (only necessary when there are multiple chromosomes in input file)" << endl ; + cerr << "\t\t--chr_win [int] [int]" << endl ; + cerr << "\t\t\t limit region on chromosome that will be analyzed" << endl ; + + cerr << "\t\t--gss_precision [float]" << endl ; + cerr << "\t\t\t specify precision in finding optimal value of s using golden section search. default: 1e-5" << endl ; + cerr << "\t\t--unit_coords" << endl ; + cerr << "\t\t\t unit for start and stop position in grid and gss search can be defined as chromosome" << endl ; + cerr << "\t\t\t coordinates rather than as line number in input file. default off" << endl ; + cerr << "\t\t--window [string] [float]" << endl ; + cerr << "\t\t\t specify size of Markov chain in percent or Morgans." << endl ; + cerr << "\t\t\t \"p 10\" extends the markov chain 10% of chromosome length on each side of selected site." << endl ; + cerr << "\t\t\t \"m 0.1\" extends the windows 0.1 Morgan on each side of the selected site." << endl ; + cerr << "\t\t\t default: \"p 100\"" << endl ; + cerr << "\t\t--traj [int]" << endl ; + cerr << "\t\t\t change algorithm for generating selection trajectories." << endl ; + cerr << "\t\t\t 4: 4-point approximation, 3: 3-point approximation (legacy option, not recommended)." << endl ; + cerr << "\t\t\t default: forward iteration." << endl ; + cerr << "\t\t--stochastic" << endl ; + cerr << "\t\t\t enables the stochastic method for generation selection trajectory." << endl ; + cerr << "\t\t\t (Experimental. Slow. Use for small values of s.)" << endl ; + cerr << "\t\t--stochastic_reps [int]" << endl ; + cerr << "\t\t\t specifies number of simulations for the stochastic trajectory algorithm." << endl ; + cerr << "\t\t\t default: 10000" << endl ; + cerr << "\t\t--full_selection_space" << endl ; + cerr << "\t\t\t turns off optimization of the selection coeffient search space. (Experimental)" << endl ; +} + +#endif + diff --git a/src/selection_read_cmd_line.h b/src/selection_read_cmd_line.h new file mode 100644 index 0000000..ea2a11b --- /dev/null +++ b/src/selection_read_cmd_line.h @@ -0,0 +1,367 @@ +#ifndef __SELECTION_READ_CMD_LINE_H +#define __SELECTION_READ_CMD_LINE_H + +void cmd_line::read_cmd_line ( int argc, char *argv[] ) { + + ///defaults + ancestral_fixed = false ; /// set to true for qtl or experimental evolution application if ancestral genotypes are known and at fixed frequencies. + + /// ideally we recommend pruning LD in advance + minimum_distance = 0 ; /// minimum distance in morgans between sites to consider + ne = 2e4 ; /// actually 2ne + + // time params to bound our search + t_max = 10000 ; + t_min = 1 ; + p_max = 0.99999 ; + p_min = 0.00001 ; + t_length = 0.8 ; + p_length = 0.8 ; + + /// if set, we clear once + bool clear = false ; + + /// error rates + error_rates = false ; + + // the default behavior is a single pulse of ancestry 1 into ancestry 0 + ancestry_pulses.resize( 2 ) ; + ancestry_pulses[0].type = 0 ; + ancestry_pulses[1].type = 1 ; + + /// default is 50:50 with single pulse of 1 into 0 + ancestry_pulses[0].proportion = 0.5 ; + ancestry_pulses[1].proportion = 0.5 ; + ancestry_pulses[0].proportion_fixed = true ; + ancestry_pulses[1].proportion_fixed = true ; + + /// also the ancestry proportions are known + ancestry_proportion.assign(2,0.5) ; + + /// time is not fixed by default, pulse of 1 into 0 + /// does not matter, really since 0>1 would be identical in formulation + ancestry_pulses[0].time = 3000 ; + ancestry_pulses[0].time_fixed = true ; + ancestry_pulses[1].time = 10 ; + ancestry_pulses[1].time_fixed = false ; + + /// end parameter this will be in lnl units + /// i.e. must obtain <= this amount of improvement between all vertices to quit + tolerance = 1e-5 ; + + /// restart number + n_restarts = -1 ; + + /// per site per read error rate + error_rate = 0.01 ; + + /// genotype data rather than read data? + genotype = false ; + + // viterbi + viterbi = false ; + + /// output pulses rather than ancestry counts + output_pulses = true ; + + /// set output precision + precision = 10 ; + + /// sample file + sample_file = "null" ; + + // intput file + input_file = "null" ; + + /// bootstraps + n_bootstraps = 0 ; + block_size = 0 ; + + // selection + is_limit = false ; + calc_grid = false; + test_point = false; + is_coord = false; + + // if --chr_win is not set, read whole chromosome + limit_win_start = 0; + limit_win_end = 1000000000; + + win_unit = "p"; // set default window size unit to percent + win_percent = 100; // default window size in percent + //win_morgan = 0.1; // default window size in morgans + + // golden section search (gss) parameters + gs_precision = 1e-5; // minimum recision in estimation of selection coeffient in gss + gs_sstep = 0.001; + + // number of runs for the stochastic trajectory function + stochastic_reps = 1000; + + // optimizes upper bound for the selection coefficient search space. Makes things faster. + // Default true. Can be turned off with --full_selection_space + limit_sel_space = true; + + /// accept command line parameters + for (int i=1; i 0 ) { + new_ancestry_pulse.time_fixed = true ; + } + else { + new_ancestry_pulse.time = new_ancestry_pulse.time * -1 ; + new_ancestry_pulse.time_fixed = false ; + } + + // if proporion is set, we are not estimating it + ////// set proporiton with a negative number to provide the starting guess for this parameter + if ( new_ancestry_pulse.proportion > 0 ) { + new_ancestry_pulse.proportion_fixed = true ; + } + else { + new_ancestry_pulse.proportion_fixed = false ; + new_ancestry_pulse.proportion = -1 * new_ancestry_pulse.proportion ; + } + ancestry_pulses.push_back( new_ancestry_pulse ) ; + ancestry_pulses.back().entry_order = ancestry_pulses.size() - 1 ; + } + + + + + + //// for each ancestry type, set the total ancestry fraction + //// this must be set and equal to all the ancestry types listed above + if ( strcmp(argv[i],"-a") == 0 ) { + ancestry_proportion.clear() ; + int stop = atoi(argv[++i]) ; + float sum = 0 ; + for ( int l = 0 ; l < stop ; l ++ ) { + ancestry_proportion.push_back( atof(argv[++i]) ) ; + sum += ancestry_proportion.back() ; + } + + //// check that ancestry proportions sum to one + if ( sum < 0.9999 || sum > 1.0001 ) { + cerr << "\n\n\t\t ERROR: ancestry proportions must sum to one\n\n" ; + print_usage() ; + exit(1) ; + } + } + + if ( strcmp(argv[i],"--help") == 0 ) { + print_usage() ; + exit(0) ; + } + + if ( strcmp(argv[i],"-g") == 0 ) { + genotype = true ; + } + + if ( strcmp(argv[i],"--output-ancestry") == 0 ) { + output_pulses = false ; + } + if ( strcmp(argv[i],"--precision") == 0 ) { + precision = atoi(argv[++i]) ; + cout.precision(precision) ; + cerr.precision(precision) ; + } + + if ( strcmp(argv[i],"-v") == 0 ) { + viterbi = true ; + } + + if ( strcmp(argv[i],"-r") == 0 ) { + n_restarts = atoi(argv[++i]) ; + } + + /// bootstraps supplied as '-b + if ( strcmp(argv[i],"-b") == 0 ) { + n_bootstraps = atoi(argv[++i]) ; + block_size = atoi(argv[++i]) ; + } + + /// to bound possible pulse times + if ( strcmp(argv[i],"--tmax") == 0 ) { + t_max = atof(argv[++i]) ; + } + if ( strcmp(argv[i],"--tmin") == 0 ) { + t_min = atof(argv[++i]) ; + } + if ( strcmp(argv[i],"--pmin") == 0 ) { + p_min = atof(argv[++i]) ; + } + if ( strcmp(argv[i],"--pmax") == 0 ) { + p_max = atof(argv[++i]) ; + } + if ( strcmp(argv[i],"--tlength") == 0 ) { + t_length = atof(argv[++i]) ; + } + if ( strcmp(argv[i],"--plength") == 0 ) { + p_length = atof(argv[++i]) ; + } + if ( strcmp(argv[i],"--tolerance") == 0 ) { + tolerance = atof(argv[++i]) ; + } + if ( strcmp(argv[i], "-e" ) == 0 ) { + error_rate = atof(argv[++i]) ; + } + if ( strcmp(argv[i], "-E" ) == 0 ) { + error_rates = true ; + } + if ( strcmp(argv[i],"--ne") == 0 ) { + ne = 2 * atof(argv[++i]) ; + } + + /// this version will allow inputting all samples in a single file with separate posterior output files + if ( strcmp(argv[i],"-i") == 0 ) { + input_file = string(argv[++i]) ; + } + + /// sample file + if ( strcmp(argv[i],"-s") == 0 ) { + sample_file = string(argv[++i]) ; + } + + if ( strcmp(argv[i],"-d") == 0 ) { + minimum_distance = atof(argv[++i]) ; + } + if ( strcmp(argv[i],"--fix") == 0 ) { + ancestral_fixed = true ; + } + + + + ///// Adaptive introgression stuff below + + + /// activate selection detection module + /// uses the following format -j chromosome_of_interest (str) site_of_interest start_window (int) + /// window_start (int) window_end (int) + if ( strcmp(argv[i],"--chr") == 0 ) { + is_limit = true ; + limit_chr = string(argv[++i]) ; + } + + if ( strcmp(argv[i],"--chr_win") == 0 ) { + limit_win_start = atoi(argv[++i]) ; + limit_win_end = atoi(argv[++i]) ; + + cerr << endl << limit_chr << "\t" << limit_win_start << "\t" << limit_win_end << "\t" << endl ; + + /// check if win_start < win_end and site is located within window + if ( limit_win_end <= limit_win_start ) { + cerr << "\n\n\t\t ERROR: formatting for window is wrong\n\n" ; + print_usage() ; + exit(1) ; + } + } + + if ( strcmp(argv[i],"--grid") == 0 ) { + calc_grid = true; + grid_pstart = atoi(argv[++i]); + grid_pstop = atoi(argv[++i]); + grid_pstep = atoi(argv[++i]); + grid_sstart = atof(argv[++i]); + grid_sstop = atof(argv[++i]); + grid_sstep = atof(argv[++i]); + } + + if ( strcmp(argv[i],"--gss") == 0 ) { + run_gss = true; + gs_pstart = atoi(argv[++i]); + gs_pstop = atoi(argv[++i]); + gs_pstep = atoi(argv[++i]); + gs_sstart = atof(argv[++i]); + gs_sstop = atof(argv[++i]); + } + + if ( strcmp(argv[i],"--gss_precision") == 0 ) { + gs_precision = atof(argv[++i]); + } + + if ( strcmp(argv[i], "--unit_coords" ) == 0 ) { + is_coord = true ; + } + + if ( strcmp(argv[i],"--site") == 0 ) { + test_point = true; + test_pos = atoi(argv[++i]); + test_sel = atof(argv[++i]); + } + + // control window size for selection + if ( strcmp(argv[i],"--window") == 0 ) { + win_unit = string(argv[++i]); + + if ( win_unit == "m") { + win_morgan = atof(argv[++i]); + if (win_morgan <= 0) { + cerr << "\n\n\t\tERROR: windows size has to be specified with positive value.\n\n" ; + exit(1) ; + } + } + else if (win_unit == "p") { + win_percent = atof(argv[++i]); + if (win_percent <= 0 || win_percent > 100) { + cerr << "\n\n\t\tERROR: windows size has to be specified in percent (1-100).\n\n" ; + exit(1) ; + } + } + else { + cerr << "\n\n\t\tERROR: wrong unit for window size.\n\n" ; + print_usage() ; + exit(1) ; + } + } + + if ( strcmp(argv[i],"--traj") == 0 ) { + traj_function = atoi(argv[++i]) ; + } + + if ( strcmp(argv[i],"--stochastic") == 0 ) { + use_stochastic = true; + } + + if ( strcmp(argv[i],"--stochastic_reps") == 0 ) { + stochastic_reps = atoi(argv[++i]) ; + } + + if ( strcmp(argv[i],"--full_selection_space") == 0 ) { + limit_sel_space = false; + } + } + + if ( input_file == "null" ) { + cerr << "\n\n\t\tERROR: must provide input file\n\n\t\t\t-i [path/to/input_file]\n\n" ; + print_usage() ; + exit(1) ; + } + if ( sample_file == "null" ) { + cerr << "\n\n\t\tERROR: must provide sample file\n\n\t\t\t-s [path/to/sample_file]\n\n" ; + print_usage() ; + exit(1) ; + } + + return ; +} + +#endif + diff --git a/src/selection_read_input.h b/src/selection_read_input.h new file mode 100644 index 0000000..d96ca8e --- /dev/null +++ b/src/selection_read_input.h @@ -0,0 +1,162 @@ +#ifndef __SELECTION_READ_INPUT_H +#define __SELECTION_READ_INPUT_H + +void read_file ( cmd_line &options, vector &markov_chain_information, map > > &state_list, vector &position, vector &recombination_rate, vector &chromosomes, int &sel_pos ) { + + /// vector to hold index of inbred path if we have variable ploidy + vector path_index( markov_chain_information.size(), 0 ) ; + + /// stream in file + ifstream in ( options.input_file.c_str() ) ; + + //// since the first site transition matrix does not matter, we can print anything + double extra_recombination = 1 ; + string last_chrom = "" ; + + /// iterator to find selected_site + int ipos = 0; + + while( !in.eof() ) { + + input_line new_line ; + in >> new_line.chrom >> new_line.pos ; + + + /// Limits the genomic region to be parsed ifa range of interest is specified + ///(so far only for selection analysis) + if (options.is_limit == true) { + if ( new_line.chrom != options.limit_chr) { + getline( in, new_line.chrom ) ; + continue ; + } + if ( new_line.pos < options.limit_win_start) { + getline( in, new_line.chrom ) ; + continue ; + } + if ( new_line.pos > options.limit_win_end) { + break ; + } + } + + + /// if two adjacent sites have the same positions, skip second + if ( ( position.size() > 0 && new_line.pos == position.back() ) ) { + getline( in, new_line.chrom ) ; + continue ; + } + + // read reference panel genotype counts + new_line.reference_counts.resize( options.ancestry_proportion.size() ) ; + int count = 0 ; + for ( int p = 0 ; p < options.ancestry_proportion.size() ; p ++ ) { + double count1, count2 ; + in >> count1 >> count2 ; + new_line.reference_counts[p].push_back(count1) ; + new_line.reference_counts[p].push_back(count2) ; + new_line.reference_counts[p].push_back(count1+count2) ; + } + + /// read recombination rate + in >> new_line.recombination_rate ; + + /// if line specific error rates are provided + new_line.error_1 = options.error_rate ; + new_line.error_2 = options.error_rate ; + if ( options.error_rates == true ) { + in >> new_line.error_1 >> new_line.error_2 ; + } + // read sample panel read counts + new_line.sample_counts.resize( markov_chain_information.size() ) ; + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + double count1, count2 ; + in >> count1 >> count2 ; + + /// subsample reads to a maximum depth so we can compute multinomial probs without overflow errors + if ( count1 + count2 > 170 ) { + subsample_reads( count1, count2 ) ; + } + + /// now store counts and total for sample + new_line.sample_counts[m].push_back(count1) ; + new_line.sample_counts[m].push_back(count2) ; + new_line.sample_counts[m].push_back(count1+count2) ; + } + + if ( new_line.chrom != last_chrom ) { + recombination_rate.push_back( 0.5 ) ; + last_chrom = new_line.chrom ; + extra_recombination = 0 ; + } + + /// ignore lines where recombination may not be suffiicent to make sites independent + /// this might be useful in place of LD pruning + else { + extra_recombination += new_line.recombination_rate ; + if ( extra_recombination < options.minimum_distance ) { + continue ; + } + new_line.recombination_rate = extra_recombination ; + extra_recombination = 0 ; + + /// Changing recombination rate to no longer be specified per basepair. JS + recombination_rate.push_back( new_line.recombination_rate ) ; + } + + ipos++ ; + + /// record position + position.push_back( new_line.pos ) ; + chromosomes.push_back( new_line.chrom ) ; + + /// check all path indexes and update as needed + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + + if ( markov_chain_information[m].path_file != "null" ) { + + /// record previous ploidy + int previous_ploidy = markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy ; + + /// check to make sure we're on the right ploidy tract + while ( new_line.chrom != markov_chain_information[m].sample_ploidy_path[path_index[m]].chrom ) { + path_index[m] ++ ; + } + while ( new_line.pos > markov_chain_information[m].sample_ploidy_path[path_index[m]].stop ) { + path_index[m] ++ ; + } + + /// record switches + if ( previous_ploidy != markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy ) { + + markov_chain_information[m].ploidy_switch_position.push_back( position.size() - 1 ) ; + markov_chain_information[m].ploidy_switch.push_back( markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy ) ; + } + } + } + /// + if ( options.genotype == false ) { + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + vec emissions ; + create_emissions_matrix( markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy, new_line, options.ancestral_fixed, state_list[markov_chain_information.at(m).sample_ploidy_path[path_index[m]].ploidy], m, options.ancestry_pulses, emissions ) ; + markov_chain_information[m].emission_probabilities.push_back( emissions ) ; + } + } + + /// create emissions matrix with genotypes + else { + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + vec emissions ; + create_emissions_matrix_genotype( markov_chain_information[m].sample_ploidy_path[path_index[m]].ploidy, new_line, options.ancestral_fixed, state_list[markov_chain_information.at(m).sample_ploidy_path[path_index[m]].ploidy], m, options.ancestry_pulses, emissions ) ; + markov_chain_information[m].emission_probabilities.push_back( emissions ) ; + } + } + } + + + /// to avoid lookahead errors + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + markov_chain_information[m].ploidy_switch_position.push_back( position.size() ) ; + } +} + +#endif + diff --git a/src/selection_split_vector.h b/src/selection_split_vector.h new file mode 100644 index 0000000..7326f45 --- /dev/null +++ b/src/selection_split_vector.h @@ -0,0 +1,76 @@ +#ifndef __SPLIT_VECTOR_H +#define __SPLIT_VECTOR_H + +/// splits vector (of recombination rates) into two vectors at the selected site. +/// the back vector is generated in reverse order + +// min function because of namespace collision between std and arma +int int_min(int a, int b){ + int minout; + if (a < b) { + minout = a; + } + else { + minout = b; + } + return minout; +} + +// splits a vector of chromosomal positions into two vectors going away from a focal site. +// one vector is reversed going back to 0 +// also trims the hmm window used +vector > split_vector(int sel_site, vector &whole_vec, cmd_line &options) +{ + vector > split_vecs; + vector fwd_vec; + vector back_vec; + + // trim vector if size is specified in morgans + if (options.win_unit == "m") { + double sum_morgans_fwd; + double sum_morgans_back; + for (int i = sel_site; i < whole_vec.size(); i++) { + sum_morgans_fwd += whole_vec[i]; + if (sum_morgans_fwd <= options.win_morgan) { + fwd_vec.push_back(whole_vec[i]) ; + } + else { + break; + } + } + for (int i = sel_site; i > 0; i--) { + sum_morgans_back += whole_vec[i]; + if (sum_morgans_back <= options.win_morgan) { + back_vec.push_back(whole_vec[i]) ; + } + else { + break; + } + } + } + + // trim vector if size is specified in percent + else if (options.win_unit == "p") { + int percent_size = whole_vec.size() * (options.win_percent/100); + int trim_size_fwd = int_min((whole_vec.size() - sel_site) , percent_size); // Check for off by 1 error + int trim_size_back = int_min(sel_site, percent_size); + + for (int i = sel_site; i < (sel_site + trim_size_fwd); i++) { + fwd_vec.push_back(whole_vec[i]) ; + } + + for (int i = sel_site; i > (sel_site - trim_size_back); i--) { + back_vec.push_back(whole_vec[i]) ; + } + + } + + split_vecs.push_back(fwd_vec); + split_vecs.push_back(back_vec); + return split_vecs; + //cout << "Split vector lengths: " << fwd_vec.size() << ", " << back_vec.size() << endl; +} + + + +#endif \ No newline at end of file diff --git a/src/selection_stochastic_traj.h b/src/selection_stochastic_traj.h new file mode 100644 index 0000000..0f26901 --- /dev/null +++ b/src/selection_stochastic_traj.h @@ -0,0 +1,66 @@ +#ifndef __SELECTION_STOCHASTIC_TRAJECTORY_H +#define __SELECTION_STOCHASTIC_TRAJECTORY_H + + +// generates a frequency trajectory of the selected site using the stochastic method +// the stochastic method will generate many random trajectories and calculate and return the average trajectory +void selection_stochastic_trajectory(vector &trajectory, double s, double m, int generations, int ne, int reps) +{ + vector traj_sum(generations+1,0); //// +1?????? + default_random_engine rand_gen; + + double fixed_freq; + double n_sel; + double n_nonsel; + int fix_gen = 0; + double ns_fit; + double nns_fit; + double ns_freq; + double new_ns; + + bool not_lost_fixed = true; + int rcount = 0; + + + while (rcount < reps) { + n_sel = ne * m; + n_nonsel = ne * (1 - m); + + fixed_freq = 0; + vector traj(generations+1,0); + traj[0] += m; + + for (int g = 1; g <= generations; g++) { //// not sure about start and end + ns_fit = n_sel * (1 + s); + nns_fit = n_nonsel; + ns_freq = ns_fit / (ns_fit + nns_fit); + + if ( ns_freq == 0.0) { + not_lost_fixed = false; + break; + } + + binomial_distribution<> repopulate(ne, ns_freq); + new_ns = repopulate(rand_gen); + traj[g] = new_ns/ne; + + n_sel = new_ns; + n_nonsel = ne - new_ns; + } + + if (not_lost_fixed = true) { + for (int f = 0; f < traj.size(); f++) { + traj_sum[f] += traj[f]; + } + rcount++; + } + not_lost_fixed = true; + } + + for (int i = 0; i < traj_sum.size(); i++) { + trajectory.push_back(traj_sum[i]/reps); + } + +} + +#endif \ No newline at end of file diff --git a/src/selection_subsample.h b/src/selection_subsample.h new file mode 100644 index 0000000..8204acd --- /dev/null +++ b/src/selection_subsample.h @@ -0,0 +1,16 @@ +#ifndef __SUBSAMPLE_H +#define __SUBSAMPLE_H + +void subsample_reads ( double &c1, double &c2 ) { + while ( c1 + c2 > 170 ) { + double r = ((double) rand() / (RAND_MAX)) ; + if ( r < c1/(c1+c2) ) { + c1 -- ; + } + else { + c2 -- ; + } + } +} + +#endif diff --git a/src/selection_trajectory.h b/src/selection_trajectory.h new file mode 100644 index 0000000..f0aa072 --- /dev/null +++ b/src/selection_trajectory.h @@ -0,0 +1,84 @@ +#ifndef __SELECTION_TRAJECTORY_H +#define __SELECTION_TRAJECTORY_H + + +// generates vector with allele frequency of selected allele over time/generations +void selection_trajectory(vector &freq, double s, int tt, double m, int generations, int n) +{ + // returns flat vector if selection is 0 (ie, no change in ellele frequency over time) + if ( s == 0) { + freq.assign(generations,m); + return; + } + + int t0 = 0; + int t0min = 0; + bool found = false; + double f; + + // loops over generations until initial frequency (m) is reached + number of generations (gen) has passed + while (found == false) { + f = 1 / (1 + 2 * n * s * exp(-s*t0)); + if (f > m) { + if (tt == 0) { + tt = t0; + } + else if (t0 == (tt + generations)) { + found = true; + } + freq.push_back(f); + } + t0++; + } + +} + +// checks if selective coeffient causes site to go to fixation in the time since introgression +bool selection_reaches_fixation(double s, double m, int generations, int n) +{ + double max_freq = 0.99; + int tt = 0; + int t0 = 0; + int t0min = 0; + bool found = false; + double f; + + s *= 0.5; + + // loops over generations until initial frequency (m) is reached + number of generations (gen) has passed + while (found == false) { + f = 1 / (1 + 2 * n * s * exp(-s*t0)); + if (f > max_freq) { + cerr << "Frequency " << f << " generation " << t0-tt << " selection " << s < m) { + if (tt == 0) { + tt = t0; + } + else if (t0 == (tt + generations)) { + break; + } + } + t0++; + } + return found; +} + +// returns selection coeffient that reaches 0.99 in the time since introgression +// used to speed up calculations and prevent division errors +double selection_get_max_sel(double min_s, double max_s, double step_s, double m, int generations, int n) +{ + cerr << "selection_get_max_sel " << min_s << " " << max_s << " " << step_s << " " << m << " " << generations << " " << n << endl; + double last_s = 0; + for (double s = min_s; s <= max_s; s += step_s) { + if (selection_reaches_fixation(s, m, generations, n) == true) { + return last_s; + } + last_s = s; + } + return max_s; +} + +#endif diff --git a/src/selection_transition_rates.h b/src/selection_transition_rates.h new file mode 100644 index 0000000..2fb643c --- /dev/null +++ b/src/selection_transition_rates.h @@ -0,0 +1,314 @@ +#ifndef __SELECTION_TRANSITION_RATES_H +#define __SELECTION_TRANSITION_RATES_H + + +// generates transition matrix. +void selection_transition_matrix(map > &transition_matrix , vector, double > > > &transition_info, vector &recombination_rate, vector &positions, double &number_chromosomes, vector &transition_rates ) { + + /// check if we already computed this for this sample ploidy + if ( transition_matrix.find( number_chromosomes ) != transition_matrix.end() ) { + return ; + } + + /// else, have to create entire matrix + /// first create data object of approporate size + //transition_matrix[number_chromosomes].resize(recombination_rate.size()) ; + transition_matrix[number_chromosomes].resize(transition_rates.size()) ; + + //// iterate across all positions and compute transition matrixes + for ( int p = 0 ; p < transition_rates.size() ; p ++ ) { + + /// create actual transition matrix + transition_matrix[number_chromosomes][p].set_size( transition_info.size(), transition_info.size() ) ; + transition_matrix[number_chromosomes][p].fill( 0 ) ; + + /// population transitions by summing across all routes + for ( int i = 0 ; i < transition_info.size() ; i ++ ) { + for ( int j = 0 ; j < transition_info[i].size() ; j ++ ) { + for ( std::map,double>::iterator t = transition_info[i][j].begin() ; t != transition_info[i][j].end() ; ++ t ) { + double prob_t = 1 ; + for ( int r = 0 ; r < t->first.size() ; r ++ ) { + prob_t *= pow( transition_rates[p](t->first[r].start_state,t->first[r].end_state), t->first[r].transition_count ) ; + } + transition_matrix[number_chromosomes][p](j,i) += prob_t * t->second ; + } + } + } + } +} + +// as above, but to use when printing expected genotype frequences across the chromosome +vector > selection_transition_rates_genotypes(selection point, vector &recombination_rate, cmd_line &options, vector &position, vector < vector > &genofreqs, vector > &split_vecs, map > &sel_trajectories) { + + // The selection coefficient is halved, since the model uses the haploid coefficient, + // but the user specifies the diploid coefficient from the command line + point.sel = 0.5 * point.sel; + + // splits the chromosome into two vectors and trims them based on window size + if (split_vecs.size() == 0) { + split_vecs = split_vector(point.pos, recombination_rate, options) ; + } + + double m = options.ancestry_pulses[1].proportion; + int generations = options.ancestry_pulses[1].time ; + int n = options.ne ; /// DOUBLE CHECK HAPLOID/DIPLOID!! + int tt = 0; + + // generates vector with allele frequencies of selected allele over time + vector sel_traject ; + map >::iterator it; + it = sel_trajectories.find(point.sel); + + if (options.use_stochastic == true) { + if (it == sel_trajectories.end()) { + selection_stochastic_trajectory(sel_traject, point.sel, m, generations, n, options.stochastic_reps) ; // change tt + sel_trajectories[point.sel] = sel_traject; + } + else { + sel_traject = it->second; + } + } + else { + if (it == sel_trajectories.end()) { + selection_trajectory(sel_traject, point.sel, tt, m, generations, n) ; // change tt + sel_trajectories[point.sel] = sel_traject; + } + else { + sel_traject = it->second; + } + } + + + vector gf1; + vector gf2; + genofreqs.push_back(gf1); + genofreqs.push_back(gf2); + + vector fwd_trans; + vector back_trans; + + // checks which trajectory function to use (4point, 3point or forward iteration) + if (options.traj_function == 4) { + if (point.sel == 0.0) { + fwd_trans = fwd_iter_genotype_freq(split_vecs[0], sel_traject, m, options.ne, genofreqs[0]) ; //options.ne + back_trans = fwd_iter_genotype_freq(split_vecs[1], sel_traject, m, options.ne, genofreqs[1]) ; + } + else { + genofreqs[0].push_back(sel_traject.back()); + genofreqs[1].push_back(sel_traject.back()); + fwd_trans = approx_curve(split_vecs[0], sel_traject, m) ; //options.ne + back_trans = approx_curve(split_vecs[1], sel_traject, m) ; + } + } + else if (options.traj_function == 3) { + genofreqs[0].push_back(sel_traject.back()); + genofreqs[1].push_back(sel_traject.back()); + fwd_trans = approx_curve_3point(split_vecs[0], sel_traject, m) ; //options.ne + back_trans = approx_curve_3point(split_vecs[1], sel_traject, m) ; + } + else { + fwd_trans = fwd_iter_genotype_freq(split_vecs[0], sel_traject, m, options.ne, genofreqs[0]) ; //options.ne + back_trans = fwd_iter_genotype_freq(split_vecs[1], sel_traject, m, options.ne, genofreqs[1]) ; + } + + vector > tr_vector; + tr_vector.push_back(fwd_trans); + tr_vector.push_back(back_trans); + return tr_vector; +} + +double selection_evaluate_point_genotypes(selection &point, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_changes, vector > &split_vecs, map > &sel_trajectories) { + + vector < vector > genofreqs ; + + vector > t_rates = selection_transition_rates_genotypes(point, recombination_rate, options, position, genofreqs, split_vecs, sel_trajectories); // test. remove + + double comb_lnl = 0; + bool go_backwards = false; + + for (int i=0 ; i < 2 ; i++) { + // transition matrix + map > transition_matrix ; + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + selection_transition_matrix( transition_matrix, transition_matrix_information[markov_chain_information.at(m).number_chromosomes], recombination_rate, position, markov_chain_information.at(m).number_chromosomes, t_rates[i] ) ; + // Delete maybe + for ( int p = 0 ; p < markov_chain_information[m].ploidy_switch.size() ; p ++ ) { + selection_transition_matrix( transition_matrix, transition_matrix_information[markov_chain_information[m].ploidy_switch[p]], recombination_rate, position, markov_chain_information[m].ploidy_switch[p], t_rates[i] ) ; + } + } + /// compute transitions within a state + vector interploidy_transitions ; + //interploidy_transitions = create_interploidy_transitions( state_changes, vertex, options.ancestry_proportion ) ; + + /// now compute the forward probabilities + double lnl = 0 ; + for ( int m = 0 ; m < markov_chain_information.size() ; m ++ ) { + lnl += markov_chain_information[m].selection_forward_probabilities_genotypes( transition_matrix, interploidy_transitions, point, go_backwards, genofreqs[i], position ) ; + } + comb_lnl += lnl; + go_backwards = true; + } + + // tries to handle the the situation where the likelihood is undefined = nan + if ( options.limit_sel_space == false ) { + if (isnan(comb_lnl) == true ) { + comb_lnl = -1000000000; + } + } + point.lnl = comb_lnl; + return comb_lnl ; +} + +// function for calculating likelihoods in a grid +// takes start, stop and step values for selection and position +void selection_grid(int p_start, int p_stop, int p_step, double s_start, double s_stop, double s_step, vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_changes ) { + + map > sel_trajectories; + + // check if limits are specified in chromosome coordinates or per site (ie from 1000bp to 2000bp or SNP #1 to SNP #10) + if (options.is_coord == true) { + int p_start = get_position(options.grid_pstart, position); + int p_stop = get_position(options.grid_pstop, position); + + cerr << "p_start " << p_start << "p_stop " << p_stop << "p_step " << p_step << endl; + + if (p_start == -1) { + cerr << "ERROR: specified start coordinate for grid not found on chromosome" << endl; + exit(1); + } + if (p_stop == -1) { + cerr << "ERROR: specified STOP coordinate for Golden section search not found on chromosome" << endl; + exit(1); + } + } + if ( p_start > p_stop ) { + cerr << "ERROR: specified stop coordinate for grid is located before start coordinate." << endl; + exit(1); + } + + + // loop over all sites in specified region with p_step steps + for (int p = p_start; p < p_stop; p+=p_step) { + + vector > split_vecs; + + // generate neutral transition rate for normalization / calculating likelihood ratio + selection point0; + point0.pos = p; + point0.sel = 0; + selection_evaluate_point_genotypes( point0, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + + // loop over selective coeffients + for (double s = s_start; s < s_stop; s=s+s_step) { + selection point; + point.pos = p; + point.sel = s; + selection_evaluate_point_genotypes( point, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + + cout << position[point.pos] << "\t" << point.sel << "\t" << setprecision(12) << point.lnl-point0.lnl << endl; + } + } +} + + +void selection_golden_section(vector &markov_chain_information, map, double > > > > &transition_matrix_information, vector &recombination_rate, vector &position, cmd_line &options, map > > &state_changes) { + map > sel_trajectories; + double GR = (sqrt(5) + 1) / 2; + + int pstart; + int pstop; + + double gs_start = options.gs_sstart; + double gs_stop = options.gs_sstop; + + // check if using full selection space, if not use s that reaches 0.99 frequency in specified number of generations + if ( options.limit_sel_space == true ) { + gs_stop = selection_get_max_sel(options.gs_sstart, options.gs_sstop, options.gs_sstep, options.ancestry_pulses[1].proportion, options.ancestry_pulses[1].time, options.ne); + } + cerr << "Golden section search search. Likelihood calculated for values of selection between " << gs_start << " and " << gs_stop << endl; + + // check which coordinate format is used, and convert bp to snp number + if (options.is_coord == true) { + pstart = get_position(options.gs_pstart, position); + pstop = get_position(options.gs_pstop, position); + + if (pstart == -1) { + cerr << "ERROR: specified START coordinate for Golden section search not found on chromosome" << endl; + exit(1); + } + if (pstop == -1) { + cerr << "ERROR: specified STOP coordinate for Golden section search not found on chromosome" << endl; + exit(1); + } + } + + else { + pstart = options.gs_pstart; + pstop = options.gs_pstop; + } + + if ( pstart > pstop ) { + cerr << "ERROR: specified STOP coordinate for Golden section search is located before START coordinate." << endl; + exit(1); + } + + // Golden section searchj + for (int p = pstart; p < pstop; p+=options.gs_pstep) { + vector > split_vecs; + + // calculate likelihood for neutral case + selection point0; + point0.pos = p; + point0.sel = 0; + selection_evaluate_point_genotypes( point0, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + + // do golden section search + selection point1; + selection point2; + selection point3; + selection point4; + + point1.pos = p; + point2.pos = p; + point3.pos = p; + point4.pos = p; + + point1.sel = gs_start; + point2.sel = gs_stop; + point3.sel = point2.sel - (point2.sel - point1.sel) / GR; + point4.sel = point1.sel + (point2.sel - point1.sel) / GR; + + selection_evaluate_point_genotypes( point1, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + selection_evaluate_point_genotypes( point2, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + selection_evaluate_point_genotypes( point3, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + selection_evaluate_point_genotypes( point4, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + + int i = 0; + + while (abs(point3.sel - point4.sel) > options.gs_precision) { + if (point3.lnl > point4.lnl) { + point2 = point4; + point4 = point3; + point3.sel = point2.sel - (point2.sel - point1.sel) / GR; + selection_evaluate_point_genotypes( point3, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + } + else { + point1 = point3; + point3 = point4; + point4.sel = point1.sel + (point2.sel - point1.sel) / GR; + selection_evaluate_point_genotypes( point4, markov_chain_information, transition_matrix_information, recombination_rate, position, options, state_changes, split_vecs, sel_trajectories ) ; + } + i++; + } + + // verbose output. used for debugging + //cout << position[point0.pos] << "\t" << (point3.sel+point4.sel)/2 << "\t" << setprecision(12) << ((point3.lnl+point4.lnl)/2)-point0.lnl << "\t" << i << "\t" << point3.lnl << "\t" << point4.lnl << "\t" << (point3.lnl+point4.lnl)/2 << "\t" << point0.lnl << endl; + + // cout << position[point0.pos] << "\t" << (point3.sel+point4.sel)/2 << "\t" << setprecision(12) << ((point3.lnl+point4.lnl)/2)-point0.lnl << "\tsel_trajectories.size() " << sel_trajectories.size() << endl; + + cout << position[point0.pos] << "\t" << (point3.sel+point4.sel)/2 << "\t" << setprecision(12) << ((point3.lnl+point4.lnl)/2)-point0.lnl << endl; + } +} + + +#endif \ No newline at end of file