diff --git a/CAI/CAI.py b/CAI/CAI.py index 98f0f6c..c1a0c6c 100644 --- a/CAI/CAI.py +++ b/CAI/CAI.py @@ -1,5 +1,5 @@ from itertools import chain -from . import genetic_codes +from genetic_codes import genetic_codes from scipy.stats.mstats import gmean def _synonymous_codons(genetic_code_dict): @@ -31,12 +31,12 @@ def RSCU(sequences, genetic_code=1): if len(sequence) % 3 != 0: raise ValueError("Input sequence not divisible by three") if len(sequence) == 0: - raise ValueError("Cannot include empty sequence in input") + raise ValueError("Input sequence cannot be empty") # count the number of each codon in the sequences sequences = [[sequence[i:i+3].upper() for i in range(0, len(sequence), 3)] for sequence in sequences] - codons = list(chain.from_iterable(sequences)) - counts = {i: codons.count(i) for i in set(genetic_code.keys())} + codons = list(chain.from_iterable(sequences)) # flat list of all codons (to be used for counting) + counts = {i: codons.count(i) for i in genetic_code.keys()} # "if a certain codon is never used in the reference set... assign [it] a value of 0.5" (page 1285) for codon in counts: @@ -103,9 +103,12 @@ def CAI(sequence, weights=[], RSCUs=[], sequences=[], genetic_code=1): # determine the synonymous codons in the genetic code synonymous_codons = _synonymous_codons(genetic_codes[genetic_code]) + # find codons without synonyms + non_synonymous_codons = [codon for codon in synonymous_codons.keys() if len(synonymous_codons[codon]) == 1] + # create a list of the weights for the sequqence, not counting codons without synonyms (page 1285) try: - sequence_weights = [weights[codon] for codon in sequence if (len(synonymous_codons[codon]) != 1)] + sequence_weights = [weights[codon] for codon in sequence if codon not in non_synonymous_codons] except KeyError, e: raise KeyError("Bad weights dictionary passed: missing weight for codon " + str(e)) diff --git a/README.md b/README.md index a63ac98..c8c6029 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,8 @@ An implementation of Sharp and Li's 1987 formulation of the codon adaption index Sharp, P. M., & Li, W. H. (1987). The codon adaptation index--a measure of directional synonymous codon usage bias, and its potential applications. _Nucleic Acids Research_, 15(3), 1281–1295. ## Installation +This module is available from PyPi and can be downloaded with the following command: + pip install CAI ## Usage @@ -23,14 +25,57 @@ print CAI("ATGGATTAC...", sequences=["ATGTTTGCTAAA", "ATGCGATACAGC",...]) ### Advanced Usage If you have already computed the weights or RSCU values of the reference set, you can supply `CAI()` with one or the other as arguments. They must be formatted as a dictionary and contain values for every codon. -To calculate the RSCU without calculating the CAI, you can use `RSCU()`. `RSCU()`s only required parameter a list of sequences. +**_N.B._ if you are computing large numbers of CAIs with the same reference sequences, first calculate their weights and then pass that to `CAI()` to eliminate redundant computation.** + +To calculate RSCU without calculating CAI, you can use `RSCU()`. `RSCU()`'s only required argument is a list of sequences. -Similarly, to calculate the weights of a reference set, you can use `relative_adaptiveness()`. `relative_adaptiveness()` takes either a list of sequences as the `sequences` parameter or a dictionary of RSCUs as the `RSCUs` parameter. +Similarly, to calculate the weights of reference sequences, you can use `relative_adaptiveness()`. `relative_adaptiveness()` takes either a list of sequences as the `sequences` parameter or a dictionary of RSCUs as the `RSCUs` parameter. ### Other Genetic Codes All functions in CAI support an optional `genetic_code` parameter, which is set by default to 1 (the standard genetic code). You may set it to any genetic code within [gc.prt](/gc.prt). +## API Reference +### `RSCU(sequences, genetic_code=1)` + +Argument | Details +--------- | ------- +sequences | List of DNA sequence strings. Required. +genetic_code | Integer containing the genetic code ID. Optional. + +#### Output +A dictionary containing every codon as the key and its RSCU as the value. + +### `relative_adaptiveness(sequences=[], RSCUs={}, genetic_code=1)` + +Argument | Details +--------- | ------- +sequences | List of DNA sequence strings. Optional. +RSCUs | Dictionary of RSCU values for each codon. Optional. +genetic_code | Integer containing the genetic code ID. Optional. + +#### Note +One of `sequences` or `RSCUs` is required. + +#### Output +A dictionary containing every codon as the key and its weight as the value. + +### `CAI(sequence, weights=[], RSCUs=[], sequences=[], genetic_code=1)` + +Argument | Details +--------- | ------- +sequence | String of DNA sequence to calculate CAI for. Required. +weights | Dictionary of weight values for each codon. Optional. +RSCUs | Dictionary of RSCU values for each codon. Optional. +sequences | List of DNA sequence strings. Required. +genetic_code | Integer containing the genetic code ID. Optional. + +#### Note +One of `sequences`, `RSCUs`, or `weights` is required. + +#### Output +A float of the CAI of the sequence. + ## Contributing Feel free to contribute, open issues, or let me know about bugs. Anything is welcome! diff --git a/setup.py b/setup.py index 135bc0b..f1eed50 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'CAI', packages = ["CAI"], - version = '0.1.6', + version = '0.1.7', description = 'Python implementation of codon adaptation index', author = 'Benjamin Lee', author_email = 'benjamin_lee@college.harvard.edu',