updated readme and improved performance

Benjamin-Lee · Aug 18, 2017 · ed68cce · ed68cce
1 parent 671617d
commit ed68cce
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 8 deletions.
diff --git a/CAI/CAI.py b/CAI/CAI.py
@@ -1,5 +1,5 @@
 from itertools import chain
-from . import genetic_codes
+from genetic_codes import genetic_codes
 from scipy.stats.mstats import gmean
 
 def _synonymous_codons(genetic_code_dict):
@@ -31,12 +31,12 @@ def RSCU(sequences, genetic_code=1):
         if len(sequence) % 3 != 0:
             raise ValueError("Input sequence not divisible by three")
         if len(sequence) == 0:
-            raise ValueError("Cannot include empty sequence in input")
+            raise ValueError("Input sequence cannot be empty")
 
     # count the number of each codon in the sequences
     sequences = [[sequence[i:i+3].upper() for i in range(0, len(sequence), 3)] for sequence in sequences]
-    codons = list(chain.from_iterable(sequences))
-    counts = {i: codons.count(i) for i in set(genetic_code.keys())}
+    codons = list(chain.from_iterable(sequences)) # flat list of all codons (to be used for counting)
+    counts = {i: codons.count(i) for i in genetic_code.keys()}
 
     # "if a certain codon is never used in the reference set... assign [it] a value of 0.5" (page 1285)
     for codon in counts:
@@ -103,9 +103,12 @@ def CAI(sequence, weights=[], RSCUs=[], sequences=[], genetic_code=1):
     # determine the synonymous codons in the genetic code
     synonymous_codons = _synonymous_codons(genetic_codes[genetic_code])
 
+    # find codons without synonyms
+    non_synonymous_codons = [codon for codon in synonymous_codons.keys() if len(synonymous_codons[codon]) == 1]
+
     # create a list of the weights for the sequqence, not counting codons without synonyms (page 1285)
     try:
-        sequence_weights = [weights[codon] for codon in sequence if (len(synonymous_codons[codon]) != 1)]
+        sequence_weights = [weights[codon] for codon in sequence if codon not in non_synonymous_codons]
     except KeyError, e:
         raise KeyError("Bad weights dictionary passed: missing weight for codon " + str(e))
 

diff --git a/README.md b/README.md
@@ -7,6 +7,8 @@ An implementation of Sharp and Li's 1987 formulation of the codon adaption index
 Sharp, P. M., & Li, W. H. (1987). The codon adaptation index--a measure of directional synonymous codon usage bias, and its potential applications. _Nucleic Acids Research_, 15(3), 1281–1295.
 
 ## Installation 
+This module is available from PyPi and can be downloaded with the following command:
+
 	pip install CAI
 
 ## Usage
@@ -23,14 +25,57 @@ print CAI("ATGGATTAC...", sequences=["ATGTTTGCTAAA", "ATGCGATACAGC",...])
 ### Advanced Usage
 If you have already computed the weights or RSCU values of the reference set, you can supply `CAI()` with one or the other as arguments. They must be formatted as a dictionary and contain values for every codon.
 
-To calculate the RSCU without calculating the CAI, you can use `RSCU()`. `RSCU()`s only required parameter a list of sequences.
+**_N.B._ if you are computing large numbers of CAIs with the same reference sequences, first calculate their weights and then pass that to `CAI()` to eliminate redundant computation.**
+
+To calculate RSCU without calculating CAI, you can use `RSCU()`. `RSCU()`'s only required argument is a list of sequences.
 
-Similarly, to calculate the weights of a reference set, you can use `relative_adaptiveness()`. `relative_adaptiveness()` takes either a list of sequences as the `sequences` parameter or a dictionary of RSCUs as the `RSCUs` parameter. 
+Similarly, to calculate the weights of reference sequences, you can use `relative_adaptiveness()`. `relative_adaptiveness()` takes either a list of sequences as the `sequences` parameter or a dictionary of RSCUs as the `RSCUs` parameter. 
 
 ### Other Genetic Codes
 
 All functions in CAI support an optional `genetic_code` parameter, which is set by default to 1 (the standard genetic code). You may set it to any genetic code within [gc.prt](/gc.prt). 
 
+## API Reference
+### `RSCU(sequences, genetic_code=1)`
+
+Argument  | Details
+--------- | -------
+sequences | List of DNA sequence strings. Required.
+genetic_code | Integer containing the genetic code ID. Optional.
+
+#### Output
+A dictionary containing every codon as the key and its RSCU as the value.
+
+### `relative_adaptiveness(sequences=[], RSCUs={}, genetic_code=1)`
+
+Argument  | Details
+--------- | -------
+sequences | List of DNA sequence strings. Optional.
+RSCUs | Dictionary of RSCU values for each codon. Optional.
+genetic_code | Integer containing the genetic code ID. Optional.
+
+#### Note
+One of `sequences` or `RSCUs` is required. 
+
+#### Output
+A dictionary containing every codon as the key and its weight as the value.
+
+### `CAI(sequence, weights=[], RSCUs=[], sequences=[], genetic_code=1)`
+
+Argument  | Details
+--------- | -------
+sequence | String of DNA sequence to calculate CAI for. Required.
+weights | Dictionary of weight values for each codon. Optional.
+RSCUs | Dictionary of RSCU values for each codon. Optional.
+sequences | List of DNA sequence strings. Required.
+genetic_code | Integer containing the genetic code ID. Optional.
+
+#### Note
+One of `sequences`, `RSCUs`, or `weights` is required.
+
+#### Output
+A float of the CAI of the sequence.
+
 ## Contributing
 Feel free to contribute, open issues, or let me know about bugs. Anything is welcome!
 

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name = 'CAI',
     packages = ["CAI"],
-    version = '0.1.6',
+    version = '0.1.7',
     description = 'Python implementation of codon adaptation index',
     author = 'Benjamin Lee',
     author_email = 'benjamin_lee@college.harvard.edu',