Skip to content

Commit

Permalink
Added option --consecutive to goalign sample sites
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed Jan 5, 2021
1 parent 7006d8a commit da9722e
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 22 deletions.
36 changes: 28 additions & 8 deletions align/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ type Alignment interface {
NumMutationsUniquePerSequence(profile *CountProfile) (numuniques []int, numnew []int, nummuts []int, err error)
Pssm(log bool, pseudocount float64, normalization int) (pssm map[uint8][]float64, err error) // Normalization: PSSM_NORM_NONE, PSSM_NORM_UNIF, PSSM_NORM_DATA
Rarefy(nb int, counts map[string]int) (Alignment, error) // Take a new rarefied sample taking into accounts weights
RandSubAlign(length int) (Alignment, error) // Extract a random subalignment with given length from this alignment
RandSubAlign(length int, consecutive bool) (Alignment, error) // Extract a random subalignment with given length from this alignment
Recombine(rate float64, lenprop float64)
// converts coordinates on the given sequence to coordinates on the alignment
RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error)
Expand Down Expand Up @@ -1465,20 +1465,40 @@ func (a *align) InversePositions(sites []int) (invsites []int, err error) {
return
}

// Extract a subalignment with given length and a random start position from this alignment
func (a *align) RandSubAlign(length int) (Alignment, error) {
// RandSubAlign extracts a subalignment of given length from this alignment
// If consecutive is true, then a start position is randomly chosen, and the next "length" positions are extracted
// Otherwise, if consecutive is false, then length positions are sampled without replacement from the original alignment
func (a *align) RandSubAlign(length int, consecutive bool) (Alignment, error) {
var tmpseq []uint8
var permutation []int
var i, p, start int
var subalign *align
var seq *seq

if length > a.Length() {
return nil, errors.New("sub alignment is larger than original alignment ")
}
if length <= 0 {
return nil, errors.New("sub alignment cannot have 0 or negative length")
}

subalign := NewAlign(a.alphabet)
start := rand.Intn(a.Length() - length + 1)
for i := 0; i < a.NbSequences(); i++ {
seq := a.seqs[i]
subalign.AddSequenceChar(seq.name, seq.SequenceChar()[start:start+length], seq.Comment())
subalign = NewAlign(a.alphabet)
if consecutive {
start = rand.Intn(a.Length() - length + 1)
for i = 0; i < a.NbSequences(); i++ {
seq = a.seqs[i]
subalign.AddSequenceChar(seq.name, seq.SequenceChar()[start:start+length], seq.Comment())
}
} else {
permutation = rand.Perm(a.Length())
for i = 0; i < a.NbSequences(); i++ {
tmpseq = make([]uint8, length)
seq = a.seqs[i]
for p = 0; p < length; p++ {
tmpseq[p] = seq.SequenceChar()[permutation[p]]
}
subalign.AddSequenceChar(seq.name, tmpseq, seq.Comment())
}
}
return subalign, nil
}
Expand Down
25 changes: 16 additions & 9 deletions cmd/samplesites.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,24 @@ import (
var siteout string
var sitelength int
var sitenb int
var siteconsecutive bool

// samplesitesCmd represents the samplesites command
var samplesitesCmd = &cobra.Command{
Use: "sites",
Short: "Take a random subalignment",
Long: `Take a random subalignment.
It take a random start position, and extract the alignment starting at that position
and with a given length.
`,
goalign sample sites extracts a subalignment of given length from the input alignment.
* If --consecutive is true, then a start position is randomly chosen, and the next "length"
positions are extracted.
* Otherwise, if consecutive is false, then "length" positions are sampled without replacement
from the original alignment (any order).
If more than 2 samples are requested (-n x, x>1), then alignemnts are written in output files
with suffix _0.ext, _1.ext, etc. (with ext in [ph,fa,clustal,nx] ).The only exception is when
format is phylip and output file name is stdout or -, then output alignemnts are written on stdout.
`,
RunE: func(cmd *cobra.Command, args []string) (err error) {
var aligns *align.AlignChannel
var f *os.File
Expand All @@ -40,12 +48,10 @@ and with a given length.
}

var name string = siteout
var extension string = "fa"
if rootphylip {
extension = "phy"
}
var extension string = alignExtension()

for i := 0; i < sitenb; i++ {
if sitenb > 1 {
if sitenb > 1 && !(rootphylip && (siteout == "stdout" || siteout == "-")) {
name = fmt.Sprintf("%s_%d.%s", siteout, i, extension)
}
if f, err = openWriteFile(name); err != nil {
Expand All @@ -54,7 +60,7 @@ and with a given length.
}
defer closeWriteFile(f, name)

if subalign, err = al.RandSubAlign(sitelength); err != nil {
if subalign, err = al.RandSubAlign(sitelength, siteconsecutive); err != nil {
io.LogError(err)
return
}
Expand All @@ -69,4 +75,5 @@ func init() {
samplesitesCmd.PersistentFlags().StringVarP(&siteout, "output", "o", "stdout", "Alignment output file")
samplesitesCmd.PersistentFlags().IntVarP(&sitelength, "length", "l", 10, "Length of the random sub alignment")
samplesitesCmd.PersistentFlags().IntVarP(&sitenb, "nsamples", "n", 1, "Number of samples to generate")
samplesitesCmd.PersistentFlags().BoolVar(&siteconsecutive, "consecutive", true, "If sampled sites are consecutive (inactivate with --consecutive=false)")
}
7 changes: 4 additions & 3 deletions docs/commands/sample.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

### sample
This command samples sites or sequences from an input alignment (fasta by default or phylip with `-p`):
1. `goalign sample sites`: take a random subsequence starting at a random position and with the given length from the input alignment;
1. `goalign sample sites`: take a random subalignment from the input alignment. If --consecutive is true, then a start position is randomly chosen, and the next "length" positions are extracted. Otherwise, if consecutive is false, then "length" positions are sampled without replacement from the original alignment (any order);
2. `goalign sample seqs`: take a random subset of the sequences from an input alignment;
3. `goalign sample rarefy`: Take a new sample taking into accounts counts. Each sequence in the alignment has associated counts. The sum s of the counts represents the number of sequences in the underlying initial dataset. The goal is to downsample (rarefy) the initial dataset, by sampling n sequences from s (n<s), and taking the alignment corresponding to this new sample, i.e by taking only unique (different) sequences from it.

Expand All @@ -14,9 +14,9 @@ If the input alignment contains several alignments (phylip), will process all of
* general command:
```
Available Commands:
rarefy Take a new sample taking into accounts weights
rarefy Takes a new sample taking into accounts weights
seqs Samples a subset of sequences from the input alignment
sites Take a random subalignment
sites Takes a random subalignment
Flags:
-h, --help help for sample
Expand Down Expand Up @@ -66,6 +66,7 @@ Usage:
goalign sample sites [flags]
Flags:
--consecutive If sampled sites are consecutive (inactivate with --consecutive=false) (default true)
-h, --help help for sites
-l, --length int Length of the random sub alignment (default 10)
-n, --nsamples int Number of samples to generate (default 1)
Expand Down
4 changes: 2 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ Command | Subcommand |
[replace](commands/replace.md) ([api](api/replace.md)) | | Replace characters in sequences of input alignment
[sample](commands/sample.md) ([api](api/sample.md)) | | Samples sequences or sites from an input alignment
-- | seqs | Samples a subset of sequences from the input alignment
-- | sites | Take a random subalignment
-- | rarefy | Take a sample taking into accounts weights
-- | sites | Takes a random subalignment
-- | rarefy | Takes a sample taking into accounts weights
[shuffle](commands/shuffle.md) ([api](api/shuffle.md)) | | A set of commands to shuffle an alignment
-- | recomb | Recombines sequences in the input alignment (copy/paste)
-- | rogue | Simulates rogue taxa
Expand Down

0 comments on commit da9722e

Please sign in to comment.