Skip to content

Commit

Permalink
Added ignore case to clean seqs and clean sites and --char to clean s…
Browse files Browse the repository at this point in the history
…eqs #5
  • Loading branch information
fredericlemoine committed Jul 7, 2020
1 parent 450cc0f commit e6a3635
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 21 deletions.
56 changes: 43 additions & 13 deletions align/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,16 @@ type Alignment interface {
RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error)
// converts sites on the given sequence to coordinates on the alignment
RefSites(name string, sites []int) (refsites []int, err error)
RemoveGapSeqs(cutoff float64) // Removes sequences having >= cutoff gaps
RemoveGapSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment
RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment
RemoveMajorityCharacterSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff of the main character at these sites, returns the number of consecutive removed sites at start and end of alignment
// Removes sequences having >= cutoff gaps, returns number of removed sequences
RemoveGapSeqs(cutoff float64) int
// Removes sequences having >= cutoff character, returns number of removed sequences
RemoveCharacterSeqs(c rune, cutoff float64, ignoreCase bool) int
// Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment
RemoveGapSites(cutoff float64, ends bool) (first, last int)
// Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment
RemoveCharacterSites(c rune, cutoff float64, ends bool, ignoreCase bool) (first, last int)
// Removes sites having >= cutoff of the main character at these sites, returns the number of consecutive removed sites at start and end of alignment
RemoveMajorityCharacterSites(cutoff float64, ends bool) (first, last int)
// Replaces match characters (.) by their corresponding characters on the first sequence
ReplaceMatchChars()
Sample(nb int) (Alignment, error) // generate a sub sample of the sequences
Expand Down Expand Up @@ -263,7 +269,7 @@ func (a *align) ShuffleSites(rate float64, roguerate float64, randroguefirst boo
//
// Returns the number of consecutive removed sites at start and end of alignment
func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int) {
return a.RemoveCharacterSites(GAP, cutoff, ends)
return a.RemoveCharacterSites(GAP, cutoff, ends, false)
}

// RemoveCharacterSites Removes positions constituted of [cutoff*100%,100%] of the given character
Expand All @@ -272,13 +278,15 @@ func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int) {
// 0 means that positions with > 0 of the given character will be removed
// other cutoffs : ]0,1] mean that positions with >= cutoff of this character will be removed
//
// if ignoreCase then the search is case insensitive
//
// If ends is true: then only removes consecutive positions that match the cutoff
// from start or from end of alignment.
// Example with a cutoff of 0.3 and ends and with the given proportion of this character:
// 0.4 0.5 0.1 0.5 0.6 0.1 0.8 will remove positions 0,1 and 6
//
// Returns the number of consecutive removed sites at start and end of alignment
func (a *align) RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) {
func (a *align) RemoveCharacterSites(c rune, cutoff float64, ends bool, ignoreCase bool) (first, last int) {
var nbchars int
if cutoff < 0 || cutoff > 1 {
cutoff = 0
Expand All @@ -293,7 +301,7 @@ func (a *align) RemoveCharacterSites(c rune, cutoff float64, ends bool) (first,
nbchars = 0

for seq := 0; seq < a.NbSequences(); seq++ {
if a.seqs[seq].sequence[site] == c {
if (a.seqs[seq].sequence[site] == c) || (ignoreCase && unicode.ToLower(a.seqs[seq].sequence[site]) == unicode.ToLower(c)) {
nbchars++
}
}
Expand Down Expand Up @@ -498,25 +506,47 @@ func (a *align) RefSites(name string, sites []int) (refsites []int, err error) {
// Cutoff must be between 0 and 1, otherwise set to 0.
// 0 means that sequences with > 0 gaps will be removed
// other cutoffs : ]0,1] mean that sequences with >= cutoff gaps will be removed
func (a *align) RemoveGapSeqs(cutoff float64) {
var nbgaps int
//
// Returns the number of removed sequences
func (a *align) RemoveGapSeqs(cutoff float64) int {
return a.RemoveCharacterSeqs(GAP, cutoff, false)
}

// RemoveCharacterSeqs Removes sequences constituted of [cutoff*100%,100%] of the given character
// Exception fo a cutoff of 0: does not remove sequences with 0% of this character
// Cutoff must be between 0 and 1, otherwise set to 0.
// 0 means that sequences with > 0 of the given character will be removed
// other cutoffs : ]0,1] mean that positions with >= cutoff of this character will be removed
//
// if ignoreCase then the search is case insensitive
//
// Returns the number of removed sequences
func (a *align) RemoveCharacterSeqs(c rune, cutoff float64, ignoreCase bool) int {
var nbseqs int
if cutoff < 0 || cutoff > 1 {
cutoff = 0
}
oldseqs := a.seqs
length := a.Length()
a.Clear()
nbremoved := 0

for _, seq := range oldseqs {
nbgaps = 0
nbseqs = 0

for site := 0; site < length; site++ {
if seq.sequence[site] == GAP {
nbgaps++
if (seq.sequence[site] == c) || (ignoreCase && unicode.ToLower(seq.sequence[site]) == unicode.ToLower(c)) {
nbseqs++
}
}
if !((cutoff > 0.0 && float64(nbgaps) >= cutoff*float64(length)) || (cutoff == 0 && nbgaps > 0)) {
if (cutoff > 0.0 && float64(nbseqs) >= cutoff*float64(length)) || (cutoff == 0 && nbseqs > 0) {
nbremoved++
} else {
a.AddSequenceChar(seq.name, seq.sequence, seq.comment)
}
}

return nbremoved
}

// Swaps a rate of the sequences together
Expand Down
4 changes: 4 additions & 0 deletions cmd/clean.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (
var cleanOutput string
var cleanCutoff float64
var cleanQuiet bool
var cleanChar string
var cleanIgnoreCase bool

// cleanCmd represents the clean command
var cleanCmd = &cobra.Command{
Expand All @@ -32,5 +34,7 @@ func init() {
RootCmd.AddCommand(cleanCmd)
cleanCmd.PersistentFlags().StringVarP(&cleanOutput, "output", "o", "stdout", "Cleaned alignment output file")
cleanCmd.PersistentFlags().Float64VarP(&cleanCutoff, "cutoff", "c", 0, "Cutoff for gap deletion : 0 remove sites/sequences with > 0 gap, 1 remove sites/sequences with 100% gaps)")
cleanCmd.PersistentFlags().StringVar(&cleanChar, "char", "GAP", "The character the cutoff is applied to. May be GAP, MAJ, or any other character")
cleanCmd.PersistentFlags().BoolVar(&cleanIgnoreCase, "ignore-case", false, "Ignore case of given character (--char) if non special character (GAP/-)")
cleanCmd.PersistentFlags().BoolVarP(&cleanQuiet, "quiet", "q", false, "Do not print results on stderr")
}
13 changes: 12 additions & 1 deletion cmd/cleanseqs.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,18 @@ will be removed.`,
i := 0
for al := range aligns.Achan {
before := al.NbSequences()
al.RemoveGapSeqs(cleanCutoff)
if cleanChar == string(align.GAP) || cleanChar == "GAP" {
al.RemoveGapSeqs(cleanCutoff)
} else {
//single character
c := []rune(cleanChar)
if len(c) != 1 {
err = fmt.Errorf("--char should be a single character")
io.LogError(err)
return
}
al.RemoveCharacterSeqs(c[0], cleanCutoff, cleanIgnoreCase)
}
after := al.NbSequences()
writeAlign(al, f)
if !cleanQuiet {
Expand Down
5 changes: 1 addition & 4 deletions cmd/cleansites.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
)

var cleanEnds bool
var cleanChar string

// cleansitesCmd represents the cleansites command
var cleansitesCmd = &cobra.Command{
Expand Down Expand Up @@ -69,7 +68,7 @@ will be removed.`,
return
}
char = string(c[0])
nbstart, nbend = al.RemoveCharacterSites(c[0], cleanCutoff, cleanEnds)
nbstart, nbend = al.RemoveCharacterSites(c[0], cleanCutoff, cleanEnds, cleanIgnoreCase)
}
afterlength := al.Length()
writeAlign(al, f)
Expand All @@ -92,7 +91,5 @@ will be removed.`,

func init() {
cleansitesCmd.PersistentFlags().BoolVar(&cleanEnds, "ends", false, "If true, then only remove consecutive gap positions from alignment start and end")
cleansitesCmd.PersistentFlags().StringVar(&cleanChar, "char", "GAP", "The character the cutoff is applied to. May be GAP, MAJ, or any other character")

cleanCmd.AddCommand(cleansitesCmd)
}
90 changes: 87 additions & 3 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,38 @@ rm -f expected result mapfile input


echo "->goalign clean sites"
cat > input <<EOF
>Seq0000
GATTAATTT-CCGTAGGCCAGAA-CTGAA---CGAA-ACTTTAAGTTTTCACTT-TAATGGAGAGGACTAGTTCATACTT
-TT-A-CACTTTTACATCGA
>Seq0001
-GTCGGACCTAAG-ATTGAGTACAACGGTG-ATTCCAGCGG-GGAGAGGTCT-TTTTTCCGG-TGAAGGACTCTAG-GCT
GTAAA-G-TATGGC-ATGTG
>Seq0002
CTAA-CGCGGGCGGATT-CTGT-GGA-CAAGGTTAAATAC-CGGCAATGC-CCATGATCCCCCA-GGAC-ATAAGAGCGA
AGTT--AACAAATGAACCCC
>Seq0003
GAGT-GA-GCTTTATGGCA-AAGGTATTAG-GACTGA----CACCCCGGCATGGTAAGCAGGAGCCA-CGCGAAGGCTTC
AGGTATCTTCCTGT-TTACC
>Seq0004
CATAGCCCC--ATG-CC-GACC-GTGTCGCGGCAACGTCTACA-TTCACGATAAA-ACTCCG-TGCTAGTCGGCTC-AGA
TGCTTTTCTTCCA-ATCTGG
>Seq0005
-GTTTGA-TAT---CGCCGGCTTAGTGCTGACA-TGATGCTCCGTT-TAAGGGTCCTGATGTT-TTGTGCTC-CGCATAT
TA-AGCTGAGTTTCCCAAAG
>Seq0006
TCGC-ACGG-GTGGAATGTACGTTAT-GCA-TAATCAGCG-CTTTC-CCGACATGCCCCCTCCGTGGCTCCTT-CG-CCA
-CGGCGGACCTGCGGT-TCG
>Seq0007
CT-G-AAT-CCTGCGCTATTTCGTCAGTTCGTG-ACGGGTAAC-A--GCGGTTAATGCTT-TTCCGATCAGCTCA-ACCC
ATGAAGGTGGCTCTGG-GCC
>Seq0008
TCGTTAACCCACTCTAACCACCTC-TG--GCGACATCGG-T-CTCGGCTTGGATACCTTC-TC-TATTGGACCCCAGG-C
TCAA-CTCGTGAGCTCTCT-
>Seq0009
ACCT-CGGCTCTAGACAGCTGAA--CCGGTTCCGAGCACTGTA-GGAAACTTG-AAAGGCTCGACGGA-G-TTGTTCC-C
AGAGTGGGA-TATAACATA-
EOF
cat > expected <<EOF
>Seq0000
ATATGGCGATCAAAGTTCCAATGAGATACTTCCTTTACG
Expand Down Expand Up @@ -94,10 +126,22 @@ Alignment (0) number of gaps=61
Alignment (0) number of start gaps=1
Alignment (0) number of end gaps=1
EOF
${GOALIGN} random --seed 10 | ${GOALIGN} mutate gaps -n 1 -r 0.1 --seed 10 | ${GOALIGN} clean sites > result 2>log
cat > expectedlog2 <<EOF
Alignment (0) length before cleaning=100
Alignment (0) length after cleaning=39
Alignment (0) number of n=61
Alignment (0) number of start n=1
Alignment (0) number of end n=1
EOF
${GOALIGN} clean sites -i input > result 2>log
diff -q -b result expected
diff -q -b log expectedlog
rm -f expected result mapfile log expectedlog
rm -f result mapfile log expectedlog

${GOALIGN} replace -s - -n N -i input | ${GOALIGN} clean sites --char n --ignore-case > result 2>log
diff -q -b result expected
diff -q -b log expectedlog2
rm -f expected result mapfile log expectedlog2


echo "->goalign clean sites --ends"
Expand Down Expand Up @@ -225,6 +269,38 @@ diff -q -b log expectedlog3
rm -f expected3 result log expectedlog3

echo "->goalign clean seqs"
cat > input <<EOF
>Seq0000
GATTAATTTGCCGTAGGCCAGAATCTGAAGATCGAACACTTTAAGTTTTCACTTCTAATGGAGAGGACTAGTTCATACTT
TTTAAACACTTTTACATCGA
>Seq0001
--T-------A------G---A-AA-G--------C---G------A-G--T-T---T-C-G--GA---AC----G---T
-T-A-------G---ATGTG
>Seq0002
-T-----C-GG---A-T---G------CAAGG-T-A---------A--GC--C---A-C---C-----C-----G---G-
AG-------A--TG----C-
>Seq0003
GAGTGGAGGCTTTATGGCACAAGGTATTAGAGACTGAGGGGCACCCCGGCATGGTAAGCAGGAGCCATCGCGAAGGCTTC
AGGTATCTTCCTGTGTTACC
>Seq0004
--T--C-----AT--C-------G------G---A-G--TAC--T--AC-A-----AC---G--G--A--CG-CT-----
T---T---T------T-T--
>Seq0005
AGTTTGACTATGAGCGCCGGCTTAGTGCTGACAGTGATGCTCCGTTGTAAGGGTCCTGATGTTCTTGTGCTCGCGCATAT
TAGAGCTGAGTTTCCCAAAG
>Seq0006
T----A----G----A-GT-CG-------A--AA-CAG-G-----C--C---A-------T-------T-CT--C--C-A
---G----C----G-T--C-
>Seq0007
CTGGTAATACCTGCGCTATTTCGTCAGTTCGTGTACGGGTAACGATAGCGGTTAATGCTTATTCCGATCAGCTCACACCC
ATGAAGGTGGCTCTGGAGCC
>Seq0008
T---T--CCCA--------A-CT--------G--A---G---C-C----TGG-----T---T-------G-C-C--G---
-C------GT-A-CT-T---
>Seq0009
ACCTACGGCTCTAGACAGCTGAAGTCCGGTTCCGAGCACTGTACGGAAACTTGAAAAGGCTCGACGGAGGCTTGTTCCGC
AGAGTGGGACTATAACATAC
EOF
cat > expected <<EOF
>Seq0000
GATTAATTTGCCGTAGGCCAGAATCTGAAGATCGAACACTTTAAGTTTTCACTTCTAATGGAGAGGACTAGTTCATACTT
Expand All @@ -247,7 +323,15 @@ cat > expectedlog <<EOF
[Warning] in cmd/cleanseqs.go (line 37), message: Alignment (0) #seqs after cleaning=5
[Warning] in cmd/cleanseqs.go (line 38), message: Alignment (0) removed sequences=5
EOF
${GOALIGN} random --seed 10 | ${GOALIGN} mutate gaps -n 0.5 -r 0.7 --seed 10 | ${GOALIGN} clean seqs > result 2>log
${GOALIGN} clean seqs -i input > result 2>log
diff -q -b result expected
rm -f result mapfile log expectedlog

${GOALIGN} clean seqs --char GAP -i input > result 2>log
diff -q -b result expected
rm -f result mapfile log expectedlog

${GOALIGN} replace -s "-" -n "N" -i input | ${GOALIGN} clean seqs --char n --ignore-case > result 2>log
diff -q -b result expected
rm -f expected result mapfile log expectedlog

Expand Down

0 comments on commit e6a3635

Please sign in to comment.