From e6a3635a49cf9340c28d4d96263ed5b97eb1485e Mon Sep 17 00:00:00 2001 From: Frederic Lemoine Date: Tue, 7 Jul 2020 12:03:44 +0200 Subject: [PATCH] Added ignore case to clean seqs and clean sites and --char to clean seqs #5 --- align/align.go | 56 ++++++++++++++++++++++------- cmd/clean.go | 4 +++ cmd/cleanseqs.go | 13 ++++++- cmd/cleansites.go | 5 +-- test.sh | 90 +++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 147 insertions(+), 21 deletions(-) diff --git a/align/align.go b/align/align.go index f7bafea..7c02248 100644 --- a/align/align.go +++ b/align/align.go @@ -71,10 +71,16 @@ type Alignment interface { RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error) // converts sites on the given sequence to coordinates on the alignment RefSites(name string, sites []int) (refsites []int, err error) - RemoveGapSeqs(cutoff float64) // Removes sequences having >= cutoff gaps - RemoveGapSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment - RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment - RemoveMajorityCharacterSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff of the main character at these sites, returns the number of consecutive removed sites at start and end of alignment + // Removes sequences having >= cutoff gaps, returns number of removed sequences + RemoveGapSeqs(cutoff float64) int + // Removes sequences having >= cutoff character, returns number of removed sequences + RemoveCharacterSeqs(c rune, cutoff float64, ignoreCase bool) int + // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment + RemoveGapSites(cutoff float64, ends bool) (first, last int) + // Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment + RemoveCharacterSites(c rune, cutoff float64, ends bool, ignoreCase bool) (first, last int) + // Removes sites having >= cutoff of the main character at these sites, returns the number of consecutive removed sites at start and end of alignment + RemoveMajorityCharacterSites(cutoff float64, ends bool) (first, last int) // Replaces match characters (.) by their corresponding characters on the first sequence ReplaceMatchChars() Sample(nb int) (Alignment, error) // generate a sub sample of the sequences @@ -263,7 +269,7 @@ func (a *align) ShuffleSites(rate float64, roguerate float64, randroguefirst boo // // Returns the number of consecutive removed sites at start and end of alignment func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int) { - return a.RemoveCharacterSites(GAP, cutoff, ends) + return a.RemoveCharacterSites(GAP, cutoff, ends, false) } // RemoveCharacterSites Removes positions constituted of [cutoff*100%,100%] of the given character @@ -272,13 +278,15 @@ func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int) { // 0 means that positions with > 0 of the given character will be removed // other cutoffs : ]0,1] mean that positions with >= cutoff of this character will be removed // +// if ignoreCase then the search is case insensitive +// // If ends is true: then only removes consecutive positions that match the cutoff // from start or from end of alignment. // Example with a cutoff of 0.3 and ends and with the given proportion of this character: // 0.4 0.5 0.1 0.5 0.6 0.1 0.8 will remove positions 0,1 and 6 // // Returns the number of consecutive removed sites at start and end of alignment -func (a *align) RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) { +func (a *align) RemoveCharacterSites(c rune, cutoff float64, ends bool, ignoreCase bool) (first, last int) { var nbchars int if cutoff < 0 || cutoff > 1 { cutoff = 0 @@ -293,7 +301,7 @@ func (a *align) RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, nbchars = 0 for seq := 0; seq < a.NbSequences(); seq++ { - if a.seqs[seq].sequence[site] == c { + if (a.seqs[seq].sequence[site] == c) || (ignoreCase && unicode.ToLower(a.seqs[seq].sequence[site]) == unicode.ToLower(c)) { nbchars++ } } @@ -498,25 +506,47 @@ func (a *align) RefSites(name string, sites []int) (refsites []int, err error) { // Cutoff must be between 0 and 1, otherwise set to 0. // 0 means that sequences with > 0 gaps will be removed // other cutoffs : ]0,1] mean that sequences with >= cutoff gaps will be removed -func (a *align) RemoveGapSeqs(cutoff float64) { - var nbgaps int +// +// Returns the number of removed sequences +func (a *align) RemoveGapSeqs(cutoff float64) int { + return a.RemoveCharacterSeqs(GAP, cutoff, false) +} + +// RemoveCharacterSeqs Removes sequences constituted of [cutoff*100%,100%] of the given character +// Exception fo a cutoff of 0: does not remove sequences with 0% of this character +// Cutoff must be between 0 and 1, otherwise set to 0. +// 0 means that sequences with > 0 of the given character will be removed +// other cutoffs : ]0,1] mean that positions with >= cutoff of this character will be removed +// +// if ignoreCase then the search is case insensitive +// +// Returns the number of removed sequences +func (a *align) RemoveCharacterSeqs(c rune, cutoff float64, ignoreCase bool) int { + var nbseqs int if cutoff < 0 || cutoff > 1 { cutoff = 0 } oldseqs := a.seqs length := a.Length() a.Clear() + nbremoved := 0 + for _, seq := range oldseqs { - nbgaps = 0 + nbseqs = 0 + for site := 0; site < length; site++ { - if seq.sequence[site] == GAP { - nbgaps++ + if (seq.sequence[site] == c) || (ignoreCase && unicode.ToLower(seq.sequence[site]) == unicode.ToLower(c)) { + nbseqs++ } } - if !((cutoff > 0.0 && float64(nbgaps) >= cutoff*float64(length)) || (cutoff == 0 && nbgaps > 0)) { + if (cutoff > 0.0 && float64(nbseqs) >= cutoff*float64(length)) || (cutoff == 0 && nbseqs > 0) { + nbremoved++ + } else { a.AddSequenceChar(seq.name, seq.sequence, seq.comment) } } + + return nbremoved } // Swaps a rate of the sequences together diff --git a/cmd/clean.go b/cmd/clean.go index 3244969..2a507be 100644 --- a/cmd/clean.go +++ b/cmd/clean.go @@ -7,6 +7,8 @@ import ( var cleanOutput string var cleanCutoff float64 var cleanQuiet bool +var cleanChar string +var cleanIgnoreCase bool // cleanCmd represents the clean command var cleanCmd = &cobra.Command{ @@ -32,5 +34,7 @@ func init() { RootCmd.AddCommand(cleanCmd) cleanCmd.PersistentFlags().StringVarP(&cleanOutput, "output", "o", "stdout", "Cleaned alignment output file") cleanCmd.PersistentFlags().Float64VarP(&cleanCutoff, "cutoff", "c", 0, "Cutoff for gap deletion : 0 remove sites/sequences with > 0 gap, 1 remove sites/sequences with 100% gaps)") + cleanCmd.PersistentFlags().StringVar(&cleanChar, "char", "GAP", "The character the cutoff is applied to. May be GAP, MAJ, or any other character") + cleanCmd.PersistentFlags().BoolVar(&cleanIgnoreCase, "ignore-case", false, "Ignore case of given character (--char) if non special character (GAP/-)") cleanCmd.PersistentFlags().BoolVarP(&cleanQuiet, "quiet", "q", false, "Do not print results on stderr") } diff --git a/cmd/cleanseqs.go b/cmd/cleanseqs.go index 7331ecb..d25d5f6 100644 --- a/cmd/cleanseqs.go +++ b/cmd/cleanseqs.go @@ -43,7 +43,18 @@ will be removed.`, i := 0 for al := range aligns.Achan { before := al.NbSequences() - al.RemoveGapSeqs(cleanCutoff) + if cleanChar == string(align.GAP) || cleanChar == "GAP" { + al.RemoveGapSeqs(cleanCutoff) + } else { + //single character + c := []rune(cleanChar) + if len(c) != 1 { + err = fmt.Errorf("--char should be a single character") + io.LogError(err) + return + } + al.RemoveCharacterSeqs(c[0], cleanCutoff, cleanIgnoreCase) + } after := al.NbSequences() writeAlign(al, f) if !cleanQuiet { diff --git a/cmd/cleansites.go b/cmd/cleansites.go index 46216f3..a90ad19 100644 --- a/cmd/cleansites.go +++ b/cmd/cleansites.go @@ -10,7 +10,6 @@ import ( ) var cleanEnds bool -var cleanChar string // cleansitesCmd represents the cleansites command var cleansitesCmd = &cobra.Command{ @@ -69,7 +68,7 @@ will be removed.`, return } char = string(c[0]) - nbstart, nbend = al.RemoveCharacterSites(c[0], cleanCutoff, cleanEnds) + nbstart, nbend = al.RemoveCharacterSites(c[0], cleanCutoff, cleanEnds, cleanIgnoreCase) } afterlength := al.Length() writeAlign(al, f) @@ -92,7 +91,5 @@ will be removed.`, func init() { cleansitesCmd.PersistentFlags().BoolVar(&cleanEnds, "ends", false, "If true, then only remove consecutive gap positions from alignment start and end") - cleansitesCmd.PersistentFlags().StringVar(&cleanChar, "char", "GAP", "The character the cutoff is applied to. May be GAP, MAJ, or any other character") - cleanCmd.AddCommand(cleansitesCmd) } diff --git a/test.sh b/test.sh index 1bdb7dd..3c69b77 100755 --- a/test.sh +++ b/test.sh @@ -65,6 +65,38 @@ rm -f expected result mapfile input echo "->goalign clean sites" +cat > input <Seq0000 +GATTAATTT-CCGTAGGCCAGAA-CTGAA---CGAA-ACTTTAAGTTTTCACTT-TAATGGAGAGGACTAGTTCATACTT +-TT-A-CACTTTTACATCGA +>Seq0001 +-GTCGGACCTAAG-ATTGAGTACAACGGTG-ATTCCAGCGG-GGAGAGGTCT-TTTTTCCGG-TGAAGGACTCTAG-GCT +GTAAA-G-TATGGC-ATGTG +>Seq0002 +CTAA-CGCGGGCGGATT-CTGT-GGA-CAAGGTTAAATAC-CGGCAATGC-CCATGATCCCCCA-GGAC-ATAAGAGCGA +AGTT--AACAAATGAACCCC +>Seq0003 +GAGT-GA-GCTTTATGGCA-AAGGTATTAG-GACTGA----CACCCCGGCATGGTAAGCAGGAGCCA-CGCGAAGGCTTC +AGGTATCTTCCTGT-TTACC +>Seq0004 +CATAGCCCC--ATG-CC-GACC-GTGTCGCGGCAACGTCTACA-TTCACGATAAA-ACTCCG-TGCTAGTCGGCTC-AGA +TGCTTTTCTTCCA-ATCTGG +>Seq0005 +-GTTTGA-TAT---CGCCGGCTTAGTGCTGACA-TGATGCTCCGTT-TAAGGGTCCTGATGTT-TTGTGCTC-CGCATAT +TA-AGCTGAGTTTCCCAAAG +>Seq0006 +TCGC-ACGG-GTGGAATGTACGTTAT-GCA-TAATCAGCG-CTTTC-CCGACATGCCCCCTCCGTGGCTCCTT-CG-CCA +-CGGCGGACCTGCGGT-TCG +>Seq0007 +CT-G-AAT-CCTGCGCTATTTCGTCAGTTCGTG-ACGGGTAAC-A--GCGGTTAATGCTT-TTCCGATCAGCTCA-ACCC +ATGAAGGTGGCTCTGG-GCC +>Seq0008 +TCGTTAACCCACTCTAACCACCTC-TG--GCGACATCGG-T-CTCGGCTTGGATACCTTC-TC-TATTGGACCCCAGG-C +TCAA-CTCGTGAGCTCTCT- +>Seq0009 +ACCT-CGGCTCTAGACAGCTGAA--CCGGTTCCGAGCACTGTA-GGAAACTTG-AAAGGCTCGACGGA-G-TTGTTCC-C +AGAGTGGGA-TATAACATA- +EOF cat > expected <Seq0000 ATATGGCGATCAAAGTTCCAATGAGATACTTCCTTTACG @@ -94,10 +126,22 @@ Alignment (0) number of gaps=61 Alignment (0) number of start gaps=1 Alignment (0) number of end gaps=1 EOF -${GOALIGN} random --seed 10 | ${GOALIGN} mutate gaps -n 1 -r 0.1 --seed 10 | ${GOALIGN} clean sites > result 2>log +cat > expectedlog2 < result 2>log diff -q -b result expected diff -q -b log expectedlog -rm -f expected result mapfile log expectedlog +rm -f result mapfile log expectedlog + +${GOALIGN} replace -s - -n N -i input | ${GOALIGN} clean sites --char n --ignore-case > result 2>log +diff -q -b result expected +diff -q -b log expectedlog2 +rm -f expected result mapfile log expectedlog2 echo "->goalign clean sites --ends" @@ -225,6 +269,38 @@ diff -q -b log expectedlog3 rm -f expected3 result log expectedlog3 echo "->goalign clean seqs" +cat > input <Seq0000 +GATTAATTTGCCGTAGGCCAGAATCTGAAGATCGAACACTTTAAGTTTTCACTTCTAATGGAGAGGACTAGTTCATACTT +TTTAAACACTTTTACATCGA +>Seq0001 +--T-------A------G---A-AA-G--------C---G------A-G--T-T---T-C-G--GA---AC----G---T +-T-A-------G---ATGTG +>Seq0002 +-T-----C-GG---A-T---G------CAAGG-T-A---------A--GC--C---A-C---C-----C-----G---G- +AG-------A--TG----C- +>Seq0003 +GAGTGGAGGCTTTATGGCACAAGGTATTAGAGACTGAGGGGCACCCCGGCATGGTAAGCAGGAGCCATCGCGAAGGCTTC +AGGTATCTTCCTGTGTTACC +>Seq0004 +--T--C-----AT--C-------G------G---A-G--TAC--T--AC-A-----AC---G--G--A--CG-CT----- +T---T---T------T-T-- +>Seq0005 +AGTTTGACTATGAGCGCCGGCTTAGTGCTGACAGTGATGCTCCGTTGTAAGGGTCCTGATGTTCTTGTGCTCGCGCATAT +TAGAGCTGAGTTTCCCAAAG +>Seq0006 +T----A----G----A-GT-CG-------A--AA-CAG-G-----C--C---A-------T-------T-CT--C--C-A +---G----C----G-T--C- +>Seq0007 +CTGGTAATACCTGCGCTATTTCGTCAGTTCGTGTACGGGTAACGATAGCGGTTAATGCTTATTCCGATCAGCTCACACCC +ATGAAGGTGGCTCTGGAGCC +>Seq0008 +T---T--CCCA--------A-CT--------G--A---G---C-C----TGG-----T---T-------G-C-C--G--- +-C------GT-A-CT-T--- +>Seq0009 +ACCTACGGCTCTAGACAGCTGAAGTCCGGTTCCGAGCACTGTACGGAAACTTGAAAAGGCTCGACGGAGGCTTGTTCCGC +AGAGTGGGACTATAACATAC +EOF cat > expected <Seq0000 GATTAATTTGCCGTAGGCCAGAATCTGAAGATCGAACACTTTAAGTTTTCACTTCTAATGGAGAGGACTAGTTCATACTT @@ -247,7 +323,15 @@ cat > expectedlog < result 2>log +${GOALIGN} clean seqs -i input > result 2>log +diff -q -b result expected +rm -f result mapfile log expectedlog + +${GOALIGN} clean seqs --char GAP -i input > result 2>log +diff -q -b result expected +rm -f result mapfile log expectedlog + +${GOALIGN} replace -s "-" -n "N" -i input | ${GOALIGN} clean seqs --char n --ignore-case > result 2>log diff -q -b result expected rm -f expected result mapfile log expectedlog