diff --git a/align/align_test.go b/align/align_test.go index 533cb2f..8179717 100644 --- a/align/align_test.go +++ b/align/align_test.go @@ -1336,6 +1336,45 @@ func TestDiffCount(t *testing.T) { } +func TestDiffReverse(t *testing.T) { + in := NewAlign(UNKNOWN) + in.AddSequence("Seq0000", "GATTAATTTGCCGTAGGCCAGAATCTGAAGATCGAACACTTTAAGTTTTCACTTCTAATGGAGAGGACTAGTTCATACTTTTTAAACACTTTTACATCGA", "") + in.AddSequence("Seq0001", "TG.CGGACCTAA...TTGAGT.CAAC.GT.TATTCCAG.GG.GGAGAGGTCTA.T.TTCC.GTT.A.GG.C.CT.G.GC.G.A..GGGTA.GGC...GTG", "") + in.AddSequence("Seq0002", "CTAAGCGCG.G..G.TTG.T.TTGGA.C.AGGTT..ATAC.CGGCAA.G.C.CATG.TCCCCC.A.GAC.A.AAGAG.GAAG.T.GA..AAA.GA.C.CC", "") + in.AddSequence("Seq0003", "..G.GGAGGCTTTAT...ACA.GGTATT...GACTGAGGGGC.CCCCGG..TGGTA.GCA.GAGCC.TCGCGAAGGCT.CAGGT.T.TTCC.GTGT.ACC", "") + in.AutoAlphabet() + + out, _ := in.Clone() + out.ReplaceMatchChars() + + i := 0 + var ref []rune + out.IterateChar(func(name string, seq []rune) { + orig, _ := in.GetSequenceCharById(i) + if i == 0 { + ref = seq + for site := 0; site < out.Length(); site++ { + if seq[site] != orig[site] { + t.Errorf("Original reference character has been changed (%c vs. %c)", orig[site], seq[site]) + } + } + } else { + for site := 0; site < out.Length(); site++ { + if orig[site] == POINT { + if seq[site] != ref[site] { + t.Errorf(". has not been replaced by the right character (%c vs. %c)", ref[site], seq[site]) + } + } else { + if seq[site] != orig[site] { + t.Errorf("Original character has been changed (%c vs. %c)", orig[site], seq[site]) + } + } + } + } + i++ + }) +} + func TestCompress(t *testing.T) { in := NewAlign(UNKNOWN) in.AddSequence("Seq0000", "GGTTTTTTTT", "") diff --git a/cmd/diff.go b/cmd/diff.go index fa65e6a..d1319d0 100644 --- a/cmd/diff.go +++ b/cmd/diff.go @@ -14,6 +14,7 @@ import ( var diffOutput string var diffCount bool var diffNoGaps bool +var reverse bool // statsCmd represents the stats command var diffCmd = &cobra.Command{ @@ -31,6 +32,8 @@ The format is tab separated, with following columns: 1. Sequence name (reference sequence is not included) 2,...,end: For each type of change, its number of occurence + +If option --reverse is given, then replaces . with the characters on the first sequence `, RunE: func(cmd *cobra.Command, args []string) (err error) { var aligns *align.AlignChannel @@ -52,7 +55,11 @@ The format is tab separated, with following columns: alldiffs, diffs := al.CountDifferences() writeDiffCounts(al, alldiffs, diffs, f) } else { - al.DiffWithFirst() + if reverse { + al.ReplaceMatchChars() + } else { + al.DiffWithFirst() + } writeAlign(al, f) } } @@ -90,5 +97,6 @@ func init() { diffCmd.PersistentFlags().StringVarP(&diffOutput, "output", "o", "stdout", "Diff output file") diffCmd.PersistentFlags().BoolVar(&diffCount, "counts", false, "Count differences instead of writting only identical characters") diffCmd.PersistentFlags().BoolVar(&diffNoGaps, "no-gaps", false, "Do not count gaps (only with --counts)") + diffCmd.PersistentFlags().BoolVar(&reverse, "reverse", false, "Restore identical characters (.) using first sequence positions") RootCmd.AddCommand(diffCmd) } diff --git a/test.sh b/test.sh index aa47a65..93ea4f5 100755 --- a/test.sh +++ b/test.sh @@ -3019,6 +3019,78 @@ ${GOALIGN} diff -i input -p --one-line -o result --counts diff -q -b expected result rm -f input expected result +echo "->goalign diff --reverse" + +cat > input <Seq0000 +GATTAATTTGCCGTAGGCCAGAATCTGAAGATCGAACACTTTAAGTTTTCACTTCTAATGGAGAGGACTAGTTCATACTT +TTTAAACACTTTTACATCGA +>Seq0001 +TG.CGGACCTAA...TTGAGT.CAAC.GT.TATTCCAG.GG.GGAGAGGTCTA.T.TTCC.GTT.A.GG.C.CT.G.GC. +G.A..GGGTA.GGC...GTG +>Seq0002 +CTAAGCGCG.G..G.TTG.T.TTGGA.C.AGGTT..ATAC.CGGCAA.G.C.CATG.TCCCCC.A.GAC.A.AAGAG.GA +AG.T.GA..AAA.GA.C.CC +>Seq0003 +..G.GGAGGCTTTAT...ACA.GGTATT...GACTGAGGGGC.CCCCGG..TGGTA.GCA.GAGCC.TCGCGAAGGCT.C +AGGT.T.TTCC.GTGT.ACC +>Seq0004 +C..AGCCCCTGATGCCCTG.CCCGTGTCGCGG.A.CGT..AC.TT.CACG.TAAA..C.CCGCT.CTAGTCGG.TCTAGA +.GCTTTTCT.CCAGATCT.G +>Seq0005 +AG..TGAC.ATGAGC.C.GGCTTAG..CT..CA.TGATGC.CCGT.G.AAGGG..CTGAT.TTCTTGTGCTCG.GC.TA. +.AG.GCTGAG...C.CAAAG +>Seq0006 +TCGCC.CGGTGT.G.ATGT.CGT.A..GCAG.AATCAG.GGCTTTCACCG..A.GCCCCCTCCGT.G..CC..GCG..CA +.CGGCGG..C.GCGGTGTCG +>Seq0007 +CTGGT.A.AC.T.CGCTATTTCG..A.TTCG.GT.CGGG.AACGA.AGCGGT.AA.GC.TATTCC..TC..C...C..CC +A.G..GGTGGC.CTGGAGCC +>Seq0008 +TCG.T.ACCCA.TCTAA...CCTC...T..CGAC.T.GGG.GCTCGGC.TGGA.ACCT.C.TC.TATTGGACC.CAGG.C +.CA.CCTCG.GAGCTC..TG +>Seq0009 +ACC..CGGCT.TAG.CAG.T...GTCCGGTTC...G....G..C.GAAA.TTGAAA.GGCTC..C.GAGGC..GT.C.GC +AGAGTGGGAC.A..ACATAC +EOF + +cat > expected <Seq0000 +GATTAATTTGCCGTAGGCCAGAATCTGAAGATCGAACACTTTAAGTTTTCACTTCTAATGGAGAGGACTAGTTCATACTT +TTTAAACACTTTTACATCGA +>Seq0001 +TGTCGGACCTAAGTATTGAGTACAACGGTGTATTCCAGCGGTGGAGAGGTCTATTTTTCCGGTTGAAGGACTCTAGAGCT +GTAAAGGGTATGGCCATGTG +>Seq0002 +CTAAGCGCGGGCGGATTGCTGTTGGAGCAAGGTTAAATACTCGGCAATGCCCCATGATCCCCCAAGGACAATAAGAGCGA +AGTTAGAACAAATGAACCCC +>Seq0003 +GAGTGGAGGCTTTATGGCACAAGGTATTAGAGACTGAGGGGCACCCCGGCATGGTAAGCAGGAGCCATCGCGAAGGCTTC +AGGTATCTTCCTGTGTTACC +>Seq0004 +CATAGCCCCTGATGCCCTGACCCGTGTCGCGGCAACGTCTACATTTCACGATAAATACTCCGCTGCTAGTCGGCTCTAGA +TGCTTTTCTTCCAGATCTGG +>Seq0005 +AGTTTGACTATGAGCGCCGGCTTAGTGCTGACAGTGATGCTCCGTTGTAAGGGTCCTGATGTTCTTGTGCTCGCGCATAT +TAGAGCTGAGTTTCCCAAAG +>Seq0006 +TCGCCACGGTGTGGAATGTACGTTATGGCAGTAATCAGCGGCTTTCACCGACATGCCCCCTCCGTGGCTCCTTGCGACCA +TCGGCGGACCTGCGGTGTCG +>Seq0007 +CTGGTAATACCTGCGCTATTTCGTCAGTTCGTGTACGGGTAACGATAGCGGTTAATGCTTATTCCGATCAGCTCACACCC +ATGAAGGTGGCTCTGGAGCC +>Seq0008 +TCGTTAACCCACTCTAACCACCTCCTGTAGCGACATCGGGTGCTCGGCTTGGATACCTTCGTCATATTGGACCCCAGGTC +TCAACCTCGTGAGCTCTCTG +>Seq0009 +ACCTACGGCTCTAGACAGCTGAAGTCCGGTTCCGAGCACTGTACGGAAACTTGAAAAGGCTCGACGGAGGCTTGTTCCGC +AGAGTGGGACTATAACATAC +EOF + +${GOALIGN} diff --reverse -i input -o result +diff -q -b expected result +rm -f input expected result + echo "->goalign stats auto from 1000 random phylip alignments" cat > expected.nb <