Skip to content

Commit

Permalink
replacing internal spaces and tabs by _ in phylip conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed Jan 11, 2018
1 parent b7e9d6e commit 6e6836c
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 11 deletions.
10 changes: 7 additions & 3 deletions align/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"log"
"math"
"math/rand"
"regexp"
"sort"
"strings"

Expand Down Expand Up @@ -69,7 +70,7 @@ type Alignment interface {
Rename(namemap map[string]string)
Pssm(log bool, pseudocount float64, normalization int) (pssm map[rune][]float64, err error) // Normalization: PSSM_NORM_NONE, PSSM_NORM_UNIF, PSSM_NORM_DATA
TrimNames(size int) (map[string]string, error)
CleanNames() // Removes spaces and tabs from sequence names
CleanNames() // Removes spaces and tabs at beginning and end of sequence names and replaces others by "_"
TrimSequences(trimsize int, fromStart bool) error
AppendSeqIdentifier(identifier string, right bool)
AvgAllelesPerSite() float64
Expand Down Expand Up @@ -678,9 +679,12 @@ func (a *align) AppendSeqIdentifier(identifier string, right bool) {

// Removes spaces and tabs from sequence names
func (a *align) CleanNames() {
firstlast := regexp.MustCompile("(^[\\s\\t]+|[\\s\\t]+$)")
inside := regexp.MustCompile("[\\s\\t]+")

for _, seq := range a.seqs {
seq.name = strings.Replace(seq.name, " ", "", -1)
seq.name = strings.Replace(seq.name, "\t", "", -1)
seq.name = firstlast.ReplaceAllString(seq.name, "")
seq.name = inside.ReplaceAllString(seq.name, "_")
}
}

Expand Down
24 changes: 24 additions & 0 deletions align/align_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,30 @@ func TestAppendIdentifier(t *testing.T) {
})
}

func TestCleanNames(t *testing.T) {
a, err := RandomAlignment(AMINOACIDS, 300, 50)
if err != nil {
t.Error(err)

}
a2, _ := a.Clone()
a.AppendSeqIdentifier("\t \t", false)
a.AppendSeqIdentifier("\t \t", true)

a.CleanNames()
i := 0
a.IterateChar(func(name string, sequence []rune) {
expected, found := a2.GetSequenceNameById(i)
if !found {
t.Error("Unknown sequence name after clean")
}
if name != expected {
t.Error("Unexpected sequence name after clean")
}
i++
})
}

func TestRemoveOneGapSite(t *testing.T) {
a, err := RandomAlignment(AMINOACIDS, 300, 300)
if err != nil {
Expand Down
16 changes: 8 additions & 8 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -166,20 +166,20 @@ rm -f expected result mapfile
echo "->goalign reformat phylip spaces tabs"
cat > expected <<EOF
5 5
Seq0000 GATTA
Seq0001 ATTTG
Seq0002 CCGTA
Seq0003 GGCCA
Seq0004 GAATC
S_e_q_0_0_00 GATTA
Se_q00_0_1 ATTTG
Se_q00_0_2 CCGTA
Seq00_03 GGCCA
Seq_0004 GAATC
EOF
cat > input.fa <<EOF
> S e q 0 0 00
GATTA
> Se q00 0 1
> Se q00 0 1
ATTTG
> Se q00 0 2
> Se q00 0 2
CCGTA
>Seq00 03
> Seq00 03
GGCCA
>Seq 0004
GAATC
Expand Down

0 comments on commit 6e6836c

Please sign in to comment.