Skip to content

Commit

Permalink
deal with ambiguous epithets (close #191)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Nov 3, 2021
1 parent 467da55 commit 4ed1d66
Show file tree
Hide file tree
Showing 13 changed files with 577 additions and 126 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

## [v1.5.1]

- Add [#191]: support for ambiguous specific epithets

## [v1.5.0]

- Add [#194]: support for cultivars' graft-chymeras (courtesy of @tobymarsden)
Expand Down
113 changes: 80 additions & 33 deletions ent/internal/preprocess/preprocess.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,28 +33,46 @@ var VirusException = map[string]string{
"Tidabius": "vector",
}

var AnnotationException = map[string]string{
"Acrostichum": "nudum",
"Adiantum": "nudum",
"Africanthion": "nudum",
"Agathidium": "nudum",
"Aphaniosoma": "nudum",
"Aspidium": "nudum",
"Athyrium": "nudum",
"Blechnum": "nudum",
"Bottaria": "nudum",
"Gnathopleustes": "den",
"Lycopodium": "nudum",
"Nephrodium": "nudum",
"Paralvinella": "dela",
"Polypodium": "nudum",
"Polystichum": "nudum",
"Psilotum": "nudum",
"Ruteloryctes": "bis",
"Selenops": "ab",
"Tortolena": "dela",
"Trachyphloeosoma": "nudum",
"Zodarion": "van",
var AmbiguousException = map[string][]string{
"Acrostichum": {"nudum"},
"Adiantum": {"nudum"},
"Africanthion": {"nudum"},
"Campylosphaera": {"dela"},
"Agathidium": {"nudum"},
"Agnetina": {"den"},
"Antaplaga": {"dela"},
"Aphaniosoma": {"nudum"},
"Aspidium": {"nudum"},
"Athyrium": {"nudum"},
"Baeolidia": {"dela"},
"Blechnum": {"nudum"},
"Bolitoglossa": {"la"},
"Bottaria": {"nudum"},
"Desmoxytes": {"des"},
"Dicentria": {"dela"},
"Eulaira": {"dela"},
"Gnathopleustes": {"den"},
"Helophorus": {"ser"},
"Leptonetela": {"la"},
"Lycopodium": {"nudum"},
"Malamatidia": {"zu"},
"Meteorus": {"dos"},
"Nephrodium": {"nudum"},
"Nocaracris": {"van"},
"Paralvinella": {"dela"},
"Polypodium": {"nudum"},
"Polystichum": {"nudum"},
"Psilotum": {"nudum"},
"Ruteloryctes": {"bis"},
"Scoparia": {"dela"},
"Selenops": {"ab"},
"Semiothisa": {"da"},
"Serina": {"ser", "subser"},
"Stenoecia": {"dos"},
"Sympycnus": {"du"},
"Tortolena": {"dela"},
"Trachyphloeosoma": {"nudum"},
"Zodarion": {"van"},
}

var NoParseException = map[string]string{
Expand Down Expand Up @@ -104,33 +122,51 @@ type Preprocessor struct {
Annotation bool
Body []byte
Tail []byte
Ambiguous ambiguous
}

type ambiguous struct {
Orig string
Subst string
}

// Preprocess runs a series of regular expressions over the input to determine
// features of the input before parsing.
func Preprocess(bs []byte) *Preprocessor {
pr := &Preprocessor{}

// check for empty string
if len(bs) == 0 {
pr.NoParse = true
return pr
}
i := len(bs)
name := string(bs)
if !IsException(name, VirusException) {
words := strings.Fields(string(bs))

// check for viruses, plasmids, RNA, DNA etc.
if !isException(words, VirusException) {
pr.Virus = IsVirus(bs[0:i])
}
if pr.Virus {
pr.NoParse = true
return pr
}

// check for unparseable names
pr.NoParse = NoParse(bs[0:i])
if IsException(name, NoParseException) {
if isException(words, NoParseException) {
pr.NoParse = false
}
if pr.NoParse {
return pr
}
j := Annotation(bs[0:i])

//
if len(words) > 1 {
pr.ambiguous(words[0], bs)
}

j := procAnnot(bs[0:i])
if j < i {
pr.Annotation = true
i = j
Expand All @@ -147,8 +183,7 @@ func Preprocess(bs []byte) *Preprocessor {
return pr
}

func IsException(name string, names map[string]string) bool {
words := strings.Fields(name)
func isException(words []string, names map[string]string) bool {
if len(words) < 2 {
return false
}
Expand All @@ -162,14 +197,26 @@ func IsException(name string, names map[string]string) bool {
return false
}

// Annotation returns index where unparsed part starts. In case if
func (p *Preprocessor) ambiguous(firstWord string, bs []byte) {
if epithets, ok := AmbiguousException[firstWord]; ok {
var sub byte = 'k'
for _, epithet := range epithets {
idx := bytes.Index(bs, []byte(" "+epithet))
if idx == -1 {
continue
}
p.Ambiguous.Orig = epithet
p.Ambiguous.Subst = string(sub) + epithet[1:]
bs[idx+1] = sub
}
}
}

// procAnnot returns index where unparsed part starts. In case if
// the full string can be parsed, returns returns the index of the end of the
// input.
func Annotation(bs []byte) int {
func procAnnot(bs []byte) int {
i := len(bs)
if IsException(string(bs), AnnotationException) {
return i
}
regexps := []*regexp.Regexp{
notesRe, taxonConceptsRe1, taxonConceptsRe2, taxonConceptsRe3,
nomenConceptsRe, lastWordJunkRe, stopWordsRe,
Expand Down
58 changes: 14 additions & 44 deletions ent/internal/preprocess/preprocess_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package preprocess_test
package preprocess

import (
"strings"
"testing"

ppr "github.com/gnames/gnparser/ent/internal/preprocess"
"github.com/stretchr/testify/assert"
)

Expand All @@ -29,12 +29,12 @@ func TestCleanup(t *testing.T) {
{"entities", "Hello &amp; you", "Hello & you"},
}
for _, v := range data {
assert.Equal(t, ppr.StripTags(v.tags), v.notags, v.msg)
assert.Equal(t, StripTags(v.tags), v.notags, v.msg)
}
})
t.Run("does not return nil", func(t *testing.T) {
assert.NotNil(t, ppr.StripTags("<!--"))
assert.NotNil(t, ppr.StripTags("<!--\r\n"))
assert.NotNil(t, StripTags("<!--"))
assert.NotNil(t, StripTags("<!--\r\n"))
})
}

Expand All @@ -48,39 +48,8 @@ func TestPreprocess(t *testing.T) {
{"name", "Navicula bacterium", true},
}
for _, v := range data {
assert.Equal(t, ppr.IsException(v.name, ppr.NoParseException), v.likeAnnotation, v.msg)
}
})
t.Run("AnnotationLikeName", func(t *testing.T) {
data := []struct {
msg string
name string
likeAnnotation bool
}{
{"name", "Acrostichum nudum", true},
{"name", "Adiantum nudum", true},
{"name", "Africanthion nudum", true},
{"name", "Agathidium nudum", true},
{"name", "Aphaniosoma nudum", true},
{"name", "Aspidium nudum", true},
{"name", "Athyrium nudum", true},
{"name", "Blechnum nudum", true},
{"name", "Bottaria nudum", true},
{"name", "Gnathopleustes den", true},
{"name", "Lycopodium nudum", true},
{"name", "Nephrodium nudum", true},
{"name", "Paralvinella dela", true},
{"name", "Polypodium nudum", true},
{"name", "Polystichum nudum", true},
{"name", "Psilotum nudum", true},
{"name", "Ruteloryctes bis", true},
{"name", "Selenops ab", true},
{"name", "Tortolena dela", true},
{"name", "Trachyphloeosoma nudum", true},
{"name", "Zodarion van", true},
}
for _, v := range data {
assert.Equal(t, ppr.IsException(v.name, ppr.AnnotationException), v.likeAnnotation, v.msg)
words := strings.Split(v.name, " ")
assert.Equal(t, isException(words, NoParseException), v.likeAnnotation, v.msg)
}
})

Expand Down Expand Up @@ -109,7 +78,8 @@ func TestPreprocess(t *testing.T) {
{"name17", "Homo sapiens coronavirus", false},
}
for _, v := range data {
assert.Equal(t, ppr.IsException(v.name, ppr.VirusException), v.likeVirus, v.msg)
words := strings.Split(v.name, " ")
assert.Equal(t, isException(words, VirusException), v.likeVirus, v.msg)
}
})

Expand Down Expand Up @@ -139,7 +109,7 @@ func TestPreprocess(t *testing.T) {
{"Match word", "Bacteriophage PH75", true},
}
for _, v := range data {
res := ppr.IsVirus([]byte(v.name))
res := IsVirus([]byte(v.name))
assert.Equal(t, res, v.isVirus, v.msg)
}
})
Expand Down Expand Up @@ -173,7 +143,7 @@ func TestPreprocess(t *testing.T) {
{"RNA4", "E. coli mRNA", true},
}
for _, v := range data {
res := ppr.NoParse([]byte(v.name))
res := NoParse([]byte(v.name))
assert.Equal(t, res, v.parsed, v.msg)
}
})
Expand All @@ -194,7 +164,7 @@ func TestPreprocess(t *testing.T) {
}
for _, v := range data {
bs := []byte(v.in)
i := ppr.Annotation(bs)
i := procAnnot(bs)
assert.Equal(t, string(bs[0:i]), v.out, v.msg)
assert.Equal(t, string(bs[i:]), v.tail, v.msg)
}
Expand All @@ -214,15 +184,15 @@ func TestPreprocess(t *testing.T) {
}
for _, v := range data {
bs := []byte(v.in)
changed2, _ := ppr.UnderscoreToSpace(bs)
changed2, _ := UnderscoreToSpace(bs)
assert.Equal(t, string(bs), v.out, v.msg)
assert.Equal(t, changed2, v.changed)
}
})

t.Run("does not remove spaces", func(t *testing.T) {
name := " Asplenium × inexpectatum(E. L. Braun ex Friesner )Morton"
res := ppr.Preprocess([]byte(name))
res := Preprocess([]byte(name))
assert.Equal(t, string(res.Body), name)
})
}
4 changes: 3 additions & 1 deletion ent/parsed/parsed.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ type Parsed struct {
// The ParseQuality is equal to the quality of the most
// severe warning (see qualityWarnings). If no problems
// are encountered, and the parsing succeeded, the parseQuality
// is set to 1. If parsing failed, the parseQuality is 0.
// is set to 1. If parsing failed, the parseQuality is 0.
ParseQuality int `json:"quality"`
// QualityWarnings contains encountered parsing problems.
QualityWarnings []QualityWarning `json:"qualityWarnings,omitempty"`
Expand Down Expand Up @@ -78,12 +78,14 @@ type Parsed struct {
// - notho- hybrid
// - hybrid formula
Hybrid *Annotation `json:"hybrid,omitempty"`

// GraftChimera is not nil if a name is detected as one of the graft chimeras
//
// - a non-categorized graft chimera
// - named graft chimera
// - graft chimera formula
GraftChimera *Annotation `json:"graftchimera,omitempty"`

// Surrogate is a wide category of names that do not follow
// nomenclatural rules

Expand Down
48 changes: 48 additions & 0 deletions ent/parsed/restore_ambiguous.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package parsed

import (
"strings"

"github.com/gnames/gnparser/ent/stemmer"
)

// RestoreAmbiguous method is used for cases where specific or infra-specific
// epithets had to be changed to be parsed sucessfully. Such situation
// arises when an epithet is the same as some word that is also an
// annotation, a prefix/suffix of an author name etc.
func (p *Parsed) RestoreAmbiguous(epithet, subst string) {
stem := stemmer.Stem(epithet).Stem
stemSubst := stemmer.Stem(subst).Stem
p.Normalized = restoreString(p.Normalized, epithet, subst)
p.Canonical.Full = restoreString(p.Canonical.Full, epithet, subst)
p.Canonical.Simple = restoreString(p.Canonical.Simple, epithet, subst)
p.Canonical.Stemmed = restoreString(p.Canonical.Stemmed, stem, stemSubst)

for i := range p.Words {
p.Words[i].Verbatim = restoreWord(p.Words[i].Verbatim, epithet, subst)
p.Words[i].Normalized = restoreWord(p.Words[i].Normalized, epithet, subst)
}

if sp, ok := p.Details.(DetailsSpecies); ok {
sp.Species.Species = restoreWord(sp.Species.Species, epithet, subst)
p.Details = sp
}
}

func restoreString(s, epithet, subst string) string {
words := strings.Split(s, " ")
for i := range words {
if strings.HasPrefix(words[i], subst) {
words[i] = epithet + words[i][len(epithet):]
return strings.Join(words, " ")
}
}
return s
}

func restoreWord(w, epithet, subst string) string {
if strings.HasPrefix(w, subst) {
return epithet + w[len(epithet):]
}
return w
}
Loading

0 comments on commit 4ed1d66

Please sign in to comment.