deal with ambiguous epithets (close #191)

gnames · Nov 3, 2021 · 4ed1d66 · 4ed1d66
1 parent 467da55
commit 4ed1d66
Show file tree

Hide file tree

Showing 13 changed files with 577 additions and 126 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## Unreleased
 
+## [v1.5.1]
+
+- Add [#191]: support for ambiguous specific epithets
+
 ## [v1.5.0]
 
 - Add [#194]: support for cultivars' graft-chymeras (courtesy of @tobymarsden)

diff --git a/ent/internal/preprocess/preprocess.go b/ent/internal/preprocess/preprocess.go
@@ -33,28 +33,46 @@ var VirusException = map[string]string{
 	"Tidabius":      "vector",
 }
 
-var AnnotationException = map[string]string{
-	"Acrostichum":      "nudum",
-	"Adiantum":         "nudum",
-	"Africanthion":     "nudum",
-	"Agathidium":       "nudum",
-	"Aphaniosoma":      "nudum",
-	"Aspidium":         "nudum",
-	"Athyrium":         "nudum",
-	"Blechnum":         "nudum",
-	"Bottaria":         "nudum",
-	"Gnathopleustes":   "den",
-	"Lycopodium":       "nudum",
-	"Nephrodium":       "nudum",
-	"Paralvinella":     "dela",
-	"Polypodium":       "nudum",
-	"Polystichum":      "nudum",
-	"Psilotum":         "nudum",
-	"Ruteloryctes":     "bis",
-	"Selenops":         "ab",
-	"Tortolena":        "dela",
-	"Trachyphloeosoma": "nudum",
-	"Zodarion":         "van",
+var AmbiguousException = map[string][]string{
+	"Acrostichum":      {"nudum"},
+	"Adiantum":         {"nudum"},
+	"Africanthion":     {"nudum"},
+	"Campylosphaera":   {"dela"},
+	"Agathidium":       {"nudum"},
+	"Agnetina":         {"den"},
+	"Antaplaga":        {"dela"},
+	"Aphaniosoma":      {"nudum"},
+	"Aspidium":         {"nudum"},
+	"Athyrium":         {"nudum"},
+	"Baeolidia":        {"dela"},
+	"Blechnum":         {"nudum"},
+	"Bolitoglossa":     {"la"},
+	"Bottaria":         {"nudum"},
+	"Desmoxytes":       {"des"},
+	"Dicentria":        {"dela"},
+	"Eulaira":          {"dela"},
+	"Gnathopleustes":   {"den"},
+	"Helophorus":       {"ser"},
+	"Leptonetela":      {"la"},
+	"Lycopodium":       {"nudum"},
+	"Malamatidia":      {"zu"},
+	"Meteorus":         {"dos"},
+	"Nephrodium":       {"nudum"},
+	"Nocaracris":       {"van"},
+	"Paralvinella":     {"dela"},
+	"Polypodium":       {"nudum"},
+	"Polystichum":      {"nudum"},
+	"Psilotum":         {"nudum"},
+	"Ruteloryctes":     {"bis"},
+	"Scoparia":         {"dela"},
+	"Selenops":         {"ab"},
+	"Semiothisa":       {"da"},
+	"Serina":           {"ser", "subser"},
+	"Stenoecia":        {"dos"},
+	"Sympycnus":        {"du"},
+	"Tortolena":        {"dela"},
+	"Trachyphloeosoma": {"nudum"},
+	"Zodarion":         {"van"},
 }
 
 var NoParseException = map[string]string{
@@ -104,33 +122,51 @@ type Preprocessor struct {
 	Annotation  bool
 	Body        []byte
 	Tail        []byte
+	Ambiguous   ambiguous
+}
+
+type ambiguous struct {
+	Orig  string
+	Subst string
 }
 
 // Preprocess runs a series of regular expressions over the input to determine
 // features of the input before parsing.
 func Preprocess(bs []byte) *Preprocessor {
 	pr := &Preprocessor{}
+
+	// check for empty string
 	if len(bs) == 0 {
 		pr.NoParse = true
 		return pr
 	}
 	i := len(bs)
-	name := string(bs)
-	if !IsException(name, VirusException) {
+	words := strings.Fields(string(bs))
+
+	// check for viruses, plasmids, RNA, DNA etc.
+	if !isException(words, VirusException) {
 		pr.Virus = IsVirus(bs[0:i])
 	}
 	if pr.Virus {
 		pr.NoParse = true
 		return pr
 	}
+
+	// check for unparseable names
 	pr.NoParse = NoParse(bs[0:i])
-	if IsException(name, NoParseException) {
+	if isException(words, NoParseException) {
 		pr.NoParse = false
 	}
 	if pr.NoParse {
 		return pr
 	}
-	j := Annotation(bs[0:i])
+
+	//
+	if len(words) > 1 {
+		pr.ambiguous(words[0], bs)
+	}
+
+	j := procAnnot(bs[0:i])
 	if j < i {
 		pr.Annotation = true
 		i = j
@@ -147,8 +183,7 @@ func Preprocess(bs []byte) *Preprocessor {
 	return pr
 }
 
-func IsException(name string, names map[string]string) bool {
-	words := strings.Fields(name)
+func isException(words []string, names map[string]string) bool {
 	if len(words) < 2 {
 		return false
 	}
@@ -162,14 +197,26 @@ func IsException(name string, names map[string]string) bool {
 	return false
 }
 
-// Annotation returns index where unparsed part starts. In case if
+func (p *Preprocessor) ambiguous(firstWord string, bs []byte) {
+	if epithets, ok := AmbiguousException[firstWord]; ok {
+		var sub byte = 'k'
+		for _, epithet := range epithets {
+			idx := bytes.Index(bs, []byte(" "+epithet))
+			if idx == -1 {
+				continue
+			}
+			p.Ambiguous.Orig = epithet
+			p.Ambiguous.Subst = string(sub) + epithet[1:]
+			bs[idx+1] = sub
+		}
+	}
+}
+
+// procAnnot returns index where unparsed part starts. In case if
 // the full string can be parsed, returns returns the index of the end of the
 // input.
-func Annotation(bs []byte) int {
+func procAnnot(bs []byte) int {
 	i := len(bs)
-	if IsException(string(bs), AnnotationException) {
-		return i
-	}
 	regexps := []*regexp.Regexp{
 		notesRe, taxonConceptsRe1, taxonConceptsRe2, taxonConceptsRe3,
 		nomenConceptsRe, lastWordJunkRe, stopWordsRe,

diff --git a/ent/internal/preprocess/preprocess_test.go b/ent/internal/preprocess/preprocess_test.go
@@ -1,9 +1,9 @@
-package preprocess_test
+package preprocess
 
 import (
+	"strings"
 	"testing"
 
-	ppr "github.com/gnames/gnparser/ent/internal/preprocess"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -29,12 +29,12 @@ func TestCleanup(t *testing.T) {
 			{"entities", "Hello &amp; you", "Hello & you"},
 		}
 		for _, v := range data {
-			assert.Equal(t, ppr.StripTags(v.tags), v.notags, v.msg)
+			assert.Equal(t, StripTags(v.tags), v.notags, v.msg)
 		}
 	})
 	t.Run("does not return nil", func(t *testing.T) {
-		assert.NotNil(t, ppr.StripTags("<!--"))
-		assert.NotNil(t, ppr.StripTags("<!--\r\n"))
+		assert.NotNil(t, StripTags("<!--"))
+		assert.NotNil(t, StripTags("<!--\r\n"))
 	})
 }
 
@@ -48,39 +48,8 @@ func TestPreprocess(t *testing.T) {
 			{"name", "Navicula bacterium", true},
 		}
 		for _, v := range data {
-			assert.Equal(t, ppr.IsException(v.name, ppr.NoParseException), v.likeAnnotation, v.msg)
-		}
-	})
-	t.Run("AnnotationLikeName", func(t *testing.T) {
-		data := []struct {
-			msg            string
-			name           string
-			likeAnnotation bool
-		}{
-			{"name", "Acrostichum nudum", true},
-			{"name", "Adiantum nudum", true},
-			{"name", "Africanthion nudum", true},
-			{"name", "Agathidium nudum", true},
-			{"name", "Aphaniosoma nudum", true},
-			{"name", "Aspidium nudum", true},
-			{"name", "Athyrium nudum", true},
-			{"name", "Blechnum nudum", true},
-			{"name", "Bottaria nudum", true},
-			{"name", "Gnathopleustes den", true},
-			{"name", "Lycopodium nudum", true},
-			{"name", "Nephrodium nudum", true},
-			{"name", "Paralvinella dela", true},
-			{"name", "Polypodium nudum", true},
-			{"name", "Polystichum nudum", true},
-			{"name", "Psilotum nudum", true},
-			{"name", "Ruteloryctes bis", true},
-			{"name", "Selenops ab", true},
-			{"name", "Tortolena dela", true},
-			{"name", "Trachyphloeosoma nudum", true},
-			{"name", "Zodarion van", true},
-		}
-		for _, v := range data {
-			assert.Equal(t, ppr.IsException(v.name, ppr.AnnotationException), v.likeAnnotation, v.msg)
+			words := strings.Split(v.name, " ")
+			assert.Equal(t, isException(words, NoParseException), v.likeAnnotation, v.msg)
 		}
 	})
 
@@ -109,7 +78,8 @@ func TestPreprocess(t *testing.T) {
 			{"name17", "Homo sapiens coronavirus", false},
 		}
 		for _, v := range data {
-			assert.Equal(t, ppr.IsException(v.name, ppr.VirusException), v.likeVirus, v.msg)
+			words := strings.Split(v.name, " ")
+			assert.Equal(t, isException(words, VirusException), v.likeVirus, v.msg)
 		}
 	})
 
@@ -139,7 +109,7 @@ func TestPreprocess(t *testing.T) {
 			{"Match word", "Bacteriophage PH75", true},
 		}
 		for _, v := range data {
-			res := ppr.IsVirus([]byte(v.name))
+			res := IsVirus([]byte(v.name))
 			assert.Equal(t, res, v.isVirus, v.msg)
 		}
 	})
@@ -173,7 +143,7 @@ func TestPreprocess(t *testing.T) {
 			{"RNA4", "E. coli mRNA", true},
 		}
 		for _, v := range data {
-			res := ppr.NoParse([]byte(v.name))
+			res := NoParse([]byte(v.name))
 			assert.Equal(t, res, v.parsed, v.msg)
 		}
 	})
@@ -194,7 +164,7 @@ func TestPreprocess(t *testing.T) {
 		}
 		for _, v := range data {
 			bs := []byte(v.in)
-			i := ppr.Annotation(bs)
+			i := procAnnot(bs)
 			assert.Equal(t, string(bs[0:i]), v.out, v.msg)
 			assert.Equal(t, string(bs[i:]), v.tail, v.msg)
 		}
@@ -214,15 +184,15 @@ func TestPreprocess(t *testing.T) {
 		}
 		for _, v := range data {
 			bs := []byte(v.in)
-			changed2, _ := ppr.UnderscoreToSpace(bs)
+			changed2, _ := UnderscoreToSpace(bs)
 			assert.Equal(t, string(bs), v.out, v.msg)
 			assert.Equal(t, changed2, v.changed)
 		}
 	})
 
 	t.Run("does not remove spaces", func(t *testing.T) {
 		name := "    Asplenium       × inexpectatum(E. L. Braun ex Friesner      )Morton"
-		res := ppr.Preprocess([]byte(name))
+		res := Preprocess([]byte(name))
 		assert.Equal(t, string(res.Body), name)
 	})
 }
diff --git a/ent/parsed/parsed.go b/ent/parsed/parsed.go
@@ -23,7 +23,7 @@ type Parsed struct {
 	// The ParseQuality is equal to the quality of the most
 	// severe warning (see qualityWarnings). If no problems
 	// are encountered, and the parsing succeeded, the parseQuality
-	// is set to 1. If parsing failed, the parseQuality is 0.
+	// is set to 1. If parsing failed, the parseQuality is 0.
 	ParseQuality int `json:"quality"`
 	// QualityWarnings contains encountered parsing problems.
 	QualityWarnings []QualityWarning `json:"qualityWarnings,omitempty"`
@@ -78,12 +78,14 @@ type Parsed struct {
 	// - notho- hybrid
 	// - hybrid formula
 	Hybrid *Annotation `json:"hybrid,omitempty"`
+
 	// GraftChimera is not nil if a name is detected as one of the graft chimeras
 	//
 	// - a non-categorized graft chimera
 	// - named graft chimera
 	// - graft chimera formula
 	GraftChimera *Annotation `json:"graftchimera,omitempty"`
+
 	// Surrogate is a wide category of names that do not follow
 	// nomenclatural rules
 

diff --git a/ent/parsed/restore_ambiguous.go b/ent/parsed/restore_ambiguous.go
@@ -0,0 +1,48 @@
+package parsed
+
+import (
+	"strings"
+
+	"github.com/gnames/gnparser/ent/stemmer"
+)
+
+// RestoreAmbiguous method is used for cases where specific or infra-specific
+// epithets had to be changed to be parsed sucessfully. Such situation
+// arises when an epithet is the same as some word that is also an
+// annotation, a prefix/suffix of an author name etc.
+func (p *Parsed) RestoreAmbiguous(epithet, subst string) {
+	stem := stemmer.Stem(epithet).Stem
+	stemSubst := stemmer.Stem(subst).Stem
+	p.Normalized = restoreString(p.Normalized, epithet, subst)
+	p.Canonical.Full = restoreString(p.Canonical.Full, epithet, subst)
+	p.Canonical.Simple = restoreString(p.Canonical.Simple, epithet, subst)
+	p.Canonical.Stemmed = restoreString(p.Canonical.Stemmed, stem, stemSubst)
+
+	for i := range p.Words {
+		p.Words[i].Verbatim = restoreWord(p.Words[i].Verbatim, epithet, subst)
+		p.Words[i].Normalized = restoreWord(p.Words[i].Normalized, epithet, subst)
+	}
+
+	if sp, ok := p.Details.(DetailsSpecies); ok {
+		sp.Species.Species = restoreWord(sp.Species.Species, epithet, subst)
+		p.Details = sp
+	}
+}
+
+func restoreString(s, epithet, subst string) string {
+	words := strings.Split(s, " ")
+	for i := range words {
+		if strings.HasPrefix(words[i], subst) {
+			words[i] = epithet + words[i][len(epithet):]
+			return strings.Join(words, " ")
+		}
+	}
+	return s
+}
+
+func restoreWord(w, epithet, subst string) string {
+	if strings.HasPrefix(w, subst) {
+		return epithet + w[len(epithet):]
+	}
+	return w
+}