Skip to content

Commit

Permalink
genera starting with 2-letter segment (close #204) by @tobymarsden
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Nov 14, 2021
1 parent 32af558 commit dc67aaf
Show file tree
Hide file tree
Showing 5 changed files with 11,031 additions and 3,047 deletions.
334 changes: 334 additions & 0 deletions 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
package parser

type Engine Peg {
baseEngine
}

SciName <- _? Name Tail END

Tail <- ((_ / ';' / ',') .*)?

Name <- NamedGenusGraftChimera / GraftChimeraFormula / NamedHybrid / HybridFormula / CandidatusName / SingleName

HybridFormula <- SingleName (_ (HybridFormulaPart / HybridFormulaFull))+

HybridFormulaFull <- HybridChar (_ SingleName)?

HybridFormulaPart <- HybridChar _ SpeciesEpithet (_ InfraspGroup)?

NamedHybrid <- NamedGenusHybrid / NamedSpeciesHybrid

NamedSpeciesHybrid <- GenusWord (_ Subgenus)? (_ Comparison)? _ HybridChar _?
SpeciesEpithet (_ InfraspGroup)?

NamedGenusHybrid <- HybridChar _? SingleName

GraftChimeraFormula <- SingleName (_ (GraftChimeraFormulaPart / GraftChimeraFormulaFull))+

GraftChimeraFormulaFull <- GraftChimeraChar (_ SingleName)?

GraftChimeraFormulaPart <- GraftChimeraChar _ SpeciesEpithet (_ InfraspGroup)?

NamedGenusGraftChimera <- GraftChimeraChar _? SingleName

CandidatusName <- Candidatus _ SingleName

Candidatus <- 'Candidatus'

SingleName <- NameComp / NameApprox / NameSpecies / NameUninomial

NameUninomial <- (UninomialCombo / Uninomial) (_ CultivarWordGroup)?

NameApprox <- GenusWord (_ SpeciesEpithet)? _ Approximation ApproxNameIgnored

NameComp <- GenusWord _ Comparison (_ SpeciesEpithet)?

NameSpecies <- GenusWord (_? ( Subgenus / SubgenusOrSuperspecies))?
_ SpeciesEpithet (_ InfraspGroup)? (_ CultivarWordGroup)?

GenusWord <- (AbbrGenus / UninomialWord) !(_ AuthorWord)

InfraspGroup <- InfraspEpithet (_ InfraspEpithet)? (_ InfraspEpithet)?

InfraspEpithet <- (Rank _?)? !(AuthorEx) Word (_? Authorship)?

CultivarWordGroup <- ((RankCultivar _)? CultivarApostrophe CultivarRecursive CultivarApostrophe) / (RankCultivar _ Cultivar)

Cultivar <- NotHybridChar+

RankCultivar <- 'cv' '.'?

NotHybridChar <- (!(_ HybridChar) .)

# Recursive, one character at a time
CultivarRecursive <- NotHybridChar CultivarRecursive / &CultivarApostrophe

CultivarApostrophe <- '\'' / '‘' / '’' / '"' / '“' / '”'

SpeciesEpithet <- !(AuthorEx) Word (_? Authorship)?

Comparison <- 'cf' '.'? &(SpaceCharEOI)

Rank <- (RankForma / RankVar / RankSsp / RankOther / RankOtherUncommon /
RankAgamo / RankNotho) (_? LowerGreek ('.' / &(SpaceCharEOI)))?

RankNotho <- (('notho' ('var' / 'fo' / 'f' / 'subsp' / 'ssp' / 'sp' /
'morth' / 'supsp' / 'su' )) / 'nvar') ('.' / &(SpaceCharEOI))

RankOtherUncommon <- ('*' / 'natio' / 'nat.' / 'nat' / 'f.sp' /
'α' / 'ββ' / 'β' / 'γ' / 'δ' / 'ε' / 'φ' / 'θ' / 'μ' / 'a.' / 'b.' /
'c.' / 'd.' / 'e.' / 'g.' / 'k.' /
'mut.') &(SpaceCharEOI)

RankOther <- ('morph' / 'convar' / 'pseudovar' / 'sect' /
'ser' / 'subvar' / 'subf' / 'race' / 'pv' / 'pathovar' /
('ab.' (_? 'n.')?) / 'st') ('.' / &(SpaceCharEOI))

RankVar <- ('variety' / '[var.]' / 'var') ('.' / &(SpaceCharEOI))

RankForma <- ('forma' / 'fma' / 'fm' / 'form' / 'fo' / 'f') ('.' / &(SpaceCharEOI))

RankSsp <- ('ssp' / 'subspec' / 'subsp') ('.' / &(SpaceCharEOI))

RankAgamo <- ('agamosp' / 'agamossp' / 'agamovar') ('.' / &(SpaceCharEOI))

SubgenusOrSuperspecies <- '(' _? NameLowerChar+ _? ')'

Subgenus <- Subgenus2 / Subgenus1

Subgenus2 <- '(' _? AbbrSubgenus _? ')' !(_? NameUpperChar)

Subgenus1 <- '(' _? UninomialWord _? ')'

UninomialCombo <- UninomialCombo1 / UninomialCombo2

UninomialCombo1 <- UninomialWord _? Subgenus (_? Authorship)?

UninomialCombo2 <- Uninomial _ RankUninomial _ Uninomial

RankUninomial <- RankUninomialPlain / RankUninomialNotho

RankUninomialPlain <- ('sect' / 'subsect' / 'trib' / 'subtrib' / 'subser' /
'ser' / 'subgen' / 'subg' / 'fam' / 'subfam' / 'div' /
'supertrib') ('.' / &(SpaceCharEOI))

RankUninomialNotho <- ('notho' _? ('sect' / 'gen' / 'ser' / 'subgeen' /
'subgen' / 'subg' / 'subsect' / 'subtrib')) ('.' / &(SpaceCharEOI))

Uninomial <- UninomialWord (_ Authorship
!(_ LowerCharExtended LowerCharExtended LowerCharExtended))?

UninomialWord <- CapWord / TwoLetterGenus

AbbrSubgenus <- UpperChar LowerChar* '.'

AbbrGenus <- UpperChar LowerChar? '.'

CapWord <- CapWordWithDash / CapWord1

CapWord1 <- NameUpperChar NameLowerChar NameLowerChar+ '?'?

CapWordWithDash <- (CapWord1 / TwoLetterGenusDashedSegment) Dash WordAfterDash (Dash WordAfterDash)?

TwoLetterGenusDashedSegment <- ('De' / 'Eu' / 'Le' / 'Ne')

WordAfterDash <- (UpperAfterDash / LowerAfterDash)

UpperAfterDash <- CapWord1

LowerAfterDash <- Word1

TwoLetterGenus <- ('Ca' / 'Do' / 'Ea' / 'Ge' / 'Ia' / 'Io' / 'Ix' / 'Lo' /
'Oa' / 'Oo' / 'Nu' / 'Ra' / 'Ty' / 'Ua' / 'Aa' / 'Ja' / 'Zu' / 'La' / 'Qu' /
'As' / 'Ba')

Word <- !(('ex' / 'et' / 'and' / 'apud' / 'pro' / 'cv' / 'cultivar' /
AuthorPrefix / RankUninomial / Approximation / Word4) SpaceCharEOI)
(WordApostr / WordStartsWithDigit / MultiDashedWord / Word2 /
Word1) &(SpaceCharEOI / '(')

Word1 <- ((DotPrefix / LowerASCII) Dash)? NameLowerChar NameLowerChar+

WordStartsWithDigit <- [123456789] Nums? ('.' / Dash)? NameLowerChar NameLowerChar
NameLowerChar NameLowerChar+

Word2 <- NameLowerChar+ Dash? (WordApostr / NameLowerChar+)

WordApostr <- NameLowerChar NameLowerChar* Apostrophe Word1

Word4 <- NameLowerChar+ '.' NameLowerChar

DotPrefix <- 'st.'

MultiDashedWord <- NameLowerChar+ Dash NameLowerChar+ Dash NameLowerChar+ (Dash NameLowerChar+)?

HybridChar <- '×' / [xX] &_ / [xX] &UninomialWord / [xX] &END

GraftChimeraChar <- '+'

ApproxNameIgnored <- .*

Approximation <- ('sp.' _? 'nr.' / 'sp.' _? 'aff.' / 'monst.' /
'?' / (('spp' / 'nr' / 'sp' / 'aff' / 'species') (&(SpaceCharEOI) / '.')))

Authorship <- (AuthorshipCombo / OriginalAuthorship) &(SpaceCharEOI / ';' / ',')

AuthorshipCombo <- OriginalAuthorshipComb (_? CombinationAuthorship)?

OriginalAuthorship <- AuthorsGroup

OriginalAuthorshipComb <- BasionymAuthorshipYearMisformed /
BasionymAuthorship /
BasionymAuthorshipMissingParens

CombinationAuthorship <- AuthorsGroup

BasionymAuthorshipMissingParens <- MissingParensStart / MissingParensEnd

MissingParensStart <- '(' _? AuthorsGroup

MissingParensEnd <- AuthorsGroup _? ')'

BasionymAuthorshipYearMisformed <- '(' _? AuthorsGroup _? ')' (_? ',')? _? Year

BasionymAuthorship <- BasionymAuthorship1 / BasionymAuthorship2Parens

BasionymAuthorship1 <- '(' _? AuthorsGroup _? ')'

BasionymAuthorship2Parens <- '(' _? '(' _? AuthorsGroup _? ')' _? ')'

AuthorsGroup <- AuthorsTeam (_ (AuthorEmend / AuthorEx) AuthorsTeam)?

AuthorsTeam <- Author (AuthorSep Author)* (_? ','? _? Year)?

AuthorSep <- AuthorSep1 / AuthorSep2

AuthorSep1 <- _? (',' _)? ( '&' / AuthorSepSpanish / 'et' / 'and' / 'apud') _?

AuthorSep2 <- _? ',' _?

AuthorSepSpanish <- _? 'y' _?

AuthorEx <- ('ex' '.'? / 'ms' _ 'in' / 'in') _

AuthorEmend <- 'emend' '.'? _

Author <- (Author0 / Author1 / Author2 / UnknownAuthor) (_ AuthorEtAl)?

Author0 <- Author2 FiliusFNoSpace

Author1 <- Author2 _? (Filius / AuthorSuffix)

Author2 <- AuthorWord (_? AuthorWord)*

UnknownAuthor <- '?' / (('auct' / 'anon') (&(SpaceCharEOI) / '.'))

AuthorWord <- !( HybridChar / "bold:") (AuthorDashInitials / AuthorWord1 /
AuthorWord2 / AuthorWord3 / AuthorPrefix)

AuthorEtAl <- 'arg.' / 'et al.{?}' / ('et' / '&') ' al' '.'?

AuthorWord1 <- 'duPont'

AuthorWord2 <- AuthorWord3 Dash (AuthorWordSoft / AuthorInitial)

AuthorWord3 <- AuthorPrefixGlued? (AllCapsAuthorWord / CapAuthorWord) '.'?

AuthorDashInitials <- AuthorUpperChar '.'? Dash AuthorUpperChar '.'?

AuthorInitial <- AuthorUpperChar '.'?

AuthorWordSoft <- ((AuthorUpperChar (AuthorUpperChar+ / AuthorLowerChar+)) /
AuthorLowerChar+) '.'?

CapAuthorWord <- AuthorUpperChar AuthorLowerChar*

AllCapsAuthorWord <- AuthorUpperChar AuthorUpperChar+

Filius <- FiliusF / 'fil.' / 'filius'

FiliusF <- 'f.' !(_ Word)

FiliusFNoSpace <- 'f.'

AuthorSuffix <- 'bis' / 'ter'

AuthorPrefixGlued <- ('d' / 'O' / 'L' / 'Mc' / 'M') Apostrophe

AuthorPrefix <- AuthorPrefix1 / AuthorPrefix2

AuthorPrefix2 <- ('v.' (_? 'd.')?) / Apostrophe 't'

AuthorPrefix1 <- ('ab' / 'af' / 'bis' / 'da' / 'der' / 'des' / 'den' /
'della' / 'dela' / 'delle' / 'del' / 'de los' / 'de' / 'di' / 'dos' /
'du' / 'do' / 'el' / 'la' / 'le' / 'ten' / 'ter' / 'van' / 'ver' /
'd' Apostrophe / 'in' Apostrophe 't' / 'zur' / 'zu' /
('von' (_ ('d.'/ 'dem'))?) / ('v' (_'d')?)) &_

AuthorUpperChar <- UpperASCII / MiscodedChar /
[ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝĆČĎİĶĹ弾ŁłŅŌŐŒŘŚŜŞŠŸŹŻŽƒǾȘȚ]

AuthorLowerChar <- LowerASCII / MiscodedChar / Apostrophe /
[àáâãäåæçèéêëìíîïðñòóóôõöøùúûüýÿāăąćĉčďđēĕėęěğīĭİıĺľłńņňŏőœŕřśşšţťũūŭůűźżžſǎǔǧșțȳß]

Year <- YearRange / YearApprox / YearWithParens / YearWithPage / YearWithDot /
YearWithChar / YearNum

YearRange <- YearNum (Dash / Slash) (Nums+ [abcdefghijklmnopqrstuvwxyz?]*)

YearWithDot <- YearNum '.'

YearApprox <- '[' _? YearNum _? ']'

YearWithPage <- (YearWithChar / YearNum) _? ":" _? Nums+

YearWithParens <- '(' (YearWithChar / YearNum) ')'

YearWithChar <- YearNum LowerASCII

YearNum <- [12] [0789] Nums (Nums / '?') '?'*

NameUpperChar <- UpperChar / UpperCharExtended

UpperCharExtended <- [ÆŒÖ]

UpperChar <- UpperASCII

NameLowerChar <- LowerChar / LowerCharExtended / MiscodedChar

MiscodedChar <- '�'

LowerCharExtended <- [æœàâåãäáçčéèëíìïňññóòôøõöúûùüŕřŗſššşßž]

LowerChar <- LowerASCII

SpaceCharEOI <- _ / !.

Nums <- [0-9]

LowerGreek <- [α-ω]

LowerASCII <- [a-z]

UpperASCII <- [A-Z]

Apostrophe <- ApostrOther / ApostrASCII

ApostrASCII <- '\''

ApostrOther <- '‘' / '’' / '`' / '´'


Dash <- '-'

Slash <- '/'

_ <- MultipleSpace / SingleSpace

MultipleSpace <- SingleSpace SingleSpace+

SingleSpace <- ' ' / OtherSpace

OtherSpace <- [  \t\r\n\f\v]

END <- !.
4 changes: 3 additions & 1 deletion ent/parser/grammar.peg
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@ CapWord <- CapWordWithDash / CapWord1

CapWord1 <- NameUpperChar NameLowerChar NameLowerChar+ '?'?

CapWordWithDash <- CapWord1 Dash WordAfterDash (Dash WordAfterDash)?
CapWordWithDash <- (CapWord1 / TwoLetterGenusDashedSegment) Dash WordAfterDash (Dash WordAfterDash)?

TwoLetterGenusDashedSegment <- ('De' / 'Eu' / 'Le' / 'Ne')

WordAfterDash <- (UpperAfterDash / LowerAfterDash)

Expand Down
Loading

0 comments on commit dc67aaf

Please sign in to comment.