From 3cc897102fb6059698f9a619d619556f30b4ba70 Mon Sep 17 00:00:00 2001 From: jeannefukumaru Date: Mon, 1 Apr 2019 00:00:08 +0800 Subject: [PATCH 1/8] added tag_map for indonesian --- spacy/lang/id/tag_map.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 spacy/lang/id/tag_map.py diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py new file mode 100644 index 00000000000..93d6cadcc1e --- /dev/null +++ b/spacy/lang/id/tag_map.py @@ -0,0 +1,31 @@ +# coding: utf8 + +''' +POS explanations for indonesian available from https://www.aclweb.org/anthology/Y12-1014 +''' + +TAG_MAP = { + "NSD" : {POS: NOUN}, + "Z–" : {POS: PUNCT}, + "VSA" : {POS: VERB}, + "CC-" : {POS: NUM}, + "R–" : {POS: ADP}, + "D–" : {POS: ADV}, + "ASP": {POS: ADJ}, + "S–" : {POS: SCONJ}, + "VSP" : {POS: VERB}, + "H–" : {POS: CCONJ}, + "F–" : {POS: X}, + "B–" : {POS: DET}, + "CO-" : {POS: NUM}, + "G–" : {POS: ADV}, + "PS3" : {POS: PRON}, + "W–" : {POS: ADV}, + "O–" : {POS: AUX}, + "PP1" : {POS: PRON}, + "ASS" : {POS: ADJ}, + "PS1" : {POS: PRON}, + "APP" : {POS: ADJ}, + "CD-" : {POS: NUM}, + "VPA" : {POS: VERB}, + "VPP" : {POS: VERB}} From 745cf0c9141627984e7672e81761be1f5ee990b0 Mon Sep 17 00:00:00 2001 From: jeannefukumaru Date: Mon, 1 Apr 2019 07:04:50 +0800 Subject: [PATCH 2/8] changed tag map from .py to .txt to see if tests pass --- spacy/lang/id/{tag_map.py => tag_map.txt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/lang/id/{tag_map.py => tag_map.txt} (100%) diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.txt similarity index 100% rename from spacy/lang/id/tag_map.py rename to spacy/lang/id/tag_map.txt From a741bed7a7f4993ff9544609c2147a1b1a23b9dd Mon Sep 17 00:00:00 2001 From: jeannefukumaru Date: Mon, 1 Apr 2019 16:21:06 +0800 Subject: [PATCH 3/8] added symbols import --- spacy/lang/id/{tag_map.txt => tag_map.py} | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) rename spacy/lang/id/{tag_map.txt => tag_map.py} (69%) diff --git a/spacy/lang/id/tag_map.txt b/spacy/lang/id/tag_map.py similarity index 69% rename from spacy/lang/id/tag_map.txt rename to spacy/lang/id/tag_map.py index 93d6cadcc1e..a729dfea86b 100644 --- a/spacy/lang/id/tag_map.txt +++ b/spacy/lang/id/tag_map.py @@ -1,8 +1,12 @@ # coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX + + +# POS explanations for indonesian available from https://www.aclweb.org/anthology/Y12-1014 -''' -POS explanations for indonesian available from https://www.aclweb.org/anthology/Y12-1014 -''' TAG_MAP = { "NSD" : {POS: NOUN}, From 082a0a223256675157b747901d1a577bdf22cd72 Mon Sep 17 00:00:00 2001 From: jeannefukumaru Date: Mon, 1 Apr 2019 16:37:11 +0800 Subject: [PATCH 4/8] added utf8 encoding flag --- spacy/lang/id/tag_map.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py index a729dfea86b..0d81b809d4a 100644 --- a/spacy/lang/id/tag_map.py +++ b/spacy/lang/id/tag_map.py @@ -1,4 +1,5 @@ # coding: utf8 + from __future__ import unicode_literals from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB From 6567f2784937413bc4235fc65f43695fee323fdb Mon Sep 17 00:00:00 2001 From: jeannefukumaru Date: Mon, 1 Apr 2019 17:02:53 +0800 Subject: [PATCH 5/8] added missing SCONJ symbol --- spacy/lang/id/tag_map.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py index 0d81b809d4a..e759550f68d 100644 --- a/spacy/lang/id/tag_map.py +++ b/spacy/lang/id/tag_map.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ # POS explanations for indonesian available from https://www.aclweb.org/anthology/Y12-1014 @@ -33,4 +33,5 @@ "APP" : {POS: ADJ}, "CD-" : {POS: NUM}, "VPA" : {POS: VERB}, - "VPP" : {POS: VERB}} + "VPP" : {POS: VERB}, +} From 8d6b5446328f4785d75ec99dedd2a0d238bfaeb5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 1 Apr 2019 11:45:43 +0200 Subject: [PATCH 6/8] Auto-format --- spacy/lang/id/tag_map.py | 55 +++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py index e759550f68d..f6996438461 100644 --- a/spacy/lang/id/tag_map.py +++ b/spacy/lang/id/tag_map.py @@ -1,37 +1,34 @@ # coding: utf8 - from __future__ import unicode_literals -from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ +from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ # POS explanations for indonesian available from https://www.aclweb.org/anthology/Y12-1014 - - TAG_MAP = { - "NSD" : {POS: NOUN}, - "Z–" : {POS: PUNCT}, - "VSA" : {POS: VERB}, - "CC-" : {POS: NUM}, - "R–" : {POS: ADP}, - "D–" : {POS: ADV}, - "ASP": {POS: ADJ}, - "S–" : {POS: SCONJ}, - "VSP" : {POS: VERB}, - "H–" : {POS: CCONJ}, - "F–" : {POS: X}, - "B–" : {POS: DET}, - "CO-" : {POS: NUM}, - "G–" : {POS: ADV}, - "PS3" : {POS: PRON}, - "W–" : {POS: ADV}, - "O–" : {POS: AUX}, - "PP1" : {POS: PRON}, - "ASS" : {POS: ADJ}, - "PS1" : {POS: PRON}, - "APP" : {POS: ADJ}, - "CD-" : {POS: NUM}, - "VPA" : {POS: VERB}, - "VPP" : {POS: VERB}, + "NSD": {POS: NOUN}, + "Z–": {POS: PUNCT}, + "VSA": {POS: VERB}, + "CC-": {POS: NUM}, + "R–": {POS: ADP}, + "D–": {POS: ADV}, + "ASP": {POS: ADJ}, + "S–": {POS: SCONJ}, + "VSP": {POS: VERB}, + "H–": {POS: CCONJ}, + "F–": {POS: X}, + "B–": {POS: DET}, + "CO-": {POS: NUM}, + "G–": {POS: ADV}, + "PS3": {POS: PRON}, + "W–": {POS: ADV}, + "O–": {POS: AUX}, + "PP1": {POS: PRON}, + "ASS": {POS: ADJ}, + "PS1": {POS: PRON}, + "APP": {POS: ADJ}, + "CD-": {POS: NUM}, + "VPA": {POS: VERB}, + "VPP": {POS: VERB}, } From 5d9212c44cd659003c9de2c642f049c6da7f4ca8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 1 Apr 2019 11:46:25 +0200 Subject: [PATCH 7/8] Remove unused imports --- spacy/lang/id/tag_map.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py index f6996438461..71d105bf470 100644 --- a/spacy/lang/id/tag_map.py +++ b/spacy/lang/id/tag_map.py @@ -1,8 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ +from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PRON, AUX, SCONJ # POS explanations for indonesian available from https://www.aclweb.org/anthology/Y12-1014 From 0a0b1087b0067259b774b91809a166d74c8c695c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 1 Apr 2019 11:46:51 +0200 Subject: [PATCH 8/8] Make tag map available in Indonesian defaults --- spacy/lang/id/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index d3c47d4b4d4..08e2d8ec2b7 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -8,6 +8,7 @@ from .lemmatizer import LOOKUP from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS +from .tag_map import TAG_MAP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -30,6 +31,7 @@ class IndonesianDefaults(Language.Defaults): infixes = TOKENIZER_INFIXES syntax_iterators = SYNTAX_ITERATORS lemma_lookup = LOOKUP + tag_map = TAG_MAP class Indonesian(Language):