From 36385a205bc2646a8713903cf86160b46030fb3e Mon Sep 17 00:00:00 2001 From: estr4ng7d Date: Tue, 21 May 2019 21:35:11 -0700 Subject: [PATCH 1/6] Adding Marathi language details and folder to it --- spacy/lang/mh/__init__.py | 23 ++++ spacy/lang/mh/stop_words.py | 205 ++++++++++++++++++++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 spacy/lang/mh/__init__.py create mode 100644 spacy/lang/mh/stop_words.py diff --git a/spacy/lang/mh/__init__.py b/spacy/lang/mh/__init__.py new file mode 100644 index 00000000000..2bfb0d90210 --- /dev/null +++ b/spacy/lang/mh/__init__.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +#from .lex_attrs import LEX_ATTRS + +from ...language import Language +from ...attrs import LANG + + +class MarathiDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + #lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "mh" + stop_words = STOP_WORDS + + +class Marathi(Language): + lang = "mh" + Defaults = MarathiDefaults + + +__all__ = ["Marathi"] \ No newline at end of file diff --git a/spacy/lang/mh/stop_words.py b/spacy/lang/mh/stop_words.py new file mode 100644 index 00000000000..bbaf057dbd4 --- /dev/null +++ b/spacy/lang/mh/stop_words.py @@ -0,0 +1,205 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json +STOP_WORDS = set( + """ +न +अतरी +तो +हें +तें +कां +आणि +जें +जे +मग +ते +मी +जो +परी +गा +हे +ऐसें +आतां +तैसें +परि +नाहीं +तेथ +हा +तया +असे +म्हणे +काय +म्हणौनि +कीं +जैसें +तंव +तूं +होय +जैसा +आहे +पैं +तैसा +जरी +म्हणोनि +एक +ऐसा +जी +ना +मज +एथ +या +जेथ +जया +तुज +तेणें +तैं +पां +असो +करी +ऐसी +येणें +जाहला +तेंचि +आघवें +होती +जैं +कांहीं +होऊनि +एकें +मातें +ठायीं +ये +अर्जुना +सकळ +केलें +जेणें +जाण +जैसी +होये +जेवीं +एऱ्हवीं +मीचि +किरीटी +दिसे +देवा +हो +तरि +कीजे +तैसे +आपण +तिये +कर्म +नोहे +इये +पडे +पार्था +माझें +तैसी +लागे +नाना +जंव +कीर +अधिक +अनेक +अशी +असलयाचे +असलेल्या +असा +असून +असे +आज +आणि +आता +आपल्या +आला +आली +आले +आहे +आहेत +एक +एका +कमी +करणयात +करून +का +काम +काय +काही +किवा +की +केला +केली +केले +कोटी +गेल्या +घेऊन +जात +झाला +झाली +झाले +झालेल्या +टा +डॉ +तर +तरी +तसेच +ता +ती +तीन +ते +तो +त्या +त्याचा +त्याची +त्याच्या +त्याना +त्यानी +त्यामुळे +त्री +दिली +दोन +न +नाही +निर्ण्य +पण +पम +परयतन +पाटील +म +मात्र +माहिती +मी +मुबी +म्हणजे +म्हणाले +म्हणून +या +याचा +याची +याच्या +याना +यानी +येणार +येत +येथील +येथे +लाख +व +व्यकत +सर्व +सागित्ले +सुरू +हजार +हा +ही +हे +होणार +होत +होता +होती +होते +""".split() +) \ No newline at end of file From 282668b8ef5d009a9d1fc0d3f21a22a7441f8832 Mon Sep 17 00:00:00 2001 From: estr4ng7d Date: Tue, 21 May 2019 22:23:08 -0700 Subject: [PATCH 2/6] Adding few changes and running tests --- .github/contributors/estr4ng7d.md | 106 ++++++++++++++++++++++++++++++ spacy/lang/mh/__init__.py | 7 +- spacy/lang/mh/stop_words.py | 2 +- 3 files changed, 109 insertions(+), 6 deletions(-) create mode 100644 .github/contributors/estr4ng7d.md diff --git a/.github/contributors/estr4ng7d.md b/.github/contributors/estr4ng7d.md new file mode 100644 index 00000000000..35c095c471c --- /dev/null +++ b/.github/contributors/estr4ng7d.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Amey Baviskar | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 21-May-2019 | +| GitHub username | estr4ng7d | +| Website (optional) | | diff --git a/spacy/lang/mh/__init__.py b/spacy/lang/mh/__init__.py index 2bfb0d90210..4c4f8fe77c2 100644 --- a/spacy/lang/mh/__init__.py +++ b/spacy/lang/mh/__init__.py @@ -1,16 +1,13 @@ -# coding: utf8 +#coding: utf8 from __future__ import unicode_literals from .stop_words import STOP_WORDS -#from .lex_attrs import LEX_ATTRS - from ...language import Language from ...attrs import LANG class MarathiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - #lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "mh" stop_words = STOP_WORDS @@ -20,4 +17,4 @@ class Marathi(Language): Defaults = MarathiDefaults -__all__ = ["Marathi"] \ No newline at end of file +__all__ = ["Marathi"] diff --git a/spacy/lang/mh/stop_words.py b/spacy/lang/mh/stop_words.py index bbaf057dbd4..2f087b7f734 100644 --- a/spacy/lang/mh/stop_words.py +++ b/spacy/lang/mh/stop_words.py @@ -202,4 +202,4 @@ होती होते """.split() -) \ No newline at end of file +) From efbd190bc8a27381e8c190de7c0cbefa802c1e93 Mon Sep 17 00:00:00 2001 From: estr4ng7d Date: Tue, 21 May 2019 22:28:09 -0700 Subject: [PATCH 3/6] Adding few changes and running tests --- spacy/lang/mh/stop_words.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/spacy/lang/mh/stop_words.py b/spacy/lang/mh/stop_words.py index 2f087b7f734..0b0cd035d51 100644 --- a/spacy/lang/mh/stop_words.py +++ b/spacy/lang/mh/stop_words.py @@ -23,8 +23,6 @@ हे ऐसें आतां -तैसें -परि नाहीं तेथ हा @@ -32,7 +30,6 @@ असे म्हणे काय -म्हणौनि कीं जैसें तंव @@ -65,14 +62,12 @@ तेंचि आघवें होती -जैं कांहीं होऊनि एकें मातें ठायीं ये -अर्जुना सकळ केलें जेणें @@ -95,7 +90,6 @@ नोहे इये पडे -पार्था माझें तैसी लागे @@ -142,7 +136,6 @@ झाले झालेल्या टा -डॉ तर तरी तसेच @@ -162,8 +155,6 @@ दिली दोन न -नाही -निर्ण्य पण पम परयतन From 50362d69d82a2f30b487c593bf4b378350d356c8 Mon Sep 17 00:00:00 2001 From: estr4ng7d Date: Wed, 22 May 2019 18:29:18 -0700 Subject: [PATCH 4/6] Update __init__.py mh -> mr --- spacy/lang/mh/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/mh/__init__.py b/spacy/lang/mh/__init__.py index 4c4f8fe77c2..53854093535 100644 --- a/spacy/lang/mh/__init__.py +++ b/spacy/lang/mh/__init__.py @@ -8,12 +8,12 @@ class MarathiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: "mh" + lex_attr_getters[LANG] = lambda text: "mr" stop_words = STOP_WORDS class Marathi(Language): - lang = "mh" + lang = "mr" Defaults = MarathiDefaults From e625d5a772e9eb293ff92790f4e05836732c06a1 Mon Sep 17 00:00:00 2001 From: estr4ng7d Date: Wed, 22 May 2019 18:32:52 -0700 Subject: [PATCH 5/6] Rename spacy/lang/mh/__init__.py to spacy/lang/mr/__init__.py --- spacy/lang/{mh => mr}/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/lang/{mh => mr}/__init__.py (100%) diff --git a/spacy/lang/mh/__init__.py b/spacy/lang/mr/__init__.py similarity index 100% rename from spacy/lang/mh/__init__.py rename to spacy/lang/mr/__init__.py From d3e37624160de2b3072b3e85189ee4490766ff5a Mon Sep 17 00:00:00 2001 From: Amey Baviskar Date: Wed, 22 May 2019 18:46:07 -0700 Subject: [PATCH 6/6] mh -> mr --- spacy/lang/{mh => mr}/stop_words.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/lang/{mh => mr}/stop_words.py (100%) diff --git a/spacy/lang/mh/stop_words.py b/spacy/lang/mr/stop_words.py similarity index 100% rename from spacy/lang/mh/stop_words.py rename to spacy/lang/mr/stop_words.py