Skip to content

Commit

Permalink
python-kpext: Python package.
Browse files Browse the repository at this point in the history
Signed-off-by: Simon David HERNANDEZ <simondhp@git.totum.one>
  • Loading branch information
Simon David HERNANDEZ committed May 3, 2018
1 parent 72b23b9 commit 11769aa
Show file tree
Hide file tree
Showing 19 changed files with 133 additions and 81 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
src/kpext/kpext_data/corpus/*
*.pycrfsuite
# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
2 changes: 1 addition & 1 deletion Keyphrase_extraction.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5rc1"
"version": "3.6.5"
}
},
"nbformat": 4,
Expand Down
65 changes: 0 additions & 65 deletions config/config.py

This file was deleted.

2 changes: 0 additions & 2 deletions corpus/.gitignore

This file was deleted.

2 changes: 1 addition & 1 deletion main.py → keyphrase-extraction-example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Execute methods.
"""
import resources.dataset as rd
import kpext.resources.dataset as rd

def main():
"""Method to run package."""
Expand Down
2 changes: 1 addition & 1 deletion minimal-example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"metadata": {},
"outputs": [],
"source": [
"import resources.dataset as rd\n",
"import kpext.resources.dataset as rd\n",
"default_corpus = rd.load_corpus()"
]
},
Expand Down
Empty file removed models/.gitkeep
Empty file.
40 changes: 40 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
""" setup
Package setup for kpext
"""

from setuptools import setup, find_packages

with open('README.md') as f:
README = f.read()

setup(name='python-kpext',
version='v0.1.0.dev3',
description='Python package for keyphrase extraction.',
long_description=README,
long_description_content_type='text/markdown',
url='https://github.com/snovd/keyphrase-extraction',
author='Simon D. Hernandez',
author_email='py.kpext@totum.one',
license='MIT',
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Developers',
'Topic :: Software Development :: Build Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6'
],
keywords='keyphrase extraction',
packages=find_packages('src'),
package_dir={'':'src', 'kpext': 'src/kpext'},
package_data={'kpext': ['kpext_data/models/*']},
python_requires='>=3, <4',
platform='any',
install_requires=['nltk', 'python-crfsuite'],
zip_safe=False)
File renamed without changes.
File renamed without changes.
78 changes: 78 additions & 0 deletions src/kpext/config/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""config/config
Default corpus configs.
"""
import sys
import inspect
from pathlib import Path

from kpext import kpext_data

ACLRDTEC = "acl-rd-tec-2.0"
SEMEVAL2017 = "semeval2017-task10"

KPEXTDATA_PATH = str(Path(inspect.getfile(kpext_data)).parents[1])

# Check for default paths for corpus
DEFAULT_CORPUS_PATH = "kpext_data/corpus/" + SEMEVAL2017 + "/"
if Path("./" + DEFAULT_CORPUS_PATH).exists():
CORPUS_PATH = "./" + DEFAULT_CORPUS_PATH
elif Path("~/" + DEFAULT_CORPUS_PATH).exists():
CORPUS_PATH = "~/" + DEFAULT_CORPUS_PATH
elif Path(KPEXTDATA_PATH + "/" + DEFAULT_CORPUS_PATH).exists():
CORPUS_PATH = KPEXTDATA_PATH + "/" + DEFAULT_CORPUS_PATH
else:
print("Warning: SemEval 2017 Task 10 corpus doesn't exists.", file=sys.stderr)
print(" - Download from here https://scienceie.github.io/resources.html",
file=sys.stderr)
print(" - Use one of the following paths.", file=sys.stderr)
print(" + %s" % (KPEXTDATA_PATH + "/" + DEFAULT_CORPUS_PATH), file=sys.stderr)
print(" + ./%s" % DEFAULT_CORPUS_PATH, file=sys.stderr)
print(" + ~/%s" % DEFAULT_CORPUS_PATH, file=sys.stderr)
print(" - You can use pre-trained models.", file=sys.stderr)
CORPUS_PATH = DEFAULT_CORPUS_PATH

CORPUS = {
ACLRDTEC: {
"_id": "acl-rd-tec-2.0",
"options": {}
},
SEMEVAL2017: {
"_id": "semeval2017-task10",
"format": "brat",
"format-description": "brat standoff format, http://brat.nlplab.org/standoff.html",
"dataset": {
"train-labeled": CORPUS_PATH + "/train2/",
"train-unlabeled": None,
"dev-labeled": CORPUS_PATH + "/dev/",
"dev-unlabeled": None,
"test-unlabeled": CORPUS_PATH + "/scienceie2017_test_unlabelled/",
"test-labeled": CORPUS_PATH + "/semeval_articles_test/"
},
"options": {}
},
"options": {}
}
CORPUS_DEFAULT = CORPUS[SEMEVAL2017]
CORPUS_SEMEVAL2017_TASK10 = CORPUS[SEMEVAL2017]
CORPUS_ACL_RD_TEC_2_0 = CORPUS[ACLRDTEC]

# Check for default paths for models
DEFAULT_MODELS_PATH = "kpext_data/models/"
if Path("./" + DEFAULT_MODELS_PATH).exists():
MODELS_PATH = "./" + DEFAULT_MODELS_PATH
elif Path("~/" + DEFAULT_MODELS_PATH).exists():
MODELS_PATH = "~/" + DEFAULT_MODELS_PATH
elif Path(KPEXTDATA_PATH + "/" + DEFAULT_MODELS_PATH).exists():
MODELS_PATH = KPEXTDATA_PATH + "/" + DEFAULT_MODELS_PATH
else:
print("Warning: Path to save models doesn't exists.", file=sys.stderr)
print(" - Possible paths are:", file=sys.stderr)
print(" + %s" % (KPEXTDATA_PATH + "/" + DEFAULT_MODELS_PATH), file=sys.stderr)
print(" + %s" % ("./" + DEFAULT_MODELS_PATH), file=sys.stderr)
print(" + %s" % ("~/" + DEFAULT_MODELS_PATH), file=sys.stderr)
print(" - Default will be %s" % DEFAULT_MODELS_PATH, file=sys.stderr)
MODELS_PATH = DEFAULT_MODELS_PATH

OUTPUT_PATH = "output/"
File renamed without changes.
File renamed without changes.
4 changes: 2 additions & 2 deletions methods/crf.py → src/kpext/methods/crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import os
import pycrfsuite

from config.config import MODELS_PATH
from resources import dataset as rd
from kpext.config.config import MODELS_PATH
from kpext.resources import dataset as rd

def crf_preprocess_candidates(candidates):
"""Receive annotated candidates and return features and labels list"""
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions resources/corpus.py → src/kpext/resources/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
"""
import copy
import resources.dataset as rd
import methods.crf as mc
import kpext.resources.dataset as rd
import kpext.methods.crf as mc

class Corpus:
"""Corpus class"""
Expand Down
4 changes: 2 additions & 2 deletions resources/dataset.py → src/kpext/resources/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tag.perceptron import PerceptronTagger

from config.config import CORPUS, CORPUS_DEFAULT, SEMEVAL2017
from kpext.config.config import CORPUS, CORPUS_DEFAULT, SEMEVAL2017

def get_files(path_to_files):
"""Walk in path"""
Expand Down Expand Up @@ -67,7 +67,7 @@ def load_corpus(name=None):
corpus = name if name else SEMEVAL2017
obj = None
if corpus == SEMEVAL2017:
from resources.semeval2017 import SemEval2017
from kpext.resources.semeval2017 import SemEval2017
obj = SemEval2017()
return obj

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
Class SemEval2017.
"""
from config.config import SEMEVAL2017
from resources.corpus import Corpus
import resources.dataset as rd
from kpext.config.config import SEMEVAL2017
from kpext.resources.corpus import Corpus
import kpext.resources.dataset as rd

class SemEval2017(Corpus):
"""Class for SemEval 2017 Task 10 corpus"""
Expand Down
4 changes: 2 additions & 2 deletions tests/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from nose2.tools import such
from nose2.tools.params import params

from resources import dataset as rd
from resources.semeval2017 import SemEval2017
from kpext.resources import dataset as rd
from kpext.resources.semeval2017 import SemEval2017

with such.A("module to load resources") as it:
@it.has_setup
Expand Down

0 comments on commit 11769aa

Please sign in to comment.