From a516b7da80c6919bb4ffda3c1d5e043b65fb3fe9 Mon Sep 17 00:00:00 2001 From: inishchith Date: Fri, 14 Jun 2019 10:56:01 +0530 Subject: [PATCH] [integration] Add support of Graal's CoLic Backend to ELK Signed-off-by: inishchith --- grimoire_elk/enriched/colic.py | 167 +++++++++++++++++++++++++++++ grimoire_elk/raw/colic.py | 70 ++++++++++++ grimoire_elk/utils.py | 5 + requirements.txt | 1 + tests/data/colic.json | 189 +++++++++++++++++++++++++++++++++ tests/test_colic.py | 89 ++++++++++++++++ 6 files changed, 521 insertions(+) create mode 100644 grimoire_elk/enriched/colic.py create mode 100644 grimoire_elk/raw/colic.py create mode 100644 tests/data/colic.json create mode 100644 tests/test_colic.py diff --git a/grimoire_elk/enriched/colic.py b/grimoire_elk/enriched/colic.py new file mode 100644 index 000000000..9f5069191 --- /dev/null +++ b/grimoire_elk/enriched/colic.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Nishchith Shetty +# + +import logging +from .enrich import Enrich, metadata +from grimoirelab_toolkit.datetime import str_to_datetime + + +MAX_SIZE_BULK_ENRICHED_ITEMS = 200 + +logger = logging.getLogger(__name__) + + +class ColicEnrich(Enrich): + + def get_identities(self, item): + """ Return the identities from an item """ + identities = [] + + return identities + + def has_identities(self): + """ Return whether the enriched items contains identities """ + + return False + + def get_field_unique_id(self): + return "id" + + def extract_modules(self, file_path): + """ Extracts module path from the given file path """ + path_chunks = file_path.split('/') + + modules = [] + for idx in range(len(path_chunks)): + sub_path = '/'.join(path_chunks[:idx]) + + if sub_path: + modules.append(sub_path) + + return modules + + @metadata + def get_rich_item(self, file_analysis): + # TODO: requires adjustments regarding category of backend used + + eitem = {} + + # entry["holders"] = file_analysis["holders"][0]["value"] + eitem["file_path"] = file_analysis["file_path"] + eitem["modules"] = self.extract_modules(eitem["file_path"]) + eitem["copyrights"] = [] + eitem["licenses"] = [] + eitem["license_name"] = [] + eitem["has_license"] = 0 + eitem["has_copyright"] = 0 + + if file_analysis.get("licenses", False): + eitem["has_license"] = 1 + for _license in file_analysis["licenses"]: + eitem["licenses"].extend(_license["matched_rule"]["licenses"]) + eitem["license_name"].append(_license["name"]) + + if file_analysis.get("copyrights", False): + eitem["has_copyright"] = 1 + for _copyright in file_analysis["copyrights"]: + eitem["copyrights"].append(_copyright["value"]) + + return eitem + + def get_rich_items(self, item): + # The real data + entry = item['data'] + + enriched_items = [] + + for file_analysis in entry["analysis"]: + eitem = self.get_rich_item(file_analysis) + + for f in self.RAW_FIELDS_COPY: + if f in item: + eitem[f] = item[f] + else: + eitem[f] = None + + # common attributes + eitem['commit_sha'] = entry['commit'] + eitem['author'] = entry['Author'] + eitem['committer'] = entry['Commit'] + eitem['commit'] = entry['commit'] + eitem['message'] = entry['message'] + eitem['author_date'] = self.__fix_field_date(entry['AuthorDate']) + eitem['commit_date'] = self.__fix_field_date(entry['CommitDate']) + + if self.prjs_map: + eitem.update(self.get_item_project(eitem)) + + # uuid + eitem['id'] = "{}_{}".format(eitem['commit_sha'], eitem['file_path']) + + eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file")) + + self.add_repository_labels(eitem) + self.add_metadata_filter_raw(eitem) + + enriched_items.append(eitem) + + return enriched_items + + def enrich_items(self, ocean_backend, events=False): + items_to_enrich = [] + num_items = 0 + ins_items = 0 + + for item in ocean_backend.fetch(): + rich_items = self.get_rich_items(item) + + items_to_enrich.extend(rich_items) + if len(items_to_enrich) < MAX_SIZE_BULK_ENRICHED_ITEMS: + continue + + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + items_to_enrich = [] + + if len(items_to_enrich) > 0: + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error("%s/%s missing items for CoLic", str(missing), str(num_items)) + else: + logger.info("%s items inserted for CoLic", str(num_items)) + + return num_items + + def __fix_field_date(self, date_value): + """Fix possible errors in the field date""" + + field_date = str_to_datetime(date_value) + + try: + _ = int(field_date.strftime("%z")[0:3]) + except ValueError: + field_date = field_date.replace(tzinfo=None) + + return field_date.isoformat() diff --git a/grimoire_elk/raw/colic.py b/grimoire_elk/raw/colic.py new file mode 100644 index 000000000..75c494939 --- /dev/null +++ b/grimoire_elk/raw/colic.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Nishchith Shetty +# + +from .elastic import ElasticOcean +from ..elastic_mapping import Mapping as BaseMapping + + +class Mapping(BaseMapping): + + @staticmethod + def get_elastic_mappings(es_major): + """Get Elasticsearch mapping. + + Ensure data.message is string, since it can be very large + + :param es_major: major version of Elasticsearch, as string + :returns: dictionary with a key, 'items', with the mapping + """ + + mapping = ''' + { + "dynamic":true, + "properties": { + "data": { + "properties": { + "message": { + "type": "text", + "index": true + } + } + } + } + } + ''' + + return {"items": mapping} + + +class ColicOcean(ElasticOcean): + """CoLic Ocean feeder""" + + mapping = Mapping + + @classmethod + def get_perceval_params_from_url(cls, url): + params = [] + tokens = url.split(' ', 1) # Just split the URL not the filter + url = tokens[0] + params.append(url) + + return params diff --git a/grimoire_elk/utils.py b/grimoire_elk/utils.py index d4fa7dd16..21db29213 100755 --- a/grimoire_elk/utils.py +++ b/grimoire_elk/utils.py @@ -29,6 +29,8 @@ from grimoire_elk.elastic import ElasticConnectException from grimoire_elk.elastic import ElasticSearch +# Connectors for Graal +from graal.backends.core.colic import CoLic, CoLicCommand # Connectors for Perceval from grimoire_elk.raw.hyperkitty import HyperKittyOcean from perceval.backends.core.askbot import Askbot, AskbotCommand @@ -68,6 +70,7 @@ from perceval.backends.mozilla.remo import ReMo, ReMoCommand from perceval.backends.opnfv.functest import Functest, FunctestCommand # Connectors for EnrichOcean +from .enriched.colic import ColicEnrich from .enriched.askbot import AskbotEnrich from .enriched.bugzilla import BugzillaEnrich from .enriched.bugzillarest import BugzillaRESTEnrich @@ -105,6 +108,7 @@ from .enriched.telegram import TelegramEnrich from .enriched.twitter import TwitterEnrich # Connectors for Ocean +from .raw.colic import ColicOcean from .raw.askbot import AskbotOcean from .raw.bugzilla import BugzillaOcean from .raw.bugzillarest import BugzillaRESTOcean @@ -200,6 +204,7 @@ def get_connectors(): return {"askbot": [Askbot, AskbotOcean, AskbotEnrich, AskbotCommand], "bugzilla": [Bugzilla, BugzillaOcean, BugzillaEnrich, BugzillaCommand], "bugzillarest": [BugzillaREST, BugzillaRESTOcean, BugzillaRESTEnrich, BugzillaRESTCommand], + "colic": [CoLic, ColicOcean, ColicEnrich, CoLicCommand], "confluence": [Confluence, ConfluenceOcean, ConfluenceEnrich, ConfluenceCommand], "crates": [Crates, CratesOcean, CratesEnrich, CratesCommand], "discourse": [Discourse, DiscourseOcean, DiscourseEnrich, DiscourseCommand], diff --git a/requirements.txt b/requirements.txt index 05c5b3b15..3a340b687 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ urllib3==1.24.3 -e git+https://github.com/chaoss/grimoirelab-cereslib/#egg=grimoirelab-cereslib -e git+https://github.com/chaoss/grimoirelab-kingarthur/#egg=grimoirelab-kingarthur -e git+https://github.com/chaoss/grimoirelab-perceval/#egg=grimoirelab-perceval +-e git+https://github.com/chaoss/grimoirelab-graal/#egg=grimoirelab-graal -e git+https://github.com/chaoss/grimoirelab-perceval-mozilla/#egg=grimoirelab-perceval-mozilla -e git+https://github.com/chaoss/grimoirelab-perceval-opnfv/#egg=grimoirelab-perceval-opnfv -e git+https://github.com/chaoss/grimoirelab-perceval-puppet/#egg=grimoirelab-perceval-puppet diff --git a/tests/data/colic.json b/tests/data/colic.json new file mode 100644 index 000000000..803dbfe23 --- /dev/null +++ b/tests/data/colic.json @@ -0,0 +1,189 @@ +[{ + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode_cli", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun May 6 13:11:43 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun May 6 13:11:43 2018 +0200", + "analysis": [{ + "authors": [], + "base_name": "LICENSE", + "copyrights": [{ + "end_line": 6, + "start_line": 4, + "value": "Copyright (c) 2007 Free Software Foundation, Inc. " + }], + "date": "2019-07-08", + "dirs_count": 0, + "extension": "", + "file_path": "LICENSE", + "file_type": "ASCII text", + "files_count": 0, + "holders": [{ + "end_line": 6, + "start_line": 4, + "value": "Free Software Foundation, Inc." + }], + "is_archive": false, + "is_binary": false, + "is_media": false, + "is_script": false, + "is_source": false, + "is_text": true, + "license_expressions": [ + "gpl-3.0" + ], + "licenses": [{ + "category": "Copyleft", + "end_line": 674, + "homepage_url": "http://www.gnu.org/licenses/gpl-3.0.html", + "is_exception": false, + "key": "gpl-3.0", + "matched_rule": { + "identifier": "gpl-3.0.LICENSE", + "is_license_notice": false, + "is_license_reference": false, + "is_license_tag": false, + "is_license_text": true, + "license_expression": "gpl-3.0", + "licenses": [ + "gpl-3.0" + ], + "match_coverage": 100.0, + "matched_length": 5700, + "matcher": "1-hash", + "rule_length": 5700, + "rule_relevance": 100 + }, + "matched_text": "GNU GENERAL PUBLIC LICENSE\n Version 3, 29 June 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. \n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n", + "name": "GNU General Public License 3.0", + "owner": "Free Software Foundation (FSF)", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-3.0", + "score": 100.0, + "short_name": "GPL 3.0", + "spdx_license_key": "GPL-3.0-only", + "spdx_url": "https://spdx.org/licenses/GPL-3.0-only", + "start_line": 1, + "text_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html" + }], + "md5": "d32239bcb673463ab874e80d47fae504", + "mime_type": "text/plain", + "name": "LICENSE", + "path": "LICENSE", + "programming_language": null, + "scan_errors": [], + "sha1": "8624bcdae55baeef00cd11d5dfcfa60f68710a02", + "size": 35147, + "size_count": 0, + "type": "file" + }], + "analyzer": "scancode_cli", + "commit": "2fb9a49363021922eb0fcc9874baabfc252a827c", + "message": "[graal] Initial commit" + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562563540.615095, + "updated_on": 1525605103.0, + "uuid": "29d7a294d2316825de824f1084a783f8479073e0" + }, + { + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode_cli", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun May 6 13:56:51 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun May 6 13:56:51 2018 +0200", + "analysis": [{ + "authors": [{ + "end_line": 20, + "start_line": 19, + "value": "Valerio Cosentino " + }], + "base_name": "codecomplexity", + "copyrights": [{ + "end_line": 3, + "start_line": 3, + "value": "Copyright (c) 2015-2018 Bitergia" + }], + "date": "2019-07-08", + "dirs_count": 0, + "extension": ".py", + "file_path": "graal/codecomplexity.py", + "file_type": "Python script, ASCII text executable", + "files_count": 0, + "holders": [{ + "end_line": 3, + "start_line": 3, + "value": "Bitergia" + }], + "is_archive": false, + "is_binary": false, + "is_media": false, + "is_script": true, + "is_source": true, + "is_text": true, + "license_expressions": [ + "gpl-3.0-plus" + ], + "licenses": [{ + "category": "Copyleft", + "end_line": 17, + "homepage_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html", + "is_exception": false, + "key": "gpl-3.0-plus", + "matched_rule": { + "identifier": "gpl-3.0-plus_12.RULE", + "is_license_notice": true, + "is_license_reference": false, + "is_license_tag": false, + "is_license_text": false, + "license_expression": "gpl-3.0-plus", + "licenses": [ + "gpl-3.0-plus" + ], + "match_coverage": 98.2, + "matched_length": 109, + "matcher": "3-seq", + "rule_length": 111, + "rule_relevance": 100 + }, + "matched_text": "This program is free software; you can redistribute it and/or modify\n# it under the terms of the GNU General Public License as published by\n# the Free Software Foundation; either version 3 [of] [the] [License], or\n# (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for more details.\n#\n# You should have received a copy of the GNU General Public License\n# along with this program; if not, write to the Free Software\n# Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-", + "name": "GNU General Public License 3.0 or later", + "owner": "Free Software Foundation (FSF)", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-3.0-plus", + "score": 98.2, + "short_name": "GPL 3.0 or later", + "spdx_license_key": "GPL-3.0-or-later", + "spdx_url": "https://spdx.org/licenses/GPL-3.0-or-later", + "start_line": 5, + "text_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html" + }], + "md5": "aa66e700b06ead2a28c2dc29633ebc00", + "mime_type": "text/x-python", + "name": "codecomplexity.py", + "path": "codecomplexity.py", + "programming_language": "Python", + "scan_errors": [], + "sha1": "124e07ae6c850eb232aaf07f43cdb2b2ad2a1db1", + "size": 7817, + "size_count": 0, + "type": "file" + }], + "analyzer": "scancode_cli", + "commit": "a957488c9bd95e3b72a30611edc61496ee152430", + "message": "[codecomplexity] Enable analysis with no file filtering\n\nThis patch allows to handle analysis without file filtering." + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562563562.34835, + "updated_on": 1525607811.0, + "uuid": "ce7c47568fd87100aff497dd7677b0736d85db1e" + } +] \ No newline at end of file diff --git a/tests/test_colic.py b/tests/test_colic.py new file mode 100644 index 000000000..a376ddbea --- /dev/null +++ b/tests/test_colic.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors: +# Nishchith Shetty +# +import logging +import unittest + +from base import TestBaseBackend + + +HEADER_JSON = {"Content-Type": "application/json"} + + +class TestCoLic(TestBaseBackend): + """Test CoLic backend""" + + connector = "colic" + ocean_index = "test_" + connector + enrich_index = "test_" + connector + "_enrich" + + def test_has_identites(self): + """Test value of has_identities method""" + + enrich_backend = self.connectors[self.connector][2]() + self.assertFalse(enrich_backend.has_identities()) + + def test_items_to_raw(self): + """Test whether JSON items are properly inserted into ES""" + + result = self._test_items_to_raw() + + self.assertGreater(result['items'], 0) + self.assertGreater(result['raw'], 0) + self.assertGreaterEqual(result['items'], result['raw']) + + def test_raw_to_enrich(self): + """Test whether the raw index is properly enriched""" + + result = self._test_raw_to_enrich() + + self.assertGreater(result['raw'], 0) + self.assertGreater(result['enrich'], 0) + self.assertGreaterEqual(result['enrich'], result['raw']) + + enrich_backend = self.connectors[self.connector][2]() + + item = self.items[0] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["gpl-3.0"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GNU General Public License 3.0"]) + self.assertEqual(eitem['copyrights'], ["Copyright (c) 2007 Free Software Foundation, Inc. "]) + self.assertEqual(eitem['has_copyright'], 1) + self.assertEqual(eitem['modules'], []) + self.assertEqual(eitem['file_path'], "LICENSE") + + item = self.items[1] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["gpl-3.0-plus"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GNU General Public License 3.0 or later"]) + self.assertEqual(eitem['copyrights'], ["Copyright (c) 2015-2018 Bitergia"]) + self.assertEqual(eitem['has_copyright'], 1) + self.assertEqual(eitem['modules'], ["graal"]) + self.assertEqual(eitem['file_path'], "graal/codecomplexity.py") + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + logging.getLogger("urllib3").setLevel(logging.WARNING) + logging.getLogger("requests").setLevel(logging.WARNING) + unittest.main(warnings='ignore')