|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +# |
| 3 | +# Copyright (C) 2015-2019 Bitergia |
| 4 | +# |
| 5 | +# This program is free software; you can redistribute it and/or modify |
| 6 | +# it under the terms of the GNU General Public License as published by |
| 7 | +# the Free Software Foundation; either version 3 of the License, or |
| 8 | +# (at your option) any later version. |
| 9 | +# |
| 10 | +# This program is distributed in the hope that it will be useful, |
| 11 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | +# GNU General Public License for more details. |
| 14 | +# |
| 15 | +# You should have received a copy of the GNU General Public License |
| 16 | +# along with this program; if not, write to the Free Software |
| 17 | +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 18 | +# |
| 19 | +# Authors: |
| 20 | +# Nishchith Shetty <inishchith@gmail.com> |
| 21 | +# |
| 22 | + |
| 23 | +import logging |
| 24 | +from .enrich import Enrich, metadata |
| 25 | +from grimoirelab_toolkit.datetime import str_to_datetime |
| 26 | + |
| 27 | + |
| 28 | +MAX_SIZE_BULK_ENRICHED_ITEMS = 200 |
| 29 | + |
| 30 | +logger = logging.getLogger(__name__) |
| 31 | + |
| 32 | + |
| 33 | +class ColicEnrich(Enrich): |
| 34 | + |
| 35 | + def get_identities(self, item): |
| 36 | + """ Return the identities from an item """ |
| 37 | + identities = [] |
| 38 | + |
| 39 | + return identities |
| 40 | + |
| 41 | + def has_identities(self): |
| 42 | + """ Return whether the enriched items contains identities """ |
| 43 | + |
| 44 | + return False |
| 45 | + |
| 46 | + def get_field_unique_id(self): |
| 47 | + return "id" |
| 48 | + |
| 49 | + def extract_modules(self, file_path): |
| 50 | + """ Extracts module path from the given file path """ |
| 51 | + path_chunks = file_path.split('/') |
| 52 | + |
| 53 | + modules = [] |
| 54 | + for idx in range(len(path_chunks)): |
| 55 | + sub_path = '/'.join(path_chunks[:idx]) |
| 56 | + |
| 57 | + if sub_path: |
| 58 | + modules.append(sub_path) |
| 59 | + |
| 60 | + return modules |
| 61 | + |
| 62 | + @metadata |
| 63 | + def get_rich_item(self, file_analysis): |
| 64 | + # TODO: requires adjustments regarding category of backend used |
| 65 | + |
| 66 | + eitem = {} |
| 67 | + |
| 68 | + # entry["holders"] = file_analysis["holders"][0]["value"] |
| 69 | + eitem["file_path"] = file_analysis["file_path"] |
| 70 | + eitem["modules"] = self.extract_modules(eitem["file_path"]) |
| 71 | + eitem["copyrights"] = [] |
| 72 | + eitem["licenses"] = [] |
| 73 | + eitem["license_name"] = [] |
| 74 | + eitem["has_license"] = 0 |
| 75 | + eitem["has_copyright"] = 0 |
| 76 | + |
| 77 | + if file_analysis.get("licenses", False): |
| 78 | + eitem["has_license"] = 1 |
| 79 | + for _license in file_analysis["licenses"]: |
| 80 | + eitem["licenses"].extend(_license["matched_rule"]["licenses"]) |
| 81 | + eitem["license_name"].append(_license["name"]) |
| 82 | + |
| 83 | + if file_analysis.get("copyrights", False): |
| 84 | + eitem["has_copyright"] = 1 |
| 85 | + for _copyright in file_analysis["copyrights"]: |
| 86 | + eitem["copyrights"].append(_copyright["value"]) |
| 87 | + |
| 88 | + return eitem |
| 89 | + |
| 90 | + def get_rich_items(self, item): |
| 91 | + # The real data |
| 92 | + entry = item['data'] |
| 93 | + |
| 94 | + enriched_items = [] |
| 95 | + |
| 96 | + for file_analysis in entry["analysis"]: |
| 97 | + eitem = self.get_rich_item(file_analysis) |
| 98 | + |
| 99 | + for f in self.RAW_FIELDS_COPY: |
| 100 | + if f in item: |
| 101 | + eitem[f] = item[f] |
| 102 | + else: |
| 103 | + eitem[f] = None |
| 104 | + |
| 105 | + # common attributes |
| 106 | + eitem['commit_sha'] = entry['commit'] |
| 107 | + eitem['author'] = entry['Author'] |
| 108 | + eitem['committer'] = entry['Commit'] |
| 109 | + eitem['commit'] = entry['commit'] |
| 110 | + eitem['message'] = entry['message'] |
| 111 | + eitem['author_date'] = self.__fix_field_date(entry['AuthorDate']) |
| 112 | + eitem['commit_date'] = self.__fix_field_date(entry['CommitDate']) |
| 113 | + |
| 114 | + if self.prjs_map: |
| 115 | + eitem.update(self.get_item_project(eitem)) |
| 116 | + |
| 117 | + # uuid |
| 118 | + eitem['id'] = "{}_{}".format(eitem['commit_sha'], eitem['file_path']) |
| 119 | + |
| 120 | + eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file")) |
| 121 | + |
| 122 | + self.add_repository_labels(eitem) |
| 123 | + self.add_metadata_filter_raw(eitem) |
| 124 | + |
| 125 | + enriched_items.append(eitem) |
| 126 | + |
| 127 | + return enriched_items |
| 128 | + |
| 129 | + def enrich_items(self, ocean_backend, events=False): |
| 130 | + items_to_enrich = [] |
| 131 | + num_items = 0 |
| 132 | + ins_items = 0 |
| 133 | + |
| 134 | + for item in ocean_backend.fetch(): |
| 135 | + rich_items = self.get_rich_items(item) |
| 136 | + |
| 137 | + items_to_enrich.extend(rich_items) |
| 138 | + if len(items_to_enrich) < MAX_SIZE_BULK_ENRICHED_ITEMS: |
| 139 | + continue |
| 140 | + |
| 141 | + num_items += len(items_to_enrich) |
| 142 | + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) |
| 143 | + items_to_enrich = [] |
| 144 | + |
| 145 | + if len(items_to_enrich) > 0: |
| 146 | + num_items += len(items_to_enrich) |
| 147 | + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) |
| 148 | + |
| 149 | + if num_items != ins_items: |
| 150 | + missing = num_items - ins_items |
| 151 | + logger.error("%s/%s missing items for CoLic", str(missing), str(num_items)) |
| 152 | + else: |
| 153 | + logger.info("%s items inserted for CoLic", str(num_items)) |
| 154 | + |
| 155 | + return num_items |
| 156 | + |
| 157 | + def __fix_field_date(self, date_value): |
| 158 | + """Fix possible errors in the field date""" |
| 159 | + |
| 160 | + field_date = str_to_datetime(date_value) |
| 161 | + |
| 162 | + try: |
| 163 | + _ = int(field_date.strftime("%z")[0:3]) |
| 164 | + except ValueError: |
| 165 | + field_date = field_date.replace(tzinfo=None) |
| 166 | + |
| 167 | + return field_date.isoformat() |
0 commit comments