Fix lint

Signed-off-by: Haiko Schol <hs@haikoschol.com>
aboutcode-org · May 23, 2020 · b74bbe4 · b74bbe4
1 parent 42c99fb
commit b74bbe4
Show file tree

Hide file tree

Showing 19 changed files with 290 additions and 191 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -11,7 +11,7 @@ env:
   - SECRET_KEY="i1bn=oly)w*2yl-5yc&f!vvgt)p)fh3_2$r#spa!*sw36f5ov7"
 
 before_script:
-  - pycodestyle --exclude=migrations,settings.py,venv,lib_oval.py,test_ubuntu.py,test_suse.py --max-line-length=100 .
+  - pycodestyle --exclude=migrations,settings.py,venv,lib_oval.py,test_ubuntu.py,test_suse.py,test_data_source.py --max-line-length=100 .
   - psql -c "CREATE DATABASE vulnerablecode;" -U postgres
   - ./manage.py migrate
 

diff --git a/vulnerabilities/data_source.py b/vulnerabilities/data_source.py
@@ -44,11 +44,12 @@
 class Advisory:
     """
     This data class expresses the contract between data sources and the import runner.
-    Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences.
+    Data sources are expected to be usable as context managers and generators, yielding batches of
+    Advisory sequences.
 
-    NB: There are two representations for package URLs that are commonly used by code consuming this data class;
-        PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as
-        "package_urls" and the latter as "purls".
+    NB: There are two representations for package URLs that are commonly used by code consuming this
+        data class; PackageURL objects and strings. As a convention, the former is referred to in
+        variable names, etc. as "package_urls" and the latter as "purls".
     """
     summary: str
     impacted_package_urls: Iterable[PackageURL]
@@ -81,8 +82,8 @@ class DataSource(ContextManager):
     """
     This class defines how importers consume advisories from a data source.
 
-    It makes a distinction between newly added records since the last run and modified records. This allows the import
-    logic to pick appropriate database operations.
+    It makes a distinction between newly added records since the last run and modified records. This
+    allows the import logic to pick appropriate database operations.
     """
 
     CONFIG_CLASS = DataSourceConfiguration
@@ -97,16 +98,18 @@ def __init__(
         """
         Create a DataSource instance.
 
-        :param batch_size: Maximum number of records to return from added_advisories() and updated_advisories()
+        :param batch_size: Maximum number of records to return from added_advisories() and
+               updated_advisories()
         :param last_run_date: Optional timestamp when this data source was last inspected
         :param cutoff_date: Optional timestamp, records older than this will be ignored
         :param config: Optional dictionary with subclass-specific configuration
         """
         config = config or {}
         try:
             self.config = self.__class__.CONFIG_CLASS(batch_size, **config)
-            # These really should be declared in DataSourceConfiguration above but that would prevent DataSource
-            # subclasses from declaring mandatory parameters (i.e. positional arguments)
+            # These really should be declared in DataSourceConfiguration above but that would
+            # prevent DataSource subclasses from declaring mandatory parameters (i.e. positional
+            # arguments)
             setattr(self.config, 'last_run_date', last_run_date)
             setattr(self.config, 'cutoff_date', cutoff_date)
         except Exception as e:
@@ -123,40 +126,47 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     @property
     def cutoff_timestamp(self) -> int:
         """
-        :return: An integer Unix timestamp of the last time this data source was queried or the cutoff date passed in
-        the constructor, whichever is more recent.
+        :return: An integer Unix timestamp of the last time this data source was queried or the
+        cutoff date passed in the constructor, whichever is more recent.
         """
         if not hasattr(self, '_cutoff_timestamp'):
-            last_run = 0 if self.config.last_run_date is None else int(self.config.last_run_date.timestamp())
-            cutoff = 0 if self.config.cutoff_date is None else int(self.config.cutoff_date.timestamp())
+            last_run = 0
+            if self.config.last_run_date is not None:
+                last_run = int(self.config.last_run_date.timestamp())
+
+            cutoff = 0
+            if self.config.cutoff_date is not None:
+                cutoff = int(self.config.cutoff_date.timestamp())
+
             setattr(self, '_cutoff_timestamp', max(last_run, cutoff))
 
         return self._cutoff_timestamp
 
     def validate_configuration(self) -> None:
         """
-        Subclasses can perform more complex validation than what is handled by data classes and their type annotations.
+        Subclasses can perform more complex validation than what is handled by data classes and
+        their type annotations.
 
-        This method is called in the constructor. It should raise InvalidConfigurationError with a human-readable
-        message.
+        This method is called in the constructor. It should raise InvalidConfigurationError with a
+        human-readable message.
         """
         pass
 
     def added_advisories(self) -> Set[Advisory]:
         """
-        Subclasses yield batch_size sized batches of Advisory objects that have been added to the data source since the
-        last run or self.cutoff_date.
+        Subclasses yield batch_size sized batches of Advisory objects that have been added to the
+        data source since the last run or self.cutoff_date.
         """
         return set()
 
     def updated_advisories(self) -> Set[Advisory]:
         """
-        Subclasses yield batch_size sized batches of Advisory objects that have been modified since the last run or
-        self.cutoff_date.
+        Subclasses yield batch_size sized batches of Advisory objects that have been modified since
+        the last run or self.cutoff_date.
 
-        NOTE: Data sources that do not enable detection of changes to existing records vs added records must only
-              implement this method, not added_advisories(). The ImportRunner relies on this contract to decide between
-              insert and update operations.
+        NOTE: Data sources that do not enable detection of changes to existing records vs added
+              records must only implement this method, not added_advisories(). The ImportRunner
+              relies on this contract to decide between insert and update operations.
         """
         return set()
 
@@ -170,11 +180,11 @@ def batch_advisories(self, advisories: List[Advisory]) -> Set[Advisory]:
         """
         Yield batches of the passed in list of advisories.
         """
-        advisories = advisories[:]  # copy the passed in list as we are mutating it in the loop below
+        advisories = advisories[:]  # copy the list as we are mutating it in the loop below
 
         while advisories:
-            batch, advisories = advisories[:self.config.batch_size], advisories[self.config.batch_size:]
-            yield set(batch)
+            b, advisories = advisories[:self.config.batch_size], advisories[self.config.batch_size:]
+            yield set(b)
 
 
 @dataclasses.dataclass
@@ -192,16 +202,17 @@ class GitDataSource(DataSource):
     def validate_configuration(self) -> None:
 
         if not self.config.create_working_directory and self.config.working_directory is None:
-            self.error('"create_working_directory" is not set but "working_directory" is set to the default, which '
-                       'calls tempfile.mkdtemp()')
+            self.error('"create_working_directory" is not set but "working_directory" is set to '
+                       'the default, which calls tempfile.mkdtemp()')
 
-        if not self.config.create_working_directory and not os.path.exists(self.config.working_directory):
-            self.error('"working_directory" does not contain an existing directory and "create_working_directory" is '
-                       'not set')
+        if not self.config.create_working_directory and \
+                not os.path.exists(self.config.working_directory):
+            self.error('"working_directory" does not contain an existing directory and'
+                       '"create_working_directory" is not set')
 
         if not self.config.remove_working_directory and self.config.working_directory is None:
-            self.error('"remove_working_directory" is not set and "working_directory" is set to the default, which '
-                       'calls tempfile.mkdtemp()')
+            self.error('"remove_working_directory" is not set and "working_directory" is set to '
+                       'the default, which calls tempfile.mkdtemp()')
 
     def __enter__(self):
         self._ensure_working_directory()
@@ -218,12 +229,14 @@ def file_changes(
             file_ext: Optional[str] = None,
     ) -> Tuple[Set[str], Set[str]]:
         """
-        Returns all added and modified files since last_run_date or cutoff_date (whichever is more recent).
+        Returns all added and modified files since last_run_date or cutoff_date (whichever is more
+        recent).
 
         :param subdir: filter by files in this directory
         :param recursive: whether to include files in subdirectories
         :param file_ext: filter files by this extension
-        :return: The first set contains (absolute paths to) added files, the second one modified files
+        :return: The first set contains (absolute paths to) added files, the second one modified
+                 files
         """
         if subdir is None:
             working_dir = self.config.working_directory
@@ -271,25 +284,27 @@ def _collect_file_changes(
 
                 abspath = os.path.join(self.config.working_directory, d.new_file.path)
                 # TODO
-                # Just filtering on the two status values for "added" and "modified" is too simplistic.
-                # This does not cover file renames, copies & deletions.
+                # Just filtering on the two status values for "added" and "modified" is too
+                # simplistic. This does not cover file renames, copies & deletions.
                 if d.status == pygit2.GIT_DELTA_ADDED:
                     added_files.add(abspath)
                 elif d.status == pygit2.GIT_DELTA_MODIFIED:
                     updated_files.add(abspath)
 
             previous_commit = commit
 
-        # Any file that has been added and then updated inside the window of the git history we looked at, should be
-        # considered "added", not "updated", since it does not exist in the database yet.
+        # Any file that has been added and then updated inside the window of the git history we
+        # looked at, should be considered "added", not "updated", since it does not exist in the
+        # database yet.
         updated_files = updated_files - added_files
 
         return added_files, updated_files
 
     def _ensure_working_directory(self) -> None:
         if self.config.working_directory is None:
             self.config.working_directory = tempfile.mkdtemp()
-        elif self.config.create_working_directory and not os.path.exists(self.config.working_directory):
+        elif self.config.create_working_directory and \
+                not os.path.exists(self.config.working_directory):
             os.mkdir(self.config.working_directory)
 
     def _ensure_repository(self) -> None:
@@ -315,7 +330,11 @@ def _clone_repository(self) -> None:
         if getattr(self, 'branch', False):
             kwargs['checkout_branch'] = self.config.branch
 
-        self._repo = pygit2.clone_repository(self.config.repository_url, self.config.working_directory, **kwargs)
+        self._repo = pygit2.clone_repository(
+            self.config.repository_url,
+            self.config.working_directory,
+            **kwargs
+        )
 
     def _find_or_add_remote(self):
         remote = None
@@ -325,7 +344,8 @@ def _find_or_add_remote(self):
                 break
 
         if remote is None:
-            remote = self._repo.remotes.create('added_by_vulnerablecode', self.config.repository_url)
+            remote = self._repo.remotes.create(
+                'added_by_vulnerablecode', self.config.repository_url)
 
         return remote
 

diff --git a/vulnerabilities/import_runner.py b/vulnerabilities/import_runner.py
@@ -41,8 +41,8 @@
 class ImportRunner:
     """
     The ImportRunner is responsible for inserting and updating data about vulnerabilities and
-    affected/unaffected/fixed packages in the database. The two main goals for the implementation are correctness and
-    efficiency.
+    affected/unaffected/fixed packages in the database. The two main goals for the implementation
+    are correctness and efficiency.
 
     Correctness:
         - There must be no duplicates in the database (should be enforced by the schema).
@@ -64,10 +64,10 @@ def run(self, cutoff_date: datetime.datetime = None) -> None:
 
         cutoff_date - optional timestamp of the oldest data to include in the import
 
-        NB: Data sources provide two kinds of records; vulnerabilities and packages. Vulnerabilities are potentially
-        shared across many packages, from the same data source and from different data sources. For example, a
-        vulnerability in the Linux kernel is mentioned by advisories from all Linux distributions that package this
-        kernel version.
+        NB: Data sources provide two kinds of records; vulnerabilities and packages. Vulnerabilities
+        are potentially shared across many packages, from the same data source and from different
+        data sources. For example, a vulnerability in the Linux kernel is mentioned by advisories
+        from all Linux distributions that package this kernel version.
         """
         logger.debug(f'Starting import for {self.importer.name}.')
         data_source = self.importer.make_data_source(self.batch_size, cutoff_date=cutoff_date)
@@ -92,9 +92,9 @@ def _process_added_advisories(data_source: DataSource) -> None:
 
             _bulk_insert_impacted_and_resolved_packages(batch, vulnerabilities, impacted, resolved)
         except (DataError, RuntimeError) as e:
-            # FIXME This exception might happen when the max. length of a VARCHAR column is exceeded.
-            # Skipping an entire batch because one version number might be too long is obviously a terrible way to
-            # handle this case.
+            # FIXME This exception might happen when the max. length of a DB column is exceeded.
+            # Skipping an entire batch because one version number might be too long is obviously a
+            # terrible way to handle this case.
             logger.exception(e)
 
 
@@ -107,7 +107,8 @@ def _process_updated_advisories(data_source: DataSource) -> None:
             vuln, _ = _get_or_create_vulnerability(advisory)
 
             for id_ in advisory.reference_ids:
-                models.VulnerabilityReference.objects.get_or_create(vulnerability=vuln, reference_id=id_)
+                models.VulnerabilityReference.objects.get_or_create(
+                    vulnerability=vuln, reference_id=id_)
 
             for url in advisory.reference_urls:
                 models.VulnerabilityReference.objects.get_or_create(vulnerability=vuln, url=url)
@@ -117,22 +118,26 @@ def _process_updated_advisories(data_source: DataSource) -> None:
 
                 # FIXME Does not work yet due to cascading deletes.
                 # if not created:
-                #     qs = models.ResolvedPackage.objects.filter(vulnerability_id=vuln.id, package_id=pkg.id)
+                #     qs = models.ResolvedPackage.objects.filter(
+                #         vulnerability_id=vuln.id, package_id=pkg.id)
                 #     if qs:
                 #         qs[0].delete()
 
-                models.ImpactedPackage.objects.get_or_create(vulnerability_id=vuln.id, package_id=pkg.id)
+                models.ImpactedPackage.objects.get_or_create(
+                    vulnerability_id=vuln.id, package_id=pkg.id)
 
             for rpkg_url in advisory.resolved_package_urls:
                 pkg, created = _get_or_create_package(rpkg_url)
 
                 # FIXME Does not work yet due to cascading deletes.
                 # if not created:
-                #     qs = models.ImpactedPackage.objects.filter(vulnerability_id=vuln.id, package_id=pkg.id)
+                #     qs = models.ImpactedPackage.objects.filter(
+                #         vulnerability_id=vuln.id, package_id=pkg.id)
                 #     if qs:
                 #         qs[0].delete()
 
-                models.ResolvedPackage.objects.get_or_create(vulnerability_id=vuln.id, package_id=pkg.id)
+                models.ResolvedPackage.objects.get_or_create(
+                    vulnerability_id=vuln.id, package_id=pkg.id)
 
 
 def _get_or_create_vulnerability(advisory: Advisory) -> Tuple[models.Vulnerability, bool]:
@@ -257,13 +262,15 @@ def _insert_vulnerabilities_and_references(batch: Set[Advisory]) -> Set[models.V
                 vuln.save()
         else:
             # FIXME
-            # There is no way to check whether a vulnerability without a CVE ID already exists in the database.
+            # There is no way to check whether a vulnerability without a CVE ID already exists in
+            # the database.
             vuln = models.Vulnerability.objects.create(summary=advisory.summary)
 
         vulnerabilities.add(vuln)
 
         for id_ in advisory.reference_ids:
-            models.VulnerabilityReference.objects.get_or_create(vulnerability=vuln, reference_id=id_)
+            models.VulnerabilityReference.objects.get_or_create(
+                vulnerability=vuln, reference_id=id_)
 
         for url in advisory.reference_urls:
             models.VulnerabilityReference.objects.get_or_create(vulnerability=vuln, url=url)

diff --git a/vulnerabilities/importers/alpine_linux.py b/vulnerabilities/importers/alpine_linux.py
@@ -72,7 +72,8 @@ def __enter__(self):
         super(AlpineDataSource, self).__enter__()
 
         if not getattr(self, '_added_files', None):
-            self._added_files, self._updated_files = self.file_changes(recursive=True, file_ext='yaml')
+            self._added_files, self._updated_files = self.file_changes(
+                recursive=True, file_ext='yaml')
 
     def updated_advisories(self) -> Set[Advisory]:
         files = self._updated_files.union(self._added_files)

diff --git a/vulnerabilities/importers/archlinux.py b/vulnerabilities/importers/archlinux.py
@@ -74,9 +74,7 @@ def updated_advisories(self) -> Set[Advisory]:
         for record in self._api_response:
             advisories.extend(self._parse(record))
 
-        while advisories:
-            batch, advisories = advisories[:self.config.batch_size], advisories[self.config.batch_size:]
-            yield set(batch)
+        return self.batch_advisories(advisories)
 
     def _fetch(self) -> Iterable[Mapping]:
         with urlopen(self.config.archlinux_tracker_url) as response:
@@ -103,12 +101,14 @@ def _parse(self, record) -> List[Advisory]:
                         version=record['fixed'],
                     ))
 
+            reference_urls = [f'https://security.archlinux.org/{a}' for a in record['advisories']]
+
             advisories.append(Advisory(
                 cve_id=cve_id,
                 summary='',
                 impacted_package_urls=impacted_purls,
                 resolved_package_urls=resolved_purls,
-                reference_urls=[f'https://security.archlinux.org/{a}' for a in record['advisories']],
+                reference_urls=reference_urls,
             ))
 
         return advisories