Skip to content

Commit

Permalink
Fix lint
Browse files Browse the repository at this point in the history
Signed-off-by: Haiko Schol <hs@haikoschol.com>
  • Loading branch information
haikoschol committed May 23, 2020
1 parent 42c99fb commit b74bbe4
Show file tree
Hide file tree
Showing 19 changed files with 290 additions and 191 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ env:
- SECRET_KEY="i1bn=oly)w*2yl-5yc&f!vvgt)p)fh3_2$r#spa!*sw36f5ov7"

before_script:
- pycodestyle --exclude=migrations,settings.py,venv,lib_oval.py,test_ubuntu.py,test_suse.py --max-line-length=100 .
- pycodestyle --exclude=migrations,settings.py,venv,lib_oval.py,test_ubuntu.py,test_suse.py,test_data_source.py --max-line-length=100 .
- psql -c "CREATE DATABASE vulnerablecode;" -U postgres
- ./manage.py migrate

Expand Down
104 changes: 62 additions & 42 deletions vulnerabilities/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,12 @@
class Advisory:
"""
This data class expresses the contract between data sources and the import runner.
Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences.
Data sources are expected to be usable as context managers and generators, yielding batches of
Advisory sequences.
NB: There are two representations for package URLs that are commonly used by code consuming this data class;
PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as
"package_urls" and the latter as "purls".
NB: There are two representations for package URLs that are commonly used by code consuming this
data class; PackageURL objects and strings. As a convention, the former is referred to in
variable names, etc. as "package_urls" and the latter as "purls".
"""
summary: str
impacted_package_urls: Iterable[PackageURL]
Expand Down Expand Up @@ -81,8 +82,8 @@ class DataSource(ContextManager):
"""
This class defines how importers consume advisories from a data source.
It makes a distinction between newly added records since the last run and modified records. This allows the import
logic to pick appropriate database operations.
It makes a distinction between newly added records since the last run and modified records. This
allows the import logic to pick appropriate database operations.
"""

CONFIG_CLASS = DataSourceConfiguration
Expand All @@ -97,16 +98,18 @@ def __init__(
"""
Create a DataSource instance.
:param batch_size: Maximum number of records to return from added_advisories() and updated_advisories()
:param batch_size: Maximum number of records to return from added_advisories() and
updated_advisories()
:param last_run_date: Optional timestamp when this data source was last inspected
:param cutoff_date: Optional timestamp, records older than this will be ignored
:param config: Optional dictionary with subclass-specific configuration
"""
config = config or {}
try:
self.config = self.__class__.CONFIG_CLASS(batch_size, **config)
# These really should be declared in DataSourceConfiguration above but that would prevent DataSource
# subclasses from declaring mandatory parameters (i.e. positional arguments)
# These really should be declared in DataSourceConfiguration above but that would
# prevent DataSource subclasses from declaring mandatory parameters (i.e. positional
# arguments)
setattr(self.config, 'last_run_date', last_run_date)
setattr(self.config, 'cutoff_date', cutoff_date)
except Exception as e:
Expand All @@ -123,40 +126,47 @@ def __exit__(self, exc_type, exc_val, exc_tb):
@property
def cutoff_timestamp(self) -> int:
"""
:return: An integer Unix timestamp of the last time this data source was queried or the cutoff date passed in
the constructor, whichever is more recent.
:return: An integer Unix timestamp of the last time this data source was queried or the
cutoff date passed in the constructor, whichever is more recent.
"""
if not hasattr(self, '_cutoff_timestamp'):
last_run = 0 if self.config.last_run_date is None else int(self.config.last_run_date.timestamp())
cutoff = 0 if self.config.cutoff_date is None else int(self.config.cutoff_date.timestamp())
last_run = 0
if self.config.last_run_date is not None:
last_run = int(self.config.last_run_date.timestamp())

cutoff = 0
if self.config.cutoff_date is not None:
cutoff = int(self.config.cutoff_date.timestamp())

setattr(self, '_cutoff_timestamp', max(last_run, cutoff))

return self._cutoff_timestamp

def validate_configuration(self) -> None:
"""
Subclasses can perform more complex validation than what is handled by data classes and their type annotations.
Subclasses can perform more complex validation than what is handled by data classes and
their type annotations.
This method is called in the constructor. It should raise InvalidConfigurationError with a human-readable
message.
This method is called in the constructor. It should raise InvalidConfigurationError with a
human-readable message.
"""
pass

def added_advisories(self) -> Set[Advisory]:
"""
Subclasses yield batch_size sized batches of Advisory objects that have been added to the data source since the
last run or self.cutoff_date.
Subclasses yield batch_size sized batches of Advisory objects that have been added to the
data source since the last run or self.cutoff_date.
"""
return set()

def updated_advisories(self) -> Set[Advisory]:
"""
Subclasses yield batch_size sized batches of Advisory objects that have been modified since the last run or
self.cutoff_date.
Subclasses yield batch_size sized batches of Advisory objects that have been modified since
the last run or self.cutoff_date.
NOTE: Data sources that do not enable detection of changes to existing records vs added records must only
implement this method, not added_advisories(). The ImportRunner relies on this contract to decide between
insert and update operations.
NOTE: Data sources that do not enable detection of changes to existing records vs added
records must only implement this method, not added_advisories(). The ImportRunner
relies on this contract to decide between insert and update operations.
"""
return set()

Expand All @@ -170,11 +180,11 @@ def batch_advisories(self, advisories: List[Advisory]) -> Set[Advisory]:
"""
Yield batches of the passed in list of advisories.
"""
advisories = advisories[:] # copy the passed in list as we are mutating it in the loop below
advisories = advisories[:] # copy the list as we are mutating it in the loop below

while advisories:
batch, advisories = advisories[:self.config.batch_size], advisories[self.config.batch_size:]
yield set(batch)
b, advisories = advisories[:self.config.batch_size], advisories[self.config.batch_size:]
yield set(b)


@dataclasses.dataclass
Expand All @@ -192,16 +202,17 @@ class GitDataSource(DataSource):
def validate_configuration(self) -> None:

if not self.config.create_working_directory and self.config.working_directory is None:
self.error('"create_working_directory" is not set but "working_directory" is set to the default, which '
'calls tempfile.mkdtemp()')
self.error('"create_working_directory" is not set but "working_directory" is set to '
'the default, which calls tempfile.mkdtemp()')

if not self.config.create_working_directory and not os.path.exists(self.config.working_directory):
self.error('"working_directory" does not contain an existing directory and "create_working_directory" is '
'not set')
if not self.config.create_working_directory and \
not os.path.exists(self.config.working_directory):
self.error('"working_directory" does not contain an existing directory and'
'"create_working_directory" is not set')

if not self.config.remove_working_directory and self.config.working_directory is None:
self.error('"remove_working_directory" is not set and "working_directory" is set to the default, which '
'calls tempfile.mkdtemp()')
self.error('"remove_working_directory" is not set and "working_directory" is set to '
'the default, which calls tempfile.mkdtemp()')

def __enter__(self):
self._ensure_working_directory()
Expand All @@ -218,12 +229,14 @@ def file_changes(
file_ext: Optional[str] = None,
) -> Tuple[Set[str], Set[str]]:
"""
Returns all added and modified files since last_run_date or cutoff_date (whichever is more recent).
Returns all added and modified files since last_run_date or cutoff_date (whichever is more
recent).
:param subdir: filter by files in this directory
:param recursive: whether to include files in subdirectories
:param file_ext: filter files by this extension
:return: The first set contains (absolute paths to) added files, the second one modified files
:return: The first set contains (absolute paths to) added files, the second one modified
files
"""
if subdir is None:
working_dir = self.config.working_directory
Expand Down Expand Up @@ -271,25 +284,27 @@ def _collect_file_changes(

abspath = os.path.join(self.config.working_directory, d.new_file.path)
# TODO
# Just filtering on the two status values for "added" and "modified" is too simplistic.
# This does not cover file renames, copies & deletions.
# Just filtering on the two status values for "added" and "modified" is too
# simplistic. This does not cover file renames, copies & deletions.
if d.status == pygit2.GIT_DELTA_ADDED:
added_files.add(abspath)
elif d.status == pygit2.GIT_DELTA_MODIFIED:
updated_files.add(abspath)

previous_commit = commit

# Any file that has been added and then updated inside the window of the git history we looked at, should be
# considered "added", not "updated", since it does not exist in the database yet.
# Any file that has been added and then updated inside the window of the git history we
# looked at, should be considered "added", not "updated", since it does not exist in the
# database yet.
updated_files = updated_files - added_files

return added_files, updated_files

def _ensure_working_directory(self) -> None:
if self.config.working_directory is None:
self.config.working_directory = tempfile.mkdtemp()
elif self.config.create_working_directory and not os.path.exists(self.config.working_directory):
elif self.config.create_working_directory and \
not os.path.exists(self.config.working_directory):
os.mkdir(self.config.working_directory)

def _ensure_repository(self) -> None:
Expand All @@ -315,7 +330,11 @@ def _clone_repository(self) -> None:
if getattr(self, 'branch', False):
kwargs['checkout_branch'] = self.config.branch

self._repo = pygit2.clone_repository(self.config.repository_url, self.config.working_directory, **kwargs)
self._repo = pygit2.clone_repository(
self.config.repository_url,
self.config.working_directory,
**kwargs
)

def _find_or_add_remote(self):
remote = None
Expand All @@ -325,7 +344,8 @@ def _find_or_add_remote(self):
break

if remote is None:
remote = self._repo.remotes.create('added_by_vulnerablecode', self.config.repository_url)
remote = self._repo.remotes.create(
'added_by_vulnerablecode', self.config.repository_url)

return remote

Expand Down
39 changes: 23 additions & 16 deletions vulnerabilities/import_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@
class ImportRunner:
"""
The ImportRunner is responsible for inserting and updating data about vulnerabilities and
affected/unaffected/fixed packages in the database. The two main goals for the implementation are correctness and
efficiency.
affected/unaffected/fixed packages in the database. The two main goals for the implementation
are correctness and efficiency.
Correctness:
- There must be no duplicates in the database (should be enforced by the schema).
Expand All @@ -64,10 +64,10 @@ def run(self, cutoff_date: datetime.datetime = None) -> None:
cutoff_date - optional timestamp of the oldest data to include in the import
NB: Data sources provide two kinds of records; vulnerabilities and packages. Vulnerabilities are potentially
shared across many packages, from the same data source and from different data sources. For example, a
vulnerability in the Linux kernel is mentioned by advisories from all Linux distributions that package this
kernel version.
NB: Data sources provide two kinds of records; vulnerabilities and packages. Vulnerabilities
are potentially shared across many packages, from the same data source and from different
data sources. For example, a vulnerability in the Linux kernel is mentioned by advisories
from all Linux distributions that package this kernel version.
"""
logger.debug(f'Starting import for {self.importer.name}.')
data_source = self.importer.make_data_source(self.batch_size, cutoff_date=cutoff_date)
Expand All @@ -92,9 +92,9 @@ def _process_added_advisories(data_source: DataSource) -> None:

_bulk_insert_impacted_and_resolved_packages(batch, vulnerabilities, impacted, resolved)
except (DataError, RuntimeError) as e:
# FIXME This exception might happen when the max. length of a VARCHAR column is exceeded.
# Skipping an entire batch because one version number might be too long is obviously a terrible way to
# handle this case.
# FIXME This exception might happen when the max. length of a DB column is exceeded.
# Skipping an entire batch because one version number might be too long is obviously a
# terrible way to handle this case.
logger.exception(e)


Expand All @@ -107,7 +107,8 @@ def _process_updated_advisories(data_source: DataSource) -> None:
vuln, _ = _get_or_create_vulnerability(advisory)

for id_ in advisory.reference_ids:
models.VulnerabilityReference.objects.get_or_create(vulnerability=vuln, reference_id=id_)
models.VulnerabilityReference.objects.get_or_create(
vulnerability=vuln, reference_id=id_)

for url in advisory.reference_urls:
models.VulnerabilityReference.objects.get_or_create(vulnerability=vuln, url=url)
Expand All @@ -117,22 +118,26 @@ def _process_updated_advisories(data_source: DataSource) -> None:

# FIXME Does not work yet due to cascading deletes.
# if not created:
# qs = models.ResolvedPackage.objects.filter(vulnerability_id=vuln.id, package_id=pkg.id)
# qs = models.ResolvedPackage.objects.filter(
# vulnerability_id=vuln.id, package_id=pkg.id)
# if qs:
# qs[0].delete()

models.ImpactedPackage.objects.get_or_create(vulnerability_id=vuln.id, package_id=pkg.id)
models.ImpactedPackage.objects.get_or_create(
vulnerability_id=vuln.id, package_id=pkg.id)

for rpkg_url in advisory.resolved_package_urls:
pkg, created = _get_or_create_package(rpkg_url)

# FIXME Does not work yet due to cascading deletes.
# if not created:
# qs = models.ImpactedPackage.objects.filter(vulnerability_id=vuln.id, package_id=pkg.id)
# qs = models.ImpactedPackage.objects.filter(
# vulnerability_id=vuln.id, package_id=pkg.id)
# if qs:
# qs[0].delete()

models.ResolvedPackage.objects.get_or_create(vulnerability_id=vuln.id, package_id=pkg.id)
models.ResolvedPackage.objects.get_or_create(
vulnerability_id=vuln.id, package_id=pkg.id)


def _get_or_create_vulnerability(advisory: Advisory) -> Tuple[models.Vulnerability, bool]:
Expand Down Expand Up @@ -257,13 +262,15 @@ def _insert_vulnerabilities_and_references(batch: Set[Advisory]) -> Set[models.V
vuln.save()
else:
# FIXME
# There is no way to check whether a vulnerability without a CVE ID already exists in the database.
# There is no way to check whether a vulnerability without a CVE ID already exists in
# the database.
vuln = models.Vulnerability.objects.create(summary=advisory.summary)

vulnerabilities.add(vuln)

for id_ in advisory.reference_ids:
models.VulnerabilityReference.objects.get_or_create(vulnerability=vuln, reference_id=id_)
models.VulnerabilityReference.objects.get_or_create(
vulnerability=vuln, reference_id=id_)

for url in advisory.reference_urls:
models.VulnerabilityReference.objects.get_or_create(vulnerability=vuln, url=url)
Expand Down
3 changes: 2 additions & 1 deletion vulnerabilities/importers/alpine_linux.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def __enter__(self):
super(AlpineDataSource, self).__enter__()

if not getattr(self, '_added_files', None):
self._added_files, self._updated_files = self.file_changes(recursive=True, file_ext='yaml')
self._added_files, self._updated_files = self.file_changes(
recursive=True, file_ext='yaml')

def updated_advisories(self) -> Set[Advisory]:
files = self._updated_files.union(self._added_files)
Expand Down
8 changes: 4 additions & 4 deletions vulnerabilities/importers/archlinux.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,7 @@ def updated_advisories(self) -> Set[Advisory]:
for record in self._api_response:
advisories.extend(self._parse(record))

while advisories:
batch, advisories = advisories[:self.config.batch_size], advisories[self.config.batch_size:]
yield set(batch)
return self.batch_advisories(advisories)

def _fetch(self) -> Iterable[Mapping]:
with urlopen(self.config.archlinux_tracker_url) as response:
Expand All @@ -103,12 +101,14 @@ def _parse(self, record) -> List[Advisory]:
version=record['fixed'],
))

reference_urls = [f'https://security.archlinux.org/{a}' for a in record['advisories']]

advisories.append(Advisory(
cve_id=cve_id,
summary='',
impacted_package_urls=impacted_purls,
resolved_package_urls=resolved_purls,
reference_urls=[f'https://security.archlinux.org/{a}' for a in record['advisories']],
reference_urls=reference_urls,
))

return advisories
Loading

0 comments on commit b74bbe4

Please sign in to comment.