Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
Signed-off-by: Haiko Schol <hs@haikoschol.com>
  • Loading branch information
haikoschol committed Apr 21, 2020
1 parent 7f1e8ab commit 6c4c5a1
Show file tree
Hide file tree
Showing 86 changed files with 2,908 additions and 30 deletions.
9 changes: 7 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
asgiref==3.2.7
attrs==19.3.0
beautifulsoup4==4.7.1
cached-property==1.5.1
cffi==1.14.0
dephell-specifier==0.2.1
dj-database-url==0.4.2
Django==3.0.3
Expand All @@ -15,20 +18,22 @@ pluggy==0.13.1
psycopg2==2.8.4
py==1.8.0
pycodestyle==2.5.0
pycparser==2.20
pygit2==1.2.0
pyparsing==2.4.5
pytest==5.3.2
pytest-dependency==0.4.0
pytest-django==3.7.0
pytest-mock==1.13.0
pytoml==0.1.21
pytz==2019.3
PyYAML==5.3
saneyaml==0.4
schema==0.7.1
six==1.13.0
soupsieve==1.9.5
sqlparse==0.3.0
tqdm==4.41.1
wcwidth==0.1.7
whitenoise==5.0.1
zipp==0.6.0
pytoml==0.1.21
schema==0.7.1
297 changes: 270 additions & 27 deletions vulnerabilities/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,80 +20,323 @@
# VulnerableCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/vulnerablecode/ for support and download.

import dataclasses
import os
import shutil
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Any
from typing import ContextManager
from typing import List
from typing import Mapping
from typing import Optional
from typing import Sequence
import dataclasses
from typing import Set

import pygit2
from packageurl import PackageURL


@dataclasses.dataclass
class Advisory:
"""
This data class expresses the contract between data sources and the import runner.
Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences.
NB: There are two representations for package URLs that are commonly used by code consuming this data class;
PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as
"package_urls" and the latter as "purls".
"""
summary: str
impacted_package_urls: Sequence[PackageURL]
resolved_package_urls: Sequence[PackageURL] = dataclasses.field(default_factory=list)
references: Sequence[str] = dataclasses.field(default_factory=list)
cve_id: Optional[str] = None

@property
def impacted_purls(self) -> Set[str]:
return {str(p) for p in self.impacted_package_urls}

@property
def resolved_purls(self) -> Set[str]:
return {str(p) for p in self.resolved_package_urls}


class InvalidConfigurationError(Exception):
pass


@dataclasses.dataclass
class DataSourceConfiguration:
batch_size: int


class DataSource(ContextManager):
"""
This class defines how importers consume advisories from a data source.
It makes a distinction between newly added records since the last run and modified records. This allows the import
logic to pick appropriate database operations.
"""
batch_size: int
cutoff_date: Optional[datetime] = None
config: Optional[Mapping[str, Any]] = dataclasses.field(default_factory=dict)

CONFIG_CLASS = DataSourceConfiguration

def __init__(
self,
batch_size: int,
last_run_date: Optional[datetime] = None,
cutoff_date: Optional[datetime] = None,
config: Optional[Mapping[str, Any]] = None,
):
"""
Create a DataSource instance.
:param batch_size: Maximum number of records to return from added_advisories() and updated_advisories()
:param last_run_date: Optional timestamp when this data source was last inspected
:param cutoff_date: Optional timestamp, records older than this will be ignored
:param config: Optional dictionary with subclass-specific configuration
"""
config = config or {}
try:
self.config = self.__class__.CONFIG_CLASS(batch_size, **config)
# These really should be declared in DataSourceConfiguration above but that would prevent DataSource
# subclasses from declaring mandatory parameters (i.e. positional arguments)
setattr(self.config, 'last_run_date', last_run_date)
setattr(self.config, 'cutoff_date', cutoff_date)
except Exception as e:
raise InvalidConfigurationError(str(e))

self.validate_configuration()

def __enter__(self):
"""
Subclasses acquire per-run resources, such as network connections, file downloads, etc. here.
"""
return self
pass

def __exit__(self, exc_type, exc_val, exc_tb):
"""
Subclasses release per-run resources acquired in __enter__() here.
"""
pass

def added_advisories(self):
def validate_configuration(self) -> None:
"""
Subclasses can perform more complex validation than what is handled by data classes and their type annotations.
This method is called in the constructor. It should raise InvalidConfigurationError with a human-readable
message.
"""
pass

def added_advisories(self) -> List[Advisory]:
"""
Subclasses yield batch_size sized batches of Advisory objects that have been added to the data source
since self.cutoff_date.
"""
raise StopIteration

def updated_advisories(self):
def updated_advisories(self) -> List[Advisory]:
"""
Subclasses yield batch_size sized batches of Advisory objects that have been modified since
self.cutoff_date.
NOTE: Data sources that do not enable detection of changes to existing records vs added records must only
implement this method, not new_records(). The ImportRunner relies on this contract to decide between
implement this method, not added_advisories(). The ImportRunner relies on this contract to decide between
insert and update operations.
"""
raise StopIteration

def error(self, msg: str) -> None:
"""
Helper method for raising InvalidConfigurationError with the class name in the message.
"""
raise InvalidConfigurationError(f'{type(self).__name__}: {msg}')


@dataclasses.dataclass
class Advisory:
"""
This data class expresses the contract between data sources and the import runner.
Data sources are expected to be usable as context managers and generators, yielding batches of Advisory sequences.
class GitDataSourceConfiguration(DataSourceConfiguration):
repository_url: str
branch: Optional[str] = None
create_working_directory: bool = True
remove_working_directory: bool = True
working_directory: Optional[str] = None

NB: There are two representations for package URLs that are commonly used by code consuming this data class;
PackageURL objects and strings. As a convention, the former is referred to in variable names, etc. as
"package_urls" and the latter as "purls".
"""
summary: str
impacted_package_urls: Sequence[PackageURL]
resolved_package_urls: Sequence[PackageURL] = dataclasses.field(default_factory=list)
references: Sequence[str] = dataclasses.field(default_factory=list)
cve_id: Optional[str] = None

@property
def impacted_purls(self):
return {str(p) for p in self.impacted_package_urls}
class GitDataSource(DataSource):
CONFIG_CLASS = GitDataSourceConfiguration

@property
def resolved_purls(self):
return {str(p) for p in self.resolved_package_urls}
def validate_configuration(self) -> None:

if not self.config.create_working_directory and self.config.working_directory is None:
self.error('"create_working_directory" is not set but "working_directory" is set to the default, which '
'calls tempfile.mkdtemp()')

if not self.config.create_working_directory and not os.path.exists(self.config.working_directory):
self.error('"working_directory" does not contain an existing directory and "create_working_directory" is '
'not set')

if not self.config.remove_working_directory and self.config.working_directory is None:
self.error('"remove_working_directory" is not set and "working_directory" is set to the default, which '
'calls tempfile.mkdtemp()')

def __enter__(self):
self._ensure_working_directory()
self._ensure_repository()

def __exit__(self, exc_type, exc_val, exc_tb):
if self.config.remove_working_directory:
shutil.rmtree(self.config.working_directory)

def added_advisories(self) -> List[Advisory]:
raise NotImplementedError

def updated_advisories(self) -> List[Advisory]:
raise NotImplementedError

# TODO Sort out cutoff_date vs last_run_date. The former is "no entries older than one year",
# TODO not "the importer was last run on"
def added_files(
self,
subdir: str = None,
recursive: bool = False,
file_ext: Optional[str] = None
) -> List[str]:

if subdir is None:
working_dir = self.config.working_directory
else:
working_dir = os.path.join(self.config.working_directory, subdir)

path = Path(working_dir)

if self.config.cutoff_date is None:
if recursive:
glob = '**/*'
else:
glob = '*'

if file_ext:
glob = f'{glob}.{file_ext}'

return [str(p.relative_to(working_dir)) for p in path.glob(glob) if p.is_file()]

return self._collect_files(pygit2.GIT_DELTA_ADDED, subdir, recursive, file_ext)

def updated_files(
self,
subdir: str = None,
recursive: bool = False,
file_ext: str = None
) -> List[str]:

if self.config.cutoff_date is None:
return []

return self._collect_files(pygit2.GIT_DELTA_MODIFIED, subdir, recursive, file_ext)

# TODO Just filtering on the two status values for "added" and "modified" is too simplistic.
# TODO This does not cover file renames, copies & deletions.
def _collect_files(
self,
delta_status: int,
subdir: Optional[str],
recursive: bool,
file_ext: Optional[str],
) -> List[str]:

cutoff = 0 if self.config.cutoff_date is None else int(self.config.cutoff_date.timestamp())
previous_commit = None
files = []

for commit in self._repo.walk(self._repo.head.target, pygit2.GIT_SORT_TIME):
if previous_commit is None:
previous_commit = commit
continue

deltas = commit.tree.diff_to_tree(previous_commit.tree).deltas
for d in deltas:
path = d.new_file.path

if d.status == delta_status and not d.is_binary and _include_file(path, subdir, recursive, file_ext):
files.append(path)

if commit.commit_time < cutoff:
break

previous_commit = commit

return files

def _ensure_working_directory(self) -> None:
if self.config.working_directory is None:
self.config.working_directory = tempfile.mkdtemp()
elif self.config.create_working_directory and not os.path.exists(self.config.working_directory):
os.mkdir(self.config.working_directory)

def _ensure_repository(self) -> None:
repodir = pygit2.discover_repository(self.config.working_directory)
if repodir is None:
self._clone_repository()
return

self._repo = pygit2.Repository(repodir)

if self.config.branch is None:
self.config.branch = self._repo.head.shorthand
branch = self._repo.branches[self.config.branch]

if not branch.is_checked_out():
self._repo.checkout(branch)

remote = self._find_or_add_remote()
progress = remote.fetch()
if progress.received_objects == 0:
return

remote_branch = self._repo.branches[f'{remote.name}/{self.config.branch}']
branch.set_target(remote_branch.target)
self._repo.checkout(branch, strategy=pygit2.GIT_CHECKOUT_FORCE)

def _clone_repository(self):
kwargs = {}
if getattr(self, 'branch', False):
kwargs['checkout_branch'] = self.config.branch

self._repo = pygit2.clone_repository(self.config.repository_url, self.config.working_directory, **kwargs)

def _find_or_add_remote(self):
remote = None
for r in self._repo.remotes:
if r.url == self.config.repository_url:
remote = r
break

if remote is None:
remote = self._repo.remotes.create('added_by_vulnerablecode', self.config.repository_url)

return remote


def _include_file(
path: str,
subdir: Optional[str] = None,
recursive: bool = False,
file_ext: Optional[str] = None,
) -> bool:
match = True

if subdir:
if not subdir.endswith(os.path.sep):
subdir = f'{subdir}{os.path.sep}'

match = match and path.startswith(subdir)

if not recursive:
match = match and (os.path.sep not in path[len(subdir or ''):])

if file_ext:
match = match and path.endswith(f'.{file_ext}')

return match
4 changes: 3 additions & 1 deletion vulnerabilities/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,6 @@ def make_data_source(self, cutoff_date=None, batch_size=None) -> DataSource:
cd = cutoff_date or self.last_run
importers_module = importlib.import_module('vulnerabilities.importers')
klass = getattr(importers_module, self.data_source)
return klass(cutoff_date=cd, batch_size=batch_size, config=self.data_source_cfg)
ds = klass(cutoff_date=cd, batch_size=batch_size, config=self.data_source_cfg)
ds.apply_config()
return ds
Loading

0 comments on commit 6c4c5a1

Please sign in to comment.