Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transform and load dependencies from setup.cfg #718

Merged
merged 8 commits into from
Nov 11, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 119 additions & 46 deletions cartography/intel/github/repos.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
from configparser import ConfigParser
from configparser import Error
from string import Template
from typing import Any
from typing import Dict
Expand All @@ -14,7 +16,6 @@
from cartography.util import run_cleanup_job
from cartography.util import timeit


logger = logging.getLogger(__name__)

GITHUB_ORG_REPOS_PAGINATED_GRAPHQL = """
Expand Down Expand Up @@ -76,6 +77,11 @@
text
}
}
setupCfg:object(expression: "HEAD:setup.cfg") {
... on Blob {
text
}
}
}
}
}
Expand Down Expand Up @@ -121,7 +127,8 @@ def transform(repos_json: List[Dict]) -> Dict:
_transform_repo_objects(repo_object, transformed_repo_list)
_transform_repo_owners(repo_object['owner']['url'], repo_object, transformed_repo_owners)
_transform_collaborators(repo_object['collaborators'], repo_object['url'], transformed_collaborators)
_transform_python_requirements(repo_object['requirements'], repo_object['url'], transformed_requirements_files)
_transform_requirements_txt(repo_object['requirements'], repo_object['url'], transformed_requirements_files)
_transform_setup_cfg_requirements(repo_object['setupCfg'], repo_object['url'], transformed_requirements_files)
Comment on lines +129 to +130
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if a dep is listed in both places (e.g. no bounds in setup.cfg but pinned in requirements.txt)?
What do we want to have happen (e.g. what would be most useful for our query patterns/the https://github.com/lyft/cartography/blob/master/docs/usage/samplequeries.md sample queries)?
This is likely worth a test case.

Copy link
Contributor Author

@olivia-hong olivia-hong Nov 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's listed in both places and has different specifiers for each usage, cartography will create two separate nodes, which I think makes sense rather than any sort of "merging" logic. This allows users to query what version(s) are being used or perhaps find out that they are specifying a dependency in multiple files when it's not needed. Added a test case for this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That SGTM, thanks! Just wanted to check that we didn't have one "overwrite" the other

results = {
'repos': transformed_repo_list,
'repo_languages': transformed_repo_languages,
Expand Down Expand Up @@ -235,58 +242,124 @@ def _transform_collaborators(collaborators: Dict, repo_url: str, transformed_col
transformed_collaborators[user_permission].append(user)


def _transform_python_requirements(req_file_contents: Dict, repo_url: str, out_requirements_files: List[Dict]) -> None:
def _transform_requirements_txt(
req_file_contents: Optional[Dict],
repo_url: str,
out_requirements_files: List[Dict],
) -> None:
"""
Performs data transformations for the requirements.txt files in a GitHub repo, if available.
:param req_file_contents: str: The text contents of the requirements file.
Performs data transformations for the requirements.txt file in a GitHub repo, if available.
:param req_file_contents: Dict: The contents of the requirements.txt file.
Comment on lines +250 to +251
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotta love the random cleanup 🙂 , very "leave it better than you found it"

:param repo_url: str: The URL of the GitHub repo.
:param out_requirements_files: Output array to append transformed results to.
:return: Nothing.
"""
if req_file_contents and req_file_contents.get('text'):
text_contents = req_file_contents['text']
reqs_list = text_contents.split("\n")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would prefer this named the same as the param in the function you call below

_transform_python_requirements(reqs_list, repo_url, out_requirements_files)

parsed_list = []
for line in text_contents.split("\n"):
# Remove trailing comments and extra whitespace
stripped_line = line.partition('#')[0].strip()
if stripped_line == '':
continue
try:
req = Requirement(stripped_line)
except InvalidRequirement:
# INFO and not WARN/ERROR as we intentionally don't support all ways to specify Python requirements
logger.info(
f"Failed to parse line \"{line}\" in repo {repo_url}'s requirements.txt; skipping line.",
exc_info=True,
)
continue
parsed_list.append(req)

for req in parsed_list:
pinned_version = None
if len(req.specifier) == 1:
specifier = next(iter(req.specifier))
if specifier.operator == '==':
pinned_version = specifier.version

# Set `spec` to a default value. Example values for str(req.specifier): "<4.0,>=3.0" or "==1.0.0".
spec: Optional[str] = str(req.specifier)
# Set spec to `None` instead of empty string so that the Neo4j driver will leave the library.specifier field
# undefined. As convention, we prefer undefined values over empty strings in the graph.
if spec == '':
spec = None

canon_name = canonicalize_name(req.name)
requirement_id = f"{canon_name}|{pinned_version}" if pinned_version else canon_name

out_requirements_files.append({
"id": requirement_id,
"name": canon_name,
"specifier": spec,
"version": pinned_version,
"repo_url": repo_url,
})

def _transform_setup_cfg_requirements(
setup_cfg_contents: Optional[Dict],
repo_url: str,
out_requirements_files: List[Dict],
) -> None:
"""
Performs data transformations for the setup.cfg file in a GitHub repo, if available.
:param setup_cfg_contents: Dict: Contains contents of a repo's setup.cfg file.
:param repo_url: str: The URL of the GitHub repo.
:param out_requirements_files: Output array to append transformed results to.
:return: Nothing.
"""
if setup_cfg_contents and setup_cfg_contents.get('text'):
olivia-hong marked this conversation as resolved.
Show resolved Hide resolved
text_contents = setup_cfg_contents['text']
setup_cfg = ConfigParser()
try:
setup_cfg.read_string(text_contents)
except Error:
olivia-hong marked this conversation as resolved.
Show resolved Hide resolved
logger.info(
f"Failed to parse {repo_url}'s setup.cfg; skipping.",
exc_info=True,
)
return
reqs_list = parse_setup_cfg(setup_cfg)
_transform_python_requirements(reqs_list, repo_url, out_requirements_files)


def _transform_python_requirements(
requirements_list: List[str],
repo_url: str,
out_requirements_files: List[Dict],
) -> None:
"""
Helper function to perform data transformations on an arbitrary list of requirements.
:param requirements_list: List[str]: List of requirements
:param repo_url: str: The URL of the GitHub repo.
:param out_requirements_files: Output array to append transformed results to.
:return: Nothing.
"""
parsed_list = []
for line in requirements_list:
stripped_line = line.partition('#')[0].strip()
if stripped_line == '':
continue
try:
req = Requirement(stripped_line)
except InvalidRequirement:
# INFO and not WARN/ERROR as we intentionally don't support all ways to specify Python requirements
logger.info(
f"Failed to parse line \"{line}\" in repo {repo_url}'s requirements.txt; skipping line.",
exc_info=True,
)
continue
parsed_list.append(req)

for req in parsed_list:
pinned_version = None
if len(req.specifier) == 1:
specifier = next(iter(req.specifier))
if specifier.operator == '==':
pinned_version = specifier.version

# Set `spec` to a default value. Example values for str(req.specifier): "<4.0,>=3.0" or "==1.0.0".
spec: Optional[str] = str(req.specifier)
# Set spec to `None` instead of empty string so that the Neo4j driver will leave the library.specifier field
# undefined. As convention, we prefer undefined values over empty strings in the graph.
if spec == '':
spec = None

canon_name = canonicalize_name(req.name)
requirement_id = f"{canon_name}|{pinned_version}" if pinned_version else canon_name

out_requirements_files.append({
"id": requirement_id,
"name": canon_name,
"specifier": spec,
"version": pinned_version,
"repo_url": repo_url,
})


def parse_setup_cfg(config: ConfigParser) -> List[str]:
reqs: List[str] = []
reqs.extend(_parse_setup_cfg_requirements(config.get("options", "install_requires", fallback="")))
reqs.extend(_parse_setup_cfg_requirements(config.get("options", "setup_requires", fallback="")))
if config.has_section("options.extras_require"):
for _, val in config.items("options.extras_require"):
reqs.extend(_parse_setup_cfg_requirements(val))
return reqs


# logic taken from setuptools:
# https://github.com/pypa/setuptools/blob/f359b8a7608c7f118710af02cb5edab4e6abb942/setuptools/config.py#L241-L258
def _parse_setup_cfg_requirements(reqs: str, separator: str = ";") -> List[str]:
if "\n" in reqs:
reqs_list = reqs.splitlines()
else:
reqs_list = reqs.split(separator)

return [req.strip() for req in reqs_list if req.strip()]


@timeit
Expand Down
3 changes: 2 additions & 1 deletion docs/schema/github.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ Representation of a single Programming Language [language object](https://develo

## Dependency::PythonLibrary

Representation of a Python library as listed in a [requirements.txt](https://pip.pypa.io/en/stable/user_guide/#requirements-files) file.
Representation of a Python library as listed in a [requirements.txt](https://pip.pypa.io/en/stable/user_guide/#requirements-files)
or [setup.cfg](https://setuptools.pypa.io/en/latest/userguide/declarative_config.html) file
olivia-hong marked this conversation as resolved.
Show resolved Hide resolved

| Field | Description |
|-------|-------------|
Expand Down
17 changes: 17 additions & 0 deletions tests/data/github/repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@
},
'collaborators': {'edges': [], 'nodes': []},
'requirements': {'text': 'cartography\nhttplib2<0.7.0\njinja2\nlxml\n-e git+https://example.com#egg=foobar\nhttps://example.com/foobar.tar.gz\npip @ https://github.com/pypa/pip/archive/1.3.1.zip#sha1=da9234ee9982d4bbb3c72346a6de940a148ea686\n'}, # noqa
'setupCfg': {
'text': '''
olivia-hong marked this conversation as resolved.
Show resolved Hide resolved
[options]
install_requires =
neo4j
scipy!=1.20.0 # comment
''',
},
}, {
'name': 'SampleRepo2',
'nameWithOwner': 'example_org/SampleRepo2',
Expand Down Expand Up @@ -64,6 +72,7 @@
},
'collaborators': None,
'requirements': None,
'setupCfg': None,
},
{
'name': 'cartography',
Expand Down Expand Up @@ -141,5 +150,13 @@
'requirements': {
'text': 'cartography==0.1.0\nhttplib2>=0.7.0\njinja2\nlxml\n# This is a comment line to be ignored\n',
},
'setupCfg': {
'text': '''
[options]
install_requires =
neo4j>=1.0.0
numpy!=1.20.0 # comment
''',
},
},
]
22 changes: 20 additions & 2 deletions tests/integration/cartography/intel/github/test_repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def test_repository_to_collaborators(neo4j_session):

def test_pinned_python_library_to_repo(neo4j_session):
"""
Ensure that repositories are connected to pinned Python libraries.
Ensure that repositories are connected to pinned Python libraries stated as dependencies in requirements.txt.
Create the path (:RepoA)-[:REQUIRES{specifier:"0.1.0"}]->(:PythonLibrary{'Cartography'})<-[:REQUIRES]-(:RepoB),
and verify that exactly 1 repo is connected to the PythonLibrary with a specifier (RepoA).
"""
Expand All @@ -210,7 +210,7 @@ def test_pinned_python_library_to_repo(neo4j_session):

def test_upinned_python_library_to_repo(neo4j_session):
"""
Ensure that repositories are connected to un-pinned Python libraries.
Ensure that repositories are connected to un-pinned Python libraries stated as dependencies in requirements.txt.
That is, create the path
(:RepoA)-[r:REQUIRES{specifier:"0.1.0"}]->(:PythonLibrary{'Cartography'})<-[:REQUIRES]-(:RepoB),
and verify that exactly 1 repo is connected to the PythonLibrary without using a pinned specifier (RepoB).
Expand All @@ -227,3 +227,21 @@ def test_upinned_python_library_to_repo(neo4j_session):
actual_nodes = {n['repo_count'] for n in nodes}
expected_nodes = {1}
assert actual_nodes == expected_nodes


def test_setup_cfg_library_to_repo(neo4j_session):
"""
Ensure that repositories are connected to Python libraries stated as dependencies in setup.cfg.
and verify that exactly 2 repos are connected to the PythonLibrary.
"""
_ensure_local_neo4j_has_test_data(neo4j_session)

# Note: don't query for relationship attributes in code that needs to be fast.
query = """
MATCH (repo:GitHubRepository)-[r:REQUIRES]->(lib:PythonLibrary{id:'neo4j'})
RETURN count(repo) as repo_count
"""
nodes = neo4j_session.run(query)
actual_nodes = {n['repo_count'] for n in nodes}
expected_nodes = {2}
assert actual_nodes == expected_nodes