Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite egg_info_matches with canonicalize_name #5875

Merged
merged 4 commits into from
Oct 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/5870.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Canonicalize sdist file names so they can be matched to a canonicalized package name passed to ``pip install``.
59 changes: 36 additions & 23 deletions src/pip/_internal/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,8 +815,8 @@ def _link_package_versions(self, link, search):
return

if not version:
version = egg_info_matches(egg_info, search.supplied, link)
if version is None:
version = _egg_info_matches(egg_info, search.canonical)
if not version:
self._log_skipped_link(
link, 'Missing project version for %s' % search.supplied)
return
Expand Down Expand Up @@ -846,33 +846,46 @@ def _link_package_versions(self, link, search):
return InstallationCandidate(search.supplied, version, link)


def egg_info_matches(
egg_info, search_name, link,
_egg_info_re=re.compile(r'([a-z0-9_.]+)-([a-z0-9_.!+-]+)', re.I)):
def _find_name_version_sep(egg_info, canonical_name):
"""Find the separator's index based on the package's canonical name.

`egg_info` must be an egg info string for the given package, and
`canonical_name` must be the package's canonical name.

This function is needed since the canonicalized name does not necessarily
have the same length as the egg info's name part. An example::

>>> egg_info = 'foo__bar-1.0'
>>> canonical_name = 'foo-bar'
>>> _find_name_version_sep(egg_info, canonical_name)
8
"""
# Project name and version must be separated by one single dash. Find all
# occurrences of dashes; if the string in front of it matches the canonical
# name, this is the one separating the name and version parts.
for i, c in enumerate(egg_info):
if c != "-":
continue
if canonicalize_name(egg_info[:i]) == canonical_name:
return i
raise ValueError("{} does not match {}".format(egg_info, canonical_name))


def _egg_info_matches(egg_info, canonical_name):
"""Pull the version part out of a string.

:param egg_info: The string to parse. E.g. foo-2.1
:param search_name: The name of the package this belongs to. None to
infer the name. Note that this cannot unambiguously parse strings
like foo-2-2 which might be foo, 2-2 or foo-2, 2.
:param link: The link the string came from, for logging on failure.
:param canonical_name: The canonicalized name of the package this
belongs to.
"""
match = _egg_info_re.search(egg_info)
if not match:
logger.debug('Could not parse version from link: %s', link)
try:
version_start = _find_name_version_sep(egg_info, canonical_name) + 1
except ValueError:
return None
if search_name is None:
full_match = match.group(0)
return full_match.split('-', 1)[-1]
name = match.group(0).lower()
# To match the "safe" name that pkg_resources creates:
name = name.replace('_', '-')
# project name and version must be separated by a dash
look_for = search_name.lower() + "-"
if name.startswith(look_for):
return match.group(0)[len(look_for):]
else:
version = egg_info[version_start:]
if not version:
return None
return version


def _determine_base_url(document, page_url):
Expand Down
12 changes: 10 additions & 2 deletions src/pip/_internal/wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,15 @@ def supported(self, tags=None):
return bool(set(tags).intersection(self.file_tags))


def _contains_egg_info(
s, _egg_info_re=re.compile(r'([a-z0-9_.]+)-([a-z0-9_.!+-]+)', re.I)):
"""Determine whether the string looks like an egg_info.

:param s: The string to parse. E.g. foo-2.1
"""
return bool(_egg_info_re.search(s))


class WheelBuilder(object):
"""Build wheels from a RequirementSet."""

Expand Down Expand Up @@ -712,7 +721,6 @@ def build(self, requirements, session, autobuilding=False):
newly built wheel, in preparation for installation.
:return: True if all the wheels built correctly.
"""
from pip._internal import index
from pip._internal.models.link import Link

building_is_possible = self._wheel_dir or (
Expand Down Expand Up @@ -742,7 +750,7 @@ def build(self, requirements, session, autobuilding=False):
if autobuilding:
link = req.link
base, ext = link.splitext()
if index.egg_info_matches(base, None, link) is None:
if not _contains_egg_info(base):
# E.g. local directory. Build wheel just for this run.
ephem_cache = True
if "binary" not in format_control.get_allowed_formats(
Expand Down
91 changes: 76 additions & 15 deletions tests/unit/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from pip._internal.download import PipSession
from pip._internal.index import (
Link, PackageFinder, _determine_base_url, _get_html_page, egg_info_matches,
Link, PackageFinder, _determine_base_url, _egg_info_matches,
_find_name_version_sep, _get_html_page,
)


Expand Down Expand Up @@ -165,32 +166,92 @@ def test_get_formatted_locations_basic_auth():


@pytest.mark.parametrize(
("egg_info", "search_name", "expected"),
("egg_info", "canonical_name", "expected"),
[
# Trivial.
("pip-18.0", "pip", 3),
("zope-interface-4.5.0", "zope-interface", 14),

# Canonicalized name match non-canonicalized egg info. (pypa/pip#5870)
("Jinja2-2.10", "jinja2", 6),
("zope.interface-4.5.0", "zope-interface", 14),
("zope_interface-4.5.0", "zope-interface", 14),

# Should be smart enough to parse ambiguous names from the provided
# package name.
("foo-2-2", "foo", 3),
("foo-2-2", "foo-2", 5),

# Should be able to detect collapsed characters in the egg info.
("foo--bar-1.0", "foo-bar", 8),
("foo-_bar-1.0", "foo-bar", 8),

# The package name must not ends with a dash (PEP 508), so the first
# dash would be the separator, not the second.
("zope.interface--4.5.0", "zope-interface", 14),
("zope.interface--", "zope-interface", 14),

# The version part is missing, but the split function does not care.
("zope.interface-", "zope-interface", 14),
],
)
def test_find_name_version_sep(egg_info, canonical_name, expected):
Copy link
Member

@cjerdonek cjerdonek Oct 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also add a test of how this function behaves if the version specifier is missing (e.g. "zope-interface") -- as well as for _egg_info_matches() below if not already present.

Another edge case I think would be worth adding is something like "zope.interface.-4.5.0" (where the prefix you check winds up ending with a non-hyphen separator character). Two dashes in the separator is another case: "zope.interface--4.5.0".

Copy link
Member Author

@uranusjr uranusjr Oct 22, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to PEP 508, the package name must match ^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$, so zope.interface.-4.5.0 is invalid. I’ll add zope.interface--4.5.0 (the first dash would be the separator).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we know in advance that all of the egg_info strings passed to the function will be valid? Or is there the possibility of mistaken input by the user? If the latter, I think it would be worth having some test cases to make sure e.g. the function doesn't treat invalid input as valid. (More generally, what guarantees do we have about what will be passed to the function?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The egg info strings come from an index or find links page. They can be invalid only if the page is formatted incorrectly. I think it is reasonable to either expect invalid inputs or not, depending on how you think pip should handle non-standard-conforming inputs.

Copy link
Member

@cjerdonek cjerdonek Oct 25, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, what I was getting at is that I think it would be worth adding a few more test cases that came to mind, even if invalid -- not because the additional cases should be considered valid, but because it provides more visibility into how the two functions behave on various edge cases. If it's possible for these inputs to happen in real life, then that's even more reason.

For example, I noticed that none of the _find_name_version_sep() test cases hit the ValueError line (though some of the _egg_info_matches() test cases hit that code path by virtue of calling _find_name_version_sep()).

Here are additional test cases that come to mind (for both functions):

("zope.interface4.5.0", "zope-interface"),
("zope.interface.4.5.0", "zope-interface"),
("zope.interface.-4.5.0", "zope-interface"),
("zope.interface--4.5.0", "zope-interface"),
("zope.interface--", "zope-interface"),
("zope.interface-", "zope-interface"),
("zope.interface", "zope-interface"),

(You may want to put the test cases for _find_name_version_sep() that cause a ValueError into its own test function.)

index = _find_name_version_sep(egg_info, canonical_name)
assert index == expected


@pytest.mark.parametrize(
("egg_info", "canonical_name"),
[
# A dash must follow the package name.
("zope.interface4.5.0", "zope-interface"),
("zope.interface.4.5.0", "zope-interface"),
("zope.interface.-4.5.0", "zope-interface"),
("zope.interface", "zope-interface"),
],
)
def test_find_name_version_sep_failure(egg_info, canonical_name):
with pytest.raises(ValueError) as ctx:
_find_name_version_sep(egg_info, canonical_name)
message = "{} does not match {}".format(egg_info, canonical_name)
assert str(ctx.value) == message


@pytest.mark.parametrize(
("egg_info", "canonical_name", "expected"),
[
# Trivial.
("pip-18.0", "pip", "18.0"),
("pip-18.0", None, "18.0"),
("zope-interface-4.5.0", "zope-interface", "4.5.0"),

# Non-canonical names.
# Canonicalized name match non-canonicalized egg info. (pypa/pip#5870)
("Jinja2-2.10", "jinja2", "2.10"),
("jinja2-2.10", "Jinja2", "2.10"),
("zope.interface-4.5.0", "zope-interface", "4.5.0"),
("zope_interface-4.5.0", "zope-interface", "4.5.0"),

# Ambiguous names. Should be smart enough if the package name is
# provided, otherwise make a guess.
# Should be smart enough to parse ambiguous names from the provided
# package name.
("foo-2-2", "foo", "2-2"),
("foo-2-2", "foo-2", "2"),
("foo-2-2", None, "2-2"),
("im-valid", None, "valid"),
("zope.interface--4.5.0", "zope-interface", "-4.5.0"),
("zope.interface--", "zope-interface", "-"),

# Should be able to detect collapsed characters in the egg info.
("foo--bar-1.0", "foo-bar", "1.0"),
("foo-_bar-1.0", "foo-bar", "1.0"),

# Invalid names.
("invalid", None, None),
("im_invalid", None, None),
# Invalid.
("the-package-name-8.19", "does-not-match", None),
("zope.interface.-4.5.0", "zope.interface", None),
("zope.interface-", "zope-interface", None),
("zope.interface4.5.0", "zope-interface", None),
("zope.interface.4.5.0", "zope-interface", None),
("zope.interface.-4.5.0", "zope-interface", None),
("zope.interface", "zope-interface", None),
],
)
def test_egg_info_matches(egg_info, search_name, expected):
link = None # Only used for reporting.
version = egg_info_matches(egg_info, search_name, link)
def test_egg_info_matches(egg_info, canonical_name, expected):
version = _egg_info_matches(egg_info, canonical_name)
assert version == expected


Expand Down
20 changes: 20 additions & 0 deletions tests/unit/test_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,26 @@
from tests.lib import DATA_DIR


@pytest.mark.parametrize(
"s, expected",
[
# Trivial.
("pip-18.0", True),

# Ambiguous.
("foo-2-2", True),
("im-valid", True),

# Invalid.
("invalid", False),
("im_invalid", False),
],
)
def test_contains_egg_info(s, expected):
result = wheel._contains_egg_info(s)
assert result == expected


@pytest.mark.parametrize("console_scripts",
["pip = pip._internal.main:pip",
"pip:pip = pip._internal.main:pip"])
Expand Down