From 2ceccfc232ef27e548c9337b8a1aa2f7e4005c52 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 22:54:56 +0000 Subject: [PATCH 01/23] Convert `Token` into a dataclass This makes it a fully-fleshed-out class for holding data. --- packaging/_tokenizer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py index ecae9e34..2583af23 100644 --- a/packaging/_tokenizer.py +++ b/packaging/_tokenizer.py @@ -1,14 +1,15 @@ import re +from dataclasses import dataclass from typing import Dict, Generator, NoReturn, Optional from .specifiers import Specifier +@dataclass class Token: - def __init__(self, name: str, text: str, position: int) -> None: - self.name = name - self.text = text - self.position = position + name: str + text: str + position: int def matches(self, name: str = "") -> bool: if name and self.name != name: From a25e85fb77f7d97edaf9b4e94967578daafbd08b Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 22:58:02 +0000 Subject: [PATCH 02/23] Convert parser exception into a rich exception class This also pulls out the error message formatting logic into the error itself. --- packaging/_tokenizer.py | 49 ++++++++++++++++++++++++++------------- packaging/markers.py | 9 +++---- packaging/requirements.py | 4 ++-- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py index 2583af23..5754d95c 100644 --- a/packaging/_tokenizer.py +++ b/packaging/_tokenizer.py @@ -16,15 +16,25 @@ def matches(self, name: str = "") -> bool: return False return True +class ParserSyntaxError(Exception): + """The provided source text could not be parsed correctly.""" + + def __init__( + self, + message: str, + *, + source: str, + span: Tuple[int, int], + ) -> None: + self.span = span + self.message = message + self.source = source -class ParseExceptionError(Exception): - """ - Parsing failed. - """ + super().__init__() - def __init__(self, message: str, position: int) -> None: - super().__init__(message) - self.position = position + def __str__(self) -> str: + marker = " " * self.span[0] + "^" * (self.span[1] - self.span[0] + 1) + return "\n ".join([self.message, self.source, marker]) DEFAULT_RULES = { @@ -130,15 +140,22 @@ def try_read(self, *name: str) -> Optional[Token]: return self.read() return None - def raise_syntax_error(self, *, message: str) -> NoReturn: - """ - Raise SyntaxError at the given position in the marker. - """ - at = f"at position {self.position}:" - marker = " " * self.position + "^" - raise ParseExceptionError( - f"{message}\n{at}\n {self.source}\n {marker}", - self.position, + def raise_syntax_error( + self, + message: str, + *, + span_start: Optional[int] = None, + span_end: Optional[int] = None, + ) -> NoReturn: + """Raise ParserSyntaxError at the given position.""" + span = ( + self.position if span_start is None else span_start, + self.position if span_end is None else span_end, + ) + raise ParserSyntaxError( + message, + source=self.source, + span=span, ) def _make_token(self, name: str, text: str) -> Token: diff --git a/packaging/markers.py b/packaging/markers.py index ddb0ac17..5c2b1dfe 100644 --- a/packaging/markers.py +++ b/packaging/markers.py @@ -9,7 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union from ._parser import MarkerAtom, MarkerList, Op, Value, Variable, parse_marker_expr -from ._tokenizer import ParseExceptionError, Tokenizer +from ._tokenizer import ParserSyntaxError, Tokenizer from .specifiers import InvalidSpecifier, Specifier from .utils import canonicalize_name @@ -205,11 +205,8 @@ def __init__(self, marker: str) -> None: # (, , ) # ] # ] - except ParseExceptionError as e: - raise InvalidMarker( - f"Invalid marker: {marker!r}, parse error at " - f"{marker[e.position : e.position + 8]!r}" - ) + except ParserSyntaxError as e: + raise InvalidMarker(str(e)) def __str__(self) -> str: return _format_marker(self._markers) diff --git a/packaging/requirements.py b/packaging/requirements.py index 7a14a58f..82204b5b 100644 --- a/packaging/requirements.py +++ b/packaging/requirements.py @@ -6,7 +6,7 @@ from typing import Any, List, Optional, Set from ._parser import parse_named_requirement -from ._tokenizer import ParseExceptionError +from ._tokenizer import ParserSyntaxError from .markers import InvalidMarker, Marker from .specifiers import SpecifierSet @@ -33,7 +33,7 @@ class Requirement: def __init__(self, requirement_string: str) -> None: try: req = parse_named_requirement(requirement_string) - except ParseExceptionError as e: + except ParserSyntaxError as e: raise InvalidRequirement(str(e)) self.name: str = req.name From 09f31ff84544b0f79c01ec9892537252b8bfa3ad Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 23:04:09 +0000 Subject: [PATCH 03/23] Use a richer type for `Tokenizer.rules` This helps pyright better understand what's happening. --- packaging/_tokenizer.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py index 5754d95c..5b759e71 100644 --- a/packaging/_tokenizer.py +++ b/packaging/_tokenizer.py @@ -1,6 +1,6 @@ import re from dataclasses import dataclass -from typing import Dict, Generator, NoReturn, Optional +from typing import Dict, Generator, NoReturn, Optional, Union from .specifiers import Specifier @@ -37,7 +37,7 @@ def __str__(self) -> str: return "\n ".join([self.message, self.source, marker]) -DEFAULT_RULES = { +DEFAULT_RULES: "Dict[str, Union[str, re.Pattern[str]]]" = { "LPAREN": r"\s*\(", "RPAREN": r"\s*\)", "LBRACKET": r"\s*\[", @@ -89,9 +89,12 @@ class Tokenizer: (advance to the next token). """ - next_token: Optional[Token] - - def __init__(self, source: str, rules: Dict[str, object] = DEFAULT_RULES) -> None: + def __init__( + self, + source: str, + *, + rules: "Dict[str, Union[str, re.Pattern[str]]]" = DEFAULT_RULES, + ) -> None: self.source = source self.rules = {name: re.compile(pattern) for name, pattern in rules.items()} self.next_token = None From 1c930f10ed968fd8ef28673e6a5d28503fdd6021 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 23:15:25 +0000 Subject: [PATCH 04/23] Provide dedicated `parse_{requirement,marker}(str)` functions These provide a consistent call signatures into the parser. This also decouples the tokenizer from the `Marker` class. --- packaging/_parser.py | 15 +++++++++++---- packaging/markers.py | 8 +++----- packaging/requirements.py | 4 ++-- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index ec02e72c..17495aa5 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -12,7 +12,7 @@ from ast import literal_eval from typing import Any, List, NamedTuple, Tuple, Union -from ._tokenizer import Tokenizer +from ._tokenizer import DEFAULT_RULES, Tokenizer class Node: @@ -62,12 +62,15 @@ class Requirement(NamedTuple): marker: str -def parse_named_requirement(requirement: str) -> Requirement: +def parse_requirement(source: str) -> Requirement: + return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES)) + + +def _parse_requirement(tokens: Tokenizer) -> Requirement: """ named_requirement: IDENTIFIER extras (URL_SPEC | specifier) (SEMICOLON marker_expr)? END """ - tokens = Tokenizer(requirement) tokens.expect("IDENTIFIER", error_message="Expression must begin with package name") name = tokens.read("IDENTIFIER").text extras = parse_extras(tokens) @@ -140,7 +143,11 @@ def parse_version_many(tokens: Tokenizer) -> str: return parsed_specifiers -def parse_marker_expr(tokens: Tokenizer) -> MarkerList: +def parse_marker(source: str) -> MarkerList: + return _parse_marker_expr(Tokenizer(source, rules=DEFAULT_RULES)) + + +def _parse_marker_expr(tokens: Tokenizer) -> MarkerList: """ marker_expr: MARKER_ATOM (BOOLOP + MARKER_ATOM)+ """ diff --git a/packaging/markers.py b/packaging/markers.py index 5c2b1dfe..bcffec24 100644 --- a/packaging/markers.py +++ b/packaging/markers.py @@ -8,8 +8,8 @@ import sys from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from ._parser import MarkerAtom, MarkerList, Op, Value, Variable, parse_marker_expr -from ._tokenizer import ParserSyntaxError, Tokenizer +from ._parser import MarkerAtom, MarkerList, Op, Value, Variable, parse_marker +from ._tokenizer import ParserSyntaxError from .specifiers import InvalidSpecifier, Specifier from .utils import canonicalize_name @@ -186,9 +186,7 @@ def default_environment() -> Dict[str, str]: class Marker: def __init__(self, marker: str) -> None: try: - self._markers = _normalize_extra_values( - parse_marker_expr(Tokenizer(marker)) - ) + self._markers = _normalize_extra_values(parse_marker(marker)) # The attribute `_markers` can be described in terms of a recursive type: # MarkerList = List[Union[Tuple[Node, ...], str, MarkerList]] # diff --git a/packaging/requirements.py b/packaging/requirements.py index 82204b5b..5bca45ba 100644 --- a/packaging/requirements.py +++ b/packaging/requirements.py @@ -5,7 +5,7 @@ import urllib.parse from typing import Any, List, Optional, Set -from ._parser import parse_named_requirement +from ._parser import parse_requirement from ._tokenizer import ParserSyntaxError from .markers import InvalidMarker, Marker from .specifiers import SpecifierSet @@ -32,7 +32,7 @@ class Requirement: def __init__(self, requirement_string: str) -> None: try: - req = parse_named_requirement(requirement_string) + req = parse_requirement(requirement_string) except ParserSyntaxError as e: raise InvalidRequirement(str(e)) From 650c7c6a5fa2c589788a2bf9798ac44c1a2dac9c Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 23:18:19 +0000 Subject: [PATCH 05/23] Rename `req` to `parsed` in `Requirement.__init__` This makes it easier to read through the function, with a clearer name. --- packaging/requirements.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/packaging/requirements.py b/packaging/requirements.py index 5bca45ba..3c30dbba 100644 --- a/packaging/requirements.py +++ b/packaging/requirements.py @@ -32,27 +32,29 @@ class Requirement: def __init__(self, requirement_string: str) -> None: try: - req = parse_requirement(requirement_string) + parsed = parse_requirement(requirement_string) except ParserSyntaxError as e: raise InvalidRequirement(str(e)) - self.name: str = req.name - if req.url: - parsed_url = urllib.parse.urlparse(req.url) + self.name: str = parsed.name + if parsed.url: + parsed_url = urllib.parse.urlparse(parsed.url) if parsed_url.scheme == "file": - if urllib.parse.urlunparse(parsed_url) != req.url: + if urllib.parse.urlunparse(parsed_url) != parsed.url: raise InvalidRequirement("Invalid URL given") elif not (parsed_url.scheme and parsed_url.netloc) or ( not parsed_url.scheme and not parsed_url.netloc ): - raise InvalidRequirement(f"Invalid URL: {req.url}") - self.url: Optional[str] = req.url + raise InvalidRequirement(f"Invalid URL: {parsed.url}") + self.url: Optional[str] = parsed.url else: self.url = None - self.extras: Set[str] = set(req.extras if req.extras else []) - self.specifier: SpecifierSet = SpecifierSet(req.specifier) + self.extras: Set[str] = set(parsed.extras if parsed.extras else []) + self.specifier: SpecifierSet = SpecifierSet(parsed.specifier) try: - self.marker: Optional[Marker] = Marker(req.marker) if req.marker else None + self.marker: Optional[Marker] = ( + Marker(parsed.marker) if parsed.marker else None + ) except InvalidMarker as e: raise InvalidRequirement(str(e)) From 282b4e1f713633af4f57ea9c90abf712baa9fd2b Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 23:25:55 +0000 Subject: [PATCH 06/23] Rename parser's `Requirement` to `ParsedRequirement` This draws a clear distinction between this and the user-visible `Requirement` object. --- packaging/_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index 17495aa5..456965dd 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -54,7 +54,7 @@ def serialize(self) -> str: MarkerList = List[Any] -class Requirement(NamedTuple): +class ParsedRequirement(NamedTuple): name: str url: str extras: List[str] @@ -62,7 +62,7 @@ class Requirement(NamedTuple): marker: str -def parse_requirement(source: str) -> Requirement: +def parse_requirement(source: str) -> ParsedRequirement: return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES)) From 07bf6f4f423b121cdea71890134742d6314909ca Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 23:29:49 +0000 Subject: [PATCH 07/23] Rework the parser with context-sensitive tokenisation This reduces how many regex patterns would be matched against the input while also enabling the parser to resolve ambiguity in-place. --- packaging/_parser.py | 295 +++++++++++++++++++++++-------------- packaging/_tokenizer.py | 168 +++++++++++---------- tests/test_requirements.py | 18 +-- 3 files changed, 280 insertions(+), 201 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index 456965dd..fb5bd5fc 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -1,15 +1,10 @@ -# The docstring for each parse function contains the grammar for the rule. -# The grammar uses a simple EBNF-inspired syntax: -# -# - Uppercase names are tokens -# - Lowercase names are rules (parsed with a parse_* function) -# - Parentheses are used for grouping -# - A | means either-or -# - A * means 0 or more -# - A + means 1 or more -# - A ? means 0 or 1 - -from ast import literal_eval +"""Handwritten parser of dependency specifiers. + +The docstring for each __parse_* function contains ENBF-inspired grammar representing +the implementation. +""" + +import ast from typing import Any, List, NamedTuple, Tuple, Union from ._tokenizer import DEFAULT_RULES, Tokenizer @@ -62,140 +57,218 @@ class ParsedRequirement(NamedTuple): marker: str +# -------------------------------------------------------------------------------------- +# Recursive descent parser for dependency specifier +# -------------------------------------------------------------------------------------- def parse_requirement(source: str) -> ParsedRequirement: return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES)) -def _parse_requirement(tokens: Tokenizer) -> Requirement: +def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement: + """ + requirement = WS? IDENTIFIER WS? extras WS? requirement_details + """ + tokenizer.consume("WS") + + name_token = tokenizer.expect( + "IDENTIFIER", expected="package name at the start of dependency specifier" + ) + name = name_token.text + tokenizer.consume("WS") + + extras = _parse_extras(tokenizer) + tokenizer.consume("WS") + + url, specifier, marker = _parse_requirement_details(tokenizer) + tokenizer.expect("END", expected="end of dependency specifier") + + return ParsedRequirement(name, url, extras, specifier, marker) + + +def _parse_requirement_details( + tokenizer: Tokenizer, +) -> Tuple[str, str, str]: """ - named_requirement: - IDENTIFIER extras (URL_SPEC | specifier) (SEMICOLON marker_expr)? END + requirement_details = AT URL (WS SEMICOLON marker WS?)? + | specifier WS? (SEMICOLON marker WS?)? """ - tokens.expect("IDENTIFIER", error_message="Expression must begin with package name") - name = tokens.read("IDENTIFIER").text - extras = parse_extras(tokens) + specifier = "" url = "" - if tokens.match("URL_SPEC"): - url = tokens.read().text[1:].strip() - elif not tokens.match("END"): - specifier = parse_specifier(tokens) - if tokens.try_read("SEMICOLON"): - marker = "" - while not tokens.match("END"): - # we don't validate markers here, it's done later as part of - # packaging/requirements.py - marker += tokens.read().text + marker = "" + + if tokenizer.check("AT"): + tokenizer.read() + tokenizer.consume("WS") + url_start = tokenizer.position + url = tokenizer.expect("URL", expected="URL after @").text + if tokenizer.check("END", peek=True): + return (url, specifier, marker) + + tokenizer.expect("WS", expected="whitespace after URL") + if not tokenizer.check("SEMICOLON"): + tokenizer.raise_syntax_error( + "Expected semicolon after URL (followed by environment markers)", + span_start=url_start, + ) + else: + tokenizer.read() else: - marker = "" - tokens.expect( - "END", - error_message="Expected semicolon (followed by markers) or end of string", + specifier = _parse_specifier(tokenizer) + tokenizer.consume("WS") + + if tokenizer.check("END", peek=True): + return (url, specifier, marker) + + tokenizer.expect( + "SEMICOLON", expected="semicolon (followed by environment markers)" ) - return Requirement(name, url, extras, specifier, marker) + marker = tokenizer.read_remaining_text() + + return (url, specifier, marker) + + +def _parse_extras(tokenizer: Tokenizer) -> List[str]: + """ + extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)? + """ + if not tokenizer.check("LEFT_BRACKET", peek=True): + return [] + + with tokenizer.enclosing_tokens("LEFT_BRACKET", "RIGHT_BRACKET"): + tokenizer.consume("WS") + extras = _parse_extras_list(tokenizer) + tokenizer.consume("WS") + + return extras -def parse_extras(tokens: Tokenizer) -> List[str]: +def _parse_extras_list(tokenizer: Tokenizer) -> List[str]: """ - extras: LBRACKET (IDENTIFIER (COMMA IDENTIFIER)*)? RBRACKET + extras_list = identifier (wsp* ',' wsp* identifier)* """ - extras = [] - if tokens.try_read("LBRACKET"): - while tokens.match("IDENTIFIER"): - extras.append(tokens.read("IDENTIFIER").text) - if not tokens.match("RBRACKET"): - tokens.read("COMMA", error_message="Missing comma after extra") - if not tokens.match("COMMA") and tokens.match("RBRACKET"): - break - tokens.read("RBRACKET", error_message="Closing square bracket is missing") + extras: List[str] = [] + + if not tokenizer.check("IDENTIFIER"): + return extras + + extras.append(tokenizer.read().text) + + while True: + tokenizer.consume("WS") + if not tokenizer.check("COMMA"): + if tokenizer.check("IDENTIFIER", peek=True): + tokenizer.raise_syntax_error("Expected comma between extra names") + break + + tokenizer.read() + tokenizer.consume("WS") + extra_token = tokenizer.expect("IDENTIFIER", expected="extra name after comma") + extras.append(extra_token.text) + return extras -def parse_specifier(tokens: Tokenizer) -> str: +def _parse_specifier(tokenizer: Tokenizer) -> str: """ - specifier: - LPAREN version_many? RPAREN | version_many + specifier = LEFT_PARENTHESIS version_many? RIGHT_PARENTHESIS + | version_many """ - lparen = False - if tokens.try_read("LPAREN"): - lparen = True - parsed_specifiers = parse_version_many(tokens) - if lparen and not tokens.try_read("RPAREN"): - tokens.raise_syntax_error(message="Closing right parenthesis is missing") + with tokenizer.enclosing_tokens("LEFT_PARENTHESIS", "RIGHT_PARENTHESIS"): + parsed_specifiers = _parse_version_many(tokenizer) + return parsed_specifiers -def parse_version_many(tokens: Tokenizer) -> str: +def _parse_version_many(tokenizer: Tokenizer) -> str: """ - version_many: OP VERSION (COMMA OP VERSION)* + version_many = OP VERSION (COMMA OP VERSION)* """ parsed_specifiers = "" - while tokens.match("OP"): - parsed_specifiers += tokens.read("OP").text - if tokens.match("VERSION"): - parsed_specifiers += tokens.read("VERSION").text - else: - tokens.raise_syntax_error(message="Missing version") - if not tokens.match("COMMA"): + while tokenizer.check("OP"): + parsed_specifiers += tokenizer.read().text + + # We intentionally do not consume whitespace here, since the regular expression + # for `VERSION` uses a lookback for the operator, to determine what + # corresponding syntax is permitted. + + version_token = tokenizer.expect("VERSION", expected="version after operator") + parsed_specifiers += version_token.text + tokenizer.consume("WS") + + if not tokenizer.check("COMMA"): break - tokens.expect("COMMA", error_message="Missing comma after version") - parsed_specifiers += tokens.read("COMMA").text + parsed_specifiers += tokenizer.read().text + tokenizer.consume("WS") + return parsed_specifiers +# -------------------------------------------------------------------------------------- +# Recursive descent parser for marker expression +# -------------------------------------------------------------------------------------- def parse_marker(source: str) -> MarkerList: return _parse_marker_expr(Tokenizer(source, rules=DEFAULT_RULES)) -def _parse_marker_expr(tokens: Tokenizer) -> MarkerList: +def _parse_marker_expr(tokenizer: Tokenizer) -> MarkerList: """ - marker_expr: MARKER_ATOM (BOOLOP + MARKER_ATOM)+ + marker_expr = marker_atom (BOOLOP WS? marker_atom)+ """ - expression = [parse_marker_atom(tokens)] - while tokens.match("BOOLOP"): - tok = tokens.read("BOOLOP") - expr_right = parse_marker_atom(tokens) - expression.extend((tok.text, expr_right)) + expression = [_parse_marker_atom(tokenizer)] + while tokenizer.check("BOOLOP"): + token = tokenizer.read() + tokenizer.consume("WS") + expr_right = _parse_marker_atom(tokenizer) + expression.extend((token.text, expr_right)) return expression -def parse_marker_atom(tokens: Tokenizer) -> MarkerAtom: +def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom: """ - marker_atom: LPAREN marker_expr RPAREN | marker_item + marker_atom = LEFT_PARENTHESIS WS? marker_expr WS? RIGHT_PARENTHESIS + | marker_item """ - if tokens.try_read("LPAREN"): - marker = parse_marker_expr(tokens) - tokens.read("RPAREN", error_message="Closing right parenthesis is missing") - return marker + + if tokenizer.check("LEFT_PARENTHESIS", peek=True): + with tokenizer.enclosing_tokens("LEFT_PARENTHESIS", "RIGHT_PARENTHESIS"): + tokenizer.consume("WS") + marker: MarkerAtom = _parse_marker_expr(tokenizer) + tokenizer.consume("WS") else: - return parse_marker_item(tokens) + marker = _parse_marker_item(tokenizer) + return marker -def parse_marker_item(tokens: Tokenizer) -> MarkerItem: +def _parse_marker_item(tokenizer: Tokenizer) -> MarkerItem: """ - marker_item: marker_var marker_op marker_var + marker_item = WS? marker_var WS? marker_op WS? marker_var WS? """ - marker_var_left = parse_marker_var(tokens) - marker_op = parse_marker_op(tokens) - marker_var_right = parse_marker_var(tokens) + tokenizer.consume("WS") + marker_var_left = _parse_marker_var(tokenizer) + tokenizer.consume("WS") + marker_op = _parse_marker_op(tokenizer) + tokenizer.consume("WS") + marker_var_right = _parse_marker_var(tokenizer) + tokenizer.consume("WS") return (marker_var_left, marker_op, marker_var_right) -def parse_marker_var(tokens: Tokenizer) -> MarkerVar: +def _parse_marker_var(tokenizer: Tokenizer) -> MarkerVar: """ - marker_var: env_var | python_str + marker_var = VARIABLE | QUOTED_STRING """ - if tokens.match("VARIABLE"): - return parse_env_var(tokens) + if tokenizer.check("VARIABLE"): + return process_env_var(tokenizer.read().text.replace(".", "_")) + elif tokenizer.check("QUOTED_STRING"): + return process_python_str(tokenizer.read().text) else: - return parse_python_str(tokens) + tokenizer.raise_syntax_error( + message="Expected a marker variable or quoted string" + ) -def parse_env_var(tokens: Tokenizer) -> Variable: - """ - env_var: VARIABLE - """ - env_var = tokens.read("VARIABLE").text.replace(".", "_") +def process_env_var(env_var: str) -> Variable: if ( env_var == "platform_python_implementation" or env_var == "python_implementation" @@ -205,31 +278,27 @@ def parse_env_var(tokens: Tokenizer) -> Variable: return Variable(env_var) -def parse_python_str(tokens: Tokenizer) -> Value: - """ - python_str: QUOTED_STRING - """ - token = tokens.read( - "QUOTED_STRING", - error_message="String with single or double quote at the beginning is expected", - ).text - python_str = literal_eval(token) - return Value(str(python_str)) +def process_python_str(python_str: str) -> Value: + value = ast.literal_eval(python_str) + return Value(str(value)) -def parse_marker_op(tokens: Tokenizer) -> Op: +def _parse_marker_op(tokenizer: Tokenizer) -> Op: """ - marker_op: IN | NOT IN | OP + marker_op = IN | NOT IN | OP """ - if tokens.try_read("IN"): + if tokenizer.check("IN"): + tokenizer.read() return Op("in") - elif tokens.try_read("NOT"): - tokens.read("IN", error_message="NOT token must be follewed by IN token") + elif tokenizer.check("NOT"): + tokenizer.read() + tokenizer.expect("WS", expected="whitespace after 'not'") + tokenizer.expect("IN", expected="'in' after 'not'") return Op("not in") - elif tokens.match("OP"): - return Op(tokens.read().text) + elif tokenizer.check("OP"): + return Op(tokenizer.read().text) else: - return tokens.raise_syntax_error( - message='Couldn\'t parse marker operator. Expecting one of \ - "<=, <, !=, ==, >=, >, ~=, ===, not, not in"' + return tokenizer.raise_syntax_error( + "Expected marker operator, one of " + "<=, <, !=, ==, >=, >, ~=, ===, not, not in" ) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py index 5b759e71..ed1bd7d9 100644 --- a/packaging/_tokenizer.py +++ b/packaging/_tokenizer.py @@ -1,6 +1,7 @@ +import contextlib import re from dataclasses import dataclass -from typing import Dict, Generator, NoReturn, Optional, Union +from typing import Dict, Iterator, NoReturn, Optional, Tuple, Union from .specifiers import Specifier @@ -11,10 +12,6 @@ class Token: text: str position: int - def matches(self, name: str = "") -> bool: - if name and self.name != name: - return False - return True class ParserSyntaxError(Exception): """The provided source text could not be parsed correctly.""" @@ -39,14 +36,14 @@ def __str__(self) -> str: DEFAULT_RULES: "Dict[str, Union[str, re.Pattern[str]]]" = { "LPAREN": r"\s*\(", - "RPAREN": r"\s*\)", - "LBRACKET": r"\s*\[", - "RBRACKET": r"\s*\]", - "SEMICOLON": r"\s*;", - "COMMA": r"\s*,", + "LEFT_PARENTHESIS": r"\(", + "RIGHT_PARENTHESIS": r"\)", + "LEFT_BRACKET": r"\[", + "RIGHT_BRACKET": r"\]", + "SEMICOLON": r";", + "COMMA": r",", "QUOTED_STRING": re.compile( r""" - \s* ( ('[^']*') | @@ -55,13 +52,12 @@ def __str__(self) -> str: """, re.VERBOSE, ), - "OP": r"\s*(===|==|~=|!=|<=|>=|<|>)", - "BOOLOP": r"\s*(or|and)", - "IN": r"\s*in", - "NOT": r"\s*not", + "OP": r"(===|==|~=|!=|<=|>=|<|>)", + "BOOLOP": r"(or|and)", + "IN": r"in", + "NOT": r"not", "VARIABLE": re.compile( r""" - \s* ( python_version |python_full_version @@ -77,71 +73,78 @@ def __str__(self) -> str: re.VERBOSE, ), "VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE), - "URL_SPEC": r"\s*@ *[^ ]+", - "IDENTIFIER": r"\s*[a-zA-Z0-9._-]+", + "AT": r"\@", + "URL": r"[^ ]+", + "IDENTIFIER": r"[a-zA-Z0-9._-]+", + "WS": r"[ \t]+", + "END": r"$", } class Tokenizer: - """Stream of tokens for a LL(1) parser. + """Context-sensitive token parsing. - Provides methods to examine the next token to be read, and to read it - (advance to the next token). + Provides methods to examine the input stream to check whether the next token + matches. """ def __init__( self, source: str, *, - rules: "Dict[str, Union[str, re.Pattern[str]]]" = DEFAULT_RULES, + rules: "Dict[str, Union[str, re.Pattern[str]]]", ) -> None: self.source = source - self.rules = {name: re.compile(pattern) for name, pattern in rules.items()} - self.next_token = None - self.generator = self._tokenize() + self.rules: Dict[str, re.Pattern[str]] = { + name: re.compile(pattern) for name, pattern in rules.items() + } + self.next_token: Optional[Token] = None self.position = 0 - def peek(self) -> Token: - """ - Return the next token to be read. - """ - if not self.next_token: - self.next_token = next(self.generator) - return self.next_token + def consume(self, name: str) -> None: + """Move beyond provided token name, if at current position.""" + if self.check(name): + self.read() - def match(self, *name: str) -> bool: - """ - Return True if the next token matches the given arguments. - """ - token = self.peek() - return token.matches(*name) + def check(self, name: str, *, peek: bool = False) -> bool: + """Check whether the next token has the provided name. - def expect(self, *name: str, error_message: str) -> Token: - """ - Raise SyntaxError if the next token doesn't match given arguments. + By default, if the check succeeds, the token *must* be read before + another check. If `peek` is set to `True`, the token is not loaded and + would need to be checked again. """ - token = self.peek() - if not token.matches(*name): - raise self.raise_syntax_error(message=error_message) - return token + assert ( + self.next_token is None + ), f"Cannot check for {name!r}, already have {self.next_token!r}" + assert name in self.rules, f"Unknown token name: {name!r}" - def read(self, *name: str, error_message: str = "") -> Token: - """Return the next token and advance to the next token. + expression = self.rules[name] - Raise SyntaxError if the token doesn't match. - """ - result = self.expect(*name, error_message=error_message) - self.next_token = None - return result + match = expression.match(self.source, self.position) + if match is None: + return False + if not peek: + self.next_token = Token(name, match[0], self.position) + return True - def try_read(self, *name: str) -> Optional[Token]: - """read() if the next token matches the given arguments. + def expect(self, name: str, *, expected: str) -> Token: + """Expect a certain token name next, failing with a syntax error otherwise. - Do nothing if it does not match. + The token is *not* read. """ - if self.match(*name): - return self.read() - return None + if not self.check(name): + raise self.raise_syntax_error(f"Expected {expected}") + return self.read() + + def read(self) -> Token: + """Consume the next token and return it.""" + token = self.next_token + assert token is not None + + self.position += len(token.text) + self.next_token = None + + return token def raise_syntax_error( self, @@ -161,25 +164,32 @@ def raise_syntax_error( span=span, ) - def _make_token(self, name: str, text: str) -> Token: - """ - Make a token with the current position. - """ - return Token(name, text, self.position) + @contextlib.contextmanager + def enclosing_tokens(self, open_token: str, close_token: str) -> Iterator[bool]: + if self.check(open_token): + open_position = self.position + self.read() + else: + open_position = None - def _tokenize(self) -> Generator[Token, Token, None]: - """ - The main generator of tokens. - """ - while self.position < len(self.source): - for name, expression in self.rules.items(): - match = expression.match(self.source, self.position) - if match: - token_text = match[0] - - yield self._make_token(name, token_text.strip()) - self.position += len(token_text) - break - else: - raise self.raise_syntax_error(message="Unrecognized token") - yield self._make_token("END", "") + yield open_position is not None + + if open_position is None: + return + + if not self.check(close_token): + self.raise_syntax_error( + f"Expected closing {close_token}", + span_start=open_position, + ) + + self.read() + + def read_remaining_text(self) -> str: + index = self.position + assert index <= len(self.source) + + # move forward to the end, preventing future reads. + self.position = len(self.source) + + return self.source[index:] diff --git a/tests/test_requirements.py b/tests/test_requirements.py index 3360b5ed..759df6d9 100644 --- a/tests/test_requirements.py +++ b/tests/test_requirements.py @@ -62,22 +62,22 @@ def test_name_with_version(self): def test_with_legacy_version(self): with pytest.raises(InvalidRequirement) as e: Requirement("name==1.0.org1") - assert "Expected semicolon (followed by markers) or end of string" in str(e) + assert "Expected semicolon" in str(e) def test_with_legacy_version_and_marker(self): with pytest.raises(InvalidRequirement) as e: Requirement("name>=1.x.y;python_version=='2.6'") - assert "Expected semicolon (followed by markers) or end of string" in str(e) + assert "Expected semicolon" in str(e) def test_missing_name(self): with pytest.raises(InvalidRequirement) as e: Requirement("@ http://example.com") - assert "Expression must begin with package name" in str(e) + assert "package name" in str(e) def test_name_with_missing_version(self): with pytest.raises(InvalidRequirement) as e: Requirement("name>=") - assert "Missing version" in str(e) + assert "Expected version after operator" in str(e) def test_version_with_parens_and_whitespace(self): req = Requirement("name (==4)") @@ -86,7 +86,7 @@ def test_version_with_parens_and_whitespace(self): def test_version_with_missing_closing_paren(self): with pytest.raises(InvalidRequirement) as e: Requirement("name(==4") - assert "Closing right parenthesis is missing" in str(e) + assert "Expected closing RIGHT_PARENTHESIS" in str(e) def test_name_with_multiple_versions(self): req = Requirement("name>=3,<2") @@ -123,12 +123,12 @@ def test_empty_extras(self): def test_unclosed_extras(self): with pytest.raises(InvalidRequirement) as e: Requirement("foo[") - assert "Closing square bracket is missing" in str(e) + assert "Expected closing RIGHT_BRACKET" in str(e) def test_extras_without_comma(self): with pytest.raises(InvalidRequirement) as e: Requirement("foobar[quux bar]") - assert "Missing comma after extra" in str(e) + assert "Expected comma between extra names" in str(e) def test_url(self): url_section = "test @ http://example.com" @@ -194,7 +194,7 @@ def test_invalid_marker(self): def test_marker_with_missing_semicolon(self): with pytest.raises(InvalidRequirement) as e: Requirement('name[bar]>=3 python_version == "2.7"') - assert "Expected semicolon (followed by markers) or end of string" in str(e) + assert "Expected semicolon" in str(e) def test_types(self): req = Requirement("foobar[quux]<2,>=3; os_name=='a'") @@ -239,7 +239,7 @@ def test_sys_platform_linux_in(self): def test_parseexception_error_msg(self): with pytest.raises(InvalidRequirement) as e: Requirement("toto 42") - assert "Expected semicolon (followed by markers) or end of string" in str(e) + assert "Expected semicolon" in str(e) EQUAL_DEPENDENCIES = [ ("packaging>20.1", "packaging>20.1"), From c6baf525e620152d934d4a9eccd6a7eeedb101e0 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 23:32:31 +0000 Subject: [PATCH 08/23] Parse markers inline when parsing requirements This allows for nicer error messages, which show the entire requirement string and highlight the marker in particular. --- packaging/_parser.py | 13 ++++++++----- packaging/_tokenizer.py | 9 --------- packaging/markers.py | 3 +++ packaging/requirements.py | 12 +++++------- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index fb5bd5fc..9db35971 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -5,7 +5,7 @@ """ import ast -from typing import Any, List, NamedTuple, Tuple, Union +from typing import Any, List, NamedTuple, Optional, Tuple, Union from ._tokenizer import DEFAULT_RULES, Tokenizer @@ -54,7 +54,7 @@ class ParsedRequirement(NamedTuple): url: str extras: List[str] specifier: str - marker: str + marker: Optional[MarkerList] # -------------------------------------------------------------------------------------- @@ -87,7 +87,7 @@ def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement: def _parse_requirement_details( tokenizer: Tokenizer, -) -> Tuple[str, str, str]: +) -> Tuple[str, str, Optional[MarkerList]]: """ requirement_details = AT URL (WS SEMICOLON marker WS?)? | specifier WS? (SEMICOLON marker WS?)? @@ -95,7 +95,7 @@ def _parse_requirement_details( specifier = "" url = "" - marker = "" + marker = None if tokenizer.check("AT"): tokenizer.read() @@ -113,6 +113,8 @@ def _parse_requirement_details( ) else: tokenizer.read() + marker = _parse_marker_expr(tokenizer) + tokenizer.consume("WS") else: specifier = _parse_specifier(tokenizer) tokenizer.consume("WS") @@ -123,7 +125,8 @@ def _parse_requirement_details( tokenizer.expect( "SEMICOLON", expected="semicolon (followed by environment markers)" ) - marker = tokenizer.read_remaining_text() + marker = _parse_marker_expr(tokenizer) + tokenizer.consume("WS") return (url, specifier, marker) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py index ed1bd7d9..06cdfe28 100644 --- a/packaging/_tokenizer.py +++ b/packaging/_tokenizer.py @@ -184,12 +184,3 @@ def enclosing_tokens(self, open_token: str, close_token: str) -> Iterator[bool]: ) self.read() - - def read_remaining_text(self) -> str: - index = self.position - assert index <= len(self.source) - - # move forward to the end, preventing future reads. - self.position = len(self.source) - - return self.source[index:] diff --git a/packaging/markers.py b/packaging/markers.py index bcffec24..bb2d2a5a 100644 --- a/packaging/markers.py +++ b/packaging/markers.py @@ -185,6 +185,9 @@ def default_environment() -> Dict[str, str]: class Marker: def __init__(self, marker: str) -> None: + # Note: We create a Marker object without calling this constructor in + # packaging.requirements.Requirement. If any additional logic is + # added here, make sure to mirror/adapt Requirement. try: self._markers = _normalize_extra_values(parse_marker(marker)) # The attribute `_markers` can be described in terms of a recursive type: diff --git a/packaging/requirements.py b/packaging/requirements.py index 3c30dbba..2140a7f7 100644 --- a/packaging/requirements.py +++ b/packaging/requirements.py @@ -7,7 +7,7 @@ from ._parser import parse_requirement from ._tokenizer import ParserSyntaxError -from .markers import InvalidMarker, Marker +from .markers import Marker, _normalize_extra_values from .specifiers import SpecifierSet @@ -51,12 +51,10 @@ def __init__(self, requirement_string: str) -> None: self.url = None self.extras: Set[str] = set(parsed.extras if parsed.extras else []) self.specifier: SpecifierSet = SpecifierSet(parsed.specifier) - try: - self.marker: Optional[Marker] = ( - Marker(parsed.marker) if parsed.marker else None - ) - except InvalidMarker as e: - raise InvalidRequirement(str(e)) + self.marker: Optional[Marker] = None + if parsed.marker is not None: + self.marker = Marker.__new__(Marker) + self.marker._markers = _normalize_extra_values(parsed.marker) def __str__(self) -> str: parts: List[str] = [self.name] From 6b2f3dec7a6911e360e6b79f7d2876d6be2c27aa Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 21:14:01 +0000 Subject: [PATCH 09/23] Factor out parsing semicolon-marker for requirements This eliminates a point of duplication and ensures that the error messaging is consistent. --- packaging/_parser.py | 46 +++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index 9db35971..fc8ce827 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -89,8 +89,8 @@ def _parse_requirement_details( tokenizer: Tokenizer, ) -> Tuple[str, str, Optional[MarkerList]]: """ - requirement_details = AT URL (WS SEMICOLON marker WS?)? - | specifier WS? (SEMICOLON marker WS?)? + requirement_details = AT URL (WS requirement_marker)? + | specifier WS? (requirement_marker)? """ specifier = "" @@ -100,37 +100,53 @@ def _parse_requirement_details( if tokenizer.check("AT"): tokenizer.read() tokenizer.consume("WS") + url_start = tokenizer.position url = tokenizer.expect("URL", expected="URL after @").text if tokenizer.check("END", peek=True): return (url, specifier, marker) tokenizer.expect("WS", expected="whitespace after URL") - if not tokenizer.check("SEMICOLON"): - tokenizer.raise_syntax_error( - "Expected semicolon after URL (followed by environment markers)", - span_start=url_start, - ) - else: - tokenizer.read() - marker = _parse_marker_expr(tokenizer) - tokenizer.consume("WS") + + marker = _parse_requirement_marker( + tokenizer, span_start=url_start, after="URL and whitespace" + ) else: + specifier_start = tokenizer.position specifier = _parse_specifier(tokenizer) tokenizer.consume("WS") if tokenizer.check("END", peek=True): return (url, specifier, marker) - tokenizer.expect( - "SEMICOLON", expected="semicolon (followed by environment markers)" + marker = _parse_requirement_marker( + tokenizer, span_start=specifier_start, after="version specifier" ) - marker = _parse_marker_expr(tokenizer) - tokenizer.consume("WS") return (url, specifier, marker) +def _parse_requirement_marker( + tokenizer: Tokenizer, *, span_start: int, after: str +) -> MarkerList: + """ + requirement_marker = SEMICOLON marker WS? + """ + + if not tokenizer.check("SEMICOLON"): + tokenizer.raise_syntax_error( + f"Expected end or semicolon (after {after})", + span_start=span_start, + ) + else: + tokenizer.read() + + marker = _parse_marker_expr(tokenizer) + tokenizer.consume("WS") + + return marker + + def _parse_extras(tokenizer: Tokenizer) -> List[str]: """ extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)? From 177e9fff017cf081b90158b5d54553259c7708a8 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 22:02:45 +0000 Subject: [PATCH 10/23] Tweak the presentation of ParserSyntaxError spans This makes it easier to identify what position the parser was checking, compared to relevant context to the reader. --- packaging/_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py index 06cdfe28..7cf2c98d 100644 --- a/packaging/_tokenizer.py +++ b/packaging/_tokenizer.py @@ -30,7 +30,7 @@ def __init__( super().__init__() def __str__(self) -> str: - marker = " " * self.span[0] + "^" * (self.span[1] - self.span[0] + 1) + marker = " " * self.span[0] + "~" * (self.span[1] - self.span[0]) + "^" return "\n ".join([self.message, self.source, marker]) From 1c3f90020fe58a2cb3431843a286649abdfb41a2 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 22:07:05 +0000 Subject: [PATCH 11/23] Make URLs match "not whitespace" This is more permitting and better handles tabs used as whitespace. --- packaging/_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py index 7cf2c98d..09fc9a06 100644 --- a/packaging/_tokenizer.py +++ b/packaging/_tokenizer.py @@ -74,7 +74,7 @@ def __str__(self) -> str: ), "VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE), "AT": r"\@", - "URL": r"[^ ]+", + "URL": r"[^ \t]+", "IDENTIFIER": r"[a-zA-Z0-9._-]+", "WS": r"[ \t]+", "END": r"$", From 4a4d835cee3b1bf8e640d9b06ad2af8085438b9d Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 22:25:11 +0000 Subject: [PATCH 12/23] Update `IDENTIFIER` to match PEP 508's stipulated syntax This follows what PEP 508's grammar says is a valid identifier. --- packaging/_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py index 09fc9a06..da6e7b06 100644 --- a/packaging/_tokenizer.py +++ b/packaging/_tokenizer.py @@ -75,7 +75,7 @@ def __str__(self) -> str: "VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE), "AT": r"\@", "URL": r"[^ \t]+", - "IDENTIFIER": r"[a-zA-Z0-9._-]+", + "IDENTIFIER": r"[a-zA-Z0-9][a-zA-Z0-9._-]*", "WS": r"[ \t]+", "END": r"$", } From 39ae524488a6e7e436494d13dcef021d835a2a78 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 22:42:58 +0000 Subject: [PATCH 13/23] Make arbitrary version matching accept what `LegacySpecifier` did This makes it possible for the arbitrary matches to be used within requirement specifiers without special constraints. --- packaging/specifiers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packaging/specifiers.py b/packaging/specifiers.py index 7543c0a8..645214ae 100644 --- a/packaging/specifiers.py +++ b/packaging/specifiers.py @@ -116,8 +116,10 @@ class Specifier(BaseSpecifier): # but included entirely as an escape hatch. (?<====) # Only match for the identity operator \s* - [^\s]* # We just match everything, except for whitespace - # since we are only testing for strict identity. + [^\s;)]* # The arbitrary version can be just about anything, + # we match everything except for whitespace, a + # semi-colon for marker support, and a closing paren + # since versions can be enclosed in them. ) | (?: From 97e764967446abf82ecae886117906619c6aef75 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 22:46:30 +0000 Subject: [PATCH 14/23] Better reflect what is optional within `specifier`/`version_many` This makes it clearer in the docstring grammars that a name without any specifier is valid. --- packaging/_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index fc8ce827..9d69bd76 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -190,7 +190,7 @@ def _parse_extras_list(tokenizer: Tokenizer) -> List[str]: def _parse_specifier(tokenizer: Tokenizer) -> str: """ - specifier = LEFT_PARENTHESIS version_many? RIGHT_PARENTHESIS + specifier = LEFT_PARENTHESIS version_many RIGHT_PARENTHESIS | version_many """ with tokenizer.enclosing_tokens("LEFT_PARENTHESIS", "RIGHT_PARENTHESIS"): @@ -201,7 +201,7 @@ def _parse_specifier(tokenizer: Tokenizer) -> str: def _parse_version_many(tokenizer: Tokenizer) -> str: """ - version_many = OP VERSION (COMMA OP VERSION)* + version_many = (OP VERSION (COMMA OP VERSION)*)? """ parsed_specifiers = "" while tokenizer.check("OP"): From 92b954536e338d4d797a0f1fe30a10a2f816f6c8 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 22:47:03 +0000 Subject: [PATCH 15/23] Flatten nested ifs into if-elif This makes the control flow slightly easier to understand. --- packaging/_parser.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index 9d69bd76..378cef88 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -175,13 +175,14 @@ def _parse_extras_list(tokenizer: Tokenizer) -> List[str]: while True: tokenizer.consume("WS") - if not tokenizer.check("COMMA"): - if tokenizer.check("IDENTIFIER", peek=True): - tokenizer.raise_syntax_error("Expected comma between extra names") + if tokenizer.check("IDENTIFIER", peek=True): + tokenizer.raise_syntax_error("Expected comma between extra names") + elif not tokenizer.check("COMMA"): break tokenizer.read() tokenizer.consume("WS") + extra_token = tokenizer.expect("IDENTIFIER", expected="extra name after comma") extras.append(extra_token.text) From 3a7cdb6b0673d58a0cf0dd909e6d6507714ee4a6 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Mon, 5 Dec 2022 22:48:32 +0000 Subject: [PATCH 16/23] Rewrite test suite for requirements parsing This now exercises more edge cases and validates that the error messages are well-formed. --- tests/test_requirements.py | 765 +++++++++++++++++++++++++------------ 1 file changed, 523 insertions(+), 242 deletions(-) diff --git a/tests/test_requirements.py b/tests/test_requirements.py index 759df6d9..09c963d0 100644 --- a/tests/test_requirements.py +++ b/tests/test_requirements.py @@ -2,217 +2,529 @@ # 2.0, and the BSD License. See the LICENSE file in the root of this repository # for complete details. +from typing import Optional, Set + import pytest from packaging.markers import Marker from packaging.requirements import InvalidRequirement, Requirement from packaging.specifiers import SpecifierSet +EQUAL_DEPENDENCIES = [ + ("packaging>20.1", "packaging>20.1"), + ( + 'requests[security, tests]>=2.8.1,==2.8.*;python_version<"2.7"', + 'requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7"', + ), + ( + 'importlib-metadata; python_version<"3.8"', + "importlib-metadata; python_version<'3.8'", + ), + ( + 'appdirs>=1.4.4,<2; os_name=="posix" and extra=="testing"', + "appdirs>=1.4.4,<2; os_name == 'posix' and extra == 'testing'", + ), +] + +DIFFERENT_DEPENDENCIES = [ + ("package_one", "package_two"), + ("packaging>20.1", "packaging>=20.1"), + ("packaging>20.1", "packaging>21.1"), + ("packaging>20.1", "package>20.1"), + ( + 'requests[security,tests]>=2.8.1,==2.8.*;python_version<"2.7"', + 'requests [security,tests] >= 2.8.1 ; python_version < "2.7"', + ), + ( + 'importlib-metadata; python_version<"3.8"', + "importlib-metadata; python_version<'3.7'", + ), + ( + 'appdirs>=1.4.4,<2; os_name=="posix" and extra=="testing"', + "appdirs>=1.4.4,<2; os_name == 'posix' and extra == 'docs'", + ), +] + + +@pytest.mark.parametrize( + "name", + [ + "package", + "pAcKaGe", + "Package", + "foo-bar.quux_bAz", + "installer", + "android12", + ], +) +@pytest.mark.parametrize( + "extras", + [ + set(), + {"a"}, + {"a", "b"}, + {"a", "B", "CDEF123"}, + ], +) +@pytest.mark.parametrize( + ("url", "specifier"), + [ + (None, ""), + ("https://example.com/packagename.zip", ""), + ("ssh://user:pass%20word@example.com/packagename.zip", ""), + ("https://example.com/name;v=1.1/?query=foo&bar=baz#blah", ""), + ("git+ssh://git.example.com/MyProject", ""), + ("git+ssh://git@github.com:pypa/packaging.git", ""), + ("git+https://git.example.com/MyProject.git@master", ""), + ("git+https://git.example.com/MyProject.git@v1.0", ""), + ("git+https://git.example.com/MyProject.git@refs/pull/123/head", ""), + (None, "==={ws}arbitrarystring"), + (None, "(==={ws}arbitrarystring)"), + (None, "=={ws}1.0"), + (None, "(=={ws}1.0)"), + (None, "<={ws}1!3.0.0.rc2"), + (None, ">{ws}2.2{ws},{ws}<{ws}3"), + (None, "(>{ws}2.2{ws},{ws}<{ws}3)"), + ], +) +@pytest.mark.parametrize( + "marker", + [ + None, + "python_version{ws}>={ws}'3.3'", + ( + "sys_platform{ws}!={ws}'linux' and(os_name{ws}=={ws}'linux' or " + "python_version{ws}>={ws}'3.3'{ws}){ws}" + ), + ], +) +@pytest.mark.parametrize("whitespace", ["", " ", "\t"]) +def test_basic_valid_requirement_parsing( + name: str, + extras: Set[str], + specifier: str, + url: Optional[str], + marker: str, + whitespace: str, +) -> None: + # GIVEN + parts = [name] + if extras: + parts.append("[") + parts.append("{ws},{ws}".format(ws=whitespace).join(sorted(extras))) + parts.append("]") + if specifier: + parts.append(specifier.format(ws=whitespace)) + if url is not None: + parts.append("@") + parts.append(url.format(ws=whitespace)) + if marker is not None: + if url is not None: + parts.append(" ;") + else: + parts.append(";") + parts.append(marker.format(ws=whitespace)) + + to_parse = whitespace.join(parts) + + # WHEN + req = Requirement(to_parse) + + # THEN + assert req.name == name + assert req.extras == extras + assert req.url == url + assert req.specifier == specifier.format(ws="").strip("()") + assert req.marker == (Marker(marker.format(ws="")) if marker else None) + + +class TestRequirementParsing: + @pytest.mark.parametrize( + "marker", + [ + "python_implementation == ''", + "platform_python_implementation == ''", + "os.name == 'linux'", + "os_name == 'linux'", + "'8' in platform.version", + "'8' not in platform.version", + ], + ) + def test_valid_marker(self, marker: str) -> None: + # GIVEN + to_parse = f"name; {marker}" + + # WHEN + Requirement(to_parse) + + @pytest.mark.parametrize( + "url", + [ + "file:///absolute/path", + "file://.", + ], + ) + def test_file_url(self, url: str) -> None: + # GIVEN + to_parse = f"name @ {url}" + + # WHEN + req = Requirement(to_parse) + + # THEN + assert req.url == url -class TestRequirements: - def test_string_specifier_marker(self): - requirement = 'name[bar]>=3; python_version == "2.7"' - req = Requirement(requirement) - assert str(req) == requirement + def test_empty_extras(self) -> None: + # GIVEN + to_parse = "name[]" - def test_string_url(self): - requirement = "name@ http://foo.com" - req = Requirement(requirement) - assert str(req) == requirement + # WHEN + req = Requirement(to_parse) - def test_string_url_with_marker(self): - requirement = 'name@ http://foo.com ; extra == "feature"' - req = Requirement(requirement) - assert str(req) == requirement + # THEN + assert req.name == "name" + assert req.extras == set() - def test_repr(self): - req = Requirement("name") - assert repr(req) == "" + def test_empty_specifier(self) -> None: + # GIVEN + to_parse = "name()" - def _assert_requirement( - self, req, name, url=None, extras=[], specifier="", marker=None - ): - assert req.name == name - assert req.url == url - assert sorted(req.extras) == sorted(extras) - assert str(req.specifier) == specifier - if marker: - assert str(req.marker) == marker - else: - assert req.marker is None - - def test_simple_names(self): - for name in ("A", "aa", "name"): - req = Requirement(name) - self._assert_requirement(req, name) - - def test_name_with_other_characters(self): - name = "foo-bar.quux_baz" - req = Requirement(name) - self._assert_requirement(req, name) - - def test_invalid_name(self): - with pytest.raises(InvalidRequirement): - Requirement("foo!") - - def test_name_with_version(self): - req = Requirement("name>=3") - self._assert_requirement(req, "name", specifier=">=3") - - def test_with_legacy_version(self): - with pytest.raises(InvalidRequirement) as e: - Requirement("name==1.0.org1") - assert "Expected semicolon" in str(e) - - def test_with_legacy_version_and_marker(self): - with pytest.raises(InvalidRequirement) as e: - Requirement("name>=1.x.y;python_version=='2.6'") - assert "Expected semicolon" in str(e) - - def test_missing_name(self): - with pytest.raises(InvalidRequirement) as e: - Requirement("@ http://example.com") - assert "package name" in str(e) - - def test_name_with_missing_version(self): - with pytest.raises(InvalidRequirement) as e: - Requirement("name>=") - assert "Expected version after operator" in str(e) - - def test_version_with_parens_and_whitespace(self): - req = Requirement("name (==4)") - self._assert_requirement(req, "name", specifier="==4") - - def test_version_with_missing_closing_paren(self): - with pytest.raises(InvalidRequirement) as e: - Requirement("name(==4") - assert "Expected closing RIGHT_PARENTHESIS" in str(e) - - def test_name_with_multiple_versions(self): - req = Requirement("name>=3,<2") - self._assert_requirement(req, "name", specifier="<2,>=3") - - def test_name_with_multiple_versions_and_whitespace(self): - req = Requirement("name >=2, <3") - self._assert_requirement(req, "name", specifier="<3,>=2") - - def test_name_with_multiple_versions_in_parenthesis(self): - req = Requirement("name (>=2,<3)") - self._assert_requirement(req, "name", specifier="<3,>=2") - - def test_name_with_no_extras_no_versions_in_parenthesis(self): - req = Requirement("name []()") - self._assert_requirement(req, "name", specifier="", extras=[]) - - def test_name_with_extra_and_multiple_versions_in_parenthesis(self): - req = Requirement("name [foo, bar](>=2,<3)") - self._assert_requirement(req, "name", specifier="<3,>=2", extras=["foo", "bar"]) - - def test_name_with_no_versions_in_parenthesis(self): - req = Requirement("name ()") - self._assert_requirement(req, "name", specifier="") - - def test_extras(self): - req = Requirement("foobar [quux,bar]") - self._assert_requirement(req, "foobar", extras=["bar", "quux"]) - - def test_empty_extras(self): - req = Requirement("foo[]") - self._assert_requirement(req, "foo") - - def test_unclosed_extras(self): - with pytest.raises(InvalidRequirement) as e: - Requirement("foo[") - assert "Expected closing RIGHT_BRACKET" in str(e) - - def test_extras_without_comma(self): - with pytest.raises(InvalidRequirement) as e: - Requirement("foobar[quux bar]") - assert "Expected comma between extra names" in str(e) - - def test_url(self): - url_section = "test @ http://example.com" - req = Requirement(url_section) - self._assert_requirement(req, "test", "http://example.com", extras=[]) - - def test_url_and_marker(self): - instring = "test @ http://example.com ; os_name=='a'" - req = Requirement(instring) - self._assert_requirement( - req, "test", "http://example.com", extras=[], marker='os_name == "a"' + # WHEN + req = Requirement(to_parse) + + # THEN + assert req.name == "name" + assert req.specifier == "" + + # ---------------------------------------------------------------------------------- + # Everything below this (in this class) should be parsing failure modes + # ---------------------------------------------------------------------------------- + # Start all method names with with `test_error_` + # to make it easier to run these tests with `-k error` + + def test_error_when_empty_string(self) -> None: + # GIVEN + to_parse = "" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected package name at the start of dependency specifier\n" + " \n" + " ^" ) - def test_invalid_url(self): - with pytest.raises(InvalidRequirement) as e: - Requirement("name @ gopher:/foo/com") - assert "Invalid URL: " in str(e.value) - assert "gopher:/foo/com" in str(e.value) - - def test_file_url(self): - req = Requirement("name @ file:///absolute/path") - self._assert_requirement(req, "name", "file:///absolute/path") - req = Requirement("name @ file://.") - self._assert_requirement(req, "name", "file://.") - - def test_invalid_file_urls(self): - with pytest.raises(InvalidRequirement): - Requirement("name @ file:.") - with pytest.raises(InvalidRequirement): - Requirement("name @ file:/.") - - def test_extras_and_url_and_marker(self): - req = Requirement("name [fred,bar] @ http://foo.com ; python_version=='2.7'") - self._assert_requirement( - req, - "name", - extras=["bar", "fred"], - url="http://foo.com", - marker='python_version == "2.7"', + def test_error_no_name(self) -> None: + # GIVEN + to_parse = "==0.0" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected package name at the start of dependency specifier\n" + " ==0.0\n" + " ^" + ) + + def test_error_when_missing_comma_in_extras(self) -> None: + # GIVEN + to_parse = "name[bar baz]" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected comma between extra names\n" + " name[bar baz]\n" + " ^" ) - def test_complex_url_and_marker(self): - url = "https://example.com/name;v=1.1/?query=foo&bar=baz#blah" - req = Requirement("foo @ %s ; python_version=='3.4'" % url) - self._assert_requirement(req, "foo", url=url, marker='python_version == "3.4"') + def test_error_when_trailing_comma_in_extras(self) -> None: + # GIVEN + to_parse = "name[bar, baz,]" - def test_multiple_markers(self): - req = Requirement( - "name[quux, strange];python_version<'2.7' and " "platform_version=='2'" + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected extra name after comma\n" + " name[bar, baz,]\n" + " ^" + ) + + def test_error_when_parens_not_closed_correctly(self) -> None: + # GIVEN + to_parse = "name (>= 1.0" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected closing RIGHT_PARENTHESIS\n" + " name (>= 1.0\n" + " ~~~~~~~^" ) - marker = 'python_version < "2.7" and platform_version == "2"' - self._assert_requirement(req, "name", extras=["strange", "quux"], marker=marker) - def test_multiple_comparison_markers(self): - req = Requirement("name; os_name=='a' and os_name=='b' or os_name=='c'") - marker = 'os_name == "a" and os_name == "b" or os_name == "c"' - self._assert_requirement(req, "name", marker=marker) + def test_error_when_bracket_not_closed_correctly(self) -> None: + # GIVEN + to_parse = "name[bar, baz >= 1.0" - def test_invalid_marker(self): - with pytest.raises(InvalidRequirement): - Requirement("name; foobar=='x'") + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) - def test_marker_with_missing_semicolon(self): - with pytest.raises(InvalidRequirement) as e: - Requirement('name[bar]>=3 python_version == "2.7"') - assert "Expected semicolon" in str(e) + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected closing RIGHT_BRACKET\n" + " name[bar, baz >= 1.0\n" + " ~~~~~~~~~~^" + ) - def test_types(self): - req = Requirement("foobar[quux]<2,>=3; os_name=='a'") + def test_error_when_extras_bracket_left_unclosed(self) -> None: + # GIVEN + to_parse = "name[bar, baz" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected closing RIGHT_BRACKET\n" + " name[bar, baz\n" + " ~~~~~~~~~^" + ) + + def test_error_no_space_after_url(self) -> None: + # GIVEN + to_parse = "name @ https://example.com/; extra == 'example'" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected end or semicolon (after URL and whitespace)\n" + " name @ https://example.com/; extra == 'example'\n" + " ~~~~~~~~~~~~~~~~~~~~~~^" + ) + + def test_error_no_url_after_at(self) -> None: + # GIVEN + to_parse = "name @ " + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected URL after @\n" + " name @ \n" + " ^" + ) + + def test_error_invalid_marker_lvalue(self) -> None: + # GIVEN + to_parse = "name; invalid_name" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected a marker variable or quoted string\n" + " name; invalid_name\n" + " ^" + ) + + def test_error_invalid_marker_rvalue(self) -> None: + # GIVEN + to_parse = "name; '3.7' <= invalid_name" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected a marker variable or quoted string\n" + " name; '3.7' <= invalid_name\n" + " ^" + ) + + def test_error_invalid_marker_notin_without_whitespace(self) -> None: + # GIVEN + to_parse = "name; '3.7' notin python_version" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected whitespace after 'not'\n" + " name; '3.7' notin python_version\n" + " ^" + ) + + def test_error_invalid_marker_not_without_in(self) -> None: + # GIVEN + to_parse = "name; '3.7' not python_version" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected 'in' after 'not'\n" + " name; '3.7' not python_version\n" + " ^" + ) + + def test_error_invalid_marker_with_invalid_op(self) -> None: + # GIVEN + to_parse = "name; '3.7' ~ python_version" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected marker operator, one of <=, <, !=, ==, >=, >, ~=, ===, " + "not, not in\n" + " name; '3.7' ~ python_version\n" + " ^" + ) + + @pytest.mark.parametrize( + "url", + [ + "gopher:/foo/com", + "file:.", + "file:/.", + ], + ) + def test_error_on_invalid_url(self, url: str) -> None: + # GIVEN + to_parse = f"name @ {url}" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert "Invalid URL" in ctx.exconly() + + def test_error_on_legacy_version_outside_triple_equals(self) -> None: + # GIVEN + to_parse = "name==1.0.org1" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected end or semicolon (after version specifier)\n" + " name==1.0.org1\n" + " ~~~~~^" + ) + + def test_error_on_missing_version_after_op(self) -> None: + # GIVEN + to_parse = "name==" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected version after operator\n" + " name==\n" + " ^" + ) + + def test_error_on_missing_op_after_name(self) -> None: + # GIVEN + to_parse = "name 1.0" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected end or semicolon (after version specifier)\n" + " name 1.0\n" + " ^" + ) + + +class TestRequirementBehaviour: + def test_types_with_nothing(self) -> None: + # GIVEN + to_parse = "foobar" + + # WHEN + req = Requirement(to_parse) + + # THEN assert isinstance(req.name, str) assert isinstance(req.extras, set) assert req.url is None assert isinstance(req.specifier, SpecifierSet) - assert isinstance(req.marker, Marker) + assert req.marker is None + + def test_types_with_specifier_and_marker(self) -> None: + # GIVEN + to_parse = "foobar[quux]<2,>=3; os_name=='a'" - def test_types_with_nothing(self): - req = Requirement("foobar") + # WHEN + req = Requirement(to_parse) + + # THEN assert isinstance(req.name, str) assert isinstance(req.extras, set) assert req.url is None assert isinstance(req.specifier, SpecifierSet) - assert req.marker is None + assert isinstance(req.marker, Marker) - def test_types_with_url(self): + def test_types_with_url(self) -> None: req = Requirement("foobar @ http://foo.com") assert isinstance(req.name, str) assert isinstance(req.extras, set) @@ -220,77 +532,46 @@ def test_types_with_url(self): assert isinstance(req.specifier, SpecifierSet) assert req.marker is None - def test_sys_platform_linux_equal(self): - req = Requirement('something>=1.2.3; sys_platform == "foo"') - - assert req.name == "something" - assert req.marker is not None - assert req.marker.evaluate(dict(sys_platform="foo")) is True - assert req.marker.evaluate(dict(sys_platform="bar")) is False - - def test_sys_platform_linux_in(self): - req = Requirement("aviato>=1.2.3; 'f' in sys_platform") - - assert req.name == "aviato" - assert req.marker is not None - assert req.marker.evaluate(dict(sys_platform="foo")) is True - assert req.marker.evaluate(dict(sys_platform="bar")) is False - - def test_parseexception_error_msg(self): - with pytest.raises(InvalidRequirement) as e: - Requirement("toto 42") - assert "Expected semicolon" in str(e) - - EQUAL_DEPENDENCIES = [ - ("packaging>20.1", "packaging>20.1"), - ( - 'requests[security, tests]>=2.8.1,==2.8.*;python_version<"2.7"', - 'requests [security,tests] >= 2.8.1, == 2.8.* ; python_version < "2.7"', - ), - ( - 'importlib-metadata; python_version<"3.8"', - "importlib-metadata; python_version<'3.8'", - ), - ( - 'appdirs>=1.4.4,<2; os_name=="posix" and extra=="testing"', - "appdirs>=1.4.4,<2; os_name == 'posix' and extra == 'testing'", - ), - ] - - DIFFERENT_DEPENDENCIES = [ - ("packaging>20.1", "packaging>=20.1"), - ("packaging>20.1", "packaging>21.1"), - ("packaging>20.1", "package>20.1"), - ( - 'requests[security,tests]>=2.8.1,==2.8.*;python_version<"2.7"', - 'requests [security,tests] >= 2.8.1 ; python_version < "2.7"', - ), - ( - 'importlib-metadata; python_version<"3.8"', - "importlib-metadata; python_version<'3.7'", - ), - ( - 'appdirs>=1.4.4,<2; os_name=="posix" and extra=="testing"', - "appdirs>=1.4.4,<2; os_name == 'posix' and extra == 'docs'", - ), - ] + @pytest.mark.parametrize( + "url_or_specifier", + ["", "@ https://url ", "!=2.0", "==2.*"], + ) + @pytest.mark.parametrize("extras", ["", "[a]", "[a,b]", "[a1,b1,b2]"]) + @pytest.mark.parametrize( + "marker", + ["", '; python_version == "3.11"', '; "3." not in python_version'], + ) + def test_str_and_repr( + self, extras: str, url_or_specifier: str, marker: str + ) -> None: + # GIVEN + to_parse = f"name{extras}{url_or_specifier}{marker}".strip() + + # WHEN + req = Requirement(to_parse) + + # THEN + assert str(req) == to_parse + assert repr(req) == f"" @pytest.mark.parametrize("dep1, dep2", EQUAL_DEPENDENCIES) - def test_equal_reqs_equal_hashes(self, dep1, dep2): - # Requirement objects created from equivalent strings should be equal. + def test_equal_reqs_equal_hashes(self, dep1: str, dep2: str) -> None: + """Requirement objects created from equivalent strings should be equal.""" + # GIVEN / WHEN req1, req2 = Requirement(dep1), Requirement(dep2) + assert req1 == req2 - # Equal Requirement objects should have the same hash. assert hash(req1) == hash(req2) @pytest.mark.parametrize("dep1, dep2", DIFFERENT_DEPENDENCIES) - def test_different_reqs_different_hashes(self, dep1, dep2): - # Requirement objects created from non-equivalent strings should differ. + def test_different_reqs_different_hashes(self, dep1: str, dep2: str) -> None: + """Requirement objects created from non-equivalent strings should differ.""" + # GIVEN / WHEN req1, req2 = Requirement(dep1), Requirement(dep2) + + # THEN assert req1 != req2 - # Different Requirement objects should have different hashes. assert hash(req1) != hash(req2) - def test_compare_reqs_to_other_objects(self): - # Requirement objects should not be comparable to other kinds of objects. + def test_compare_with_string(self) -> None: assert Requirement("packaging>=21.3") != "packaging>=21.3" From 0399eaf42a4bc9e587ff6762e7572427e7d8c849 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Tue, 6 Dec 2022 00:24:11 +0000 Subject: [PATCH 17/23] Improve error message for bad version specifiers in `Requirement` This makes it easier to understand what the state of the parser is and what is expected at that point. --- packaging/_parser.py | 8 +++++++- tests/test_requirements.py | 18 +++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index 378cef88..29cb27d3 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -120,7 +120,13 @@ def _parse_requirement_details( return (url, specifier, marker) marker = _parse_requirement_marker( - tokenizer, span_start=specifier_start, after="version specifier" + tokenizer, + span_start=specifier_start, + after=( + "version specifier" + if specifier + else "name and no valid version specifier" + ), ) return (url, specifier, marker) diff --git a/tests/test_requirements.py b/tests/test_requirements.py index 09c963d0..5305fbdf 100644 --- a/tests/test_requirements.py +++ b/tests/test_requirements.py @@ -489,11 +489,27 @@ def test_error_on_missing_op_after_name(self) -> None: # THEN assert ctx.exconly() == ( "packaging.requirements.InvalidRequirement: " - "Expected end or semicolon (after version specifier)\n" + "Expected end or semicolon (after name and no valid version specifier)\n" " name 1.0\n" " ^" ) + def test_error_on_random_char_after_specifier(self) -> None: + # GIVEN + to_parse = "name >= 1.0 #" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected end or semicolon (after version specifier)\n" + " name >= 1.0 #\n" + " ~~~~~~~^" + ) + class TestRequirementBehaviour: def test_types_with_nothing(self) -> None: From 83aae66567665339accf2fb71c31228dd4548ceb Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Tue, 6 Dec 2022 23:48:32 +0000 Subject: [PATCH 18/23] Add `ParserSyntaxError` as the cause of `Invalid{Requirement/Marker}` This ensures that these error tracebacks correctly describe the causality between the two errors. --- packaging/markers.py | 2 +- packaging/requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/markers.py b/packaging/markers.py index bb2d2a5a..a5bf35c6 100644 --- a/packaging/markers.py +++ b/packaging/markers.py @@ -207,7 +207,7 @@ def __init__(self, marker: str) -> None: # ] # ] except ParserSyntaxError as e: - raise InvalidMarker(str(e)) + raise InvalidMarker(str(e)) from e def __str__(self) -> str: return _format_marker(self._markers) diff --git a/packaging/requirements.py b/packaging/requirements.py index 2140a7f7..a9f9b9c7 100644 --- a/packaging/requirements.py +++ b/packaging/requirements.py @@ -34,7 +34,7 @@ def __init__(self, requirement_string: str) -> None: try: parsed = parse_requirement(requirement_string) except ParserSyntaxError as e: - raise InvalidRequirement(str(e)) + raise InvalidRequirement(str(e)) from e self.name: str = parsed.name if parsed.url: From 163993a619b2bf735815dac9dd1932b845191de1 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Wed, 7 Dec 2022 00:09:46 +0000 Subject: [PATCH 19/23] Permit whitespace around `marker_atom` This ensures that a marker with whitespace around it is parsed correctly. --- packaging/_parser.py | 9 +++++---- tests/test_requirements.py | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index 29cb27d3..d13965d9 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -239,12 +239,11 @@ def parse_marker(source: str) -> MarkerList: def _parse_marker_expr(tokenizer: Tokenizer) -> MarkerList: """ - marker_expr = marker_atom (BOOLOP WS? marker_atom)+ + marker_expr = marker_atom (BOOLOP marker_atom)+ """ expression = [_parse_marker_atom(tokenizer)] while tokenizer.check("BOOLOP"): token = tokenizer.read() - tokenizer.consume("WS") expr_right = _parse_marker_atom(tokenizer) expression.extend((token.text, expr_right)) return expression @@ -252,10 +251,11 @@ def _parse_marker_expr(tokenizer: Tokenizer) -> MarkerList: def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom: """ - marker_atom = LEFT_PARENTHESIS WS? marker_expr WS? RIGHT_PARENTHESIS - | marker_item + marker_atom = WS? LEFT_PARENTHESIS WS? marker_expr WS? RIGHT_PARENTHESIS WS? + | WS? marker_item WS? """ + tokenizer.consume("WS") if tokenizer.check("LEFT_PARENTHESIS", peek=True): with tokenizer.enclosing_tokens("LEFT_PARENTHESIS", "RIGHT_PARENTHESIS"): tokenizer.consume("WS") @@ -263,6 +263,7 @@ def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom: tokenizer.consume("WS") else: marker = _parse_marker_item(tokenizer) + tokenizer.consume("WS") return marker diff --git a/tests/test_requirements.py b/tests/test_requirements.py index 5305fbdf..86516fd7 100644 --- a/tests/test_requirements.py +++ b/tests/test_requirements.py @@ -92,6 +92,7 @@ [ None, "python_version{ws}>={ws}'3.3'", + '({ws}python_version{ws}>={ws}"3.4"{ws}){ws}and extra{ws}=={ws}"oursql"', ( "sys_platform{ws}!={ws}'linux' and(os_name{ws}=={ws}'linux' or " "python_version{ws}>={ws}'3.3'{ws}){ws}" From fa4b69d22083147dd7e473aa16e7475294cc5cbf Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Wed, 7 Dec 2022 00:10:45 +0000 Subject: [PATCH 20/23] Rename `marker_expr` to `marker` This is better aligned with the naming from PEP 508. --- packaging/_parser.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index d13965d9..103f679a 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -147,7 +147,7 @@ def _parse_requirement_marker( else: tokenizer.read() - marker = _parse_marker_expr(tokenizer) + marker = _parse_marker(tokenizer) tokenizer.consume("WS") return marker @@ -234,12 +234,12 @@ def _parse_version_many(tokenizer: Tokenizer) -> str: # Recursive descent parser for marker expression # -------------------------------------------------------------------------------------- def parse_marker(source: str) -> MarkerList: - return _parse_marker_expr(Tokenizer(source, rules=DEFAULT_RULES)) + return _parse_marker(Tokenizer(source, rules=DEFAULT_RULES)) -def _parse_marker_expr(tokenizer: Tokenizer) -> MarkerList: +def _parse_marker(tokenizer: Tokenizer) -> MarkerList: """ - marker_expr = marker_atom (BOOLOP marker_atom)+ + marker = marker_atom (BOOLOP marker_atom)+ """ expression = [_parse_marker_atom(tokenizer)] while tokenizer.check("BOOLOP"): @@ -251,7 +251,7 @@ def _parse_marker_expr(tokenizer: Tokenizer) -> MarkerList: def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom: """ - marker_atom = WS? LEFT_PARENTHESIS WS? marker_expr WS? RIGHT_PARENTHESIS WS? + marker_atom = WS? LEFT_PARENTHESIS WS? marker WS? RIGHT_PARENTHESIS WS? | WS? marker_item WS? """ @@ -259,7 +259,7 @@ def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom: if tokenizer.check("LEFT_PARENTHESIS", peek=True): with tokenizer.enclosing_tokens("LEFT_PARENTHESIS", "RIGHT_PARENTHESIS"): tokenizer.consume("WS") - marker: MarkerAtom = _parse_marker_expr(tokenizer) + marker: MarkerAtom = _parse_marker(tokenizer) tokenizer.consume("WS") else: marker = _parse_marker_item(tokenizer) From ff75da700a4f29302a1f4217774cf5bde8259906 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Wed, 7 Dec 2022 00:44:17 +0000 Subject: [PATCH 21/23] Enforce word boundaries in operators and names This ensures that these are only parsed when they're independent words. --- packaging/_tokenizer.py | 12 ++++++------ tests/test_requirements.py | 22 ++++++++++++++++++++-- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/packaging/_tokenizer.py b/packaging/_tokenizer.py index da6e7b06..de389cb3 100644 --- a/packaging/_tokenizer.py +++ b/packaging/_tokenizer.py @@ -53,12 +53,12 @@ def __str__(self) -> str: re.VERBOSE, ), "OP": r"(===|==|~=|!=|<=|>=|<|>)", - "BOOLOP": r"(or|and)", - "IN": r"in", - "NOT": r"not", + "BOOLOP": r"\b(or|and)\b", + "IN": r"\bin\b", + "NOT": r"\bnot\b", "VARIABLE": re.compile( r""" - ( + \b( python_version |python_full_version |os[._]name @@ -68,14 +68,14 @@ def __str__(self) -> str: |python_implementation |implementation_(name|version) |extra - ) + )\b """, re.VERBOSE, ), "VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE), "AT": r"\@", "URL": r"[^ \t]+", - "IDENTIFIER": r"[a-zA-Z0-9][a-zA-Z0-9._-]*", + "IDENTIFIER": r"\b[a-zA-Z0-9][a-zA-Z0-9._-]*\b", "WS": r"[ \t]+", "END": r"$", } diff --git a/tests/test_requirements.py b/tests/test_requirements.py index 86516fd7..93839d69 100644 --- a/tests/test_requirements.py +++ b/tests/test_requirements.py @@ -390,9 +390,27 @@ def test_error_invalid_marker_notin_without_whitespace(self) -> None: # THEN assert ctx.exconly() == ( "packaging.requirements.InvalidRequirement: " - "Expected whitespace after 'not'\n" + "Expected marker operator, one of <=, <, !=, ==, >=, >, ~=, ===, " + "not, not in\n" " name; '3.7' notin python_version\n" - " ^" + " ^" + ) + + def test_error_when_no_word_boundary(self) -> None: + # GIVEN + to_parse = "name; '3.6'inpython_version" + + # WHEN + with pytest.raises(InvalidRequirement) as ctx: + Requirement(to_parse) + + # THEN + assert ctx.exconly() == ( + "packaging.requirements.InvalidRequirement: " + "Expected marker operator, one of <=, <, !=, ==, >=, >, ~=, ===, " + "not, not in\n" + " name; '3.6'inpython_version\n" + " ^" ) def test_error_invalid_marker_not_without_in(self) -> None: From 494585628da5e6eed2dfa10150070dc73887fe76 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Wed, 7 Dec 2022 00:45:02 +0000 Subject: [PATCH 22/23] Fix a typo in an error message The listed operators were incorrect. --- packaging/_parser.py | 2 +- tests/test_requirements.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index 103f679a..d03119a1 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -327,5 +327,5 @@ def _parse_marker_op(tokenizer: Tokenizer) -> Op: else: return tokenizer.raise_syntax_error( "Expected marker operator, one of " - "<=, <, !=, ==, >=, >, ~=, ===, not, not in" + "<=, <, !=, ==, >=, >, ~=, ===, in, not in" ) diff --git a/tests/test_requirements.py b/tests/test_requirements.py index 93839d69..92cf9ce6 100644 --- a/tests/test_requirements.py +++ b/tests/test_requirements.py @@ -391,7 +391,7 @@ def test_error_invalid_marker_notin_without_whitespace(self) -> None: assert ctx.exconly() == ( "packaging.requirements.InvalidRequirement: " "Expected marker operator, one of <=, <, !=, ==, >=, >, ~=, ===, " - "not, not in\n" + "in, not in\n" " name; '3.7' notin python_version\n" " ^" ) @@ -408,7 +408,7 @@ def test_error_when_no_word_boundary(self) -> None: assert ctx.exconly() == ( "packaging.requirements.InvalidRequirement: " "Expected marker operator, one of <=, <, !=, ==, >=, >, ~=, ===, " - "not, not in\n" + "in, not in\n" " name; '3.6'inpython_version\n" " ^" ) @@ -441,7 +441,7 @@ def test_error_invalid_marker_with_invalid_op(self) -> None: assert ctx.exconly() == ( "packaging.requirements.InvalidRequirement: " "Expected marker operator, one of <=, <, !=, ==, >=, >, ~=, ===, " - "not, not in\n" + "in, not in\n" " name; '3.7' ~ python_version\n" " ^" ) From 7869a1aff208364044b3761dad206013ff9df179 Mon Sep 17 00:00:00 2001 From: Pradyun Gedam Date: Wed, 7 Dec 2022 01:10:40 +0000 Subject: [PATCH 23/23] Permit arbitrary whitespace around versions specifier in parenthesis This is more consistent with the rest of the format which is largely whitespace agnostic. --- packaging/_parser.py | 6 ++++-- tests/test_requirements.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/packaging/_parser.py b/packaging/_parser.py index d03119a1..4dcf03d1 100644 --- a/packaging/_parser.py +++ b/packaging/_parser.py @@ -197,11 +197,13 @@ def _parse_extras_list(tokenizer: Tokenizer) -> List[str]: def _parse_specifier(tokenizer: Tokenizer) -> str: """ - specifier = LEFT_PARENTHESIS version_many RIGHT_PARENTHESIS - | version_many + specifier = LEFT_PARENTHESIS WS? version_many WS? RIGHT_PARENTHESIS + | WS? version_many WS? """ with tokenizer.enclosing_tokens("LEFT_PARENTHESIS", "RIGHT_PARENTHESIS"): + tokenizer.consume("WS") parsed_specifiers = _parse_version_many(tokenizer) + tokenizer.consume("WS") return parsed_specifiers diff --git a/tests/test_requirements.py b/tests/test_requirements.py index 92cf9ce6..36ea474b 100644 --- a/tests/test_requirements.py +++ b/tests/test_requirements.py @@ -79,9 +79,9 @@ ("git+https://git.example.com/MyProject.git@v1.0", ""), ("git+https://git.example.com/MyProject.git@refs/pull/123/head", ""), (None, "==={ws}arbitrarystring"), - (None, "(==={ws}arbitrarystring)"), + (None, "({ws}==={ws}arbitrarystring{ws})"), (None, "=={ws}1.0"), - (None, "(=={ws}1.0)"), + (None, "({ws}=={ws}1.0{ws})"), (None, "<={ws}1!3.0.0.rc2"), (None, ">{ws}2.2{ws},{ws}<{ws}3"), (None, "(>{ws}2.2{ws},{ws}<{ws}3)"),