From 081e82a794aee0cbec850f598af4e3c3d1b5eff9 Mon Sep 17 00:00:00 2001 From: Khoi Pham Date: Tue, 29 Oct 2024 11:42:26 -0700 Subject: [PATCH 1/2] fix: Another approach to parse tag string with spaces to avoid long running time on regex matching. --- samcli/cli/types.py | 131 +++++++++++++++++++++++++++++++---- tests/unit/cli/test_types.py | 8 +++ 2 files changed, 124 insertions(+), 15 deletions(-) diff --git a/samcli/cli/types.py b/samcli/cli/types.py index dc37420047..695a6d9172 100644 --- a/samcli/cli/types.py +++ b/samcli/cli/types.py @@ -16,8 +16,7 @@ LOG = logging.getLogger(__name__) - -def _generate_match_regex(match_pattern, delim): +def _generate_match_regex(match_pattern, delim = None): """ Creates a regex string based on a match pattern (also a regex) that is to be run on a string (which may contain escaped quotes) that is separated by delimiters. @@ -32,13 +31,14 @@ def _generate_match_regex(match_pattern, delim): str: regex expression """ + result = (f"""(\\"(?:\\\\{match_pattern}|[^\\"\\\\]+)*\\"|""" + + f"""\'(?:\\\\{match_pattern}|[^\'\\\\]+)*\'""") - # Non capturing groups reduces duplicates in groups, but does not reduce matches. - return ( - f"""(\\"(?:\\\\{match_pattern}|[^\\"\\\\]+)*\\"|""" - + f"""\'(?:\\\\{match_pattern}|[^\'\\\\]+)*\'|""" - + f"""(?:\\\\{match_pattern}|[^{delim}\\"\\\\]+)+)""" - ) + if delim is not None: + # Non capturing groups reduces duplicates in groups, but does not reduce matches. + return result + f"""|(?:\\\\{match_pattern}|[^{delim}\\"\\\\]+)+)""" + else: + return result + ")" def _unquote_wrapped_quotes(value): @@ -194,6 +194,7 @@ def __init__(self, multiple_values_per_key=False): TAG_REGEX = '[A-Za-z0-9\\"_:\\.\\/\\+-\\@=]' _pattern = r"{tag}={tag}".format(tag=_generate_match_regex(match_pattern=TAG_REGEX, delim=" ")) + _quoted_pattern = _generate_match_regex(match_pattern=TAG_REGEX) name = "string,list" @@ -222,13 +223,7 @@ def convert(self, value, param, ctx): for k in tags: self._add_value(result, _unquote_wrapped_quotes(k), _unquote_wrapped_quotes(tags[k])) else: - groups = re.findall(self._pattern, val) - - if not groups: - fail = True - for group in groups: - key, v = group - self._add_value(result, _unquote_wrapped_quotes(key), _unquote_wrapped_quotes(v)) + fail = not self._parse_key_value_pair(result, val) if fail: return self.fail( @@ -239,6 +234,66 @@ def convert(self, value, param, ctx): return result + def _parse_key_value_pair(self, result: dict, key_value_string: str): + """ + This method processes a string in the format "'key1'='value1','key2'='value2'", + where spaces may exist within keys or values. + + To optimize performance, the parsing is divided into two stages: + + Stage 1: Optimized Parsing + 1. Identify quoted strings containing spaces within values. + 2. Temporarily replace spaces in these strings with a placeholder (e.g., "_"). + 3. Use a fast, standard parser to extract key-value pairs, as no spaces are expected. + 4. Restore original spaces in the extracted key-value pairs. + + Stage 2: Fallback Parsing + If Stage 1 fails to parse the string correctly,run against a comprehensive regex pattern + {tag}={tag}) to parse the entire string. + + Parameters + ---------- + result: result dict + key_value_string: string to parse + + Returns + ------- + boolean - parse result + """ + parse_result = True + + # Unquote an entire string + modified_val = _unquote_wrapped_quotes(key_value_string) + + # Looking for a quote strings that contain spaces and proceed to replace them + quoted_strings_with_spaces = re.findall(self._quoted_pattern, modified_val) + quoted_strings_with_spaces_objects = [ + TextWithSpaces(str_with_spaces) for str_with_spaces in quoted_strings_with_spaces + ] + for s, replacement in zip(quoted_strings_with_spaces, quoted_strings_with_spaces_objects): + modified_val = modified_val.replace(s, replacement.replace_spaces()) + + # Use default parser to parse key=value + tags = self._multiple_space_separated_key_value_parser(modified_val) + if tags is not None: + for key, value in tags.items(): + new_value = value + text_objects = [obj for obj in quoted_strings_with_spaces_objects if obj.modified_text == value] + if len(text_objects) > 0: + new_value = text_objects[0].restore_spaces() + self._add_value(result, _unquote_wrapped_quotes(key), _unquote_wrapped_quotes(new_value)) + else: + # Otherwise, fall back to the original mechanism. + groups = re.findall(self._pattern, key_value_string) + + if not groups: + parse_result = False + for group in groups: + key, v = group + self._add_value(result, _unquote_wrapped_quotes(key), _unquote_wrapped_quotes(v)) + + return parse_result + def _add_value(self, result: dict, key: str, new_value: str): """ Add a given value to a given key in the result map. @@ -286,6 +341,21 @@ def _space_separated_key_value_parser(tag_value): tags_dict = {**tags_dict, **parsed_tag} return True, tags_dict + @staticmethod + def _multiple_space_separated_key_value_parser(tag_value): + """ + Method to parse space separated `Key1=Value1 Key2=Value2` type tags without using regex. + Parameters + ---------- + tag_value + """ + tags_dict = {} + for value in tag_value.split(): + parsed, parsed_tag = CfnTags._standard_key_value_parser(value) + if not parsed: + return None + tags_dict.update(parsed_tag) + return tags_dict class SigningProfilesOptionType(click.ParamType): """ @@ -560,3 +630,34 @@ def convert( ) return {resource_id: [excluded_path]} + + +class TextWithSpaces: + def __init__(self, text) -> None: + self.text = text + self.modified_text = text + self.space_positions = [] # type: List[int] + + def replace_spaces(self, replacement="_"): + """ + Replace spaces in a text with a replacement together with its original locations. + Input: "test 1" + Output: "test_1" [4] + """ + self.space_positions = [i for i, char in enumerate(self.text) if char == " "] + self.modified_text = self.text.replace(" ", replacement) + + return self.modified_text + + def restore_spaces(self): + """ + Restore spaces in a text from a original space locations. + Input: "test_1" [4] + Output: "test 1" + """ + text_list = list(self.modified_text) + + for pos in self.space_positions: + text_list[pos] = " " + + return "".join(text_list) diff --git a/tests/unit/cli/test_types.py b/tests/unit/cli/test_types.py index 8edab7cb9b..f6943107f6 100644 --- a/tests/unit/cli/test_types.py +++ b/tests/unit/cli/test_types.py @@ -238,6 +238,14 @@ def test_must_fail_on_invalid_format(self, input): ["stage=int", "company:application=awesome-service", "company:department=engineering"], {"stage": "int", "company:application": "awesome-service", "company:department": "engineering"}, ), + # input as string with multiple key-values including spaces + (('tag1="son of anton" tag2="company abc"',), {"tag1": "son of anton", "tag2": "company abc"}), + (('tag1="son of anton" tag2="company abc"',), {"tag1": "son of anton", "tag2": "company abc"}), + (('\'tag1="son of anton" tag2="company abc"\'',), {"tag1": "son of anton", "tag2": "company abc"}), + ( + ('tag1="son of anton" tag2="company abc" tag:3="dummy tag"',), + {"tag1": "son of anton", "tag2": "company abc", "tag:3": "dummy tag"}, + ), ] ) def test_successful_parsing(self, input, expected): From afcd99675808428946c730e1a39b372beb67a45d Mon Sep 17 00:00:00 2001 From: Khoi Pham Date: Tue, 26 Nov 2024 10:39:46 -0800 Subject: [PATCH 2/2] Format --- samcli/cli/types.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/samcli/cli/types.py b/samcli/cli/types.py index 695a6d9172..a3743128ad 100644 --- a/samcli/cli/types.py +++ b/samcli/cli/types.py @@ -16,7 +16,8 @@ LOG = logging.getLogger(__name__) -def _generate_match_regex(match_pattern, delim = None): + +def _generate_match_regex(match_pattern, delim=None): """ Creates a regex string based on a match pattern (also a regex) that is to be run on a string (which may contain escaped quotes) that is separated by delimiters. @@ -31,8 +32,7 @@ def _generate_match_regex(match_pattern, delim = None): str: regex expression """ - result = (f"""(\\"(?:\\\\{match_pattern}|[^\\"\\\\]+)*\\"|""" - + f"""\'(?:\\\\{match_pattern}|[^\'\\\\]+)*\'""") + result = f"""(\\"(?:\\\\{match_pattern}|[^\\"\\\\]+)*\\"|""" + f"""\'(?:\\\\{match_pattern}|[^\'\\\\]+)*\'""" if delim is not None: # Non capturing groups reduces duplicates in groups, but does not reduce matches. @@ -357,6 +357,7 @@ def _multiple_space_separated_key_value_parser(tag_value): tags_dict.update(parsed_tag) return tags_dict + class SigningProfilesOptionType(click.ParamType): """ Custom parameter type to parse Signing Profile options