Skip to content

Commit

Permalink
fix: Another approach to parse tag string with spaces to avoid long r…
Browse files Browse the repository at this point in the history
…unning time on regex matching. (#7625)

* fix: Another approach to parse tag string with spaces to avoid long running time on regex matching.

* Format
  • Loading branch information
dkphm authored Nov 27, 2024
1 parent 6f750a8 commit 575253a
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 14 deletions.
130 changes: 116 additions & 14 deletions samcli/cli/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
LOG = logging.getLogger(__name__)


def _generate_match_regex(match_pattern, delim):
def _generate_match_regex(match_pattern, delim=None):
"""
Creates a regex string based on a match pattern (also a regex) that is to be
run on a string (which may contain escaped quotes) that is separated by delimiters.
Expand All @@ -32,13 +32,13 @@ def _generate_match_regex(match_pattern, delim):
str: regex expression
"""
result = f"""(\\"(?:\\\\{match_pattern}|[^\\"\\\\]+)*\\"|""" + f"""\'(?:\\\\{match_pattern}|[^\'\\\\]+)*\'"""

# Non capturing groups reduces duplicates in groups, but does not reduce matches.
return (
f"""(\\"(?:\\\\{match_pattern}|[^\\"\\\\]+)*\\"|"""
+ f"""\'(?:\\\\{match_pattern}|[^\'\\\\]+)*\'|"""
+ f"""(?:\\\\{match_pattern}|[^{delim}\\"\\\\]+)+)"""
)
if delim is not None:
# Non capturing groups reduces duplicates in groups, but does not reduce matches.
return result + f"""|(?:\\\\{match_pattern}|[^{delim}\\"\\\\]+)+)"""
else:
return result + ")"


def _unquote_wrapped_quotes(value):
Expand Down Expand Up @@ -194,6 +194,7 @@ def __init__(self, multiple_values_per_key=False):
TAG_REGEX = '[A-Za-z0-9\\"_:\\.\\/\\+-\\@=]'

_pattern = r"{tag}={tag}".format(tag=_generate_match_regex(match_pattern=TAG_REGEX, delim=" "))
_quoted_pattern = _generate_match_regex(match_pattern=TAG_REGEX)

name = "string,list"

Expand Down Expand Up @@ -222,13 +223,7 @@ def convert(self, value, param, ctx):
for k in tags:
self._add_value(result, _unquote_wrapped_quotes(k), _unquote_wrapped_quotes(tags[k]))
else:
groups = re.findall(self._pattern, val)

if not groups:
fail = True
for group in groups:
key, v = group
self._add_value(result, _unquote_wrapped_quotes(key), _unquote_wrapped_quotes(v))
fail = not self._parse_key_value_pair(result, val)

if fail:
return self.fail(
Expand All @@ -239,6 +234,66 @@ def convert(self, value, param, ctx):

return result

def _parse_key_value_pair(self, result: dict, key_value_string: str):
"""
This method processes a string in the format "'key1'='value1','key2'='value2'",
where spaces may exist within keys or values.
To optimize performance, the parsing is divided into two stages:
Stage 1: Optimized Parsing
1. Identify quoted strings containing spaces within values.
2. Temporarily replace spaces in these strings with a placeholder (e.g., "_").
3. Use a fast, standard parser to extract key-value pairs, as no spaces are expected.
4. Restore original spaces in the extracted key-value pairs.
Stage 2: Fallback Parsing
If Stage 1 fails to parse the string correctly,run against a comprehensive regex pattern
{tag}={tag}) to parse the entire string.
Parameters
----------
result: result dict
key_value_string: string to parse
Returns
-------
boolean - parse result
"""
parse_result = True

# Unquote an entire string
modified_val = _unquote_wrapped_quotes(key_value_string)

# Looking for a quote strings that contain spaces and proceed to replace them
quoted_strings_with_spaces = re.findall(self._quoted_pattern, modified_val)
quoted_strings_with_spaces_objects = [
TextWithSpaces(str_with_spaces) for str_with_spaces in quoted_strings_with_spaces
]
for s, replacement in zip(quoted_strings_with_spaces, quoted_strings_with_spaces_objects):
modified_val = modified_val.replace(s, replacement.replace_spaces())

# Use default parser to parse key=value
tags = self._multiple_space_separated_key_value_parser(modified_val)
if tags is not None:
for key, value in tags.items():
new_value = value
text_objects = [obj for obj in quoted_strings_with_spaces_objects if obj.modified_text == value]
if len(text_objects) > 0:
new_value = text_objects[0].restore_spaces()
self._add_value(result, _unquote_wrapped_quotes(key), _unquote_wrapped_quotes(new_value))
else:
# Otherwise, fall back to the original mechanism.
groups = re.findall(self._pattern, key_value_string)

if not groups:
parse_result = False
for group in groups:
key, v = group
self._add_value(result, _unquote_wrapped_quotes(key), _unquote_wrapped_quotes(v))

return parse_result

def _add_value(self, result: dict, key: str, new_value: str):
"""
Add a given value to a given key in the result map.
Expand Down Expand Up @@ -286,6 +341,22 @@ def _space_separated_key_value_parser(tag_value):
tags_dict = {**tags_dict, **parsed_tag}
return True, tags_dict

@staticmethod
def _multiple_space_separated_key_value_parser(tag_value):
"""
Method to parse space separated `Key1=Value1 Key2=Value2` type tags without using regex.
Parameters
----------
tag_value
"""
tags_dict = {}
for value in tag_value.split():
parsed, parsed_tag = CfnTags._standard_key_value_parser(value)
if not parsed:
return None
tags_dict.update(parsed_tag)
return tags_dict


class SigningProfilesOptionType(click.ParamType):
"""
Expand Down Expand Up @@ -560,3 +631,34 @@ def convert(
)

return {resource_id: [excluded_path]}


class TextWithSpaces:
def __init__(self, text) -> None:
self.text = text
self.modified_text = text
self.space_positions = [] # type: List[int]

def replace_spaces(self, replacement="_"):
"""
Replace spaces in a text with a replacement together with its original locations.
Input: "test 1"
Output: "test_1" [4]
"""
self.space_positions = [i for i, char in enumerate(self.text) if char == " "]
self.modified_text = self.text.replace(" ", replacement)

return self.modified_text

def restore_spaces(self):
"""
Restore spaces in a text from a original space locations.
Input: "test_1" [4]
Output: "test 1"
"""
text_list = list(self.modified_text)

for pos in self.space_positions:
text_list[pos] = " "

return "".join(text_list)
8 changes: 8 additions & 0 deletions tests/unit/cli/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,14 @@ def test_must_fail_on_invalid_format(self, input):
["stage=int", "company:application=awesome-service", "company:department=engineering"],
{"stage": "int", "company:application": "awesome-service", "company:department": "engineering"},
),
# input as string with multiple key-values including spaces
(('tag1="son of anton" tag2="company abc"',), {"tag1": "son of anton", "tag2": "company abc"}),
(('tag1="son of anton" tag2="company abc"',), {"tag1": "son of anton", "tag2": "company abc"}),
(('\'tag1="son of anton" tag2="company abc"\'',), {"tag1": "son of anton", "tag2": "company abc"}),
(
('tag1="son of anton" tag2="company abc" tag:3="dummy tag"',),
{"tag1": "son of anton", "tag2": "company abc", "tag:3": "dummy tag"},
),
]
)
def test_successful_parsing(self, input, expected):
Expand Down

0 comments on commit 575253a

Please sign in to comment.