From 35793aa21c843394341104aff0ccdb393e4f7c68 Mon Sep 17 00:00:00 2001 From: h1alexbel Date: Tue, 8 Oct 2024 10:12:52 +0300 Subject: [PATCH] feat(#130): failed to parse --- sr-data/src/sr_data/steps/maven.py | 88 ++++++++++++++++-------------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/sr-data/src/sr_data/steps/maven.py b/sr-data/src/sr_data/steps/maven.py index a37f615a..81b3308f 100644 --- a/sr-data/src/sr_data/steps/maven.py +++ b/sr-data/src/sr_data/steps/maven.py @@ -1,7 +1,6 @@ """ Collect maven information for each repo. """ -import xml.dom.minidom # The MIT License (MIT) # # Copyright (c) 2024 Aliaksei Bialiauski @@ -24,6 +23,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import xml.etree.ElementTree as ET +from xml.etree.ElementTree import ParseError +import xml.dom.minidom import pandas as pd import requests @@ -90,48 +91,51 @@ def merge(build, repo): for project in build: path = project["path"] logger.debug(f"Checking {repo}: {path}") - root = ET.fromstring(project["content"]) - pretty = "\n".join( - [ - line for line - in xml.dom.minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ").splitlines() - if line.strip() - ] - ) - logger.debug(f"{path}:\n{pretty}") - if len( - root.findall( - ".//pom:dependency[pom:groupId='@project.groupId@']", - namespaces - ) - ) > 0: - logger.info(f"Skipping {path}, since it contains @project dependency") - else: - profile = {} - packaging = root.find(".//pom:packaging", namespaces) - if packaging is not None: - packgs.append(packaging.text) + try: + root = ET.fromstring(project["content"]) + pretty = "\n".join( + [ + line for line + in xml.dom.minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ").splitlines() + if line.strip() + ] + ) + logger.debug(f"{path}:\n{pretty}") + if len( + root.findall( + ".//pom:dependency[pom:groupId='@project.groupId@']", + namespaces + ) + ) > 0: + logger.info(f"Skipping {path}, since it contains @project dependency") else: - packgs.append("jar") - for plugin in root.findall(".//pom:plugin", namespaces): - group = plugin.find("./pom:groupId", namespaces) - artifact = plugin.find("./pom:artifactId", namespaces) - if group is not None: - plugins.append(f"{group.text}:{artifact.text}") - elif artifact is not None: - plugins.append(artifact.text) - good.append(profile) - used = len(good) - logger.info(f"Found {used} good Maven projects in {repo}") - return { - "projects": used, - "plugins": sorted(list(set(plugins))), - "packages": { - "wars": len(list(filter(lambda p: p == "war", packgs))), - "jars": len(list(filter(lambda p: p == "jar", packgs))), - "poms": len(list(filter(lambda p: p == "pom", packgs))) - } - } + profile = {} + packaging = root.find(".//pom:packaging", namespaces) + if packaging is not None: + packgs.append(packaging.text) + else: + packgs.append("jar") + for plugin in root.findall(".//pom:plugin", namespaces): + group = plugin.find("./pom:groupId", namespaces) + artifact = plugin.find("./pom:artifactId", namespaces) + if group is not None: + plugins.append(f"{group.text}:{artifact.text}") + elif artifact is not None: + plugins.append(artifact.text) + good.append(profile) + used = len(good) + logger.info(f"Found {used} good Maven projects in {repo}") + return { + "projects": used, + "plugins": sorted(list(set(plugins))), + "packages": { + "wars": len(list(filter(lambda p: p == "war", packgs))), + "jars": len(list(filter(lambda p: p == "jar", packgs))), + "poms": len(list(filter(lambda p: p == "pom", packgs))) + } + } + except ParseError: + logger.warning(f"Failed to parse {repo}: {path}. Probably XML is broken") def request(token, repo) -> Response: