From fe87fd2877f62407d390929f9cde8e7dd5706bc2 Mon Sep 17 00:00:00 2001 From: Matt Willis <88482857+mew-nsc@users.noreply.github.com> Date: Sat, 11 Dec 2021 08:45:35 +0000 Subject: [PATCH 1/2] Initial cut of the more generic jchat importer -> Failure tests not passing yet --- importers/jchat_importer.py | 19 ++++++++----------- .../jchat_files/combined_format.html | 6 ++++++ tests/test_load_jchat.py | 8 +++++--- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/importers/jchat_importer.py b/importers/jchat_importer.py index c08d7952c..5933c804c 100644 --- a/importers/jchat_importer.py +++ b/importers/jchat_importer.py @@ -122,21 +122,24 @@ def _read_message_div(self, div, data_store, datafile, change_id): # Older format uses id= # Newer format uses msgid= msg_id = div.attrib["msgid"] # Grabbing ID to help with error reporting - version = JCHAT_MODERN except KeyError: try: msg_id = div.attrib["id"] - version = JCHAT_LEGACY except KeyError: # Ignore any non-comment messages (e.g. connect/disconnect) return - time_element = div.find("{*}tt/font") - # Sample data included some "Marker" messages with the id="marker" if str.upper(msg_id) == "MARKER": return # Ignore these messages + text_blocks = [] + text_blocks.append([item for item in div.findall(".//*") if item.text]) + if text_blocks[0]: + time_element = text_blocks[0][0] + platform_element = text_blocks[0][1] + msg_content_element = text_blocks[0][2:] + if time_element is None: self.errors.append( {self.error_type: f"Unable to read message {msg_id}. No timestamp provided"} @@ -147,11 +150,6 @@ def _read_message_div(self, div, data_store, datafile, change_id): timestamp = self.parse_timestamp(time_string, msg_id) time_element.record(self.name, "timestamp", timestamp) - if version == JCHAT_LEGACY: - platform_element = div.find("{*}b/a/font") - else: # version == JCHAT_MODERN - platform_element = div.find("{*}b/font/a") - if platform_element is None: self.errors.append( {self.error_type: f"Unable to read message {msg_id}. No platform provided"} @@ -162,14 +160,13 @@ def _read_message_div(self, div, data_store, datafile, change_id): # Match on quadgraphs platform = self.get_cached_platform_from_quad(data_store, platform_quad, change_id) - msg_content_element = [element for element in div.iterfind("font")] - if not msg_content_element: self.errors.append( {self.error_type: f"Unable to read message {msg_id}. No message provided"} ) return msg_content = self.parse_message_content(msg_content_element) + print(msg_content) if not msg_content: self.errors.append({self.error_type: f"Unable to parse JChat message {msg_id}."}) return diff --git a/tests/sample_data/jchat_files/combined_format.html b/tests/sample_data/jchat_files/combined_format.html index 505db8e57..dd1e2e1df 100644 --- a/tests/sample_data/jchat_files/combined_format.html +++ b/tests/sample_data/jchat_files/combined_format.html @@ -22,5 +22,11 @@
[08010809A]ABCD_CMDLegacy - font a swap - has i tag - no breaks
+
+ [23112654A]SPLA_ABModern 2 -
no i tag
but has multiple
breaks
+
+
+ [06010709A]SPLB_XOModern 2 - no i tag - no breaks +
\ No newline at end of file diff --git a/tests/test_load_jchat.py b/tests/test_load_jchat.py index 22690902e..d95def3ff 100644 --- a/tests/test_load_jchat.py +++ b/tests/test_load_jchat.py @@ -495,11 +495,11 @@ def test_combined_format(self): with self.store.session_scope(): # there must be states after the import comments = self.store.session.query(self.store.db_classes.Comment).all() - assert len(comments) == 5 + assert len(comments) == 7 # there must be platforms after the import platforms = self.store.session.query(self.store.db_classes.Platform).all() - assert len(platforms) == 3 + assert len(platforms) == 5 # there must be one datafile afterwards datafiles = self.store.session.query(self.store.db_classes.Datafile).all() @@ -510,12 +510,14 @@ def test_combined_format(self): .order_by(self.store.db_classes.Comment.time) .all() ) - assert len(results) == 5 + assert len(results) == 7 assert results[0].content == "Modern - has i tag" assert results[1].content == "Modern - no i tag but has multiple breaks" assert results[2].content == "Modern - no i tag - no breaks" assert results[3].content == "Legacy - font a swap - no i tag - no breaks" assert results[4].content == "Legacy - font a swap - has i tag - no breaks" + assert results[5].content == "Modern 2 - no i tag but has multiple breaks" + assert results[6].content == "Modern 2 - no i tag - no breaks" def test_invalid_missing_timestamp(self): html_string = """ From 672f1d0c9a6f042c8fe743565712da4a8088d364 Mon Sep 17 00:00:00 2001 From: Matt Willis <88482857+mew-nsc@users.noreply.github.com> Date: Tue, 14 Dec 2021 07:51:34 +0000 Subject: [PATCH 2/2] Reworking error messages because we can no longer tell which element is missing --- importers/jchat_importer.py | 18 +++++------------- tests/test_load_jchat.py | 11 +++++++---- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/importers/jchat_importer.py b/importers/jchat_importer.py index 5933c804c..1ca0272a1 100644 --- a/importers/jchat_importer.py +++ b/importers/jchat_importer.py @@ -140,9 +140,11 @@ def _read_message_div(self, div, data_store, datafile, change_id): platform_element = text_blocks[0][1] msg_content_element = text_blocks[0][2:] - if time_element is None: + if not text_blocks[0] or len(text_blocks[0]) < 3: self.errors.append( - {self.error_type: f"Unable to read message {msg_id}. No timestamp provided"} + { + self.error_type: f"Unable to read message {msg_id}. Not enough parts (expecting timestamp, platform, message)" + } ) return @@ -150,23 +152,13 @@ def _read_message_div(self, div, data_store, datafile, change_id): timestamp = self.parse_timestamp(time_string, msg_id) time_element.record(self.name, "timestamp", timestamp) - if platform_element is None: - self.errors.append( - {self.error_type: f"Unable to read message {msg_id}. No platform provided"} - ) - return platform_quad = platform_element.text[0:4] platform_element.record(self.name, "platform", platform_quad) # Match on quadgraphs platform = self.get_cached_platform_from_quad(data_store, platform_quad, change_id) - if not msg_content_element: - self.errors.append( - {self.error_type: f"Unable to read message {msg_id}. No message provided"} - ) - return msg_content = self.parse_message_content(msg_content_element) - print(msg_content) + if not msg_content: self.errors.append({self.error_type: f"Unable to parse JChat message {msg_id}."}) return diff --git a/tests/test_load_jchat.py b/tests/test_load_jchat.py index d95def3ff..97832efe7 100644 --- a/tests/test_load_jchat.py +++ b/tests/test_load_jchat.py @@ -539,7 +539,7 @@ def test_invalid_missing_timestamp(self): check_errors_for_file_contents( html_string, - "Unable to read message 34544=34534. No timestamp provided", + "Unable to read message 34544=34534. Not enough parts (expecting timestamp, platform, message)", importer, "no_timestamp.html", ) @@ -590,7 +590,7 @@ def test_invalid_missing_platform(self): check_errors_for_file_contents( html_string, - "Unable to read message 34544=34534. No platform provided", + "Unable to read message 34544=34534. Not enough parts (expecting timestamp, platform, message)", importer, "no_platform.html", ) @@ -615,7 +615,7 @@ def test_invalid_missing_message(self): check_errors_for_file_contents( html_string, - "Unable to read message 34544=34534. No message provided", + "Unable to read message 34544=34534. Not enough parts (expecting timestamp, platform, message)", importer, "no_message", ) @@ -640,7 +640,10 @@ def test_empty_message(self): importer = JChatImporter() check_errors_for_file_contents( - html_string, "Unable to parse JChat message 34544=34534.", importer, "no_message" + html_string, + "Unable to read message 34544=34534. Not enough parts (expecting timestamp, platform, message)", + importer, + "no_message", ) @staticmethod