Merge pull request #200 from obsidianforensics/update-twitter

Add the `x.com` domains for the Twitter parser. Extend "reasonable" S…
obsidianforensics · Jul 1, 2024 · 0b0fbb1 · 0b0fbb1
2 parents 727f46e + a739204
commit 0b0fbb1
Show file tree

Hide file tree

Showing 6 changed files with 15 additions and 11 deletions.
diff --git a/requirements-lookups.txt b/requirements-lookups.txt
@@ -1,2 +1 @@
-Requests
 maclookup
diff --git a/requirements.txt b/requirements.txt
@@ -4,5 +4,6 @@ protobuf==4.*
 publicsuffix2
 pycountry
 pymispwarninglists>=1.5
+requests
 torf
 ulid-py
diff --git a/unfurl/parsers/parse_google.py b/unfurl/parsers/parse_google.py
@@ -18,6 +18,7 @@
 import struct
 import re
 from unfurl.parsers.proto.google_search_pb2 import Ved
+from unfurl import utils
 from google.protobuf import json_format
 
 import logging
@@ -289,6 +290,8 @@ def run(unfurl, node):
                             parent_id=node.node_id, incoming_edge_config=google_edge)
 
             elif node.key == 'ei':
+                if not re.fullmatch(utils.urlsafe_b64_re, node.value):
+                    return
                 padded_value = unfurl.add_b64_padding(node.value)
                 if not padded_value:
                     return

diff --git a/unfurl/parsers/parse_magnet.py b/unfurl/parsers/parse_magnet.py
@@ -115,7 +115,7 @@ def run(unfurl, node):
             for field, value in parsed_magnet.__dict__.items():
                 if field in [f'_{known}' for known in known_field_names.keys()]:
                     continue
-                # 'infohash' is extracted from xt by torf and added as it's own thing (for convenience); skip it
+                # 'infohash' is extracted from xt by torf and added as its own thing (for convenience); skip it
                 elif field == '_infohash':
                     continue
                 elif value:

diff --git a/unfurl/parsers/parse_tiktok.py b/unfurl/parsers/parse_tiktok.py
@@ -53,11 +53,11 @@ def run(unfurl, node):
             if node.key == 1:
                 if node.value.startswith('@'):
                     unfurl.add_to_queue(
-                        data_type='descriptor', key=None, value=f'Username who posted on TikTok',
+                        data_type='descriptor', key=None, value='Username who posted on TikTok',
                         parent_id=node.node_id, incoming_edge_config=tiktok_edge)
                 elif node.value == 'embed':
                     unfurl.add_to_queue(
-                        data_type='descriptor', key=None, value=f'TikTok video was embedded on another site',
+                        data_type='descriptor', key=None, value='TikTok video was embedded on another site',
                         parent_id=node.node_id, incoming_edge_config=tiktok_edge)
 
             # Check if TikTok ID timestamp would be between 2017-12 and 2025-05

diff --git a/unfurl/parsers/parse_twitter.py b/unfurl/parsers/parse_twitter.py
@@ -70,8 +70,8 @@ def parse_twitter_snowflake(unfurl, node, encoding_type='integer', on_twitter=Tr
                 f'Sequence number should be between 0 and 4096; got {sequence}'
 
             # Since we are trying to parse things that might not be valid, make sure the decoded
-            # timestamp is "reasonable" (between 2010-11 (the Snowflake epoch) and 2024-01
-            if not 1288834974657 < timestamp < 1704070800000:
+            # timestamp is "reasonable" (between 2010-11 (the Snowflake epoch) and 2025-03
+            if not 1288834974657 < timestamp < 1741000800000:
                 return
 
         except Exception as e:
@@ -110,14 +110,15 @@ def parse_twitter_snowflake(unfurl, node, encoding_type='integer', on_twitter=Tr
 
 def run(unfurl, node):
     preceding_domain = unfurl.find_preceding_domain(node)
-    if preceding_domain in ['twitter.com', 'mobile.twitter.com']:
-        # Make sure potential snowflake is reasonable: between 2015-02-01 & 2024-06-10
+    if preceding_domain in ['twitter.com', 'mobile.twitter.com', 'x.com', 'mobile.x.com']:
+        # Make sure potential snowflake is reasonable: between 2015-02-01 & 2025-03-13
         if node.data_type == 'url.path.segment' and \
-                unfurl.check_if_int_between(node.value, 261675293291446272, 1800000000000000001):
+                unfurl.check_if_int_between(node.value, 261675293291446272, 1900000000000000001):
             parse_twitter_snowflake(unfurl, node)
 
         # Based on information found in a Javascript file on Twitter's website. Thanks 2*yo (https://github.com/2xyo)!
-        # ref: https://web.archive.org/web/20220924170519/https://abs.twimg.com/responsive-web/client-web/main.ea5f3cf9.js
+        # ref:
+        #   https://web.archive.org/web/20220924170519/https://abs.twimg.com/responsive-web/client-web/main.ea5f3cf9.js
         elif node.data_type == 'url.query.pair' and node.key == 's':
             sharing_codes = {
                 '01': ' from an Android using SMS',
@@ -209,7 +210,7 @@ def run(unfurl, node):
                     parse_twitter_snowflake(unfurl, node, encoding_type='base64')
 
     # Images from Twitter can be viewed in other ways than the above (including being saved/downloaded and then
-    # uploaded somewhere else. The file name pattern appears fairly unique, so if we see a file name that matches it
+    # uploaded somewhere else). The file name pattern appears fairly unique, so if we see a file name that matches it
     # and decodes to a "reasonable" timestamp, show it in the graph.
     if node.data_type == 'file.name' and len(node.value) == 15:
         on_twitter = True if '.twimg.com' in preceding_domain else False