Skip to content

Commit

Permalink
Merge pull request #200 from obsidianforensics/update-twitter
Browse files Browse the repository at this point in the history
Add the `x.com` domains for the Twitter parser. Extend "reasonable" S…
  • Loading branch information
obsidianforensics authored Jul 1, 2024
2 parents 727f46e + a739204 commit 0b0fbb1
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 11 deletions.
1 change: 0 additions & 1 deletion requirements-lookups.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
Requests
maclookup
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ protobuf==4.*
publicsuffix2
pycountry
pymispwarninglists>=1.5
requests
torf
ulid-py
3 changes: 3 additions & 0 deletions unfurl/parsers/parse_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import struct
import re
from unfurl.parsers.proto.google_search_pb2 import Ved
from unfurl import utils
from google.protobuf import json_format

import logging
Expand Down Expand Up @@ -289,6 +290,8 @@ def run(unfurl, node):
parent_id=node.node_id, incoming_edge_config=google_edge)

elif node.key == 'ei':
if not re.fullmatch(utils.urlsafe_b64_re, node.value):
return
padded_value = unfurl.add_b64_padding(node.value)
if not padded_value:
return
Expand Down
2 changes: 1 addition & 1 deletion unfurl/parsers/parse_magnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def run(unfurl, node):
for field, value in parsed_magnet.__dict__.items():
if field in [f'_{known}' for known in known_field_names.keys()]:
continue
# 'infohash' is extracted from xt by torf and added as it's own thing (for convenience); skip it
# 'infohash' is extracted from xt by torf and added as its own thing (for convenience); skip it
elif field == '_infohash':
continue
elif value:
Expand Down
4 changes: 2 additions & 2 deletions unfurl/parsers/parse_tiktok.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ def run(unfurl, node):
if node.key == 1:
if node.value.startswith('@'):
unfurl.add_to_queue(
data_type='descriptor', key=None, value=f'Username who posted on TikTok',
data_type='descriptor', key=None, value='Username who posted on TikTok',
parent_id=node.node_id, incoming_edge_config=tiktok_edge)
elif node.value == 'embed':
unfurl.add_to_queue(
data_type='descriptor', key=None, value=f'TikTok video was embedded on another site',
data_type='descriptor', key=None, value='TikTok video was embedded on another site',
parent_id=node.node_id, incoming_edge_config=tiktok_edge)

# Check if TikTok ID timestamp would be between 2017-12 and 2025-05
Expand Down
15 changes: 8 additions & 7 deletions unfurl/parsers/parse_twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ def parse_twitter_snowflake(unfurl, node, encoding_type='integer', on_twitter=Tr
f'Sequence number should be between 0 and 4096; got {sequence}'

# Since we are trying to parse things that might not be valid, make sure the decoded
# timestamp is "reasonable" (between 2010-11 (the Snowflake epoch) and 2024-01
if not 1288834974657 < timestamp < 1704070800000:
# timestamp is "reasonable" (between 2010-11 (the Snowflake epoch) and 2025-03
if not 1288834974657 < timestamp < 1741000800000:
return

except Exception as e:
Expand Down Expand Up @@ -110,14 +110,15 @@ def parse_twitter_snowflake(unfurl, node, encoding_type='integer', on_twitter=Tr

def run(unfurl, node):
preceding_domain = unfurl.find_preceding_domain(node)
if preceding_domain in ['twitter.com', 'mobile.twitter.com']:
# Make sure potential snowflake is reasonable: between 2015-02-01 & 2024-06-10
if preceding_domain in ['twitter.com', 'mobile.twitter.com', 'x.com', 'mobile.x.com']:
# Make sure potential snowflake is reasonable: between 2015-02-01 & 2025-03-13
if node.data_type == 'url.path.segment' and \
unfurl.check_if_int_between(node.value, 261675293291446272, 1800000000000000001):
unfurl.check_if_int_between(node.value, 261675293291446272, 1900000000000000001):
parse_twitter_snowflake(unfurl, node)

# Based on information found in a Javascript file on Twitter's website. Thanks 2*yo (https://github.com/2xyo)!
# ref: https://web.archive.org/web/20220924170519/https://abs.twimg.com/responsive-web/client-web/main.ea5f3cf9.js
# ref:
# https://web.archive.org/web/20220924170519/https://abs.twimg.com/responsive-web/client-web/main.ea5f3cf9.js
elif node.data_type == 'url.query.pair' and node.key == 's':
sharing_codes = {
'01': ' from an Android using SMS',
Expand Down Expand Up @@ -209,7 +210,7 @@ def run(unfurl, node):
parse_twitter_snowflake(unfurl, node, encoding_type='base64')

# Images from Twitter can be viewed in other ways than the above (including being saved/downloaded and then
# uploaded somewhere else. The file name pattern appears fairly unique, so if we see a file name that matches it
# uploaded somewhere else). The file name pattern appears fairly unique, so if we see a file name that matches it
# and decodes to a "reasonable" timestamp, show it in the graph.
if node.data_type == 'file.name' and len(node.value) == 15:
on_twitter = True if '.twimg.com' in preceding_domain else False
Expand Down

0 comments on commit 0b0fbb1

Please sign in to comment.