Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update parsing of Google's RLZ parameter to handle 1-4 digit cohort weeks #150

Merged
merged 2 commits into from
Apr 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 52 additions & 68 deletions unfurl/parsers/parse_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@

import base64
import datetime
import struct
import pycountry
import struct
import re
from unfurl.parsers.proto.google_search_pb2 import Ved
from google.protobuf import json_format

Expand Down Expand Up @@ -99,31 +100,36 @@ def parse_ei(ei):

def parse_rlz(rlz_string):
# Reference: https://github.com/rogerta/rlz/blob/wiki/HowToReadAnRlzString.md

rlz = {}
assert rlz_string[0] == '1', 'Only v1 of RLZ is known'

rlz['version'] = {
rlz_re = re.compile(r'1(?P<ap>[A-z0-9]{2})(?P<brand_code>[A-z0-9]{4})(?P<cannibal>[c_])(?P<language_code>[A-z]{2}'
r'(-[A-z]{2})?)(?P<install_cohort>[A-z]{2}\d{1,4})?(?P<search_cohort>[A-z]{2}\d{1,4})?')
rlz_m = rlz_re.fullmatch(rlz_string)
if not rlz_m:
return

rlz = {
'version': {
'data_type': 'google.rlz.version',
'key': 'RLZ version',
'value': rlz_string[0],
'hover': 'Only v1 of RLZ is known'}

rlz['ap'] = {
'data_type': 'google.rlz.ap',
'key': 'Application',
'value': rlz_string[1:3],
'hover': 'Application (or "access point") used to do the Google Search'}

rlz['brand_code'] = {
'data_type': 'google.rlz.brand_code',
'key': 'Brand Code',
'value': rlz_string[3:7],
'hover': 'The brand code identifies the distribution channel (it may be a partner or internal marketing). <br>'
'This correlates to how the user got the software (ie. they downloaded it by itself vs. it came <br>'
'pre-installed on their new computer vs. it came with a partner\'s software.'}

cannibal_string = rlz_string[7]
'value': '1',
'hover': 'Only v1 of RLZ is known'
},
'ap': {
'data_type': 'google.rlz.ap',
'key': 'Application',
'value': rlz_m.group('ap'),
'hover': 'Application (or "access point") used to do the Google Search'
},
'brand_code': {
'data_type': 'google.rlz.brand_code',
'key': 'Brand Code',
'value': rlz_m.group('brand_code'),
'hover': 'The brand code identifies the distribution channel (it may be a partner or internal marketing). '
'This correlates to how the user got the software (ie. they downloaded it by itself vs. it came '
'pre-installed on their new computer vs. it came with a partner\'s software.'
}
}

cannibal_string = rlz_m.group('cannibal')
if cannibal_string == 'c':
cannibal_value = 'Yes'
elif cannibal_string == '_':
Expand All @@ -138,39 +144,17 @@ def parse_rlz(rlz_string):
'hover': '"Cannibal" tells if the library has evidence that the user was a user prior <br>'
'to installing the software, and thus "cannibalized" a previous installation.'}

rlz_pointer = 8

# Examples of longer RLZ values (2- and 5-char lang codes):
# 1C1CHBF_en-GBGB901GB901
# 1C1GCEU_enUS820US820
if len(rlz_string) >= 11:
# If a dash is in this position, it means it is a 5 char lang code, not the 2 char code
if rlz_string[10] == '-':
language_code = rlz_string[rlz_pointer:rlz_pointer+5]
rlz_pointer += 5
# langcodes was having install issues on macOS; not using it for now in
# order to not complicate Unfurl's install. Pycountry's languages isn't
# as good (only alpha_2 and alpha_3) but better than nothing for now.
# Old implementation:
# language_name = langcodes.Language.get(language_code).language_name()
language_name = pycountry.languages.get(alpha_2=language_code[:2]).name

else:
language_code = rlz_string[rlz_pointer:rlz_pointer+2]
# language_name = langcodes.Language.get(language_code).language_name()
language_name = pycountry.languages.get(alpha_2=language_code).name
rlz_pointer += 2

# Example of RLZ value without cohorts
# 1C1GCEV_en
elif len(rlz_string) == 10:
language_code = rlz_string[rlz_pointer:rlz_pointer + 2]
# language_name = langcodes.Language.get(language_code).language_name()
language_name = pycountry.languages.get(alpha_2=language_code).name
rlz_pointer += 2

else:
raise ValueError('Couldn\'t parse RLZ language code')
language_code = rlz_m.group('language_code')
# langcodes was having install issues on macOS; not using it for now in
# order to not complicate Unfurl's install. Pycountry's languages isn't
# as good (only alpha_2 and alpha_3) but better than nothing for now.
# Old implementation:
# language_name = langcodes.Language.get(language_code).language_name()
language_name = pycountry.languages.get(alpha_2=language_code[:2]).name

rlz['language'] = {
'data_type': 'google.rlz.language',
Expand All @@ -179,13 +163,11 @@ def parse_rlz(rlz_string):
'hover': 'The two- (en) or five-character (zh-CN) language code of the application. <br>'
'Valid values depend on the specific app.'}

# The cohorts are optional, so if this is the end of the RLZ string, return what we've parsed
if not len(rlz_string) >= rlz_pointer+5:
# The cohorts are optional; example of RLZ value without cohorts: 1C1GCEV_en
if not rlz_m.group('install_cohort'):
return rlz

install_cohort = rlz_string[rlz_pointer:rlz_pointer+5]
rlz_pointer += 5

install_cohort = rlz_m.group('install_cohort')
install_country_code = install_cohort[:2]
install_country = pycountry.countries.get(alpha_2=install_country_code).name
install_date = datetime.timedelta(weeks=int(install_cohort[2:])) + datetime.date(year=2003, month=2, day=3)
Expand All @@ -200,13 +182,15 @@ def parse_rlz(rlz_string):

# Both cohorts do not need to be present, so if the 2nd one (search) is missing, return what we have
# Example: 1CAGZLV_enGB898
if not len(rlz_string) >= rlz_pointer + 5:
if not rlz_m.group('search_cohort'):
return rlz

search_cohort = rlz_string[rlz_pointer:rlz_pointer + 5]

search_cohort = rlz_m.group('search_cohort')
search_country_code = search_cohort[:2]
search_country = pycountry.countries.get(alpha_2=search_country_code).name
pycountry_object = pycountry.countries.get(alpha_2=search_country_code)
search_country = search_country_code
if pycountry_object:
search_country = pycountry_object.name
search_date = datetime.timedelta(weeks=int(search_cohort[2:])) + datetime.date(year=2003, month=2, day=3)

rlz['search_cohort'] = {
Expand Down Expand Up @@ -447,12 +431,12 @@ def run(unfurl, node):

try:
rlz = parse_rlz(node.value)

for component in rlz.values():
unfurl.add_to_queue(
data_type=component['data_type'], key=component['key'],
value=component['value'], hover=component['hover'],
parent_id=node.node_id, incoming_edge_config=google_edge)
if rlz:
for component in rlz.values():
unfurl.add_to_queue(
data_type=component['data_type'], key=component['key'],
value=component['value'], hover=component['hover'],
parent_id=node.node_id, incoming_edge_config=google_edge)

except ValueError as e:
print(f'Exception parsing RLZ: {e}')
Expand Down
41 changes: 35 additions & 6 deletions unfurl/tests/unit/test_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,50 @@ def test_google_search_with_rlz(self):
test = Unfurl()
test.add_to_queue(
data_type='url', key=None,
value='https://www.google.com/search?rlz=1C1GCAB_enUS907US907q=dfir+data')
value='https://www.google.com/search?rlz=1C1GCAB_enUS907US907&q=dfir+data')
test.parse_queue()

# Check the number of nodes
self.assertEqual(len(test.nodes.keys()), 20)
self.assertEqual(test.total_nodes, 20)
self.assertEqual(len(test.nodes.keys()), 22)
self.assertEqual(test.total_nodes, 22)

# Confirm that RLZ AP parsed
self.assertEqual('Application: C1', test.nodes[14].label)
self.assertEqual('Application: C1', test.nodes[15].label)

# Language parses
self.assertEqual('Language: English (en)', test.nodes[17].label)
self.assertEqual('Language: English (en)', test.nodes[18].label)

# Search cohort parses
self.assertIn('United States the week of 2020-06-22', test.nodes[19].label)
self.assertIn('United States the week of 2020-06-22', test.nodes[20].label)

# make sure the queue finished empty
self.assertTrue(test.queue.empty())
self.assertEqual(len(test.edges), 0)

def test_google_search_with_rlz_different_weeks(self):
""" Test a Google search URL with a RLZ param with different cohort weeks """

test = Unfurl()
test.add_to_queue(
data_type='url', key=None,
value='https://www.google.com/search?rlz=1C1GCAB_esUS97US1007&q=dfir+data')
test.parse_queue()

# Check the number of nodes
self.assertEqual(len(test.nodes.keys()), 22)
self.assertEqual(test.total_nodes, 22)

# Confirm that RLZ AP parsed
self.assertEqual('Application: C1', test.nodes[15].label)

# Language parses
self.assertEqual('Language: Spanish (es)', test.nodes[18].label)

# Install cohort parses (2 digit week)
self.assertIn('United States the week of 2004-12-13', test.nodes[19].label)

# Search cohort parses (4 digit week)
self.assertIn('United States the week of 2022-05-23', test.nodes[20].label)

# make sure the queue finished empty
self.assertTrue(test.queue.empty())
Expand Down