Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix update of GA4 pageview data for questions #6420

Merged
merged 1 commit into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 48 additions & 33 deletions kitsune/questions/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,29 +758,15 @@ def reload_from_analytics(cls, verbose=False):
from kitsune.sumo import googleanalytics

with transaction.atomic():
# First, let's gather some data we need. We're sacrificing memory
# here in order to reduce the number of database queries later on.
if verbose:
log.info("Gathering pageviews per question from GA4 data API...")

instance_by_question_id = {}
for question_id, visits in googleanalytics.pageviews_by_question(verbose=verbose):
instance_by_question_id[question_id] = cls(
question_id=Subquery(Question.objects.filter(id=question_id).values("id")),
visits=visits,
)
pageviews_by_question_id = googleanalytics.pageviews_by_question(verbose=verbose)

question_ids = list(instance_by_question_id)
total_count = len(pageviews_by_question_id)

# Next, let's clear out the stale instances that have new results.
if verbose:
log.info(f"Deleting all stale instances of {cls.__name__}...")

cls.objects.filter(question_id__in=question_ids).delete()

# Then we can create fresh instances for the questions that have results.
if verbose:
log.info(f"Creating {len(question_ids)} fresh instances of {cls.__name__}...")
log.info(f"Gathered pageviews for {total_count} questions.")

def create_batch(batch_of_question_ids):
"""
Expand All @@ -798,26 +784,55 @@ def create_batch(batch_of_question_ids):
]
)

# Let's create the fresh instances in batches, so we avoid exposing ourselves to
# the possibility of transgressing some query size limit.
batch_size = 1000
for batch_of_question_ids in chunked(question_ids, batch_size):
instance_by_question_id = {}

for i, (question_id, visits) in enumerate(pageviews_by_question_id.items(), start=1):
instance_by_question_id[question_id] = cls(
question_id=Subquery(Question.objects.filter(id=question_id).values("id")),
visits=visits,
)

# Update the question visits in batches of 30K to avoid memory issues.
if ((i % 30000) != 0) and (i != total_count):
continue

# We've got a batch, so let's update them.

question_ids = list(instance_by_question_id)

# Next, let's clear out the stale instances that have new results.
if verbose:
log.info(f"Creating a batch of {len(batch_of_question_ids)} instances...")
log.info(f"Deleting {len(question_ids)} stale instances of {cls.__name__}...")

cls.objects.filter(question_id__in=question_ids).delete()

try:
with transaction.atomic():
# Then we can create fresh instances for the questions that have results.
if verbose:
log.info(f"Creating {len(question_ids)} fresh instances of {cls.__name__}...")

# Let's create the fresh instances in batches of 1K, so we avoid exposing
# ourselves to the possibility of transgressing some query size limit.
for batch_of_question_ids in chunked(question_ids, 1000):
if verbose:
log.info(f"Creating a batch of {len(batch_of_question_ids)} instances...")

try:
with transaction.atomic():
create_batch(batch_of_question_ids)
except IntegrityError:
# There is a very slim chance that one or more Questions have been
# deleted in the moment of time between the formation of the list
# of valid instances and actually creating them, so let's give it
# one more try, assuming there's an even slimmer chance that
# lightning will strike twice. If this one fails, we'll roll-back
# everything and give up on the entire effort.
create_batch(batch_of_question_ids)
except IntegrityError:
# There is a very slim chance that one or more Questions have been deleted in
# the moment of time between the formation of the list of valid instances and
# actually creating them, so let's give it one more try, assuming there's an
# even slimmer chance that lightning will strike twice. If this one fails,
# we'll roll-back everything and give up on the entire effort.
create_batch(batch_of_question_ids)

if verbose:
log.info("Done.")
# We're done with this batch, so let's clear the memory for the next one.
instance_by_question_id.clear()

if verbose:
log.info("Done.")


class QuestionLocale(ModelBase):
Expand Down
4 changes: 2 additions & 2 deletions kitsune/questions/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ def test_visit_count_from_analytics(self, pageviews_by_question):
q2 = QuestionFactory()
q3 = QuestionFactory()

pageviews_by_question.return_value = (
pageviews_by_question.return_value = dict(
row
for row in (
(q1.id, 42),
Expand All @@ -589,7 +589,7 @@ def test_visit_count_from_analytics(self, pageviews_by_question):
self.assertEqual(1337, QuestionVisits.objects.get(question_id=q3.id).visits)

# Change the data and run again to cover the update case.
pageviews_by_question.return_value = (
pageviews_by_question.return_value = dict(
row
for row in (
(q1.id, 100),
Expand Down
10 changes: 9 additions & 1 deletion kitsune/sumo/googleanalytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,8 @@ def pageviews_by_question(period=LAST_YEAR, verbose=False):
"""
date_range = DateRange(start_date=PERIOD_TO_DAYS_AGO[period], end_date="today")

pageviews_by_id = {}

for row in run_report(date_range, create_question_report_request, verbose=verbose):
path = row.dimension_values[0].value
# The path should be a question path without any query parameters, but in reality
Expand All @@ -345,7 +347,13 @@ def pageviews_by_question(period=LAST_YEAR, verbose=False):
question_id = int(question_id)
except ValueError:
continue
yield (question_id, num_page_views)

# The "run_report" will return one row for each unique question path. Since the
# path includes the locale, there can be multiple rows for each question, so we
# need to accumulate the sum of all of those rows.
pageviews_by_id[question_id] = pageviews_by_id.get(question_id, 0) + num_page_views

return pageviews_by_id


def search_clicks_and_impressions(start_date, end_date, verbose=False):
Expand Down
18 changes: 13 additions & 5 deletions kitsune/sumo/tests/test_googleanalytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,28 @@ def test_pageviews_by_question(self, run_report):
),
Row(
dimension_values=[DimensionValue(value="/es/questions/782348")],
metric_values=[MetricValue(value="2000")],
metric_values=[MetricValue(value="187")],
),
Row(
dimension_values=[DimensionValue(value="/de/questions/987235")],
metric_values=[MetricValue(value="3000")],
),
Row(
dimension_values=[DimensionValue(value="/it/questions/123456")],
metric_values=[MetricValue(value="17")],
),
Row(
dimension_values=[DimensionValue(value="/en-US/questions/782348")],
metric_values=[MetricValue(value="2000")],
),
)

result = list(googleanalytics.pageviews_by_question(LAST_7_DAYS))
result = googleanalytics.pageviews_by_question(LAST_7_DAYS)

self.assertEqual(3, len(result))
self.assertEqual(result[0], (123456, 1000))
self.assertEqual(result[1], (782348, 2000))
self.assertEqual(result[2], (987235, 3000))
self.assertEqual(result[123456], 1017)
self.assertEqual(result[782348], 2187)
self.assertEqual(result[987235], 3000)

@patch.object(googleanalytics, "run_report")
def test_search_clicks_and_impressions(self, run_report):
Expand Down