Skip to content

Commit

Permalink
Merge pull request #6420 from mozilla/fix-update-of-question-pageview…
Browse files Browse the repository at this point in the history
…-data

fix update of GA4 pageview data for questions
  • Loading branch information
akatsoulas authored Jan 8, 2025
2 parents 3f535d4 + 2ccd982 commit 131fba6
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 41 deletions.
81 changes: 48 additions & 33 deletions kitsune/questions/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,29 +758,15 @@ def reload_from_analytics(cls, verbose=False):
from kitsune.sumo import googleanalytics

with transaction.atomic():
# First, let's gather some data we need. We're sacrificing memory
# here in order to reduce the number of database queries later on.
if verbose:
log.info("Gathering pageviews per question from GA4 data API...")

instance_by_question_id = {}
for question_id, visits in googleanalytics.pageviews_by_question(verbose=verbose):
instance_by_question_id[question_id] = cls(
question_id=Subquery(Question.objects.filter(id=question_id).values("id")),
visits=visits,
)
pageviews_by_question_id = googleanalytics.pageviews_by_question(verbose=verbose)

question_ids = list(instance_by_question_id)
total_count = len(pageviews_by_question_id)

# Next, let's clear out the stale instances that have new results.
if verbose:
log.info(f"Deleting all stale instances of {cls.__name__}...")

cls.objects.filter(question_id__in=question_ids).delete()

# Then we can create fresh instances for the questions that have results.
if verbose:
log.info(f"Creating {len(question_ids)} fresh instances of {cls.__name__}...")
log.info(f"Gathered pageviews for {total_count} questions.")

def create_batch(batch_of_question_ids):
"""
Expand All @@ -798,26 +784,55 @@ def create_batch(batch_of_question_ids):
]
)

# Let's create the fresh instances in batches, so we avoid exposing ourselves to
# the possibility of transgressing some query size limit.
batch_size = 1000
for batch_of_question_ids in chunked(question_ids, batch_size):
instance_by_question_id = {}

for i, (question_id, visits) in enumerate(pageviews_by_question_id.items(), start=1):
instance_by_question_id[question_id] = cls(
question_id=Subquery(Question.objects.filter(id=question_id).values("id")),
visits=visits,
)

# Update the question visits in batches of 30K to avoid memory issues.
if ((i % 30000) != 0) and (i != total_count):
continue

# We've got a batch, so let's update them.

question_ids = list(instance_by_question_id)

# Next, let's clear out the stale instances that have new results.
if verbose:
log.info(f"Creating a batch of {len(batch_of_question_ids)} instances...")
log.info(f"Deleting {len(question_ids)} stale instances of {cls.__name__}...")

cls.objects.filter(question_id__in=question_ids).delete()

try:
with transaction.atomic():
# Then we can create fresh instances for the questions that have results.
if verbose:
log.info(f"Creating {len(question_ids)} fresh instances of {cls.__name__}...")

# Let's create the fresh instances in batches of 1K, so we avoid exposing
# ourselves to the possibility of transgressing some query size limit.
for batch_of_question_ids in chunked(question_ids, 1000):
if verbose:
log.info(f"Creating a batch of {len(batch_of_question_ids)} instances...")

try:
with transaction.atomic():
create_batch(batch_of_question_ids)
except IntegrityError:
# There is a very slim chance that one or more Questions have been
# deleted in the moment of time between the formation of the list
# of valid instances and actually creating them, so let's give it
# one more try, assuming there's an even slimmer chance that
# lightning will strike twice. If this one fails, we'll roll-back
# everything and give up on the entire effort.
create_batch(batch_of_question_ids)
except IntegrityError:
# There is a very slim chance that one or more Questions have been deleted in
# the moment of time between the formation of the list of valid instances and
# actually creating them, so let's give it one more try, assuming there's an
# even slimmer chance that lightning will strike twice. If this one fails,
# we'll roll-back everything and give up on the entire effort.
create_batch(batch_of_question_ids)

if verbose:
log.info("Done.")
# We're done with this batch, so let's clear the memory for the next one.
instance_by_question_id.clear()

if verbose:
log.info("Done.")


class QuestionLocale(ModelBase):
Expand Down
4 changes: 2 additions & 2 deletions kitsune/questions/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ def test_visit_count_from_analytics(self, pageviews_by_question):
q2 = QuestionFactory()
q3 = QuestionFactory()

pageviews_by_question.return_value = (
pageviews_by_question.return_value = dict(
row
for row in (
(q1.id, 42),
Expand All @@ -589,7 +589,7 @@ def test_visit_count_from_analytics(self, pageviews_by_question):
self.assertEqual(1337, QuestionVisits.objects.get(question_id=q3.id).visits)

# Change the data and run again to cover the update case.
pageviews_by_question.return_value = (
pageviews_by_question.return_value = dict(
row
for row in (
(q1.id, 100),
Expand Down
10 changes: 9 additions & 1 deletion kitsune/sumo/googleanalytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,8 @@ def pageviews_by_question(period=LAST_YEAR, verbose=False):
"""
date_range = DateRange(start_date=PERIOD_TO_DAYS_AGO[period], end_date="today")

pageviews_by_id = {}

for row in run_report(date_range, create_question_report_request, verbose=verbose):
path = row.dimension_values[0].value
# The path should be a question path without any query parameters, but in reality
Expand All @@ -345,7 +347,13 @@ def pageviews_by_question(period=LAST_YEAR, verbose=False):
question_id = int(question_id)
except ValueError:
continue
yield (question_id, num_page_views)

# The "run_report" will return one row for each unique question path. Since the
# path includes the locale, there can be multiple rows for each question, so we
# need to accumulate the sum of all of those rows.
pageviews_by_id[question_id] = pageviews_by_id.get(question_id, 0) + num_page_views

return pageviews_by_id


def search_clicks_and_impressions(start_date, end_date, verbose=False):
Expand Down
18 changes: 13 additions & 5 deletions kitsune/sumo/tests/test_googleanalytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,28 @@ def test_pageviews_by_question(self, run_report):
),
Row(
dimension_values=[DimensionValue(value="/es/questions/782348")],
metric_values=[MetricValue(value="2000")],
metric_values=[MetricValue(value="187")],
),
Row(
dimension_values=[DimensionValue(value="/de/questions/987235")],
metric_values=[MetricValue(value="3000")],
),
Row(
dimension_values=[DimensionValue(value="/it/questions/123456")],
metric_values=[MetricValue(value="17")],
),
Row(
dimension_values=[DimensionValue(value="/en-US/questions/782348")],
metric_values=[MetricValue(value="2000")],
),
)

result = list(googleanalytics.pageviews_by_question(LAST_7_DAYS))
result = googleanalytics.pageviews_by_question(LAST_7_DAYS)

self.assertEqual(3, len(result))
self.assertEqual(result[0], (123456, 1000))
self.assertEqual(result[1], (782348, 2000))
self.assertEqual(result[2], (987235, 3000))
self.assertEqual(result[123456], 1017)
self.assertEqual(result[782348], 2187)
self.assertEqual(result[987235], 3000)

@patch.object(googleanalytics, "run_report")
def test_search_clicks_and_impressions(self, run_report):
Expand Down

0 comments on commit 131fba6

Please sign in to comment.