From 2ccd98208a59553eaddde96fe8e119c9b416d1d3 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Thu, 19 Dec 2024 11:57:26 -0800 Subject: [PATCH] fix update of GA4 pageviews data for questions --- kitsune/questions/models.py | 81 +++++++++++++--------- kitsune/questions/tests/test_models.py | 4 +- kitsune/sumo/googleanalytics.py | 10 ++- kitsune/sumo/tests/test_googleanalytics.py | 18 +++-- 4 files changed, 72 insertions(+), 41 deletions(-) diff --git a/kitsune/questions/models.py b/kitsune/questions/models.py index ba61eabded6..a735536d078 100755 --- a/kitsune/questions/models.py +++ b/kitsune/questions/models.py @@ -758,29 +758,15 @@ def reload_from_analytics(cls, verbose=False): from kitsune.sumo import googleanalytics with transaction.atomic(): - # First, let's gather some data we need. We're sacrificing memory - # here in order to reduce the number of database queries later on. if verbose: log.info("Gathering pageviews per question from GA4 data API...") - instance_by_question_id = {} - for question_id, visits in googleanalytics.pageviews_by_question(verbose=verbose): - instance_by_question_id[question_id] = cls( - question_id=Subquery(Question.objects.filter(id=question_id).values("id")), - visits=visits, - ) + pageviews_by_question_id = googleanalytics.pageviews_by_question(verbose=verbose) - question_ids = list(instance_by_question_id) + total_count = len(pageviews_by_question_id) - # Next, let's clear out the stale instances that have new results. if verbose: - log.info(f"Deleting all stale instances of {cls.__name__}...") - - cls.objects.filter(question_id__in=question_ids).delete() - - # Then we can create fresh instances for the questions that have results. - if verbose: - log.info(f"Creating {len(question_ids)} fresh instances of {cls.__name__}...") + log.info(f"Gathered pageviews for {total_count} questions.") def create_batch(batch_of_question_ids): """ @@ -798,26 +784,55 @@ def create_batch(batch_of_question_ids): ] ) - # Let's create the fresh instances in batches, so we avoid exposing ourselves to - # the possibility of transgressing some query size limit. - batch_size = 1000 - for batch_of_question_ids in chunked(question_ids, batch_size): + instance_by_question_id = {} + + for i, (question_id, visits) in enumerate(pageviews_by_question_id.items(), start=1): + instance_by_question_id[question_id] = cls( + question_id=Subquery(Question.objects.filter(id=question_id).values("id")), + visits=visits, + ) + + # Update the question visits in batches of 30K to avoid memory issues. + if ((i % 30000) != 0) and (i != total_count): + continue + + # We've got a batch, so let's update them. + + question_ids = list(instance_by_question_id) + + # Next, let's clear out the stale instances that have new results. if verbose: - log.info(f"Creating a batch of {len(batch_of_question_ids)} instances...") + log.info(f"Deleting {len(question_ids)} stale instances of {cls.__name__}...") + + cls.objects.filter(question_id__in=question_ids).delete() - try: - with transaction.atomic(): + # Then we can create fresh instances for the questions that have results. + if verbose: + log.info(f"Creating {len(question_ids)} fresh instances of {cls.__name__}...") + + # Let's create the fresh instances in batches of 1K, so we avoid exposing + # ourselves to the possibility of transgressing some query size limit. + for batch_of_question_ids in chunked(question_ids, 1000): + if verbose: + log.info(f"Creating a batch of {len(batch_of_question_ids)} instances...") + + try: + with transaction.atomic(): + create_batch(batch_of_question_ids) + except IntegrityError: + # There is a very slim chance that one or more Questions have been + # deleted in the moment of time between the formation of the list + # of valid instances and actually creating them, so let's give it + # one more try, assuming there's an even slimmer chance that + # lightning will strike twice. If this one fails, we'll roll-back + # everything and give up on the entire effort. create_batch(batch_of_question_ids) - except IntegrityError: - # There is a very slim chance that one or more Questions have been deleted in - # the moment of time between the formation of the list of valid instances and - # actually creating them, so let's give it one more try, assuming there's an - # even slimmer chance that lightning will strike twice. If this one fails, - # we'll roll-back everything and give up on the entire effort. - create_batch(batch_of_question_ids) - if verbose: - log.info("Done.") + # We're done with this batch, so let's clear the memory for the next one. + instance_by_question_id.clear() + + if verbose: + log.info("Done.") class QuestionLocale(ModelBase): diff --git a/kitsune/questions/tests/test_models.py b/kitsune/questions/tests/test_models.py index e39dba045bb..f72dade407d 100644 --- a/kitsune/questions/tests/test_models.py +++ b/kitsune/questions/tests/test_models.py @@ -572,7 +572,7 @@ def test_visit_count_from_analytics(self, pageviews_by_question): q2 = QuestionFactory() q3 = QuestionFactory() - pageviews_by_question.return_value = ( + pageviews_by_question.return_value = dict( row for row in ( (q1.id, 42), @@ -589,7 +589,7 @@ def test_visit_count_from_analytics(self, pageviews_by_question): self.assertEqual(1337, QuestionVisits.objects.get(question_id=q3.id).visits) # Change the data and run again to cover the update case. - pageviews_by_question.return_value = ( + pageviews_by_question.return_value = dict( row for row in ( (q1.id, 100), diff --git a/kitsune/sumo/googleanalytics.py b/kitsune/sumo/googleanalytics.py index 4a7ab2226e4..9f70ac9665a 100644 --- a/kitsune/sumo/googleanalytics.py +++ b/kitsune/sumo/googleanalytics.py @@ -332,6 +332,8 @@ def pageviews_by_question(period=LAST_YEAR, verbose=False): """ date_range = DateRange(start_date=PERIOD_TO_DAYS_AGO[period], end_date="today") + pageviews_by_id = {} + for row in run_report(date_range, create_question_report_request, verbose=verbose): path = row.dimension_values[0].value # The path should be a question path without any query parameters, but in reality @@ -345,7 +347,13 @@ def pageviews_by_question(period=LAST_YEAR, verbose=False): question_id = int(question_id) except ValueError: continue - yield (question_id, num_page_views) + + # The "run_report" will return one row for each unique question path. Since the + # path includes the locale, there can be multiple rows for each question, so we + # need to accumulate the sum of all of those rows. + pageviews_by_id[question_id] = pageviews_by_id.get(question_id, 0) + num_page_views + + return pageviews_by_id def search_clicks_and_impressions(start_date, end_date, verbose=False): diff --git a/kitsune/sumo/tests/test_googleanalytics.py b/kitsune/sumo/tests/test_googleanalytics.py index e04c4000469..4a41bc9576c 100644 --- a/kitsune/sumo/tests/test_googleanalytics.py +++ b/kitsune/sumo/tests/test_googleanalytics.py @@ -134,20 +134,28 @@ def test_pageviews_by_question(self, run_report): ), Row( dimension_values=[DimensionValue(value="/es/questions/782348")], - metric_values=[MetricValue(value="2000")], + metric_values=[MetricValue(value="187")], ), Row( dimension_values=[DimensionValue(value="/de/questions/987235")], metric_values=[MetricValue(value="3000")], ), + Row( + dimension_values=[DimensionValue(value="/it/questions/123456")], + metric_values=[MetricValue(value="17")], + ), + Row( + dimension_values=[DimensionValue(value="/en-US/questions/782348")], + metric_values=[MetricValue(value="2000")], + ), ) - result = list(googleanalytics.pageviews_by_question(LAST_7_DAYS)) + result = googleanalytics.pageviews_by_question(LAST_7_DAYS) self.assertEqual(3, len(result)) - self.assertEqual(result[0], (123456, 1000)) - self.assertEqual(result[1], (782348, 2000)) - self.assertEqual(result[2], (987235, 3000)) + self.assertEqual(result[123456], 1017) + self.assertEqual(result[782348], 2187) + self.assertEqual(result[987235], 3000) @patch.object(googleanalytics, "run_report") def test_search_clicks_and_impressions(self, run_report):