From 21d88df13428658b3d66f95260448a3e7af95126 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Sun, 12 Jul 2020 16:19:33 -0400 Subject: [PATCH] Added 'mau' collection to Figures pipeline The purpose of this is to improve API performance. See the previous commit with SHA 92ea8c460d3ab3ae7cc42945f94f27c218dde39f for an explanation. This commit adds the pipeline code to populate 'SiteDailyMetrics.mau' with the latest numbers for the month as of yesterday We also clean up the old live MAU collection from StudentModule, removing dead coe and the tests that call the dead code --- figures/mau.py | 34 +++++++++ figures/metrics.py | 21 ++---- figures/pipeline/site_daily_metrics.py | 5 ++ tests/metrics/test_site_monthly_metrics.py | 57 ++-------------- tests/pipeline/test_site_daily_metrics.py | 24 ++++++- tests/test_mau.py | 76 +++++++++++++++++++++ tests/views/test_site_daily_metrics_view.py | 4 +- 7 files changed, 151 insertions(+), 70 deletions(-) diff --git a/figures/mau.py b/figures/mau.py index 29017d0d..adb7373d 100644 --- a/figures/mau.py +++ b/figures/mau.py @@ -73,6 +73,40 @@ def retrieve_live_course_mau_data(site, course_id): ) +def mau_1g_for_month_as_of_day(sm_queryset, date_for): + """Get the MAU from the sm, as of the "date_for" for "date_for" month + + sm_queryset is the StudentModule queryset for our source + + This is a MAU 1G function that calculates the monthly active users as of the + day in the given month. + + This function queries `courseware.models.StudentModule` to identify users + who are active in the site + + Returns a queryset of distinct user ids + """ + month_sm = sm_queryset.filter(modified__year=date_for.year, + modified__month=date_for.month, + modified__day__lte=date_for.day) + return month_sm.values('student__id').distinct() + + +def site_mau_1g_for_month_as_of_day(site, date_for): + """Get the MAU for the given site, as of the "date_for" in the month + + This is a conenvience function. It gets the student modules for the site, + then calls + + `figures.mau.mau_for_month_as_of_day(...)` + + Returns a queryset with distinct user ids + """ + site_sm = get_student_modules_for_site(site) + return mau_1g_for_month_as_of_day(sm_queryset=site_sm, + date_for=date_for) + + def store_mau_metrics(site, overwrite=False): """ Save "snapshot" of MAU metrics diff --git a/figures/metrics.py b/figures/metrics.py index 9668c22b..bc191662 100644 --- a/figures/metrics.py +++ b/figures/metrics.py @@ -253,8 +253,10 @@ def get_site_mau_history_metrics(site, months_back): month=str(rec.month_for.month).zfill(2)) history.append(dict(period=period, value=rec.active_user_count)) - # Hack to set current month data in the history list - current_month_active = get_site_mau_current_month(site) + # Get our latest stored site MAU count + sdm = SiteDailyMetrics.latest_previous_record(site=site) + current_month_active = sdm.mau if sdm else 0 + if history: # reverse the list because it is currently in reverser chronological order history.reverse() @@ -266,21 +268,6 @@ def get_site_mau_history_metrics(site, months_back): return dict(current_month=current_month_active, history=history) -def get_site_mau_current_month(site): - """Specific function to get the live active users for the current month - - Developers note: We're starting with the simple aproach for MAU 1G, first - generation. When we update for MAU 2G, we will be able to make the query - more efficient by pulling unique users from a single table used for live - capture. - """ - month_for = datetime.datetime.utcnow() - site_sm = figures.sites.get_student_modules_for_site(site) - curr_sm = site_sm.filter(modified__year=month_for.year, - modified__month=month_for.month) - return curr_sm.values('student__id').distinct().count() - - def get_active_users_for_time_period(site, start_date, end_date, course_ids=None): """ Returns the number of users active in the time period. diff --git a/figures/pipeline/site_daily_metrics.py b/figures/pipeline/site_daily_metrics.py index a90055ac..1c8a8903 100644 --- a/figures/pipeline/site_daily_metrics.py +++ b/figures/pipeline/site_daily_metrics.py @@ -12,6 +12,7 @@ from django.db.models import Sum from figures.helpers import as_course_key, as_datetime, next_day, prev_day +from figures.mau import site_mau_1g_for_month_as_of_day from figures.models import CourseDailyMetrics, SiteDailyMetrics from figures.sites import ( get_courses_for_site, @@ -133,12 +134,15 @@ def extract(self, site, date_for=None, **kwargs): # pylint: disable=unused-argu todays_active_users = get_site_active_users_for_date(site, date_for) todays_active_user_count = todays_active_users.count() + mau = site_mau_1g_for_month_as_of_day(site, date_for) + data['todays_active_user_count'] = todays_active_user_count data['cumulative_active_user_count'] = get_previous_cumulative_active_user_count( site, date_for) + todays_active_user_count data['total_user_count'] = user_count data['course_count'] = course_count data['total_enrollment_count'] = get_total_enrollment_count(site, date_for) + data['mau'] = mau.count() return data @@ -186,6 +190,7 @@ def load(self, site, date_for=None, force_update=False, **_kwargs): total_user_count=data['total_user_count'], course_count=data['course_count'], total_enrollment_count=data['total_enrollment_count'], + mau=data['mau'], ) ) return site_metrics, created diff --git a/tests/metrics/test_site_monthly_metrics.py b/tests/metrics/test_site_monthly_metrics.py index d5b301df..81635bba 100644 --- a/tests/metrics/test_site_monthly_metrics.py +++ b/tests/metrics/test_site_monthly_metrics.py @@ -1,8 +1,7 @@ """ """ -from datetime import date, datetime -from factory import fuzzy +from datetime import date from freezegun import freeze_time from dateutil.rrule import rrule, MONTHLY @@ -10,25 +9,14 @@ import pytest -from figures.metrics import ( - get_site_mau_history_metrics, - get_site_mau_current_month -) +from figures.metrics import get_site_mau_history_metrics from figures.models import SiteMonthlyMetrics from tests.factories import ( - CourseOverviewFactory, - OrganizationFactory, - OrganizationCourseFactory, + SiteDailyMetricsFactory, SiteMonthlyMetricsFactory, SiteFactory, - StudentModuleFactory, - UserFactory, ) -from tests.helpers import organizations_support_sites - -if organizations_support_sites(): - from tests.factories import UserOrganizationMappingFactory @pytest.mark.django_db @@ -45,6 +33,7 @@ def test_get_site_mau_history_metrics_basic(db, monkeypatch): {'period': '2020/05', 'value': 11}, {'period': '2020/06', 'value': 12}]} + TODO: We want to revisit both this test and the function under test """ all_months_back = 12 months_back = 6 @@ -68,8 +57,9 @@ def test_get_site_mau_history_metrics_basic(db, monkeypatch): active_user_count=counter)) current_month_active = 42 - monkeypatch.setattr('figures.metrics.get_site_mau_current_month', - lambda n: current_month_active) + SiteDailyMetricsFactory(site=our_site, + date_for=mock_today - relativedelta(day=2), + mau=current_month_active) data = get_site_mau_history_metrics(site=our_site, months_back=months_back) @@ -82,36 +72,3 @@ def test_get_site_mau_history_metrics_basic(db, monkeypatch): obj = SiteMonthlyMetrics.objects.get(site=our_site, month_for=month_for) assert obj.active_user_count == rec['value'] assert obj.site == our_site - - -@pytest.mark.django_db -def test_get_site_mau_current_month(db): - - mock_today = date(year=2020, month=3, day=1) - freezer = freeze_time(mock_today) - freezer.start() - - start_dt = datetime(mock_today.year, mock_today.month, 1, tzinfo=fuzzy.compat.UTC) - end_dt = datetime(mock_today.year, mock_today.month, 31, tzinfo=fuzzy.compat.UTC) - date_gen = fuzzy.FuzzyDateTime(start_dt=start_dt, end_dt=end_dt) - site = SiteFactory() - course_overviews = [CourseOverviewFactory() for i in range(2)] - users = [UserFactory() for i in range(2)] - sm = [] - for user in users: - for co in course_overviews: - sm.append(StudentModuleFactory(course_id=co.id, - student=user, - modified=date_gen.evaluate(2, None, False))) - - if organizations_support_sites(): - org = OrganizationFactory(sites=[site]) - for co in course_overviews: - OrganizationCourseFactory(organization=org, course_id=str(co.id)) - for user in users: - UserOrganizationMappingFactory(user=user, - organization=org) - - active_user_count = get_site_mau_current_month(site) - freezer.stop() - assert active_user_count == len(users) diff --git a/tests/pipeline/test_site_daily_metrics.py b/tests/pipeline/test_site_daily_metrics.py index 81751d43..9d84e807 100644 --- a/tests/pipeline/test_site_daily_metrics.py +++ b/tests/pipeline/test_site_daily_metrics.py @@ -1,4 +1,7 @@ -'''Tests figures.pipeline. +'''Tests figures.pipeline.site_daily_metrics module + +IMPORTANT: We need to refactor the test data in this test module as this was +early work and we've learned a lot since then. TODO: @@ -71,6 +74,7 @@ total_user_count=200, course_count=len(CDM_INPUT_TEST_DATA), total_enrollment_count=100, + mau=55, ) ] @@ -81,6 +85,7 @@ total_user_count=200, course_count=len(CDM_INPUT_TEST_DATA), total_enrollment_count=150, + mau=56, ) @@ -232,12 +237,15 @@ def setup(self, db): organization=self.organization) def test_extract(self, monkeypatch): + previous_cumulative_active_user_count = 50 + expected_results = dict( cumulative_active_user_count=52, # previous cumulative is 50 todays_active_user_count=2, total_user_count=len(self.users), course_count=len(CDM_INPUT_TEST_DATA), total_enrollment_count=150, + mau=len(self.users), # expect 3 ) assert not StudentModule.objects.count() @@ -253,6 +261,19 @@ def mock_student_modules_for_site(site): monkeypatch.setattr(pipeline_sdm, 'get_student_modules_for_site', mock_student_modules_for_site) + def mock_site_mau_1g_for_month_as_of_day(site, date_for): + return get_user_model().objects.filter( + id__in=[user.id for user in self.users]).values('id') + + monkeypatch.setattr(pipeline_sdm, 'site_mau_1g_for_month_as_of_day', + mock_site_mau_1g_for_month_as_of_day) + + def mock_get_previous_cumulative_active_user_count(site, date_for): + return previous_cumulative_active_user_count + + monkeypatch.setattr(pipeline_sdm, 'get_previous_cumulative_active_user_count', + mock_get_previous_cumulative_active_user_count) + for course in figures.sites.get_courses_for_site(self.site): assert course.created.date() < self.date_for for user in figures.sites.get_users_for_site(self.site): @@ -278,6 +299,7 @@ class TestSiteDailyMetricsLoader(object): total_user_count=3, course_count=4, total_enrollment_count=5, + mau=6, ) class MockExtractor(object): diff --git a/tests/test_mau.py b/tests/test_mau.py index 56e7fcdf..a2c12de3 100644 --- a/tests/test_mau.py +++ b/tests/test_mau.py @@ -1,6 +1,10 @@ from datetime import datetime from freezegun import freeze_time +from dateutil.relativedelta import relativedelta +import pytest + +from django.utils.timezone import utc from courseware.models import StudentModule @@ -11,9 +15,13 @@ from figures.mau import ( get_mau_from_student_modules, get_mau_from_site_course, + mau_1g_for_month_as_of_day, + site_mau_1g_for_month_as_of_day, store_mau_metrics, ) +from tests.factories import StudentModuleFactory + def test_get_mau_from_site_course(sm_test_data): """Basic test for coverage with simple check @@ -47,6 +55,74 @@ def test_get_mau_from_sm_for_site(sm_test_data): assert set(users) == set(sm_check) +@pytest.mark.django_db +def test_mau_1g_for_month_as_of_day_first_day_next_month(db): + """ + Test getting live MAU 1G values from StudentModule for the given day + + Quick-n-dirty data setup: + + We want to make sure we get the right records when the query happens on the + first day of the next month. So we do the following + + * Add a StudentModule record for two months before + * Add at least one StudentModule record for the month we want + * Add at least one StudentModule record for after the month we want + + This sets up the scenario that we run the daily pipeline to capture MAU + "as of" yesterday (the last day of the previous month) to capture MAU for + the previous month + """ + mock_today = datetime(year=2020, month=4, day=1).replace(tzinfo=utc) + month_before = datetime(year=2020, month=2, day=2).replace(tzinfo=utc) + in_dates = [datetime(year=2020, month=3, day=1).replace(tzinfo=utc), + datetime(year=2020, month=3, day=15).replace(tzinfo=utc), + datetime(year=2020, month=3, day=31).replace(tzinfo=utc)] + date_for = mock_today.date() - relativedelta(days=1) + + # Create a student module in the month before, and in month after + StudentModuleFactory(created=month_before, modified=month_before) + StudentModuleFactory(created=mock_today, modified=mock_today) + sm_in = [StudentModuleFactory(created=rec, + modified=rec) for rec in in_dates] + expected_user_ids = [obj.student_id for obj in sm_in] + + sm_queryset = StudentModule.objects.all() + user_ids = mau_1g_for_month_as_of_day(sm_queryset=sm_queryset, + date_for=date_for) + assert set([rec['student__id'] for rec in user_ids]) == set(expected_user_ids) + + +def test_site_mau_1g_for_month_as_of_day(monkeypatch): + """Test our wrapper function, site_mau_1g_for_month_as_of_day + + All we really care about is the call stack is what we expect with the args + we expect + """ + expected_site = 'this is our site' + expected_date_for = 'this is my date' + expected_sm_queryset = 'this is my expected student module queryset' + expected_user_id_qs = 'this is my expected user id queryset' + + def mock_get_student_modules_for_site(site): + assert site == expected_site + return expected_sm_queryset + + def mock_mau_1g_for_month_as_of_day(sm_queryset, date_for): + assert date_for == expected_date_for + assert sm_queryset == expected_sm_queryset + return expected_user_id_qs + + monkeypatch.setattr('figures.mau.mau_1g_for_month_as_of_day', + mock_mau_1g_for_month_as_of_day) + monkeypatch.setattr('figures.mau.get_student_modules_for_site', + mock_get_student_modules_for_site) + + qs = site_mau_1g_for_month_as_of_day(site=expected_site, + date_for=expected_date_for) + assert qs == expected_user_id_qs + + def test_store_mau_metrics(monkeypatch, sm_test_data): """ Basic minimal test diff --git a/tests/views/test_site_daily_metrics_view.py b/tests/views/test_site_daily_metrics_view.py index 2d4aceb7..cb78536f 100644 --- a/tests/views/test_site_daily_metrics_view.py +++ b/tests/views/test_site_daily_metrics_view.py @@ -123,13 +123,13 @@ def test_create(self): Note: We don't need write functionality with this view as of version 0.2.0 """ data = dict( - # site=SiteSerializer(self.site).data, date_for='2020-01-01', cumulative_active_user_count=1, todays_active_user_count=2, total_user_count=3, course_count=4, - total_enrollment_count=5 + total_enrollment_count=5, + mau=6, ) # Might not need to set format='json' request = APIRequestFactory().post(