Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update SDM historic course count calculation #458

Merged
merged 1 commit into from
Apr 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions figures/pipeline/site_daily_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@

from django.db.models import Sum

from figures.course import Course
from figures.helpers import as_course_key, as_datetime, next_day
from figures.mau import site_mau_1g_for_month_as_of_day
from figures.models import CourseDailyMetrics, SiteDailyMetrics
from figures.sites import (
site_course_ids,
get_courses_for_site,
get_users_for_site,
get_student_modules_for_site,
Expand Down Expand Up @@ -54,6 +56,33 @@ def missing_course_daily_metrics(site, date_for):
# Standalone methods to extract data/aggregate data for use in SiteDailyMetrics
#


def get_course_ids_enrolled_on_or_before(site, date_for):
"""Best guess to get site courses created on or before the specified date

CourseOverview does not have a reliable 'created' field
So we need to get the first enrollment for the courses
Unfortunately, we need to iterate over the courses OR create a complex and
slow query, so our implementation is the safe brute force to iterate over
courses, get the first enrollment and see if that enrollment is on or before
the date.

We will first implement this using `figures.course.Course` as it can
provide the first enrollment date for a course. There is a slight overhead
of an extra query in that `Course`'s contructor retrieves the site for the
course. If this turns out to be expensive, then we can furter optimize. But
first, let's see if we can do this withoutg duplicating code.
"""
found_course_ids = []
compare_date = next_day(as_datetime(date_for))
for course_id in site_course_ids(site):
course = Course(course_id)
course_fe_ts = course.first_enrollment_timestamp()
if course_fe_ts and course_fe_ts < compare_date:
found_course_ids.append(course_id)
return found_course_ids


def get_site_active_users_for_date(site, date_for):
'''
Get the active users ids for the given site and date
Expand Down Expand Up @@ -123,9 +152,9 @@ def extract(self, site, date_for, **kwargs): # pylint: disable=unused-argument
site_users = get_users_for_site(site)
user_count = site_users.filter(
date_joined__lt=as_datetime(next_day(date_for))).count()
site_courses = get_courses_for_site(site)
course_count = site_courses.filter(
created__lt=as_datetime(next_day(date_for))).count()

course_ids = get_course_ids_enrolled_on_or_before(site, date_for)
course_count = len(course_ids)

todays_active_users = get_site_active_users_for_date(site, date_for)
todays_active_user_count = todays_active_users.count()
Expand Down
4 changes: 4 additions & 0 deletions tests/pipeline/test_site_daily_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ def mock_get_previous_cumulative_active_user_count(site, date_for):
monkeypatch.setattr(pipeline_sdm, 'get_previous_cumulative_active_user_count',
mock_get_previous_cumulative_active_user_count)

course_ids = [str(co.id) for co in self.course_overviews]
monkeypatch.setattr(pipeline_sdm, 'get_course_ids_enrolled_on_or_before',
lambda site, date_for: course_ids)

for course in figures.sites.get_courses_for_site(self.site):
assert course.created.date() < self.date_for
for user in figures.sites.get_users_for_site(self.site):
Expand Down
89 changes: 89 additions & 0 deletions tests/pipeline/test_site_daily_metrics_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Test the module level functions in figures.pipeline.site_daily_metrics

First
"""
from __future__ import absolute_import
import pytest

from django.utils.timezone import utc
from faker import Faker

from figures.helpers import days_from
from figures.pipeline.site_daily_metrics import (
get_course_ids_enrolled_on_or_before
)

from tests.factories import (
CourseEnrollmentFactory,
CourseOverviewFactory,
SiteFactory,
)


fake = Faker()


@pytest.mark.django_db
class TestGetCoursesEnrolledOnOrBefore(object):
"""Tests the `get_course_ids_enrolled_on_or_before` function
"""
@pytest.fixture(autouse=True)
def setup(self, db, settings):
# self.course_overview = CourseOverviewFactory()
self.site = SiteFactory()
# Try for any datetime with `fake.date_time()` as the code we test here
# should be agnostic to "any when". If it turns up to fail, then we
# need to inspect the code as to why
self.date_for = fake.date_time(tzinfo=utc)
self.site_course_ids = 'figures.pipeline.site_daily_metrics.site_course_ids',

def test_with_no_courses(self, monkeypatch):
monkeypatch.setattr('figures.pipeline.site_daily_metrics.site_course_ids',
lambda site: [])
found_cids = get_course_ids_enrolled_on_or_before(self.site,
self.date_for)
assert found_cids == []

def test_with_no_enrollments(self, monkeypatch):
course_overview = CourseOverviewFactory()
monkeypatch.setattr('figures.pipeline.site_daily_metrics.site_course_ids',
lambda site: [str(course_overview.id)])
found_cids = get_course_ids_enrolled_on_or_before(self.site,
self.date_for)
assert found_cids == []

def test_mix(self, monkeypatch):
"""
This should cover our cases. We shouldn't need
`test_all_before` or `test_all_after`
"""

# Construct enrollments on days before date_for
ce_before = [
CourseEnrollmentFactory(created=days_from(self.date_for, days))
for days in range(-2, 0)
]
ce_before_cids = [str(ce.course_id) for ce in ce_before]

# Construct enrollments on date for
ce_date_for = [CourseEnrollmentFactory(created=self.date_for)]
ce_date_for_cids = [str(ce.course_id) for ce in ce_date_for]

# Construct enrollments on days after date_for
ce_after = [CourseEnrollmentFactory(created=days_from(self.date_for, days))
for days in range(1, 3)]
ce_after_cids = [str(ce.course_id) for ce in ce_after]

# adaptable checks to make sure we don't have duplicate course ids
assert not set.intersection(set(ce_before_cids), set(ce_after_cids))
assert not set.intersection(set(ce_before_cids), set(ce_date_for_cids))
assert not set.intersection(set(ce_date_for_cids), set(ce_after_cids))

all_cids = ce_before_cids + ce_date_for_cids + ce_after_cids
monkeypatch.setattr('figures.pipeline.site_daily_metrics.site_course_ids',
lambda site: all_cids)
monkeypatch.setattr('figures.course.get_site_for_course',
lambda course_id: self.site)
found_cids = get_course_ids_enrolled_on_or_before(self.site,
self.date_for)
assert set(found_cids) == set(ce_before_cids + ce_date_for_cids)