Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enrollment data handling fixes #449

Merged
merged 7 commits into from
Mar 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions devsite/requirements/ginkgo.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,6 @@ freezegun==0.3.12

# Added to address: TypeError: attrib() got an unexpected keyword argument 'convert'
attrs==19.1.0

# Added to enable mock.mock_open using 'builtins.open'
future==0.18.2
3 changes: 3 additions & 0 deletions devsite/requirements/hawthorn_base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,6 @@ edx-lint==0.5.5

# To address: edx-lint 0.5.5 requires astroid==1.5.2, but you'll have astroid 1.6.6 which is incompatible.
astroid==1.5.2

# Added to enable mock.mock_open using 'builtins.open'
future==0.18.2
8 changes: 8 additions & 0 deletions figures/course.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,11 @@ def enrollments_active_on_date(self, date_for):
user_ids = sm.values('student_id').distinct()
return CourseEnrollment.objects.filter(course_id=self.course_key,
user_id__in=user_ids)

def enrollments_with_student_modules(self):
"""Return CourseEnrollment objects that have StudentModule records
"""
sm = StudentModule.objects.filter(course_id=self.course_key)
user_ids = sm.values('student_id').distinct()
return CourseEnrollment.objects.filter(course_id=self.course_key,
user_id__in=user_ids).distinct()
57 changes: 57 additions & 0 deletions figures/enrollment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""This module handles enrollments

Enrollment specific functionality should go here unless the functionality would
only be run in the pipeline.

At some point we might make an `Enrollment` class. But for now, this module is
here to build functionality to shed more light into what such a class would
look like.
"""
from figures.compat import StudentModule
from figures.models import EnrollmentData


def student_modules_for_enrollment(enrollment):
"""Return an enrollment's StudentModule records, if they exist

This function exists because edx-platform does not have a model relationship between
`CourseEnrollment` and `StudentModule`.
"""
return StudentModule.objects.filter(student_id=enrollment.user_id,
course_id=enrollment.course_id)


def student_modules_for_enrollment_after_date(enrollment, date_for):
"""Return StudentModule records modified for the enrollment after a date

TODO: Test if we need to do `modified__gte=next_day(date_for)` or if
`modified__gt=date_for` will trigger for times after midnight but before
the next day

We can ignore `StudentModule.created` because a new record also sets the
`modified` field.
"""
return student_modules_for_enrollment(enrollment).filter(modified__gte=date_for)


def is_enrollment_data_out_of_date(enrollment):
"""
Assumes being called with an enrollment with student modules
"""
# has EnrollmentData records? if not, then out of date
edrec = EnrollmentData.objects.get_for_enrollment(enrollment)
if edrec:
sm = student_modules_for_enrollment_after_date(enrollment, edrec.date_for)
# All we care about is if there are student modules modified
# after the EnrollmentData record was `date_for`
# out_of_date.append(enrollment)
out_of_date = True if sm.exists() else False
else:
# Just check if there are student modules for the enrollment, ever
# If there are student modules (and we've already check there is no
# enrollment), then we need to update
# If there are no student modules, then the enrollment had no activity
# then return whether there are StudentModule records or not
out_of_date = student_modules_for_enrollment(enrollment).exists()

return out_of_date
52 changes: 45 additions & 7 deletions figures/management/commands/backfill_figures_enrollment_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from __future__ import print_function
from __future__ import absolute_import

import argparse
from textwrap import dedent
from django.contrib.sites.models import Site
from django.core.management.base import BaseCommand, CommandError
Expand Down Expand Up @@ -82,7 +83,7 @@ def get_sites(self, options):
sites.append(Site.objects.get(domain=site_identifier))
return set(sites)

def get_course_ids(self, options):
def get_course_ids_from_courses_param(self, options):
"""Return a validated list of course id strings.
If no courses are specified, returns an empty list
Expand All @@ -101,23 +102,47 @@ def get_course_ids(self, options):
course_ids.append(str(course_overview.id))
return course_ids

def update_enrollments(self, course_ids, use_celery=True):
def get_course_ids_from_file(self, options):
""""Gets a list of verified course ids from a flat file
Initially, we start with a flat file of course ids
The '#' sign is a comment
"""
course_ids = []
lines = options['courses_file'].read().splitlines()
for line in lines:
line = line.strip()
if line and not line.startswith('#'):
course_overview = CourseOverview.objects.get(id=as_course_key(line))
course_ids.append(str(course_overview.id))
return course_ids

def update_enrollments(self, course_ids, use_celery):
"""This method calls the Celery task in delay or immediate mode
TODO: If running as Celery tasks, improve the function, collect the
task ids, return them to the caller (Currently it is the `handle`
function) and report on processing state. Allow the user to quit and
the tasks keep running.
"""
for course_id in course_ids:
print('Updating enrollment data for course "{}"'.format(str(course_id)))
if use_celery:
# Call the Celery task with delay
backfill_enrollment_data_for_course.delay(str(course_id))

else:
# Call the Celery task immediately
backfill_enrollment_data_for_course(str(course_id))

def add_arguments(self, parser):
"""
If site domain or id or set of ides are provided, but no course or
courses provided, runs for all courses in the site
If a course id or list of course ids provided, processes those courses
If a site
If a site domain or list of site domains, process those, unless course_ids
are provided. If course ids are provided, those are processed and sites
ignored.
"""
parser.add_argument('--sites',
nargs='+',
Expand All @@ -127,15 +152,28 @@ def add_arguments(self, parser):
nargs='+',
default=[],
help='backfill a specific course. provide course id string')
parser.add_argument('--courses-file', type=argparse.FileType('r'),
help='Course IDs file. A flat file with course ids')
# TODO: document this
# we want to allow explicit --use-celery false
parser.add_argument('--use-celery',
action='store_true',
default=True,
help='Run with Celery worker. Set to false to run in app space ')
default=False,
help='Run with Celery worker. Default is to run in immediate mode')

def handle(self, *args, **options):

print('BEGIN: Backfill Figures EnrollmentData')
sites = self.get_sites(options)
course_ids = self.get_course_ids(options)

if options['courses_file']:
# Implmented two different methods just to keep them separate for
# this release and make it clear where the course ids are from
# Later on, we might abstract this to just `seof.get_course_ids(options)`
course_ids = self.get_course_ids_from_file(options)
else:
course_ids = self.get_course_ids_from_courses_param(options)

use_celery = options['use_celery']

# Would it benefit to implement a generator method to yield the course id?
Expand Down
14 changes: 14 additions & 0 deletions figures/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from figures.progress import EnrollmentProgress


# Remove this. Import from figures.sites or will we get dependency issues?
def default_site():
"""
Wrapper aroung `django.conf.settings.SITE_ID` so we do not have to create a
Expand Down Expand Up @@ -204,6 +205,19 @@ class EnrollmentDataManager(models.Manager):
EnrollmentData instances.

"""

def get_for_enrollment(self, course_enrollment):
"""Returns EnrollmentData object or None for the given CourseEnrollment

This is a context specific `get_or_none` function that uses the `user_id`
and `course_id` from the enrollment argument.
"""
try:
return self.get(user_id=course_enrollment.user_id,
course_id=str(course_enrollment.course_id))
except EnrollmentData.DoesNotExist:
return None

def set_enrollment_data(self, site, user, course_id, course_enrollment=None):
"""
This is an expensive call as it needs to call CourseGradeFactory if
Expand Down
36 changes: 36 additions & 0 deletions figures/pipeline/enrollment_metrics_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"""
from django.db.models import Avg
from figures.course import Course
from figures.enrollment import is_enrollment_data_out_of_date
from figures.helpers import utc_yesterday
from figures.models import EnrollmentData
from figures.sites import UnlinkedCourseError
Expand All @@ -42,6 +43,8 @@ def update_enrollment_data_for_course(course_id):
`EnrollmentData` records

Return results are a list of the results returned by `update_enrollment_data`

TODO: Add check if the course data actually exists in Mongo
"""
date_for = utc_yesterday()
the_course = Course(course_id)
Expand All @@ -55,6 +58,39 @@ def update_enrollment_data_for_course(course_id):
for ce in active_enrollments]


def stale_course_enrollments(course_id):
"""Find missing/out of date EnrollmentData records for the course

The `EnrollmentData` model holds the most recent data about the enrollment.
This model is not created until after two events happen in sequence
1. The enrollment has activity that creates one or more`StudentModule` records
2. Figures daily metrics ran after the `StudentModule` records were created
OR
Figures `EnrollmentData` backfill was run

Maybe... We might want to add query methods to EnrollmentDataManager to
get out of date `EnrollmentData` records for a course.

NOTE: Naming this function was a bit of a challenge. What I used before:

"update_enrollment_data_for_course"
"enrollments_with_out_of_date_enrollment_data"
"ce_needs_enrollment_data_update"

Since we are in the context of Figures and figures doesn't modifity the platform,
we should be save with saying "stale_course_enrollments" for brevity
"""
course = Course(course_id)

# If we have a course with no activity, nothing can be out of date.
# As such if the course has no student modules, the loop won't execute

# We are only interested in enrollments with activity
for enrollment in course.enrollments_with_student_modules():
if is_enrollment_data_out_of_date(enrollment):
yield enrollment


def calculate_course_progress(course_id):
"""Return average progress percentage for all enrollments in the course
"""
Expand Down
45 changes: 38 additions & 7 deletions figures/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
from celery.utils.log import get_task_logger

from figures.compat import CourseEnrollment, CourseOverview
from figures.course import Course
from figures.helpers import as_course_key, as_date, is_past_date
from figures.log import log_exec_time
from figures.models import EnrollmentData
from figures.sites import get_sites, get_sites_by_id, site_course_ids

from figures.pipeline.backfill import backfill_enrollment_data_for_site
Expand All @@ -29,7 +31,10 @@
from figures.pipeline.mau_pipeline import collect_course_mau
from figures.pipeline.helpers import DateForCannotBeFutureError
from figures.pipeline.site_monthly_metrics import fill_last_month as fill_last_smm_month
from figures.pipeline.enrollment_metrics_next import update_enrollment_data_for_course
from figures.pipeline.enrollment_metrics_next import (
update_enrollment_data_for_course,
stale_course_enrollments,
)


logger = get_task_logger(__name__)
Expand Down Expand Up @@ -324,15 +329,41 @@ def populate_daily_metrics_next(site_id=None, force_update=False):

@shared_task
def backfill_enrollment_data_for_course(course_id):
"""Create or update EnrollmentData records for the course
"""Update EnrollmentData records for activity before "yesterday"

Simple wrapper to run the enrollment update as a Celery task
This task function is to get `EnrollmentData` records up to date. This is
needed under at least the following conditions

We usually run this task through the Figures Django management command,
`backfill_figures_enrollment_data`
A. Figures is being installed/enabled on an existing Open edX deployment
B. The daily pipeline was stopped or failed to run for longer than a day

Why this function is needed is because it costs too much time to query
an enrollment's `StudentModule` record to find the latest date it was
modified. For the regular day to day pipeline, Figures can perform a **much**
fasater query to find if `StudentModule` records exist for the specific day
we gather our daily metrics. This is always the previous calendar day given
UTC time. The following function is what updates `EnrollmentData` on the
daily job:

```
figures.pipeline.enrollment_metrics_next.update_enrollment_data_for_course
```

There is a Figures Django management command to run this task:

```
backfill_figures_enrollment_data
```
"""
ed_objects = update_enrollment_data_for_course(course_id)
return [obj.id for obj in ed_objects]
course = Course(course_id)
updated = []
for enrollment in stale_course_enrollments(course_id):
# `update_metrics` results are a (object, created_flag) tuple
updated.append(EnrollmentData.objects.update_metrics(course.site, enrollment))

msg = ('figures.tasks.backfill_enrollment_data_for_course "{course_id}".'
' Updated {edrec_count} enrollment data records.')
logger.info(msg.format(course_id=course_id, edrec_count=len(updated)))


#
Expand Down
Loading