appsembler · johnbaldwin · Mar 16, 2022 · Mar 11, 2022 · Mar 14, 2022 · Mar 14, 2022
@@ -83,3 +83,6 @@ freezegun==0.3.12
 
 # Added to address: TypeError: attrib() got an unexpected keyword argument 'convert'
 attrs==19.1.0
+
+# Added to enable mock.mock_open using 'builtins.open'
+future==0.18.2
@@ -95,3 +95,6 @@ edx-lint==0.5.5
 
 # To address: edx-lint 0.5.5 requires astroid==1.5.2, but you'll have astroid 1.6.6 which is incompatible.
 astroid==1.5.2
+
+# Added to enable mock.mock_open using 'builtins.open'
+future==0.18.2
@@ -113,3 +113,11 @@ def enrollments_active_on_date(self, date_for):
         user_ids = sm.values('student_id').distinct()
         return CourseEnrollment.objects.filter(course_id=self.course_key,
                                                user_id__in=user_ids)
+
+    def enrollments_with_student_modules(self):
+        """Return CourseEnrollment objects that have StudentModule records
+        """
+        sm = StudentModule.objects.filter(course_id=self.course_key)
+        user_ids = sm.values('student_id').distinct()
+        return CourseEnrollment.objects.filter(course_id=self.course_key,
+                                               user_id__in=user_ids).distinct()
@@ -0,0 +1,57 @@
+"""This module handles enrollments
+
+Enrollment specific functionality should go here unless the functionality would
+only be run in the pipeline.
+
+At some point we might make an `Enrollment` class. But for now, this module is
+here to build functionality to shed more light into what such a class would
+look like.
+"""
+from figures.compat import StudentModule
+from figures.models import EnrollmentData
+
+
+def student_modules_for_enrollment(enrollment):
+    """Return an enrollment's StudentModule records, if they exist
+
+    This function exists because edx-platform does not have a model relationship between
+    `CourseEnrollment` and `StudentModule`.
+    """
+    return StudentModule.objects.filter(student_id=enrollment.user_id,
+                                        course_id=enrollment.course_id)
+
+
+def student_modules_for_enrollment_after_date(enrollment, date_for):
+    """Return StudentModule records modified for the enrollment after a date
+
+    TODO: Test if we need to do `modified__gte=next_day(date_for)` or if
+    `modified__gt=date_for` will trigger for times after midnight but before
+    the next day
+
+    We can ignore `StudentModule.created` because a new record also sets the
+    `modified` field.
+    """
+    return student_modules_for_enrollment(enrollment).filter(modified__gte=date_for)
+
+
+def is_enrollment_data_out_of_date(enrollment):
+    """
+    Assumes being called with an enrollment with student modules
+    """
+    # has EnrollmentData records? if not, then out of date
+    edrec = EnrollmentData.objects.get_for_enrollment(enrollment)
+    if edrec:
+        sm = student_modules_for_enrollment_after_date(enrollment, edrec.date_for)
+        # All we care about is if there are student modules modified
+        # after the EnrollmentData record was `date_for`
+        # out_of_date.append(enrollment)
+        out_of_date = True if sm.exists() else False
+    else:
+        # Just check if there are student modules for the enrollment, ever
+        # If there are student modules (and we've already check there is no
+        # enrollment), then we need to update
+        # If there are no student modules, then the enrollment had no activity
+        # then return whether there are StudentModule records or not
+        out_of_date = student_modules_for_enrollment(enrollment).exists()
+
+    return out_of_date
@@ -37,6 +37,7 @@
 from __future__ import print_function
 from __future__ import absolute_import
 
+import argparse
 from textwrap import dedent
 from django.contrib.sites.models import Site
 from django.core.management.base import BaseCommand, CommandError
@@ -82,7 +83,7 @@ def get_sites(self, options):
             sites.append(Site.objects.get(domain=site_identifier))
         return set(sites)
 
-    def get_course_ids(self, options):
+    def get_course_ids_from_courses_param(self, options):
         """Return a validated list of course id strings.
 
         If no courses are specified, returns an empty list
@@ -101,23 +102,47 @@ def get_course_ids(self, options):
             course_ids.append(str(course_overview.id))
         return course_ids
 
-    def update_enrollments(self, course_ids, use_celery=True):
+    def get_course_ids_from_file(self, options):
+        """"Gets a list of verified course ids from a flat file
+        Initially, we start with a flat file of course ids
+        The '#' sign is a comment
+        """
+        course_ids = []
+        lines = options['courses_file'].read().splitlines()
+        for line in lines:
+            line = line.strip()
+            if line and not line.startswith('#'):
+                course_overview = CourseOverview.objects.get(id=as_course_key(line))
+                course_ids.append(str(course_overview.id))
+        return course_ids
+
+    def update_enrollments(self, course_ids, use_celery):
+        """This method calls the Celery task in delay or immediate mode
+
+        TODO: If running as Celery tasks, improve the function, collect the
+        task ids, return them to the caller (Currently it is the `handle`
+        function) and report on processing state. Allow the user to quit and
+        the tasks keep running.
+        """
         for course_id in course_ids:
+            print('Updating enrollment data for course "{}"'.format(str(course_id)))
             if use_celery:
                 # Call the Celery task with delay
                 backfill_enrollment_data_for_course.delay(str(course_id))
+
             else:
                 # Call the Celery task immediately
                 backfill_enrollment_data_for_course(str(course_id))
 
     def add_arguments(self, parser):
         """
-
         If site domain or id or set of ides are provided, but no course or
         courses provided, runs for all courses in the site
 
         If a course id or list of course ids provided, processes those courses
-        If a site
+        If a site domain or list of site domains, process those, unless course_ids
+        are provided. If course ids are provided, those are processed and sites
+        ignored.
         """
         parser.add_argument('--sites',
                             nargs='+',
@@ -127,15 +152,28 @@ def add_arguments(self, parser):
                             nargs='+',
                             default=[],
                             help='backfill a specific course. provide course id string')
+        parser.add_argument('--courses-file', type=argparse.FileType('r'),
+                            help='Course IDs file. A flat file with course ids')
+        # TODO: document this
+        # we want to allow explicit --use-celery false
         parser.add_argument('--use-celery',
                             action='store_true',
-                            default=True,
-                            help='Run with Celery worker. Set to false to run in app space ')
+                            default=False,
+                            help='Run with Celery worker. Default is to run in immediate mode')
 
     def handle(self, *args, **options):
+
         print('BEGIN: Backfill Figures EnrollmentData')
         sites = self.get_sites(options)
-        course_ids = self.get_course_ids(options)
+
+        if options['courses_file']:
+            # Implmented two different methods just to keep them separate for
+            # this release and make it clear where the course ids are from
+            # Later on, we might abstract this to just `seof.get_course_ids(options)`
+            course_ids = self.get_course_ids_from_file(options)
+        else:
+            course_ids = self.get_course_ids_from_courses_param(options)
+
         use_celery = options['use_celery']
 
         # Would it benefit to implement a generator method to yield the course id?

@@ -22,6 +22,7 @@
 from figures.progress import EnrollmentProgress
 
 
+# Remove this. Import from figures.sites or will we get dependency issues?
 def default_site():
     """
     Wrapper aroung `django.conf.settings.SITE_ID` so we do not have to create a
@@ -204,6 +205,19 @@ class EnrollmentDataManager(models.Manager):
     EnrollmentData instances.
 
     """
+
+    def get_for_enrollment(self, course_enrollment):
+        """Returns EnrollmentData object or None for the given CourseEnrollment
+
+        This is a context specific `get_or_none` function that uses the `user_id`
+        and `course_id` from the enrollment argument.
+        """
+        try:
+            return self.get(user_id=course_enrollment.user_id,
+                            course_id=str(course_enrollment.course_id))
+        except EnrollmentData.DoesNotExist:
+            return None
+
     def set_enrollment_data(self, site, user, course_id, course_enrollment=None):
         """
         This is an expensive call as it needs to call CourseGradeFactory if

@@ -31,6 +31,7 @@
 """
 from django.db.models import Avg
 from figures.course import Course
+from figures.enrollment import is_enrollment_data_out_of_date
 from figures.helpers import utc_yesterday
 from figures.models import EnrollmentData
 from figures.sites import UnlinkedCourseError
@@ -42,6 +43,8 @@ def update_enrollment_data_for_course(course_id):
     `EnrollmentData` records
 
     Return results are a list of the results returned by `update_enrollment_data`
+
+    TODO: Add check if the course data actually exists in Mongo
     """
     date_for = utc_yesterday()
     the_course = Course(course_id)
@@ -55,6 +58,39 @@ def update_enrollment_data_for_course(course_id):
             for ce in active_enrollments]
 
 
+def stale_course_enrollments(course_id):
+    """Find missing/out of date EnrollmentData records for the course
+
+    The `EnrollmentData` model holds the most recent data about the enrollment.
+    This model is not created until after two events happen in sequence
+    1. The enrollment has activity that creates one or more`StudentModule` records
+    2. Figures daily metrics ran after the `StudentModule` records were created
+       OR
+       Figures `EnrollmentData` backfill was run
+
+    Maybe... We might want to add query methods to EnrollmentDataManager to
+    get out of date `EnrollmentData` records for a course.
+
+    NOTE: Naming this function was a bit of a challenge. What I used before:
+
+    "update_enrollment_data_for_course"
+    "enrollments_with_out_of_date_enrollment_data"
+    "ce_needs_enrollment_data_update"
+
+    Since we are in the context of Figures and figures doesn't modifity the platform,
+    we should be save with saying "stale_course_enrollments" for brevity
+    """
+    course = Course(course_id)
+
+    # If we have a course with no activity, nothing can be out of date.
+    # As such if the course has no student modules, the loop won't execute
+
+    # We are only interested in enrollments with activity
+    for enrollment in course.enrollments_with_student_modules():
+        if is_enrollment_data_out_of_date(enrollment):
+            yield enrollment
+
+
 def calculate_course_progress(course_id):
     """Return average progress percentage for all enrollments in the course
     """

@@ -19,8 +19,10 @@
 from celery.utils.log import get_task_logger
 
 from figures.compat import CourseEnrollment, CourseOverview
+from figures.course import Course
 from figures.helpers import as_course_key, as_date, is_past_date
 from figures.log import log_exec_time
+from figures.models import EnrollmentData
 from figures.sites import get_sites, get_sites_by_id, site_course_ids
 
 from figures.pipeline.backfill import backfill_enrollment_data_for_site
@@ -29,7 +31,10 @@
 from figures.pipeline.mau_pipeline import collect_course_mau
 from figures.pipeline.helpers import DateForCannotBeFutureError
 from figures.pipeline.site_monthly_metrics import fill_last_month as fill_last_smm_month
-from figures.pipeline.enrollment_metrics_next import update_enrollment_data_for_course
+from figures.pipeline.enrollment_metrics_next import (
+    update_enrollment_data_for_course,
+    stale_course_enrollments,
+)
 
 
 logger = get_task_logger(__name__)
@@ -324,15 +329,41 @@ def populate_daily_metrics_next(site_id=None, force_update=False):
 
 @shared_task
 def backfill_enrollment_data_for_course(course_id):
-    """Create or update EnrollmentData records for the course
+    """Update EnrollmentData records for activity before "yesterday"
 
-    Simple wrapper to run the enrollment update as a Celery task
+    This task function is to get `EnrollmentData` records up to date. This is
+    needed under at least the following conditions
 
-    We usually run this task through the Figures Django management command,
-    `backfill_figures_enrollment_data`
+    A. Figures is being installed/enabled on an existing Open edX deployment
+    B. The daily pipeline was stopped or failed to run for longer than a day
+
+    Why this function is needed is because it costs too much time to query
+    an enrollment's `StudentModule` record to find the latest date it was
+    modified. For the regular day to day pipeline, Figures can perform a **much**
+    fasater query to find if `StudentModule` records exist for the specific day
+    we gather our daily metrics. This is always the previous calendar day given
+    UTC time. The following function is what updates `EnrollmentData` on the
+    daily job:
+
+    ```
+    figures.pipeline.enrollment_metrics_next.update_enrollment_data_for_course
+    ```
+
+    There is a Figures Django management command to run this task:
+
+    ```
+    backfill_figures_enrollment_data
+    ```
     """
-    ed_objects = update_enrollment_data_for_course(course_id)
-    return [obj.id for obj in ed_objects]
+    course = Course(course_id)
+    updated = []
+    for enrollment in stale_course_enrollments(course_id):
+        # `update_metrics` results are a (object, created_flag) tuple
+        updated.append(EnrollmentData.objects.update_metrics(course.site, enrollment))
+
+    msg = ('figures.tasks.backfill_enrollment_data_for_course "{course_id}".'
+           ' Updated {edrec_count} enrollment data records.')
+    logger.info(msg.format(course_id=course_id, edrec_count=len(updated)))
 
 
 #