From 546b6dd78b31fa169dfd8c2163b8be52133166a7 Mon Sep 17 00:00:00 2001
From: Vladimir Jimenez <vladimir.jimenez@smgov.net>
Date: Sun, 12 Mar 2017 14:47:10 -0700
Subject: [PATCH] Huge refactor/update for the aggregate script

- Rewrite and optimize the entire script
- Add basic unit tests for the core functions of the script that handle
  the actual data manipulation
---
 App_Data/jobs/triggered/aggregate/job.py  | 556 +++++++++++-----------
 App_Data/jobs/triggered/aggregate/test.py | 131 +++++
 2 files changed, 403 insertions(+), 284 deletions(-)
 create mode 100644 App_Data/jobs/triggered/aggregate/test.py

diff --git a/App_Data/jobs/triggered/aggregate/job.py b/App_Data/jobs/triggered/aggregate/job.py
index 6d8f9b0..04188f7 100644
--- a/App_Data/jobs/triggered/aggregate/job.py
+++ b/App_Data/jobs/triggered/aggregate/job.py
@@ -5,359 +5,347 @@
 import os
 import shutil
 import sys
-from collections import Counter
+from types import *
 from random import shuffle
+from collections import Counter
 
-# The location where agencies individual data is stored; e.g. each agency has its own folder
-if len(sys.argv) > 1:
-    report_folder = sys.argv[1]
-else:
-    report_folder = os.path.join(
-        os.environ['HOME'],
-        "site",
-        "wwwroot",
-        os.environ["ANALYTICS_DATA_PATH"]
-    )
-
-# Where the aggregated data will go
-target_folder = report_folder + "_aggregation"
-
-# Make a temporary folder for data aggregation
-if os.path.exists(target_folder):
-    shutil.rmtree(target_folder)
-
-os.mkdir(target_folder)
-
-# Reports that will not be aggregated by this script
-ignored_reports = [
-]
+target_path = ''
 
-def merge_dict_into(objOne, objTwo):
+def report_path(file_name):
     """
-    Add keys from objTwo that do not exist in objOne to objOne
+    Return the path to the location where aggregated data will be written to before it's deployed
     """
-    missingKeys = [ key for key in objTwo if key not in objOne ]
+    return os.path.join(target_folder, file_name)
 
-    for key in missingKeys:
-        objOne[key] = objTwo[key]
-
-def merge_dict_addition(objOne, objTwo):
+def csv_report_path(file_name):
     """
-    Merge two objects and add the respective values to get a total of both
+    Build a path to the target location for a CSV equivalent version from a JSON file name
     """
-    if not objOne:
-        return objTwo
-
-    if not objTwo:
-        return objOne
-
-    newObj = {}
+    csv_name = os.path.splitext(os.path.basename(file_name))[0] + '.csv'
+    return report_path(csv_name)
 
-    for key in objOne:
-        try:
-            if isinstance(objOne[key], (int, list, tuple)):
-                newObj[key] = objOne[key] + objTwo[key]
-            elif isinstance(objOne[key], dict):
-                newObj[key] = merge_dict_addition(objOne[key], objTwo[key])
-        except KeyError:
-            pass
-
-    return newObj
-
-def write_json_file(file_name, json_data):
+def read_json_file(file_name):
     """
-    Open `file_name` and dump JSON into the file
+    Open `file_name` and parse it as JSON and return the respective content as a dictionary
     """
-    with open(os.path.join(target_folder, file_name), 'w', encoding='utf8') as data_file:
-        json.dump(json_data, data_file, indent=4)
+    with open(file_name, 'r', encoding='utf8') as data_file:
+        return json.load(data_file)
 
-def json_file_writer(fileName, function):
+def write_json_file(file_name, json_data):
     """
-    Open `fileName` and load it as JSON. Call `function` and write the mutated `data` variable into the original file
+    Open `file_name` and dump JSON into the file; this function will overwrite everything that's in the file already
     """
-    with open(os.path.join(target_folder, fileName), 'r+', encoding='utf8') as data_file:
-        data = json.load(data_file)
+    with open(file_name, 'w', encoding='utf8') as data_file:
+        json.dump(json_data, data_file, indent=4)
 
-        function(data)
+def write_csv_file(file_name, data, headers):
+    with open(file_name, 'w', encoding='utf8') as csv_file:
+        csvwriter = csv.DictWriter(csv_file, dialect='unix', fieldnames=headers)
+        csvwriter.writeheader()
 
-        data_file.seek(0)
-        json.dump(data, data_file, indent=4)
-        data_file.truncate()
+        [ csvwriter.writerow(item) for item in data ]
 
-def aggregate_list_sum(data, groupKey, sumKey, ignoreKeys = None):
+def sum_shared_dict_keys(obj_one, obj_two):
     """
-    Loop through a list and sum up the `sumKey` values while treating `groupKey` as a unique ID. The `ignoreKeys` allows
-    for a list of keys to ignore and not return
-    """
-    output = {}
-
-    for item in data:
-        key = item[groupKey]
-
-        if key not in output:
-            output[key] = item
-            output[key][sumKey] = int(output[key][sumKey])
-        else:
-            output[key][sumKey] += int(item[sumKey])
-
-        if ignoreKeys is not None:
-            for k in ignoreKeys:
-                output[key].pop(k, None)
-
-    return [ output[item] for item in output ]
+    Loop through all of the keys in a dictionary and find the sum for the respective values in both dictionaries. This
+    function supports ints, lists, and dictionaries; anything else will be removed from the final result.
 
-def aggregate_list_sum_file(fileName, groupKey, sumKey, ignoreKeys = None, sort = None):
-
-    def action(data):
-        finalData = aggregate_list_sum(data['data'], groupKey, sumKey, ignoreKeys)
+    See 'test_sum_shared_dict_keys' in test.py for sample usage and expected results
+    """
+    if not obj_one:
+        return obj_two
 
-        if sort is not None:
-            finalData = sorted(finalData, key = sort)
+    if not obj_two:
+        return obj_one
 
-        data['data'] = finalData[0:min(len(finalData), data['query']['max-results'])]
+    newObj = {}
 
-    json_file_writer(fileName, action)
+    for key in obj_one:
+        if isinstance(obj_one[key], (int, list)):
+            newObj[key] = obj_one[key] + obj_two[key]
+        elif isinstance(obj_one[key], dict):
+            newObj[key] = sum_shared_dict_keys(obj_one[key], obj_two[key])
 
-def aggregate_json_data(jsonFile, primaryKey, uniqueKey, sumKey, fieldnames, sort = None):
+    return newObj
 
-    def action(data):
-        primaryKeys = list({ item[primaryKey] for item in data['data'] })
-        totals = []
+def sum_data_by_key(data, group_by, sum_key, keysToStrip = [], sort_by = None):
+    """
+    Sum up a specific key in an array of dictionaries. Each dictionary can be uniquely identified by the `group_by`,
+    which can be a lambda or a string; use lambdas to concat two fields together to get a unique identifier for each
+    dictionary.
 
-        for pKey in primaryKeys:
-            items = [ item for item in data['data'] if item[primaryKey] == pKey ]
-            totals += aggregate_list_sum(items, uniqueKey, sumKey)
+    See 'test_sum_data_by_key' in test.py for sample usage and expected results
+    """
+    result = {}
 
-        data['data'] = sorted(totals, key = sort)
+    for entry in data:
+        key = group_by(entry) if isinstance(group_by, LambdaType) else entry[group_by]
 
-    json_file_writer(jsonFile, action)
+        if key not in result:
+            result[key] = entry
+            result[key][sum_key] = int(result[key][sum_key])
+        else:
+            result[key][sum_key] += int(entry[sum_key])
 
-def csv_file_writer(fileName, data, fieldnames, sort = None):
-    csvFile = os.path.join(target_folder, os.path.splitext(os.path.basename(fileName))[0] + '.csv')
+        for k in keysToStrip:
+            result[key].pop(k, None)
 
-    with open(csvFile, 'w+', encoding='utf8') as csv_file:
-        csvwriter = csv.DictWriter(csv_file, dialect='unix', fieldnames=fieldnames)
-        csvwriter.writeheader()
+    result = [ result[key] for key in result ]
 
-        [ csvwriter.writerow(item) for item in sorted(data, key=sort) ]
+    if sort_by is not None:
+        result = sorted(result, key = sort_by)
 
-def aggregate_csv_data(jsonFile, fieldnames, sort = None):
+    return result
 
-    with open(os.path.join(target_folder, jsonFile), encoding='utf8') as data_file:
+def sum_data_by_key_file(file_name, group_by, sum_key, keys_to_strip, sort_by):
+    with open(file_name, 'r+', encoding='utf8') as data_file:
         data = json.load(data_file)
+        final_data = sum_data_by_key(data['data'], group_by, sum_key, keys_to_strip, sort_by)
+        data['data'] = final_data[0:min(len(final_data), data['query']['max-results'])]
 
-        csv_file_writer(jsonFile, data['data'], fieldnames, sort)
-
-
-# Get all of our agencies and deleted the first item in the list. The first item is a collection
-# of everything in the folder and is safe to skip
-agencies = [ agency for agency in os.walk(report_folder) ]
-del agencies[0]
-
-# Get all of the reports in the first agency's folder. Since all agencies have the same reports generated,
-# we'll be fine
-reports = agencies[0]
-
-# With the aggregation, the sorting is lost, so sort these reports' `data` array by the respective key
-sortBy = {
-    'top-pages-7-days.json': 'visits',
-    'top-pages-30-days.json': 'visits',
-    'top-pages-realtime.json': 'active_visitors'
-}
-
-# These keys need to be stripped from the respective reports
-stripKeys = {
-    'top-countries-realtime.json': ['domain'],
-    'top-cities-realtime.json': ['domain']
-}
-
-# For certain reports, we'll have to borrow values from other reports in order to fix inconsistencies. This will method
-# will make some not so smart assumptions and hopes it works.
-borrowKeys = {
-    "top-pages-7-days.json": ["domain"],
-    "top-pages-30-days.json": ["domain"]
-}
-
-global_variables = {}
-with open(os.path.join(os.environ['HOME'], "site", "wwwroot", "reports", "variables.json")) as data_file:
-    global_variables = json.load(data_file)
-
-
-# Aggregate all of the reports
-# -----
+        data_file.seek(0)
+        data_file.truncate()
+        json.dump(data, data_file, indent=4)
 
-for report in reports[2]:
-    if not report.endswith('.json') or report in ignored_reports:
-        continue
+        return data
 
-    jsonData = []
 
-    for agency in agencies:
-        reportFile = os.path.join(agency[0], report)
+if __name__ == "__main__":
 
-        try:
-            with open(reportFile, encoding='utf8') as file_content:
-                data = json.load(file_content)
+    # Set some variables based on the environment we're in; i.e. production or development
+    if len(sys.argv) > 1:
+        cwd = '_site'
+        report_folder = sys.argv[1]
+    else:
+        cwd = os.path.join(os.environ['HOME'], "site", "wwwroot")
+        report_folder = os.path.join(cwd, os.environ["ANALYTICS_DATA_PATH"])
 
-                if not jsonData:
-                    jsonData = data
-                    continue
+    # Where the aggregated data will go. We don't modify the data in place since it'll affect/break the website during
+    # the process
+    target_folder = report_folder + "_aggregation"
 
-                merge_dict_into(jsonData, data)
+    # Make a temporary folder for data aggregation
+    if os.path.exists(target_folder):
+        shutil.rmtree(target_folder)
 
-                try:
-                    jsonData['data'] += data['data']
-                except KeyError:
-                    pass
+    os.mkdir(target_folder)
 
-                try:
-                    jsonData['totals'] = merge_dict_addition(jsonData['totals'], data['totals'])
-                except KeyError:
-                    pass
+    # Reports that will not be aggregated by this script
+    ignored_reports = []
 
-                if report in borrowKeys:
-                    c_agency = os.path.basename(agency[0])
+    # Get all of our agencies and deleted the first item in the list. The first item is a collection of everything in
+    # the folder and is safe to skip
+    agencies = [ agency for agency in os.walk(report_folder) ]
+    del agencies[0]
 
-                    for item in jsonData['data']:
-                        if 'replace_done' not in item:
-                            item['replace_done'] = False
+    # Get all of the reports for the smgov website. We will go on the assumption that the 'smgov' website will have all
+    # of the reports
+    reports = next(filter(lambda x: x[0] == "data/smgov", agencies))
 
-                        for key_to_replace in borrowKeys[report]:
-                            if not item['replace_done']:
-                                item[key_to_replace] = global_variables[c_agency][key_to_replace]
+    # With the aggregation, the sorting is lost, so sort these reports' `data` array by the respective key
+    sortBy = {
+        'top-pages-7-days.json': 'visits',
+        'top-pages-30-days.json': 'visits',
+        'top-pages-realtime.json': 'active_visitors'
+    }
 
-                        item['replace_done'] = True
+    # These keys need to be stripped from the respective reports
+    stripKeys = {
+        'top-countries-realtime.json': ['domain'],
+        'top-cities-realtime.json': ['domain']
+    }
 
-        except IOError:
-            pass
+    # Specific keys or fields that will be replaced during the aggregation based on the values retrieved from env.json
+    findEnvReplace = {
+        'all-pages-realtime.json': ['domain'],
+        'top-pages-7-days.json': ['domain'],
+        'top-pages-30-days.json': ['domain']
+    }
 
+    # Environment variables set during analytics fetching
     try:
-        sortKey = sortBy[report]
-        sortedData = sorted(jsonData['data'], key=lambda x: -int(x[sortKey]))
+        environment_vars = read_json_file(os.path.join(cwd, 'reports', 'env.json'))
+    except FileNotFoundError:
+        print("No environment variables have been defined. If you're in a dev environment, be sure to build the website first")
+        exit()
+
+    #
+    # Aggregate all of the reports
+    #
 
-        moreThanOneViewer = [item for item in sortedData if int(item[sortKey]) > 1]
-        onlyOneViewer = [item for item in sortedData if int(item[sortKey]) == 1]
+    # reports[2] is where all of the report file names are stored
+    for report in reports[2]:
+        if not report.endswith('.json') or report in ignored_reports:
+            continue
 
-        shuffle(onlyOneViewer)
-        sortedData = moreThanOneViewer + onlyOneViewer
+        # ...short for 'aggregated'
+        agg_data = []
 
-        jsonData['data'] = sortedData[0:min(len(sortedData), jsonData['query']['max-results'])]
-    except KeyError:
-        pass
+        for agency in agencies:
+            # agency[0] is the path to the agency
+            report_file = os.path.join(agency[0], report)
+            agency_name = os.path.basename(agency[0])
 
-    if report in stripKeys or report in borrowKeys:
-        for item in jsonData['data']:
             try:
-                del item['replace_done']
-                for key in stripKeys[report]:
-                    del item[key]
-            except KeyError:
+                with open(report_file, 'r+', encoding='utf8') as file_content:
+                    data = json.load(file_content)
+
+                    # Fields that need to be replaced based on environment variables due to Google Analytics returning
+                    # data in a different format
+                    try:
+                        for key in findEnvReplace[report]:
+                            for item in data['data']:
+                                for env in environment_vars[agency_name]:
+                                    item[key] = environment_vars[agency_name][env]
+
+                        # Since we use data for individual websites now, we'll update individual website data with our
+                        # replacements too
+                        file_content.seek(0)
+                        file_content.truncate()
+                        json.dump(data, file_content, indent=4)
+                    except KeyError:
+                        pass
+
+                    if not agg_data:
+                        agg_data = data
+                        continue
+
+                    if 'data' in agg_data:
+                        agg_data['data'] += data['data']
+
+                    if 'totals' in agg_data:
+                        agg_data['totals'] = sum_shared_dict_keys(agg_data['totals'], data['totals'])
+
+            # This'll happen if the file was not found, meaning this agency isn't configured to have this report. For
+            # example, not all websites will have the `realtime.json` report.
+            except IOError:
                 pass
 
+        # If the report data requires to be sorted, sort the data for it based on the key specified. Any results that
+        # match the value 1 will be shuffled so they are not listed alphabetically (there are a lot of these)
+        if report in sortBy:
+            sortKey = sortBy[report]
+            sortedData = sorted(agg_data['data'], key=lambda x: -int(x[sortKey]))
 
-    write_json_file(report, jsonData)
-
-
-# Reports that need further, special, aggregation
-# -----
-
-
-# Let's count unique cities & countries and total up our active visitors and create the respective files
-with open(os.path.join(target_folder, 'all-pages-realtime.json'), 'r+', encoding='utf8') as data_file:
-    data = json.load(data_file)
+            moreThanOneViewer = [item for item in sortedData if int(item[sortKey]) > 1]
+            onlyOneViewer = [item for item in sortedData if int(item[sortKey]) == 1]
 
-    # City or country codes that should be ignored
-    ignoreKeys = [ 'zz' ]
+            shuffle(onlyOneViewer)
+            sortedData = moreThanOneViewer + onlyOneViewer
 
-    # First tally up the number of entries for things, respectively
-    countries = Counter([ k['country'] for k in data['data'] ])
-    cities    = Counter([ k['city']    for k in data['data'] ])
-    total     = sum([ int(k['active_visitors']) for k in data['data'] ])
-
-    # Convert the tallies into dictionaries and sort them by visitors so our dashboard knows how to handle them
-    countriesData = [ {'country': k[0], 'active_visitors': k[1]} for k in list(dict(countries).items()) if k[0] not in ignoreKeys ]
-    countriesData = sorted(countriesData, key = lambda x: -x['active_visitors'])
-
-    citiesData = [ {'city': k[0], 'active_visitors': k[1]} for k in list(dict(cities).items()) if k[0] not in ignoreKeys ]
-    citiesData = sorted(citiesData, key = lambda x: -x['active_visitors'])
-
-    # Write the data into the expected files so we don't have to break/change the dashboard
-    write_json_file('top-countries-realtime.json', { 'data': countriesData })
-    write_json_file('top-cities-realtime.json', { 'data': citiesData })
-    write_json_file('realtime.json', { 'data': [{ 'active_visitors': total }] })
-
-
-# Clean-up 'all-pages-realtime.json' from duplicate URLs and get rid of 'country' & 'city' keys while we're at it
-sortCountKey = lambda x: -int(x[sumKey])
-groupByKey = 'page'
-ignoreKeys = [ 'country', 'city' ]
-sumKey = 'active_visitors'
-aggregate_list_sum_file('all-pages-realtime.json', groupByKey, sumKey, ignoreKeys, sortCountKey)
-
-
-# Today.json aggregation
-sortCountKey = lambda x: int(x[groupByKey])
-groupByKey = 'hour'
-ignoreKeys = None
-sumKey = 'visits'
-aggregate_list_sum_file('today.json', groupByKey, sumKey, ignoreKeys, sortCountKey)
-aggregate_list_sum_file('last-48-hours.json', groupByKey, sumKey, ignoreKeys, sortCountKey)
-
-
-# Aggregate `users.json`
-aggregate_list_sum_file('users.json', 'date', 'visits', None, lambda x: x['date'])
-
-
-# CSV aggregation
-# -----
+            agg_data['data'] = sortedData[0:min(len(sortedData), agg_data['query']['max-results'])]
 
+        if report in stripKeys:
+            for item in agg_data['data']:
+                for key in stripKeys[report]:
+                    del item[key]
 
-# All of these reports have similar data structures
-aggregationDefinitions = {
-    'browsers.json': 'browser',
-    'devices.json': 'device',
-    'ie.json': 'browser_version',
-    'os.json': 'os',
-    'windows.json': 'os_version'
-}
+        write_json_file(report_path(report), agg_data)
 
-for k in aggregationDefinitions:
-    v = aggregationDefinitions[k]
-    sorting = lambda x: (x['date'], -int(x['visits']))
 
-    aggregate_json_data(k, 'date', v, 'visits', ['date', v, 'visits'], sorting)
-    aggregate_csv_data(k, ['date', v, 'visits'], sorting)
+    # Reports that need further, special, aggregation
+    # -----
 
 
-# Aggregate the "top pages" reports
-aggregateTopPages = {
-    'all-pages-realtime.json': 'active_visitors',
-    'top-pages-7-days.json': 'visits',
-    'top-pages-30-days.json': 'visits'
-}
+    # The 'all-pages-realtime.json' report contains data regarding active users based on cities and countries. In order
+    # to avoid making a separate GA call, we'll tally up the number of users per city and country from this existing
+    # report and create our own report that the dashboard will understand and accept
+    with open(report_path('all-pages-realtime.json'), 'r+', encoding='utf8') as data_file:
+        data = json.load(data_file)
 
-for report in aggregateTopPages:
-    with open(os.path.join(target_folder, report), encoding='utf8') as json_file:
+        # City or country codes that should be ignored
+        ignoreKeys = [ 'zz' ]
+
+        # Tally up the number of entries for things, respectively. We'll receive a dictionary in the following format:
+        #   {'United States': 50, 'Canada': 2}
+        countries = Counter([ k['country'] for k in data['data'] ])
+        cities    = Counter([ k['city']    for k in data['data'] ])
+        total     = sum([ int(k['active_visitors']) for k in data['data'] ])
+
+        # Convert the tallies into a list of dictionaries and sort them by visitors. By doing this, we'll be giving the
+        # dashboard the syntax it expects
+        countriesData = [ {'country': k[0], 'active_visitors': k[1]} for k in list(countries.items()) if k[0] not in ignoreKeys ]
+        countriesData = sorted(countriesData, key = lambda x: -x['active_visitors'])
+        citiesData = [ {'city': k[0], 'active_visitors': k[1]} for k in list(cities.items()) if k[0] not in ignoreKeys ]
+        citiesData = sorted(citiesData, key = lambda x: -x['active_visitors'])
+
+        # Write the data into the expected files so we don't have to break/change the dashboard
+        write_json_file(report_path('top-countries-realtime.json'), { 'data': countriesData })
+        write_json_file(report_path('top-cities-realtime.json'), { 'data': citiesData })
+        write_json_file(report_path('realtime.json'), { 'data': [{ 'active_visitors': total }] })
+
+    # Clean-up 'all-pages-realtime.json' from duplicate URLs and get rid of 'country' & 'city' keys while we're at it
+    sortCountKey = lambda x: -int(x[sumKey])
+    groupByKey = lambda x: x['domain'] + x['page']
+    ignoreKeys = [ 'country', 'city' ]
+    sumKey = 'active_visitors'
+    sum_data_by_key_file(report_path('all-pages-realtime.json'), groupByKey, sumKey, ignoreKeys, sortCountKey)
+
+    # Today.json aggregation
+    sortCountKey = lambda x: int(x[groupByKey])
+    groupByKey = 'hour'
+    ignoreKeys = []
+    sumKey = 'visits'
+    sum_data_by_key_file(report_path('today.json'), groupByKey, sumKey, ignoreKeys, sortCountKey)
+    sum_data_by_key_file(report_path('last-48-hours.json'), groupByKey, sumKey, ignoreKeys, sortCountKey)
+
+    # Aggregate `users.json`
+    sum_data_by_key_file(report_path('users.json'), 'date', 'visits', [], lambda x: x['date'])
+
+
+    #
+    # CSV Generation
+    #
+
+
+    # All of these reports have similar data structures
+    aggregationDefinitions = {
+        'browsers.json': 'browser',
+        'devices.json': 'device',
+        'ie.json': 'browser_version',
+        'os.json': 'os',
+        'windows.json': 'os_version'
+    }
+
+    for k in aggregationDefinitions:
+        v = aggregationDefinitions[k]
+        sorting = lambda x: (x['date'], -int(x['visits']))
+        file_name = report_path(k)
+
+        data = sum_data_by_key_file(file_name, lambda x: x['date'] + x[v], 'visits', [], sorting)
+        write_csv_file(csv_report_path(file_name), data['data'], ['date', v, 'visits'])
+
+    # Aggregate the "top pages" reports
+    aggregateTopPages = {
+        'all-pages-realtime.json': 'active_visitors',
+        'top-pages-7-days.json': 'visits',
+        'top-pages-30-days.json': 'visits'
+    }
+
+    for report in aggregateTopPages:
+        with open(report_path(report), encoding='utf8') as json_file:
+            data = json.load(json_file)
+            value = aggregateTopPages[report]
+
+            write_csv_file(csv_report_path(report), data['data'], ['domain', 'page', 'page_title', value])
+
+    # Aggregate `users.csv`
+    with open(report_path('users.json'), encoding='utf8') as json_file:
         data = json.load(json_file)
-        value = aggregateTopPages[report]
+        write_csv_file(csv_report_path('users.json'), data['data'], ['date', 'visits'])
 
-        csv_file_writer(report, data['data'], ['domain', 'page', 'page_title', value], lambda x: -int(x[value]))
+    #
+    # File moving and cleanup
+    #
 
+    # Copy all of the aggregated files into the final directory
+    src_files = os.listdir(target_folder)
 
-# Aggregate `users.csv`
-with open(os.path.join(target_folder, 'users.json'), encoding='utf8') as json_file:
-    data = json.load(json_file)
-    csv_file_writer('users.json', data['data'], ['date', 'visits'], lambda x: x['date'])
+    for file_name in src_files:
+        full_file_name = os.path.join(target_folder, file_name)
 
+        if (os.path.isfile(full_file_name)):
+            shutil.copy(full_file_name, report_folder)
 
-# Copy all of the aggregated files into the final directory
-src_files = os.listdir(target_folder)
-
-for file_name in src_files:
-    full_file_name = os.path.join(target_folder, file_name)
-
-    if (os.path.isfile(full_file_name)):
-        shutil.copy(full_file_name, report_folder)
-
-# Delete the temporary folder
-shutil.rmtree(target_folder)
+    # Delete the temporary folder
+    shutil.rmtree(target_folder)
diff --git a/App_Data/jobs/triggered/aggregate/test.py b/App_Data/jobs/triggered/aggregate/test.py
new file mode 100644
index 0000000..36160b5
--- /dev/null
+++ b/App_Data/jobs/triggered/aggregate/test.py
@@ -0,0 +1,131 @@
+import job
+import unittest
+
+class JobTest(unittest.TestCase):
+    # This function is the first layer of aggregation that's used for all the reports to merge up the 'totals' attributes
+    # from each of the reports
+    def test_sum_shared_dict_keys(self):
+        objectOne = {
+            "int": 10,
+            "list": [1, 2, 3],
+            "tuple": (10, 20),
+            "dict": {
+                "int": 2,
+                "list": [1, 2]
+            }
+        }
+        objectTwo = {
+            "int": 5,
+            "list": [4, 5, 6],
+            "tuple": (20, 30),
+            "dict": {
+                "int": 8,
+                "list": [9, 8]
+            }
+        }
+
+        result = job.sum_shared_dict_keys(objectOne, objectTwo)
+        self.assertEqual(result, {
+            "int": 15,
+            "list": [1, 2, 3, 4, 5, 6],
+            "dict": {
+                "int": 10,
+                "list": [1, 2, 9, 8]
+            }
+        })
+
+    # This function is the core for aggregating information from the following reports:
+    #
+    #   - 'all-pages-realtime.json'
+    #   - 'today.json'
+    #   - 'last-48-hours.json'
+    #   - 'users.json'
+    def test_sum_data_by_key_lambda(self):
+        data = [
+            { "country": "United States", "city": "Los Angeles", "page": "/rider-info/real-time-info.aspx",
+              "page_title": "Real-Time Info - Big Blue Bus", "active_visitors": "2", "domain": "www.bigbluebus.com"
+            }, {
+              "country": "United States", "city": "Los Angeles", "page": "/routes-and-schedules/route-8.aspx",
+              "page_title": "Route 8 - Ocean Park Blvd - Big Blue Bus", "active_visitors": "2", "domain": "www.bigbluebus.com"
+            }, {
+              "country": "United States", "city": "Indianapolis", "page": "/default.aspx", "page_title": "Big Blue Bus",
+              "active_visitors": "1", "domain": "www.bigbluebus.com"
+            }, {
+              "country": "United States", "city": "Santa Monica", "page": "/default.aspx", "page_title": "Big Blue Bus",
+              "active_visitors": "6", "domain": "www.bigbluebus.com"
+            }, {
+              "country": "United States", "city": "Los Angeles", "page": "/fares/fare-information.aspx",
+              "page_title": "Fare Information - Big Blue Bus", "active_visitors": "1", "domain": "www.bigbluebus.com"
+            }, {
+              "country": "United States", "city": "Santa Monica", "page": "/fares/fare-information.aspx",
+              "page_title": "Fare Information - Big Blue Bus", "active_visitors": "4", "domain": "www.bigbluebus.com"
+            }, {
+              "country": "United States", "city": "Los Angeles", "page": "/departments/pcd/transportation/motorists-parking/",
+              "page_title": "Motorists Parking - Planning & Community Development - City of Santa Monica",
+              "active_visitors": "2", "domain": "www.smgov.net"
+            }, {
+              "country": "United States", "city": "Los Angeles", "page": "/departments/pcd/transportation/motorists-parking/where-to-park/",
+              "page_title": "Where to Park - Planning & Community Development - City of Santa Monica", "active_visitors": "2",
+              "domain": "www.smgov.net"
+            }, {
+              "country": "United States", "city": "Los Angeles", "page": "/default.aspx", "page_title": "City of Santa Monica",
+              "active_visitors": "2", "domain": "www.smgov.net"
+            }, {
+              "country": "United States", "city": "Santa Monica", "page": "/default.aspx", "page_title": "City of Santa Monica",
+              "active_visitors": "6", "domain": "www.smgov.net"
+            }
+        ]
+
+        # We're summing up the 'active_visitors' key
+        result = job.sum_data_by_key(
+            data,
+            lambda x: x['domain'] + x['page'],
+            'active_visitors',
+            ['country', 'city']
+        )
+
+        # These should be the totals for the sample dataset
+        expected_count = {
+            "Real-Time Info - Big Blue Bus": 2,
+            "Route 8 - Ocean Park Blvd - Big Blue Bus": 2,
+            "Big Blue Bus": 7,
+            "Fare Information - Big Blue Bus": 5,
+            "Motorists Parking - Planning & Community Development - City of Santa Monica": 2,
+            "Where to Park - Planning & Community Development - City of Santa Monica": 2,
+            "City of Santa Monica": 8
+        }
+
+        for count in expected_count.items():
+            for item in result:
+                if item['page_title'] == count:
+                    self.assertEqual(item['active_visitors'], expected_count[count])
+
+    def test_sum_data_by_key_str(self):
+        data = [
+            { "visits": "5", "date": "2017-03-12", "hour": "01" },
+            { "visits": "10", "date": "2017-03-12", "hour": "02" },
+            { "visits": "10", "date": "2017-03-12", "hour": "03" },
+            { "visits": "5", "date": "2017-03-12", "hour": "04" },
+
+            { "visits": "50", "date": "2017-03-12", "hour": "01" },
+            { "visits": "20", "date": "2017-03-12", "hour": "02" },
+            { "visits": "20", "date": "2017-03-12", "hour": "03" },
+            { "visits": "10", "date": "2017-03-12", "hour": "04" },
+        ]
+
+        result = job.sum_data_by_key(data, 'hour', 'visits')
+
+        expected_count = {
+            '01': 55,
+            '02': 30,
+            '03': 30,
+            '04': 15,
+        }
+
+        for count in expected_count.items():
+            for item in result:
+                if item['hour'] == count:
+                    self.assertEqual(item['visits'], expected_count[count])
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file