From 546b6dd78b31fa169dfd8c2163b8be52133166a7 Mon Sep 17 00:00:00 2001 From: Vladimir Jimenez Date: Sun, 12 Mar 2017 14:47:10 -0700 Subject: [PATCH] Huge refactor/update for the aggregate script - Rewrite and optimize the entire script - Add basic unit tests for the core functions of the script that handle the actual data manipulation --- App_Data/jobs/triggered/aggregate/job.py | 556 +++++++++++----------- App_Data/jobs/triggered/aggregate/test.py | 131 +++++ 2 files changed, 403 insertions(+), 284 deletions(-) create mode 100644 App_Data/jobs/triggered/aggregate/test.py diff --git a/App_Data/jobs/triggered/aggregate/job.py b/App_Data/jobs/triggered/aggregate/job.py index 6d8f9b0..04188f7 100644 --- a/App_Data/jobs/triggered/aggregate/job.py +++ b/App_Data/jobs/triggered/aggregate/job.py @@ -5,359 +5,347 @@ import os import shutil import sys -from collections import Counter +from types import * from random import shuffle +from collections import Counter -# The location where agencies individual data is stored; e.g. each agency has its own folder -if len(sys.argv) > 1: - report_folder = sys.argv[1] -else: - report_folder = os.path.join( - os.environ['HOME'], - "site", - "wwwroot", - os.environ["ANALYTICS_DATA_PATH"] - ) - -# Where the aggregated data will go -target_folder = report_folder + "_aggregation" - -# Make a temporary folder for data aggregation -if os.path.exists(target_folder): - shutil.rmtree(target_folder) - -os.mkdir(target_folder) - -# Reports that will not be aggregated by this script -ignored_reports = [ -] +target_path = '' -def merge_dict_into(objOne, objTwo): +def report_path(file_name): """ - Add keys from objTwo that do not exist in objOne to objOne + Return the path to the location where aggregated data will be written to before it's deployed """ - missingKeys = [ key for key in objTwo if key not in objOne ] + return os.path.join(target_folder, file_name) - for key in missingKeys: - objOne[key] = objTwo[key] - -def merge_dict_addition(objOne, objTwo): +def csv_report_path(file_name): """ - Merge two objects and add the respective values to get a total of both + Build a path to the target location for a CSV equivalent version from a JSON file name """ - if not objOne: - return objTwo - - if not objTwo: - return objOne - - newObj = {} + csv_name = os.path.splitext(os.path.basename(file_name))[0] + '.csv' + return report_path(csv_name) - for key in objOne: - try: - if isinstance(objOne[key], (int, list, tuple)): - newObj[key] = objOne[key] + objTwo[key] - elif isinstance(objOne[key], dict): - newObj[key] = merge_dict_addition(objOne[key], objTwo[key]) - except KeyError: - pass - - return newObj - -def write_json_file(file_name, json_data): +def read_json_file(file_name): """ - Open `file_name` and dump JSON into the file + Open `file_name` and parse it as JSON and return the respective content as a dictionary """ - with open(os.path.join(target_folder, file_name), 'w', encoding='utf8') as data_file: - json.dump(json_data, data_file, indent=4) + with open(file_name, 'r', encoding='utf8') as data_file: + return json.load(data_file) -def json_file_writer(fileName, function): +def write_json_file(file_name, json_data): """ - Open `fileName` and load it as JSON. Call `function` and write the mutated `data` variable into the original file + Open `file_name` and dump JSON into the file; this function will overwrite everything that's in the file already """ - with open(os.path.join(target_folder, fileName), 'r+', encoding='utf8') as data_file: - data = json.load(data_file) + with open(file_name, 'w', encoding='utf8') as data_file: + json.dump(json_data, data_file, indent=4) - function(data) +def write_csv_file(file_name, data, headers): + with open(file_name, 'w', encoding='utf8') as csv_file: + csvwriter = csv.DictWriter(csv_file, dialect='unix', fieldnames=headers) + csvwriter.writeheader() - data_file.seek(0) - json.dump(data, data_file, indent=4) - data_file.truncate() + [ csvwriter.writerow(item) for item in data ] -def aggregate_list_sum(data, groupKey, sumKey, ignoreKeys = None): +def sum_shared_dict_keys(obj_one, obj_two): """ - Loop through a list and sum up the `sumKey` values while treating `groupKey` as a unique ID. The `ignoreKeys` allows - for a list of keys to ignore and not return - """ - output = {} - - for item in data: - key = item[groupKey] - - if key not in output: - output[key] = item - output[key][sumKey] = int(output[key][sumKey]) - else: - output[key][sumKey] += int(item[sumKey]) - - if ignoreKeys is not None: - for k in ignoreKeys: - output[key].pop(k, None) - - return [ output[item] for item in output ] + Loop through all of the keys in a dictionary and find the sum for the respective values in both dictionaries. This + function supports ints, lists, and dictionaries; anything else will be removed from the final result. -def aggregate_list_sum_file(fileName, groupKey, sumKey, ignoreKeys = None, sort = None): - - def action(data): - finalData = aggregate_list_sum(data['data'], groupKey, sumKey, ignoreKeys) + See 'test_sum_shared_dict_keys' in test.py for sample usage and expected results + """ + if not obj_one: + return obj_two - if sort is not None: - finalData = sorted(finalData, key = sort) + if not obj_two: + return obj_one - data['data'] = finalData[0:min(len(finalData), data['query']['max-results'])] + newObj = {} - json_file_writer(fileName, action) + for key in obj_one: + if isinstance(obj_one[key], (int, list)): + newObj[key] = obj_one[key] + obj_two[key] + elif isinstance(obj_one[key], dict): + newObj[key] = sum_shared_dict_keys(obj_one[key], obj_two[key]) -def aggregate_json_data(jsonFile, primaryKey, uniqueKey, sumKey, fieldnames, sort = None): + return newObj - def action(data): - primaryKeys = list({ item[primaryKey] for item in data['data'] }) - totals = [] +def sum_data_by_key(data, group_by, sum_key, keysToStrip = [], sort_by = None): + """ + Sum up a specific key in an array of dictionaries. Each dictionary can be uniquely identified by the `group_by`, + which can be a lambda or a string; use lambdas to concat two fields together to get a unique identifier for each + dictionary. - for pKey in primaryKeys: - items = [ item for item in data['data'] if item[primaryKey] == pKey ] - totals += aggregate_list_sum(items, uniqueKey, sumKey) + See 'test_sum_data_by_key' in test.py for sample usage and expected results + """ + result = {} - data['data'] = sorted(totals, key = sort) + for entry in data: + key = group_by(entry) if isinstance(group_by, LambdaType) else entry[group_by] - json_file_writer(jsonFile, action) + if key not in result: + result[key] = entry + result[key][sum_key] = int(result[key][sum_key]) + else: + result[key][sum_key] += int(entry[sum_key]) -def csv_file_writer(fileName, data, fieldnames, sort = None): - csvFile = os.path.join(target_folder, os.path.splitext(os.path.basename(fileName))[0] + '.csv') + for k in keysToStrip: + result[key].pop(k, None) - with open(csvFile, 'w+', encoding='utf8') as csv_file: - csvwriter = csv.DictWriter(csv_file, dialect='unix', fieldnames=fieldnames) - csvwriter.writeheader() + result = [ result[key] for key in result ] - [ csvwriter.writerow(item) for item in sorted(data, key=sort) ] + if sort_by is not None: + result = sorted(result, key = sort_by) -def aggregate_csv_data(jsonFile, fieldnames, sort = None): + return result - with open(os.path.join(target_folder, jsonFile), encoding='utf8') as data_file: +def sum_data_by_key_file(file_name, group_by, sum_key, keys_to_strip, sort_by): + with open(file_name, 'r+', encoding='utf8') as data_file: data = json.load(data_file) + final_data = sum_data_by_key(data['data'], group_by, sum_key, keys_to_strip, sort_by) + data['data'] = final_data[0:min(len(final_data), data['query']['max-results'])] - csv_file_writer(jsonFile, data['data'], fieldnames, sort) - - -# Get all of our agencies and deleted the first item in the list. The first item is a collection -# of everything in the folder and is safe to skip -agencies = [ agency for agency in os.walk(report_folder) ] -del agencies[0] - -# Get all of the reports in the first agency's folder. Since all agencies have the same reports generated, -# we'll be fine -reports = agencies[0] - -# With the aggregation, the sorting is lost, so sort these reports' `data` array by the respective key -sortBy = { - 'top-pages-7-days.json': 'visits', - 'top-pages-30-days.json': 'visits', - 'top-pages-realtime.json': 'active_visitors' -} - -# These keys need to be stripped from the respective reports -stripKeys = { - 'top-countries-realtime.json': ['domain'], - 'top-cities-realtime.json': ['domain'] -} - -# For certain reports, we'll have to borrow values from other reports in order to fix inconsistencies. This will method -# will make some not so smart assumptions and hopes it works. -borrowKeys = { - "top-pages-7-days.json": ["domain"], - "top-pages-30-days.json": ["domain"] -} - -global_variables = {} -with open(os.path.join(os.environ['HOME'], "site", "wwwroot", "reports", "variables.json")) as data_file: - global_variables = json.load(data_file) - - -# Aggregate all of the reports -# ----- + data_file.seek(0) + data_file.truncate() + json.dump(data, data_file, indent=4) -for report in reports[2]: - if not report.endswith('.json') or report in ignored_reports: - continue + return data - jsonData = [] - for agency in agencies: - reportFile = os.path.join(agency[0], report) +if __name__ == "__main__": - try: - with open(reportFile, encoding='utf8') as file_content: - data = json.load(file_content) + # Set some variables based on the environment we're in; i.e. production or development + if len(sys.argv) > 1: + cwd = '_site' + report_folder = sys.argv[1] + else: + cwd = os.path.join(os.environ['HOME'], "site", "wwwroot") + report_folder = os.path.join(cwd, os.environ["ANALYTICS_DATA_PATH"]) - if not jsonData: - jsonData = data - continue + # Where the aggregated data will go. We don't modify the data in place since it'll affect/break the website during + # the process + target_folder = report_folder + "_aggregation" - merge_dict_into(jsonData, data) + # Make a temporary folder for data aggregation + if os.path.exists(target_folder): + shutil.rmtree(target_folder) - try: - jsonData['data'] += data['data'] - except KeyError: - pass + os.mkdir(target_folder) - try: - jsonData['totals'] = merge_dict_addition(jsonData['totals'], data['totals']) - except KeyError: - pass + # Reports that will not be aggregated by this script + ignored_reports = [] - if report in borrowKeys: - c_agency = os.path.basename(agency[0]) + # Get all of our agencies and deleted the first item in the list. The first item is a collection of everything in + # the folder and is safe to skip + agencies = [ agency for agency in os.walk(report_folder) ] + del agencies[0] - for item in jsonData['data']: - if 'replace_done' not in item: - item['replace_done'] = False + # Get all of the reports for the smgov website. We will go on the assumption that the 'smgov' website will have all + # of the reports + reports = next(filter(lambda x: x[0] == "data/smgov", agencies)) - for key_to_replace in borrowKeys[report]: - if not item['replace_done']: - item[key_to_replace] = global_variables[c_agency][key_to_replace] + # With the aggregation, the sorting is lost, so sort these reports' `data` array by the respective key + sortBy = { + 'top-pages-7-days.json': 'visits', + 'top-pages-30-days.json': 'visits', + 'top-pages-realtime.json': 'active_visitors' + } - item['replace_done'] = True + # These keys need to be stripped from the respective reports + stripKeys = { + 'top-countries-realtime.json': ['domain'], + 'top-cities-realtime.json': ['domain'] + } - except IOError: - pass + # Specific keys or fields that will be replaced during the aggregation based on the values retrieved from env.json + findEnvReplace = { + 'all-pages-realtime.json': ['domain'], + 'top-pages-7-days.json': ['domain'], + 'top-pages-30-days.json': ['domain'] + } + # Environment variables set during analytics fetching try: - sortKey = sortBy[report] - sortedData = sorted(jsonData['data'], key=lambda x: -int(x[sortKey])) + environment_vars = read_json_file(os.path.join(cwd, 'reports', 'env.json')) + except FileNotFoundError: + print("No environment variables have been defined. If you're in a dev environment, be sure to build the website first") + exit() + + # + # Aggregate all of the reports + # - moreThanOneViewer = [item for item in sortedData if int(item[sortKey]) > 1] - onlyOneViewer = [item for item in sortedData if int(item[sortKey]) == 1] + # reports[2] is where all of the report file names are stored + for report in reports[2]: + if not report.endswith('.json') or report in ignored_reports: + continue - shuffle(onlyOneViewer) - sortedData = moreThanOneViewer + onlyOneViewer + # ...short for 'aggregated' + agg_data = [] - jsonData['data'] = sortedData[0:min(len(sortedData), jsonData['query']['max-results'])] - except KeyError: - pass + for agency in agencies: + # agency[0] is the path to the agency + report_file = os.path.join(agency[0], report) + agency_name = os.path.basename(agency[0]) - if report in stripKeys or report in borrowKeys: - for item in jsonData['data']: try: - del item['replace_done'] - for key in stripKeys[report]: - del item[key] - except KeyError: + with open(report_file, 'r+', encoding='utf8') as file_content: + data = json.load(file_content) + + # Fields that need to be replaced based on environment variables due to Google Analytics returning + # data in a different format + try: + for key in findEnvReplace[report]: + for item in data['data']: + for env in environment_vars[agency_name]: + item[key] = environment_vars[agency_name][env] + + # Since we use data for individual websites now, we'll update individual website data with our + # replacements too + file_content.seek(0) + file_content.truncate() + json.dump(data, file_content, indent=4) + except KeyError: + pass + + if not agg_data: + agg_data = data + continue + + if 'data' in agg_data: + agg_data['data'] += data['data'] + + if 'totals' in agg_data: + agg_data['totals'] = sum_shared_dict_keys(agg_data['totals'], data['totals']) + + # This'll happen if the file was not found, meaning this agency isn't configured to have this report. For + # example, not all websites will have the `realtime.json` report. + except IOError: pass + # If the report data requires to be sorted, sort the data for it based on the key specified. Any results that + # match the value 1 will be shuffled so they are not listed alphabetically (there are a lot of these) + if report in sortBy: + sortKey = sortBy[report] + sortedData = sorted(agg_data['data'], key=lambda x: -int(x[sortKey])) - write_json_file(report, jsonData) - - -# Reports that need further, special, aggregation -# ----- - - -# Let's count unique cities & countries and total up our active visitors and create the respective files -with open(os.path.join(target_folder, 'all-pages-realtime.json'), 'r+', encoding='utf8') as data_file: - data = json.load(data_file) + moreThanOneViewer = [item for item in sortedData if int(item[sortKey]) > 1] + onlyOneViewer = [item for item in sortedData if int(item[sortKey]) == 1] - # City or country codes that should be ignored - ignoreKeys = [ 'zz' ] + shuffle(onlyOneViewer) + sortedData = moreThanOneViewer + onlyOneViewer - # First tally up the number of entries for things, respectively - countries = Counter([ k['country'] for k in data['data'] ]) - cities = Counter([ k['city'] for k in data['data'] ]) - total = sum([ int(k['active_visitors']) for k in data['data'] ]) - - # Convert the tallies into dictionaries and sort them by visitors so our dashboard knows how to handle them - countriesData = [ {'country': k[0], 'active_visitors': k[1]} for k in list(dict(countries).items()) if k[0] not in ignoreKeys ] - countriesData = sorted(countriesData, key = lambda x: -x['active_visitors']) - - citiesData = [ {'city': k[0], 'active_visitors': k[1]} for k in list(dict(cities).items()) if k[0] not in ignoreKeys ] - citiesData = sorted(citiesData, key = lambda x: -x['active_visitors']) - - # Write the data into the expected files so we don't have to break/change the dashboard - write_json_file('top-countries-realtime.json', { 'data': countriesData }) - write_json_file('top-cities-realtime.json', { 'data': citiesData }) - write_json_file('realtime.json', { 'data': [{ 'active_visitors': total }] }) - - -# Clean-up 'all-pages-realtime.json' from duplicate URLs and get rid of 'country' & 'city' keys while we're at it -sortCountKey = lambda x: -int(x[sumKey]) -groupByKey = 'page' -ignoreKeys = [ 'country', 'city' ] -sumKey = 'active_visitors' -aggregate_list_sum_file('all-pages-realtime.json', groupByKey, sumKey, ignoreKeys, sortCountKey) - - -# Today.json aggregation -sortCountKey = lambda x: int(x[groupByKey]) -groupByKey = 'hour' -ignoreKeys = None -sumKey = 'visits' -aggregate_list_sum_file('today.json', groupByKey, sumKey, ignoreKeys, sortCountKey) -aggregate_list_sum_file('last-48-hours.json', groupByKey, sumKey, ignoreKeys, sortCountKey) - - -# Aggregate `users.json` -aggregate_list_sum_file('users.json', 'date', 'visits', None, lambda x: x['date']) - - -# CSV aggregation -# ----- + agg_data['data'] = sortedData[0:min(len(sortedData), agg_data['query']['max-results'])] + if report in stripKeys: + for item in agg_data['data']: + for key in stripKeys[report]: + del item[key] -# All of these reports have similar data structures -aggregationDefinitions = { - 'browsers.json': 'browser', - 'devices.json': 'device', - 'ie.json': 'browser_version', - 'os.json': 'os', - 'windows.json': 'os_version' -} + write_json_file(report_path(report), agg_data) -for k in aggregationDefinitions: - v = aggregationDefinitions[k] - sorting = lambda x: (x['date'], -int(x['visits'])) - aggregate_json_data(k, 'date', v, 'visits', ['date', v, 'visits'], sorting) - aggregate_csv_data(k, ['date', v, 'visits'], sorting) + # Reports that need further, special, aggregation + # ----- -# Aggregate the "top pages" reports -aggregateTopPages = { - 'all-pages-realtime.json': 'active_visitors', - 'top-pages-7-days.json': 'visits', - 'top-pages-30-days.json': 'visits' -} + # The 'all-pages-realtime.json' report contains data regarding active users based on cities and countries. In order + # to avoid making a separate GA call, we'll tally up the number of users per city and country from this existing + # report and create our own report that the dashboard will understand and accept + with open(report_path('all-pages-realtime.json'), 'r+', encoding='utf8') as data_file: + data = json.load(data_file) -for report in aggregateTopPages: - with open(os.path.join(target_folder, report), encoding='utf8') as json_file: + # City or country codes that should be ignored + ignoreKeys = [ 'zz' ] + + # Tally up the number of entries for things, respectively. We'll receive a dictionary in the following format: + # {'United States': 50, 'Canada': 2} + countries = Counter([ k['country'] for k in data['data'] ]) + cities = Counter([ k['city'] for k in data['data'] ]) + total = sum([ int(k['active_visitors']) for k in data['data'] ]) + + # Convert the tallies into a list of dictionaries and sort them by visitors. By doing this, we'll be giving the + # dashboard the syntax it expects + countriesData = [ {'country': k[0], 'active_visitors': k[1]} for k in list(countries.items()) if k[0] not in ignoreKeys ] + countriesData = sorted(countriesData, key = lambda x: -x['active_visitors']) + citiesData = [ {'city': k[0], 'active_visitors': k[1]} for k in list(cities.items()) if k[0] not in ignoreKeys ] + citiesData = sorted(citiesData, key = lambda x: -x['active_visitors']) + + # Write the data into the expected files so we don't have to break/change the dashboard + write_json_file(report_path('top-countries-realtime.json'), { 'data': countriesData }) + write_json_file(report_path('top-cities-realtime.json'), { 'data': citiesData }) + write_json_file(report_path('realtime.json'), { 'data': [{ 'active_visitors': total }] }) + + # Clean-up 'all-pages-realtime.json' from duplicate URLs and get rid of 'country' & 'city' keys while we're at it + sortCountKey = lambda x: -int(x[sumKey]) + groupByKey = lambda x: x['domain'] + x['page'] + ignoreKeys = [ 'country', 'city' ] + sumKey = 'active_visitors' + sum_data_by_key_file(report_path('all-pages-realtime.json'), groupByKey, sumKey, ignoreKeys, sortCountKey) + + # Today.json aggregation + sortCountKey = lambda x: int(x[groupByKey]) + groupByKey = 'hour' + ignoreKeys = [] + sumKey = 'visits' + sum_data_by_key_file(report_path('today.json'), groupByKey, sumKey, ignoreKeys, sortCountKey) + sum_data_by_key_file(report_path('last-48-hours.json'), groupByKey, sumKey, ignoreKeys, sortCountKey) + + # Aggregate `users.json` + sum_data_by_key_file(report_path('users.json'), 'date', 'visits', [], lambda x: x['date']) + + + # + # CSV Generation + # + + + # All of these reports have similar data structures + aggregationDefinitions = { + 'browsers.json': 'browser', + 'devices.json': 'device', + 'ie.json': 'browser_version', + 'os.json': 'os', + 'windows.json': 'os_version' + } + + for k in aggregationDefinitions: + v = aggregationDefinitions[k] + sorting = lambda x: (x['date'], -int(x['visits'])) + file_name = report_path(k) + + data = sum_data_by_key_file(file_name, lambda x: x['date'] + x[v], 'visits', [], sorting) + write_csv_file(csv_report_path(file_name), data['data'], ['date', v, 'visits']) + + # Aggregate the "top pages" reports + aggregateTopPages = { + 'all-pages-realtime.json': 'active_visitors', + 'top-pages-7-days.json': 'visits', + 'top-pages-30-days.json': 'visits' + } + + for report in aggregateTopPages: + with open(report_path(report), encoding='utf8') as json_file: + data = json.load(json_file) + value = aggregateTopPages[report] + + write_csv_file(csv_report_path(report), data['data'], ['domain', 'page', 'page_title', value]) + + # Aggregate `users.csv` + with open(report_path('users.json'), encoding='utf8') as json_file: data = json.load(json_file) - value = aggregateTopPages[report] + write_csv_file(csv_report_path('users.json'), data['data'], ['date', 'visits']) - csv_file_writer(report, data['data'], ['domain', 'page', 'page_title', value], lambda x: -int(x[value])) + # + # File moving and cleanup + # + # Copy all of the aggregated files into the final directory + src_files = os.listdir(target_folder) -# Aggregate `users.csv` -with open(os.path.join(target_folder, 'users.json'), encoding='utf8') as json_file: - data = json.load(json_file) - csv_file_writer('users.json', data['data'], ['date', 'visits'], lambda x: x['date']) + for file_name in src_files: + full_file_name = os.path.join(target_folder, file_name) + if (os.path.isfile(full_file_name)): + shutil.copy(full_file_name, report_folder) -# Copy all of the aggregated files into the final directory -src_files = os.listdir(target_folder) - -for file_name in src_files: - full_file_name = os.path.join(target_folder, file_name) - - if (os.path.isfile(full_file_name)): - shutil.copy(full_file_name, report_folder) - -# Delete the temporary folder -shutil.rmtree(target_folder) + # Delete the temporary folder + shutil.rmtree(target_folder) diff --git a/App_Data/jobs/triggered/aggregate/test.py b/App_Data/jobs/triggered/aggregate/test.py new file mode 100644 index 0000000..36160b5 --- /dev/null +++ b/App_Data/jobs/triggered/aggregate/test.py @@ -0,0 +1,131 @@ +import job +import unittest + +class JobTest(unittest.TestCase): + # This function is the first layer of aggregation that's used for all the reports to merge up the 'totals' attributes + # from each of the reports + def test_sum_shared_dict_keys(self): + objectOne = { + "int": 10, + "list": [1, 2, 3], + "tuple": (10, 20), + "dict": { + "int": 2, + "list": [1, 2] + } + } + objectTwo = { + "int": 5, + "list": [4, 5, 6], + "tuple": (20, 30), + "dict": { + "int": 8, + "list": [9, 8] + } + } + + result = job.sum_shared_dict_keys(objectOne, objectTwo) + self.assertEqual(result, { + "int": 15, + "list": [1, 2, 3, 4, 5, 6], + "dict": { + "int": 10, + "list": [1, 2, 9, 8] + } + }) + + # This function is the core for aggregating information from the following reports: + # + # - 'all-pages-realtime.json' + # - 'today.json' + # - 'last-48-hours.json' + # - 'users.json' + def test_sum_data_by_key_lambda(self): + data = [ + { "country": "United States", "city": "Los Angeles", "page": "/rider-info/real-time-info.aspx", + "page_title": "Real-Time Info - Big Blue Bus", "active_visitors": "2", "domain": "www.bigbluebus.com" + }, { + "country": "United States", "city": "Los Angeles", "page": "/routes-and-schedules/route-8.aspx", + "page_title": "Route 8 - Ocean Park Blvd - Big Blue Bus", "active_visitors": "2", "domain": "www.bigbluebus.com" + }, { + "country": "United States", "city": "Indianapolis", "page": "/default.aspx", "page_title": "Big Blue Bus", + "active_visitors": "1", "domain": "www.bigbluebus.com" + }, { + "country": "United States", "city": "Santa Monica", "page": "/default.aspx", "page_title": "Big Blue Bus", + "active_visitors": "6", "domain": "www.bigbluebus.com" + }, { + "country": "United States", "city": "Los Angeles", "page": "/fares/fare-information.aspx", + "page_title": "Fare Information - Big Blue Bus", "active_visitors": "1", "domain": "www.bigbluebus.com" + }, { + "country": "United States", "city": "Santa Monica", "page": "/fares/fare-information.aspx", + "page_title": "Fare Information - Big Blue Bus", "active_visitors": "4", "domain": "www.bigbluebus.com" + }, { + "country": "United States", "city": "Los Angeles", "page": "/departments/pcd/transportation/motorists-parking/", + "page_title": "Motorists Parking - Planning & Community Development - City of Santa Monica", + "active_visitors": "2", "domain": "www.smgov.net" + }, { + "country": "United States", "city": "Los Angeles", "page": "/departments/pcd/transportation/motorists-parking/where-to-park/", + "page_title": "Where to Park - Planning & Community Development - City of Santa Monica", "active_visitors": "2", + "domain": "www.smgov.net" + }, { + "country": "United States", "city": "Los Angeles", "page": "/default.aspx", "page_title": "City of Santa Monica", + "active_visitors": "2", "domain": "www.smgov.net" + }, { + "country": "United States", "city": "Santa Monica", "page": "/default.aspx", "page_title": "City of Santa Monica", + "active_visitors": "6", "domain": "www.smgov.net" + } + ] + + # We're summing up the 'active_visitors' key + result = job.sum_data_by_key( + data, + lambda x: x['domain'] + x['page'], + 'active_visitors', + ['country', 'city'] + ) + + # These should be the totals for the sample dataset + expected_count = { + "Real-Time Info - Big Blue Bus": 2, + "Route 8 - Ocean Park Blvd - Big Blue Bus": 2, + "Big Blue Bus": 7, + "Fare Information - Big Blue Bus": 5, + "Motorists Parking - Planning & Community Development - City of Santa Monica": 2, + "Where to Park - Planning & Community Development - City of Santa Monica": 2, + "City of Santa Monica": 8 + } + + for count in expected_count.items(): + for item in result: + if item['page_title'] == count: + self.assertEqual(item['active_visitors'], expected_count[count]) + + def test_sum_data_by_key_str(self): + data = [ + { "visits": "5", "date": "2017-03-12", "hour": "01" }, + { "visits": "10", "date": "2017-03-12", "hour": "02" }, + { "visits": "10", "date": "2017-03-12", "hour": "03" }, + { "visits": "5", "date": "2017-03-12", "hour": "04" }, + + { "visits": "50", "date": "2017-03-12", "hour": "01" }, + { "visits": "20", "date": "2017-03-12", "hour": "02" }, + { "visits": "20", "date": "2017-03-12", "hour": "03" }, + { "visits": "10", "date": "2017-03-12", "hour": "04" }, + ] + + result = job.sum_data_by_key(data, 'hour', 'visits') + + expected_count = { + '01': 55, + '02': 30, + '03': 30, + '04': 15, + } + + for count in expected_count.items(): + for item in result: + if item['hour'] == count: + self.assertEqual(item['visits'], expected_count[count]) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file