adding version that doesn't use the command line

elliewix · Mar 28, 2019 · 55ec22c · 55ec22c
1 parent 31c8462
commit 55ec22c
Showing 1 changed file with 232 additions and 0 deletions.
diff --git a/data_profilepy3_no_cmd_line.py b/data_profilepy3_no_cmd_line.py
@@ -0,0 +1,232 @@
+from __future__ import division
+# command line prompt
+# python data_profile.py -[mh][source folder of data] [target folder for profiles]
+# -m make markdown
+# -h make html
+
+import os
+from os.path import isfile, join
+import csv
+import datetime
+import glob
+# import markdown # removed html output for now
+import sys
+import json
+from pathlib import Path
+
+# this wasn't being used!!!!!!
+# def getFiles(path):
+#     """Function to return a list of all files within a folder"""
+#     files = [f for f in os.listdir(path) if isfile(join(path, f)) and f[0] != '.']
+#     p = Path(path)
+#     print(glob.glob(p.read_text() / "*"))
+#     print('hello')
+#     return files
+
+
+def basic_stats(file):
+    stats = os.stat(file)
+    size = stats.st_size
+    last_modified = datetime.datetime.fromtimestamp(stats.st_mtime).strftime('%Y-%m-%d %H:%M:%S')
+    last_access = datetime.datetime.fromtimestamp(stats.st_atime).strftime('%Y-%m-%d %H:%M:%S')
+    return {'filename': str(file.absolute()), 'size': size, 'last_access': last_access, 'last_modified': last_modified}
+
+
+def review_csv(file, mode='rt', headers=True, index_row=True, missing=''):
+    with open(file, mode) as fin:
+        fin = csv.reader(fin)
+        if headers:
+            col_names = next(fin)
+            data = [r for r in fin]
+        else:
+            data = [r for r in fin]
+
+    if index_row:
+        ids = [r[0] for r in data]
+    else:
+        ids = "None declared"
+
+    num_rows = len(data)
+    data = list(map(list, zip(*data)))
+
+    num_columns = len(col_names)
+    col_info = {'csv_basic': {'num_rows': num_rows, 'num_columns': num_columns, 'missing': missing}, 'cols': {}}
+    for i, col in enumerate(col_names):
+
+        info = {}
+        num_uniques = len(set(data[i]))
+        info['unique_values'] = str(num_uniques) + " (this includes missing values)"
+        if num_uniques <= 10:
+            uvals = set(data[i])
+            uval_print = []
+            for x in uvals:
+                if x == missing:
+                    uval_print.append("[missing code]")
+                else:
+                    uval_print.append(x)
+            uval_print.sort() # sorting unique values for pretty printing
+            info['unique_value_content'] = "The values are:\n\t* " + "\n\t* ".join(uval_print)
+        else:
+            info['unique_value_content'] = "Not reported (More than 10 unique values)"
+        info['missing'] = data[i].count(missing)
+        info['percent_missing'] = "{:.0%}".format(info['missing'] / len(data[i]))
+        # digits = len([d for d in data[i] if d.isdigit()])
+        # dcount = 0
+        passed_digits = []
+        for d in data[i]:
+            try:
+                d = float(d)
+                # dcount += 1
+                passed_digits.append(float(d))
+            except:
+                # this is fine becasue this is simply testing
+                # if it can be done at all
+                # stop fretting, elizabeth.
+                pass # passed_digits.append('failed to convert to float')
+        digits = len(passed_digits)
+        totalvalues = len([d for d in data[i] if len(d) > 0])
+
+        if totalvalues == 0:
+            info['percent_digit'] = "no digits"
+        else:
+            info['percent_digit'] = "{:.0%}".format(digits / totalvalues)
+
+        if digits > 0:
+            # digit_values = [float(d) for d in data[i] if d.isdigit()]
+            info['min_digit'] = min(passed_digits)
+            info['max_digit'] = max(passed_digits)
+        else:
+            info['min_digit'] = "no digits"
+            info['max_digit'] = "no digits"
+        if headers:
+            col_info['cols'][col] = info
+        else:
+            col_info['cols']['col_' + str(i)] = info
+    return col_info
+
+
+def make_md(file_name, file_data, headers, target):
+    # print(file_name, headers, target)
+    dt = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())
+    md = ""
+    md += "Data Profile for " + file_name.name + "\n\n"
+    md += "Generated on: " + dt + "\n"
+    md += "\n\n"
+    basic = file_data['csv_basic']
+    md += "* Number of columns: " + str(basic['num_columns']) + "\n"
+    md += "* Number of rows: " + str(basic['num_rows']) + "\n"
+    if basic['missing'] == '':
+        missing_print = "(empty string)"
+    else:
+        missing_print = basic['missing']
+    md += "* Using missing value of: " + missing_print + "\n"
+    md += "\n"
+    info = [file_data['columns'] for f in file_data.keys()][0]
+    for key in headers:
+        data = info[key]
+        md += "**" + key + "**" + "\n"
+        md += "-" * (len(key) + 2) + "\n"
+        md += "* Description of column: (you fill in)\n"
+        md += "* Collection methods: (you fill in)\n"
+        md += "* Description of data values and units: (you fill in)\n"
+        md += "* Reason for missing values: (you fill in)\n"
+        # md += "\n"
+        for column, val in data.items():  # go through all the data info
+            md += "* " + column.replace('_', ' ').capitalize() + ": " + str(val) + "\n"
+        md += "\n"
+    # print file_name
+    write_name = file_name.stem + '_DataProfile.md'
+    # write_name = file_name.split('/')[-1].split('.')[0] + '_DataProfile'
+    # print write_name
+    with open(target.absolute() / write_name, 'wt') as fout:
+        fout.write(md)
+
+    # the html looks like crap
+    # with open(target + write_name + '.html', 'wt') as fout:
+    #     fout.write(markdown.markdown(md))
+
+
+def get_headers(file):
+    with open(file, 'rU') as fin:
+        fin = csv.reader(fin)
+        headers = next(fin)
+    return headers
+
+
+def main(source, target, missingcode):
+    do_not_write = False
+    target = Path(target)
+    source = Path(source)
+    # if not target.is_dir():
+    #     target += "/"  # sorry windows
+    # files = [source + f for f in getFiles(source)]
+    if source.is_dir():
+        # if not source.endswith('/'):
+        #     source += "/"
+        files = [p.absolute() for p in source.glob('*')]
+        num_files = len(files)
+    elif source.is_file():
+        files = [source.absolute()]  # forcing this into a list of 1 so for loop works
+        num_files = 1
+
+    # only report out file names if there are <10 to do
+    if num_files < 10:  # change this number if you care
+        print("Generating profile for: " + ", ".join([str(p) for p in files]))
+    else:
+        print("Generating profiles for " + str(num_files) + " files")
+
+    if os.path.isdir(target):  # this will not play nicely with windows...
+        confirm_needed = True
+        tstr = str(target.absolute())
+        while confirm_needed:
+            confirm_overwrite = input("\n" + tstr + " already exists. Do you want to overwrite? (Y/N)\n").upper()
+            print(confirm_overwrite)
+            if confirm_overwrite == "Y":
+                confirm_needed = False
+                print("Profiles written into " + tstr + "\n")
+            elif confirm_overwrite == "N":
+                do_not_write = True
+                print("Profiles not written.\n")
+                break
+            else:
+                print("Input not understood. Please try again.")
+    else:
+        target.mkdir()  # but I can't test windows right now...
+        print("\n" + str(target.absolute()) + " created")
+        print("\nProfiles written into " + str(target.absolute()) + "\n")
+    all_file_data = {}
+
+    if not do_not_write:
+        for f in files:
+            f = Path(f)
+            if f.suffix == '.csv':
+                finfo = basic_stats(f)
+                headers = get_headers(f)
+                csvinfo = review_csv(f, mode='rU', missing=missingcode)
+                all_file_data[str(f.name)] = ({'file_metadata': finfo,
+                                     'csv_basic': csvinfo['csv_basic'],
+                                     'columns': csvinfo['cols']})
+                make_md(f, all_file_data[str(f.name)], headers, target)
+        write_name = str(target.stem + '_DataProfiles.json')
+        # write_name = target.split('/')[-2].split('.')[0] + '_DataProfiles.json'
+        with open(target.absolute() / write_name, 'wt') as jsonout:
+            json.dump(all_file_data, jsonout, indent=4)
+
+
+if __name__ == "__main__":
+    # print args
+    # ['data_profile.py', 'vagrants/', 'vagrant-profiles/', '']
+    # usage
+    # python data_profile.py source output_folder (missing_code)
+    # source may be file or folder
+    # output must be a folder
+    # missing code optional, will presume empty string if not provided
+    source = "gendercounts.csv" # existing folder or file here
+    target = "gender_docs" # name the new folder where the results should go
+    missing_code = "missing" #provide the missing data code
+
+    # main(source, target, kind, missingcode)
+    main(source, target, missing_code)
+    # not dealing with the the mode right now, just letting it make both
+
+