generate_working_data.py

# Python 3.x
#

import numpy as np
import pandas as pd

import okctools


def function1(df):
    """DataFrame -> DataFrame
    Take a DataFrame generated by the OKCScraper and transform some part of it
    to make it better suited for analysis.
    """

    pass


def remove_empty_profiles(df):
    """DataFrame -> None (Operates inplace on DataFrame object)
    Remove empty profiles from OKC profiles DataFrame. This is straightforward
    because empty profiles will only contain a username and nothing else.
    """

    df.drop(index=df.index[df.name.isnull().values], inplace=True)
    df.reset_index(drop=True, inplace=True)


def create_wordcount(df):
    """DataFrame -> DataFrame
    Use the 'essays' column of an OKC profiles DataFrame to create a new
    column of word counts.
    """

    if 'essays' not in df.columns:
        print('Input DataFrame does not contain column "essays".')
        return df

    wordcounts = list(map(lambda essay: okctools.count_words(essay),
                          df.loc[:, 'essays']))
    # "df.loc[:, wordcount] = stuff" gives a warning
    df = df.assign(wordcount=wordcounts)

    return df


def lists_to_dummies(df, drop=False):
    """DataFrame, bool -> DataFrame
    Turn list values into dummy variables. For example, df.orientation[0] might
    be the list ['pansexual','queer'], which would be turned into the dummy
    variables orientation_pansexual and orientation_queer

    TODO:
    add arg to specify exactly which columns to dummify
    """

    # Loops for all columns whose values are lists, excluding 'essays'
    for col in df.drop(columns='essays').columns:
        if all(df[col].isnull()):
            print('Column "{}" has no valid values. You can remove these with '
                  'the "remove_empty_columns" function'
                  .format(col))
            continue
        elif type(df[col][df[col].notna()].iloc[0]) == list:

            df_dummies = df.loc[df[col].notna(), col] \
                .apply(lambda lst: pd.Series([1] * len(lst), index=lst)
                       .add_prefix(col + '_')) \
                .fillna(0, downcast='infer')

            df_dummies.reindex(index=df.index, fill_value=0)
            df = pd.concat([df, df_dummies], axis=1, sort=False)
            if drop:
                df.drop(columns=col, inplace=True)

    return df


def remove_empty_columns(df):
    """DataFrame -> None (Operates inplace on DataFrame object)
    Remove columns in DataFrame that have no valid values (i.e. all NaN)
    """

    emptylist = []
    for col in df.columns:
        if all(df[col].isnull()):
            emptylist.append(col)

    # Do not drop in place, as that will modify the actual df used as input.
    df.drop(columns=emptylist, inplace=True)


def repair_lang_features(df):
    """DataFrame -> None (Operates inplace on DataFrame object)
    Fix lang_primary and lang_secondary fields from data collected using older
    versions of fetchusers.py that resulted in stray words being collected into
    the data such as 'very' and 'it'
    """

    df.lang_primary.fillna(value=np.nan, inplace=True)
    df.lang_secondary.fillna(value=np.nan, inplace=True)
    # List of languages directly from OKC website
    langs = open('okc_lang_options.txt', 'r').read().lower().splitlines()

    lang_columns = ['lang_primary', 'lang_secondary']
    for column in lang_columns:
        lang_col_new = []
        for langlist in df[column]:
            langlist_new = []
            langset = set()  # To check and avoid duplicates
            if langlist is np.nan:
                lang_col_new.append(np.nan)
                continue
            if 'c' in langlist:
                langlist_new.append('c++')
            if 'sign' in langlist:
                langlist_new.append('sign language')
            for lang in langlist:
                if lang in langs and lang not in langset:
                    langlist_new.append(lang)
                    langset.add(lang)
            lang_col_new.append(langlist_new)
        df[column] = lang_col_new


if __name__ == "__main__":

    infolder = 'data/input/'
    outfolder = 'data/working/'

    df = okctools.load_profiles_df(version='py3')
    remove_empty_profiles(df)
    remove_empty_columns(df)
    repair_lang_features(df)
    #df = lists_to_dummies(df)

    # See the notebook for details on the "why" of what I'm doing here.
    df.loc[502, 'status'] = 'single' # Manual inference
    df.loc[203, 'gender'] = ['woman'] # Manual inference, simplistic assumption

    replace_empty_list = ['lang_primary', 'lang_secondary', 'ethnicity', '']
    leave_alone = ['dogs', 'cats', 'height', 'smokes', 'build', 'drinks',
                   'build', 'education', 'ed_prefix', '']