diff --git a/data-visualizations-code.py b/data-visualizations-code.py new file mode 100644 index 0000000..be143b9 --- /dev/null +++ b/data-visualizations-code.py @@ -0,0 +1,537 @@ +import pandas as pd +from statsmodels.graphics.factorplots import interaction_plot +import matplotlib.pyplot as plt +import numpy as np +from pyspark.sql import functions as fn +import seaborn as sns +from pyspark.sql.functions import * +from pyspark.sql.functions import col +from pyspark.ml import Pipeline +from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit +from pyspark.ml.feature import RFormula +from pyspark.ml.classification import LogisticRegression +from pyspark.ml.evaluation import BinaryClassificationEvaluator +from pyspark.sql import functions as fn +from pyspark.sql import SparkSession +from pyspark import SparkConf + +from pyspark.sql import SparkSession +spark = SparkSession \ + .builder \ + .appName("Read Voter File Data") \ + .getOrCreate() + +# pull Georgia data from GCP bucket storage +ga_data = spark.read.format("parquet").option('nullValue','null').load("gs://voter-files-16/VM2Uniform--GA--2021-04-16/*.parquet") + +# random sample to downsize data +ga_samp = ga_data.sample(withReplacement=False, fraction=0.01, seed=42) + + +########################################################### +## PLOT 1 - ethnicity count v. num of registrated voters ## +########################################################### +murray_county = ga_data[fn.col('County') == 'MURRAY'].cache() +murray_county.count() +young_voters = murray_county.withColumn('Voters_Age',fn.col('Voters_Age').cast('int')) \ + .filter((fn.col('Voters_Age') >= 18) & (fn.col('Voters_Age') <= 30)).cache() + +df_GA_ethnicgroup = young_voters.select(['EthnicGroups_EthnicGroup1Desc']) + +df_GA_ethnicgroup = df_GA_ethnicgroup.groupBy('EthnicGroups_EthnicGroup1Desc')\ + .agg(count('EthnicGroups_EthnicGroup1Desc').alias('ethnicgroup_count')) \ + .orderBy(col('ethnicgroup_count').desc()).dropna() + +df_GA_ethnicgroup_plt = df_GA_ethnicgroup.toPandas() +plt.rcParams.update({'font.size': 10}) +df_GA_ethnicgroup_plt.plot.bar(x = 'EthnicGroups_EthnicGroup1Desc', y = 'ethnicgroup_count', + xlabel = 'Ethnicity', ylabel = '# of Registered Voters', + title = 'Count of Ethnicities in Dataset', rot = 15, legend = False, color = '#2A7DBD') + + +############################################## +## PLOT 2 - ethnicity v. 2020 voter turnout ## +############################################## +df_GA_ethnicgroup = young_voters.select(['EthnicGroups_EthnicGroup1Desc', 'General_2020']) + +df_GA_ethnicgroup = df_GA_ethnicgroup.na.fill(value='N',subset=['General_2020']) +df_GA_ethnicgroup = df_GA_ethnicgroup.withColumn('General_2020', translate('General_2020', 'Y', '1')) +df_GA_ethnicgroup = df_GA_ethnicgroup.withColumn('General_2020', translate('General_2020', 'N', '0')) +df_GA_ethnicgroup = df_GA_ethnicgroup.withColumn('General_2020', col('General_2020').cast('int')) + +df_GA_ethnicgroup_turnout = df_GA_ethnicgroup.groupBy('EthnicGroups_EthnicGroup1Desc')\ + .agg(count('EthnicGroups_EthnicGroup1Desc').alias('ethnicgroup_count'), + avg('General_2020').alias('general_2020_turnout'))\ + .orderBy(col('general_2020_turnout').desc()).drop('ethnicgroup_count').dropna() + +df_GA_ethnicgroup_turnout = df_GA_ethnicgroup_turnout.toPandas() +plt.rcParams.update({'font.size': 10}) +df_GA_ethnicgroup_turnout.plot.bar(x = 'EthnicGroups_EthnicGroup1Desc', y = 'general_2020_turnout', + xlabel = 'Ethnicity', ylabel = 'General 2020 Election Turnout', + title = 'General 2020 Election Turnout by Ethnicity', rot = 15, legend = False, color = '#2A7DBD') + + + +################################################# +## PLOT 3 - ethnicity distr. of Georgia voters ## +################################################# +ga_group = young_voters.select('EthnicGroups_EthnicGroup1Desc').groupby('EthnicGroups_EthnicGroup1Desc').count() +ethga = [row[0] for row in ga_group.select('EthnicGroups_EthnicGroup1Desc').collect()] +numga = [row[0] for row in ga_group.select('count').collect()] + +fig, ax = plt.subplots() +patches, texts, autotexts = ax.pie(numga, labels=ethga, + autopct='%.2f%%', + textprops={'size': 'smaller'}) +plt.setp(autotexts, size='x-small') +autotexts[0].set_color('white') +autotexts[1].set_color('white') +autotexts[2].set_color('white') +autotexts[3].set_color('white') +autotexts[4].set_color('white') +plt.title('Ethnic Distribution of Georgia Voters who Specified Ethnicity') + + + +############################################################################## +## PLOT 4 - counts of voters with recreational/personal interests specified ## +############################################################################## +from pyspark.sql.functions import when, col +import matplotlib.pyplot as plt +import seaborn as sns + +# Rename the actual dataframe columns to match the odd interest column names +rename_map = { + 'CommercialDataLL_Interest_in_Camping_Hiking_In_Household': 'Camping_Hiking', + 'CommercialDataLL_Interest_in_Cooking_General_In_Household': 'Cooking_General', + 'CommercialDataLL_Interest_in_Cooking_Gourmet_In_Household': 'Cooking_Gourmet', + 'CommercialDataLL_Interest_in_Crafts_In_Household': 'Crafts', + 'CommercialDataLL_Interest_in_Current_Affairs_Politics_In_Household': 'Current_Affairs_Politics', + 'CommercialDataLL_Interest_in_Education_Online_In_Household': 'Education_Online', + 'CommercialDataLL_Interest_in_Electronic_Gaming_In_Household': 'Electronic_Gaming', + 'CommercialDataLL_Interest_in_Exercise_Health_In_Household': 'Exercise_Health' +} + +# Apply the column renaming to the dataframe +renamed_df = young_voters +for old_name, new_name in rename_map.items(): + renamed_df = renamed_df.withColumnRenamed(old_name, new_name) + +# List of interest columns (renamed) +interest_columns = list(rename_map.values()) + +# Initialize subplots +fig, axs = plt.subplots(1, len(interest_columns), figsize=(15, 6)) + +# Convert Yes/Null to 1 or 0 for each interest column and plot +for idx, column in enumerate(interest_columns): + selected_df = renamed_df.withColumn(column, + when(col(column) == 'Yes', 1) + .when(col(column).isNull(), 0) + .otherwise(0)) + + # Remove rows with all NaN values in interest columns + #selected_df = selected_df.dropna(subset=[column]) + + # Convert to Pandas for visualization + pandas_df = selected_df.toPandas() + + # Plot in the corresponding subplot + sns.countplot(x=column, data=pandas_df, ax=axs[idx]) + #axs[idx].set_title(f"{column}") + axs[idx].tick_params(axis='x', rotation=45) # Rotate x-axis labels + +# Adjust layout and add legend +plt.tight_layout() +plt.show() + + + +############################################################# +## PLOT 5 - plot 4 information but w/ proportion not count ## +############################################################# +from pyspark.sql.functions import when, col +import matplotlib.pyplot as plt +import seaborn as sns + +# Rename the actual dataframe columns to match the odd interest column names +rename_map = { + 'CommercialDataLL_Interest_in_Camping_Hiking_In_Household': 'Camping_Hiking', + 'CommercialDataLL_Interest_in_Cooking_General_In_Household': 'Cooking_General', + 'CommercialDataLL_Interest_in_Cooking_Gourmet_In_Household': 'Cooking_Gourmet', + 'CommercialDataLL_Interest_in_Crafts_In_Household': 'Crafts', + 'CommercialDataLL_Interest_in_Current_Affairs_Politics_In_Household': 'Current_Affairs_Politics', + 'CommercialDataLL_Interest_in_Education_Online_In_Household': 'Education_Online', + 'CommercialDataLL_Interest_in_Electronic_Gaming_In_Household': 'Electronic_Gaming', + 'CommercialDataLL_Interest_in_Exercise_Health_In_Household': 'Exercise_Health' +} + +# Apply the column renaming to the dataframe +renamed_df = young_voters +for old_name, new_name in rename_map.items(): + renamed_df = renamed_df.withColumnRenamed(old_name, new_name) + +# List of interest columns (renamed) +interest_columns = list(rename_map.values()) + +# Initialize subplots +fig, axs = plt.subplots(1, len(interest_columns), figsize=(15, 6)) + +# Convert Yes/Null to 1 or 0 for each interest column and plot the ratio +for idx, column in enumerate(interest_columns): + selected_df = renamed_df.withColumn(column, + when(col(column) == 'Yes', 1) + .when(col(column).isNull(), 0) + .otherwise(0)) + + # Calculate the ratio of 0 and 1 for the interest column + ratio_values = selected_df.groupBy(column).count() + ratio_values = ratio_values.withColumn('Ratio', col('count') / selected_df.count()) + + # Convert to Pandas for visualization + pandas_df = ratio_values.toPandas() + + # Plot the ratio in the corresponding subplot + sns.barplot(x=column, y='Ratio', data=pandas_df, ax=axs[idx]) + axs[idx].set_title(f"{column}") + axs[idx].tick_params(axis='x', rotation=45) # Rotate x-axis labels + +# Adjust layout +plt.tight_layout() +plt.show() + + + +############################################### +## TABLE - count and ratio for each interest ## +############################################### +from pyspark.sql.functions import when, col +import pandas as pd + +# Rename the actual dataframe columns to match the odd interest column names +rename_map = { + 'CommercialDataLL_Interest_in_Camping_Hiking_In_Household': 'Camping_Hiking', + 'CommercialDataLL_Interest_in_Cooking_General_In_Household': 'Cooking_General', + 'CommercialDataLL_Interest_in_Cooking_Gourmet_In_Household': 'Cooking_Gourmet', + 'CommercialDataLL_Interest_in_Crafts_In_Household': 'Crafts', + 'CommercialDataLL_Interest_in_Current_Affairs_Politics_In_Household': 'Current_Affairs_Politics', + 'CommercialDataLL_Interest_in_Education_Online_In_Household': 'Education_Online', + 'CommercialDataLL_Interest_in_Electronic_Gaming_In_Household': 'Electronic_Gaming', + 'CommercialDataLL_Interest_in_Exercise_Health_In_Household': 'Exercise_Health' +} + +# Apply the column renaming to the dataframe +renamed_df = young_voters +for old_name, new_name in rename_map.items(): + renamed_df = renamed_df.withColumnRenamed(old_name, new_name) + +# List of interest columns (renamed) +interest_columns = list(rename_map.values()) + +# Create an empty DataFrame to store the results +result_df = pd.DataFrame(columns=['Interest_Column', 'Count_Yes', 'Count_Null', 'Ratio_Yes']) + +# Convert Yes/Null to 1 or 0 for each interest column and calculate the count and ratio +for idx, column in enumerate(interest_columns): + selected_df = renamed_df.withColumn(column, + when(col(column) == 'Yes', 1) + .when(col(column).isNull(), 0) + .otherwise(0)) + + # Calculate the count of Yes and Null values for the interest column + count_values = selected_df.groupBy(column).count().collect() + count_yes = next((row['count'] for row in count_values if row[column] == 1), 0) + count_null = next((row['count'] for row in count_values if row[column] == 0), 0) + + # Calculate the ratio of Yes to Null values for the interest column + ratio_yes = count_yes / (count_yes + count_null) if count_yes + count_null != 0 else 0 + + # Add the results to the DataFrame + result_df = pd.concat([result_df, pd.DataFrame({'Interest_Column': [column], + 'Count_Yes': [count_yes], + 'Count_Null': [count_null], + 'Ratio_Yes': [ratio_yes]})]) + +# Print the resulting DataFrame +print("Count and Ratio for each Interest Column:") +print(result_df) + + + +########################################################### +## PLOT 6 - 2020 voter turnout v. Georgia regional areas ## +########################################################### +area_columns = ['General_2020','County','Residence_Addresses_Latitude', + 'Residence_Addresses_Longitude','CommercialData_EstimatedHHIncomeAmount','CommercialData_EstimatedAreaMedianHHIncome'] +ga_area = ga_samp.select(area_columns) +ga_area = ga_area.withColumnRenamed('General_2020', 'General 2020') \ + .withColumnRenamed('Residence_Addresses_Latitude', "Latitude") \ + .withColumnRenamed('Residence_Addresses_Longitude', "Longitude")\ + .withColumnRenamed("CommercialData_EstimatedHHIncomeAmount", "Household Income") \ + .withColumnRenamed("CommercialData_EstimatedAreaMedianHHIncome", "Median Household Income") + +# cast proper column types, fix format +ga_area = ga_area.withColumn("Household Income", regexp_replace(col("Household Income"), "\\$", "").cast("int")) +ga_area = ga_area.withColumn("Median Household Income", regexp_replace(col("Median Household Income"), "\\$", "").cast("int")) +ga_area = ga_area.withColumn('Latitude', col('Latitude').cast('float')) +ga_area = ga_area.withColumn('Longitude', col('Longitude').cast('float')) +null_counts = ga_area.select([count(when(isnull(c), c)).alias(c) for c in ga_area.columns]) + +# percentage of null values in each column +for column_name in null_counts.columns: + null_counts = null_counts.withColumn(column_name, round(col(column_name) / ga_area.count(), 3)) + +print("Ratio of Null Values for each Area and Income Column:\n") +print(null_counts.show()) + +ga_area_clean = ga_area.dropna() +ga_area_clean.count() + +ga_area_vote = ga_area.na.fill(value='N',subset=['General 2020']) +ga_area_vote = ga_area_vote.withColumn('General 2020', translate('General 2020', 'Y', '1')) +ga_area_vote = ga_area_vote.withColumn('General 2020', translate('General 2020', 'N', '0')) +ga_area_vote = ga_area_vote.withColumn('General 2020', col('General 2020').cast('int')) + +# coarsen gridsize for plotting +from pyspark.ml.feature import Bucketizer + +min_long = ga_area_vote.agg({"Longitude": "min"}).collect()[0][0] +max_long = ga_area_vote.agg({"Longitude": "max"}).collect()[0][0] +min_lat = ga_area_vote.agg({"Latitude": "min"}).collect()[0][0] +max_lat = ga_area_vote.agg({"Latitude": "max"}).collect()[0][0] + +# Generate splits +step = .05 +splits_long = list(np.arange(min_long, max_long + step, step)) +splits_lat = list(np.arange(min_lat, max_lat + step, step)) + +bucketizer_long = Bucketizer(splits=splits_long, inputCol="Longitude", outputCol="long_bucket") +bucketizer_lat = Bucketizer(splits=splits_lat, inputCol="Latitude", outputCol="lat_bucket") + +ga_area_vote_buck = bucketizer_long.transform(ga_area_vote) +ga_area_vote_buck = bucketizer_lat.transform(ga_area_vote_buck) + +from pyspark.ml.feature import OneHotEncoder + +encoder_long = OneHotEncoder(inputCols=["long_bucket"], outputCols=["long_bucket_vec"]) +ga_area_vote_encoded = encoder_long.fit(ga_area_vote_buck).transform(ga_area_vote_buck) +encoder_lat = OneHotEncoder(inputCols=["lat_bucket"], outputCols=["lat_bucket_vec"]) +ga_area_vote_encoded = encoder_lat.fit(ga_area_vote_encoded).transform(ga_area_vote_encoded) + +ga_area_vote_encoded.show() + +ga_area_vote_pandas = ga_area_vote_encoded.toPandas() + +ga_area_turnout = ga_area_vote_pandas.groupby(['long_bucket', 'lat_bucket']).agg({'General 2020': 'mean'}).reset_index() +plt.figure(figsize=(10, 6)) +plt.scatter(data=ga_area_turnout, x='long_bucket', y='lat_bucket', c='General 2020', alpha=0.8) +plt.colorbar(label='General 2020 Turnout') +plt.title('General 2020 Voter Turnout Across Regions within Georgia') +plt.xlabel('Longitude') +plt.ylabel('Latitude') +plt.grid(True) +plt.show() + + + + +######################################################### +## PLOT 7 - household income v. Georgia regional areas ## +######################################################### +ga_area_pandas = ga_area_clean.toPandas() + +# Plot longitude and latitude with income data +plt.figure(figsize=(10, 6)) +plt.scatter(data=ga_area_pandas, x='Longitude', y='Latitude', c='Household Income', alpha=0.8) +plt.colorbar(label='Household Income') +plt.title('Household Income Across Regions within Georgia') +plt.xlabel('Longitude') +plt.ylabel('Latitude') +plt.grid(True) +plt.show() + +############################################################################################################# +## PLOT 7.5 - median household income v. Georgia regional areas (smoother, more interpretable than Plot 7) ## +############################################################################################################# +plt.figure(figsize=(10, 6)) +plt.scatter(data=ga_area_pandas, x='Longitude', y='Latitude', c='Median Household Income', alpha=0.8) +plt.colorbar(label='Median Household Income') +plt.title('Median Household Income Across Regions within Georgia') +plt.xlabel('Longitude') +plt.ylabel('Latitude') +plt.grid(True) +plt.show() + + +############################################ +## PLOT 8 - avg voter turnout rate v. age ## +############################################ +relevant_data = ga_data.select( + "Voters_Age", + "Voters_Gender", + "Ethnic_Description", + "CommercialData_EstimatedHHIncome", + "General_2020", "General_2016", "General_2012", # Add or remove years as needed +) + +turnout_by_age = relevant_data.groupBy("Voters_Age").agg( + avg("General_2020").alias("Average_Turnout_2020") +) + +converted_data = relevant_data.withColumn( + "Voted_2020", + when(col("General_2020") == 'Y', 1).otherwise(0) +) + +filtered_data = converted_data.filter(col("Voters_Age") != 100) + +turnout_by_age = filtered_data.groupBy("Voters_Age").agg( + avg("Voted_2020").alias("Average_Turnout_2020") +).orderBy("Voters_Age") + +turnout_by_age_pd = turnout_by_age.toPandas() + +plt.figure(figsize=(14, 8)) +sns.barplot(x="Voters_Age", y="Average_Turnout_2020", data=turnout_by_age_pd, palette="viridis") +plt.title("Average Voter Turnout by Age for General 2020") +plt.xlabel("Age") +plt.ylabel("Average Turnout Rate") +plt.xticks(rotation=90) +plt.tight_layout() +plt.show() + + +################################################################### +## PLOT 9 - 2020 voter turnout v. total registered voters v. age ## +################################################################### +filtered_data = converted_data.filter(col("Voters_Age") != 100) + +total_and_voted_by_age = filtered_data.groupBy("Voters_Age").agg( + count("*").alias("Total_Voters"), + sum("Voted_2020").alias("Voters_Who_Voted") +).orderBy("Voters_Age") + +total_and_voted_by_age_pd = total_and_voted_by_age.toPandas() + +plt.figure(figsize=(14, 8)) +sns.barplot(x="Voters_Age", y="Total_Voters", data=total_and_voted_by_age_pd, color='lightgrey', label='Total Voters') +sns.barplot(x="Voters_Age", y="Voters_Who_Voted", data=total_and_voted_by_age_pd, color='blue', label='Voters Who Voted') +plt.legend() +plt.title("Total Voters and Voters Who Voted by Age for General 2020") +plt.xlabel("Age") +plt.ylabel("Count") +plt.xticks(rotation=90) +plt.tight_layout() +plt.show() + + + +########################################################## +## PLOT 10 - 2020 voter turnout v. est household income ## +########################################################## +turnout_by_gender = converted_data.groupBy("Voters_Gender").agg( + avg("Voted_2020").alias("Average_Turnout_By_Gender") +) + +turnout_by_gender_pd = turnout_by_gender.toPandas() + +income_turnout_relationship = converted_data.groupBy("CommercialData_EstimatedHHIncome").agg( + avg("Voted_2020").alias("Average_Turnout_By_Income") +).orderBy("CommercialData_EstimatedHHIncome") + +income_turnout_relationship_pd = income_turnout_relationship.toPandas() + +plt.figure(figsize=(14, 8)) +sns.lineplot(x="CommercialData_EstimatedHHIncome", y="Average_Turnout_By_Income", data=income_turnout_relationship_pd) +plt.title("Voter Turnout by Estimated Household Income for General 2020") +plt.xlabel("Estimated Household Income") +plt.ylabel("Average Turnout Rate") +plt.xticks(rotation=90) +plt.tight_layout() +plt.show() + + +#################################################################### +## PLOT 11 - 2020 avg voter turnout v. household income v. gender ## +#################################################################### +turnout_by_income_gender = converted_data.groupBy("CommercialData_EstimatedHHIncome", "Voters_Gender").agg( + avg("Voted_2020").alias("Average_Turnout_By_Income_Gender") +).orderBy("CommercialData_EstimatedHHIncome", "Voters_Gender") + +turnout_by_income_gender_pd = turnout_by_income_gender.toPandas() + +# Plotting the relationship between income, gender, and voter turnout +plt.figure(figsize=(12, 6)) +sns.barplot( + x="CommercialData_EstimatedHHIncome", + y="Average_Turnout_By_Income_Gender", + hue="Voters_Gender", + data=turnout_by_income_gender_pd, + palette="muted" +) +plt.title("Average Voter Turnout by Income and Gender for General 2020") +plt.xlabel("Estimated Household Income") +plt.ylabel("Average Turnout Rate") +plt.xticks(rotation=90) +plt.tight_layout() # Adjust layout to make room for the x-axis labels +plt.legend(title='Gender') +plt.show() + + + +############################################################ +## PLOT 12 - avg age v. ethnicity among registered voters ## +############################################################ +avg_age_by_ethnicity = ga_data.groupBy("Ethnic_Description").agg( + avg("Voters_Age").alias("Average_Age") +).orderBy("Average_Age") + +avg_age_by_ethnicity_pd = avg_age_by_ethnicity.toPandas() + +plt.figure(figsize=(12, 6)) +sns.barplot(x="Ethnic_Description", y="Average_Age", data=avg_age_by_ethnicity_pd, palette="coolwarm") +plt.title("Average Age by Ethnic Group") +plt.xlabel("Ethnic Group") +plt.ylabel("Average Age") +plt.xticks(rotation=90) +plt.tight_layout() +plt.show() + + + +######################################################################### +## PLOT 13 - avg household income v. ethnicity among registered voters ## +######################################################################### +ga_data = ga_data.withColumn( + "Cleaned_Income", + regexp_replace(col("CommercialData_EstimatedHHIncomeAmount"), "[\\$,]", "").cast("integer") +) +income_by_ethnicity = ga_data.groupBy("Ethnic_Description").agg( + avg("Cleaned_Income").alias("Average_Income") +).orderBy("Ethnic_Description") +income_by_ethnicity_pd = income_by_ethnicity.toPandas() +income_by_ethnicity_pd = income_by_ethnicity_pd.dropna(subset=["Average_Income"]) +income_by_ethnicity_pd_sorted = income_by_ethnicity_pd.sort_values(by="Average_Income") +plt.figure(figsize=(14, 8)) +barplot = sns.barplot( + x="Ethnic_Description", + y="Average_Income", + data=income_by_ethnicity_pd_sorted, + palette='Spectral' +) +plt.xticks(rotation=90) +plt.xlabel("Ethnic Group") +plt.ylabel("Average Household Income") +plt.title("Average Household Income by Ethnic Group") +plt.tight_layout() +plt.show() + + + + + + + + + +