-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvisualizingPay.py
719 lines (573 loc) · 30.8 KB
/
visualizingPay.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from pandas import Series, DataFrame
from pylab import *
import statsmodels.api as sm
from matplotlib.ticker import MaxNLocator
from matplotlib.font_manager import FontProperties
from scipy import stats
from scipy.stats import norm
plt.style.use('fivethirtyeight')
# Load In the Data
df = pd.read_csv("/Users/justinnunez/Downloads/Citywide_Payroll_Data__Fiscal_Year_.csv")
# Delete columns that aren't needed for analysis.
# . Names not needed
del df["Mid Init"]
del df["First Name"]
del df["Last Name"]
# Check columns were deleted.
df.head(n=15)
df.tail(n=15)
# See all of our data columns
df.columns
# Year - Agency - Start Date - Work Location - Title Descr. - Leave Status (June 30) - Base Salary - Pay Basis - Regular Hours
# - Regular Gross Pay - OT Hours - Total OT paid - Total Other Pay.
print(df.shape) # 2 million rows of data. 13 Column Variables.
df.info()
# This shows us that our numerical data is in format of object not integer, we need to cast them into integers to run analysis.
# We can confirm this by running .describe() function which prints summary statistics of only numerical data
# We can see that it only outputted data for year and hours which are int/float variables.
# this just states that our numbers aren't being read as numbers but as objects, plain variables.
df.describe()
# Before we do this let us check if all our variables are filled correctly and don't have missing values...
df.isnull().sum() # A lot of missing values in our variable "Work Location Borough", 23% of it is missing exactly.
#
# And 21 missing values for Title Description. That's not bad.
## Data Cleaning -
df["Title Description"].value_counts() # Some titles have non-alpha characters, we would need to clean this up as well.
# this shows us how many different types of occupations there are and how many times they occur.
df["Base Salary"] = df["Base Salary"].str.replace('$','')
df["Regular Gross Paid"] = df["Regular Gross Paid"].str.replace('$','')
df["Total OT Paid"] = df["Total OT Paid"].str.replace('$','')
df["Total Other Pay"] = df["Total Other Pay"].str.replace('$','')
# Get rid of whitespaces
df["Regular Gross Paid"] = df["Regular Gross Paid"].str.strip()
df["Pay Basis"] = df["Pay Basis"].str.strip()
# Convert "Base Salary" to Integer and rest to floats.
df["Base Salary"] = df["Base Salary"].astype(float)
df["Base Salary"] = df["Base Salary"].astype(int)
df["Regular Gross Paid"] = df["Regular Gross Paid"].astype(float)
df["Total OT Paid"] = df["Total OT Paid"].astype(float)
df["Total Other Pay"] = df["Total Other Pay"].astype(float)
df["Title Description"] = df["Title Description"].astype(str)
df["Title Description"] = df["Title Description"].str.strip()
# Gets rid of Non-Alpha Characters in this column
df["Title Description"]= df["Title Description"].str.replace('[^A-Za-z\s]+', '')
df["Title Description"] = df["Title Description"].str.lstrip()
df["Title Description"] = df["Title Description"].str.lower()
'
df["job_title"] = df["Title Description"].str.extract('([a-zA-Z ]+)', expand=False).str.strip().str.lstrip()
df["Start_Year"] = df["Agency Start Date"].str[6:10]
df["Start_Year"] = df["Start_Year"].astype(int)
df.head()
# BOXPLOT FOR GENERAL BASE PAY
df.columns
ax8=plt.axes()
# Subset taken of years 2000 to 2017, paid annually, with base and gross above 10,000.
nof = df[(df["Start_Year"] >= 2000) & (df["Start_Year"]< 2018) & (df["Pay Basis"] == "per Annum") &(df["Regular Gross Paid"] > 10000) & (df["Base Salary"] > 10000)]
nof["Start_Year"].value_counts()
nof["Base Salary"]
nof.head()
sb.boxplot(x=nof["Start_Year"],
y=nof["Base Salary"], data=nof, palette='hls',ax=ax8)
plt.yticks(fontsize=11.0, weight='bold',color='black')
plt.ylabel('$ ', fontsize=23, rotation=0, weight='bold')
plt.xlabel('\n Year Started Working',fontsize=18, weight='bold')
#plt.yticks(range(10000,200000,10000), fontsize=13, weight= 'bold')
plt.xticks(fontsize=16, weight= 'bold')
ax8.set_title("Boxplot Distribution of Base Salary (USD) for Year Started Working Across All Occupations", weight='bold')
#
# TOP 8 Average Paid Jobs Plot
rcParams['figure.figsize'] = 12,8
nof.groupby("job_title")["Base Salary"].mean().sort_values(ascending=False)[:8].plot(kind='barh')
plt.ylabel('', fontsize=23, rotation=0, weight='bold')
plt.xlabel(' \n $ (USD) ',fontsize=18, weight='bold')
plt.yticks(fontsize=13, weight= 'bold')
plt.xticks(fontsize=16, weight= 'bold')
plt.text(x=0,y=8.1, s="Top 8 Average Paid Job Titles",weight='bold',fontsize=25)
##
###### --- (3/13/18) New - Job Analysis Boxplot for Occupations with the Most Employees!! -- ########
df["job_title"].value_counts()
sub1 = df[(df["Pay Basis"] == "per Annum") & (df["Fiscal Year"] >= 2000) & (df["Fiscal Year"] <= 2017)]
sub1["job_title"].value_counts().index
sub1.head()
# Merge Teacher job titles into 1 category!
diction = {'teacher special education':'teacher'}
sub1['job_title'] = sub1.job_title.replace(diction)
# get all values that are not NULL !
sub1 = sub1[~sub1.job_title.isnull()]
# to check if nulls went away
sub1.isnull().sum()
# Top 20 job titles with most employees registered on annual pay from 2000 to 2017!
sub_dict = set(sub1["job_title"].value_counts().index[:20]) # store the top 20
sub_dict_containing = sub1[sub1['job_title'].isin(sub_dict)].copy() # find them in DF and Copy
sub_dict_containing.head()
# data for boxplots.
# Get index and values. .. Set them to variables
x0 = sub_dict_containing["job_title"]
y0 = sub_dict_containing[sub_dict_containing["Regular Gross Paid"] > 5000]["Regular Gross Paid"]
rcParams['figure.figsize'] = 26,20
axx = plt.axes()
sb.boxplot(x=x0,y=y0, data=sub_dict_containing, palette='husl',ax=axx)
axx.set_xticklabels(axx.get_xticklabels(),rotation =40, ha="right") #fontname='Calibri'
#sb.set_context("poster")
plt.ylabel('Gross Pay ', fontsize=20, rotation=0, weight='bold')
plt.xlabel('Occupation',fontsize=25, weight='bold')
plt.yticks(range(0,450000,30000),fontsize=19.0, weight='bold',color='black')
plt.xticks(fontsize=18.3, weight='bold',color='black')
#plt.show()
axx.set_title("Gross Pay Distribution Based on Occupations with The Most Employees",fontsize=37, weight='bold')
plt.text(x=1.78, y=500000, s="Annual Paid Employees From Years 2000 to 2017", fontsize=14, weight='bold',alpha=.74)
# ------ # ------ # - ----- # ----- # ------ # ----- #
# === # == # == # == # == # == # == # == # == #
# We are going to see relationship between years of experience and Gross Pay!
# Create new column yearsOfExp
newdata = df[(df["Pay Basis"] == "per Annum") & (df["Fiscal Year"] == 2017) & (df["Start_Year"] >= 2000) & (df["Start_Year"] <= 2017) & (df["Regular Gross Paid"] > 20000)]
len(newdata)
newdata.head()
# newData. Has data from years 2000 to 2017, registered in 2017.
newdata['yearsOfexp'] = np.nan
newdata.info()
newdata.head()
# function to create new column with years of experience!
def year_approx(cols):
yearsOfexp = cols[0]
Start_Year =cols[1]
if pd.isnull(yearsOfexp):
if Start_Year == 2000:
return 18
elif Start_Year==2001:
return 17
elif Start_Year==2002:
return 16
elif Start_Year==2003:
return 15
elif Start_Year==2004:
return 14
elif Start_Year==2005:
return 13
elif Start_Year==2006:
return 12
elif Start_Year==2007:
return 11
elif Start_Year==2008:
return 10
elif Start_Year==2009:
return 9
elif Start_Year==2010:
return 8
elif Start_Year==2011:
return 7
elif Start_Year==2012:
return 6
elif Start_Year==2013:
return 5
elif Start_Year==2014:
return 4
elif Start_Year==2015:
return 3
elif Start_Year==2016:
return 2
elif Start_Year==2017:
return 1
else:
return np.nan
newdata['yearsOfexp'] = newdata[['yearsOfexp', 'Start_Year']].apply(year_approx, axis=1)
newdata['yearsOfexp'].corr(newdata['Regular Gross Paid']) # check correlation
newdata['yearsOfexp'].corr(newdata['Base Salary']) # check correlation
from scipy.stats import spearmanr
sb.jointplot("yearsOfexp", "Regular Gross Paid", data=newdata,stat_func=spearmanr,color="m")
rcParams['figure.figsize'] = 12,8
sb.heatmap(newdata.corr()) # heatmap plot
# Pairplot!
sb.pairplot(newdata)
# - - PLOT: Gross PAY vs. YEAR OF EXP
#plt.figure.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
newdata.groupby(['yearsOfexp'])['Regular Gross Paid'].median().plot(color='teal')
newdata.groupby(['yearsOfexp'])['Regular Gross Paid'].mean().plot(color='orangered')
plt.axhline(y = 20000, color = 'black', linewidth = 1.3, alpha = .7)
plt.yticks(range(20000,100000,10000), weight='bold',color='black')
plt.xticks(range(1,19,3),fontsize=10, weight='bold',color='black')
plt.ylabel('$ (USD) ', fontsize=12, rotation=0, weight='bold')
plt.xlabel('\n Years of Experience',fontsize=13.1, weight='bold')
plt.text(x=0, y = 100000, s= 'Gross Salary based on Experience',fontsize = 27.5, weight = 'bold', alpha = .75)
plt.text(x=0, y= 96000, s='Across All Occupations for employees on annual pay above $20,000',fontsize=13.2, alpha=.65)
plt.text(x=13.1, y =87000, s='Median',color='teal',weight='bold',fontsize = 14,alpha = .85)
plt.text(x=13.1, y= 72500, s='Mean',color='orangered',weight='bold',fontsize=14, alpha=.85)
sb.despine()
# # # --- --- BOXPLOT: YEAR OF EXP vs. Gross PAY!
ax8=plt.axes()
rcParams['figure.figsize'] = 25,17
sb.boxplot(x=newdata["yearsOfexp"],
y=newdata[newdata["Regular Gross Paid"] < 200000]['Regular Gross Paid'], data=newdata, palette='hls',ax=ax8)
plt.yticks(range(0,200000,15000),fontsize=18.0, weight='bold',color='black')
plt.ylabel('$ (USD) ', fontsize=23, rotation=0, weight='bold')
plt.xlabel('\n Years of Experience',fontsize=28, weight='bold')
#plt.yticks(range(10000,200000,10000), fontsize=13, weight= 'bold')
plt.xticks(fontsize=21, weight= 'bold')
plt.axhline(y=1000, color = 'black', linewidth = 1.3, alpha = .7)
ax8.set_title("Boxplot Distribution of Gross Salary (USD) based on Experience Across All Occupations",fontsize= 30, weight='bold')
## PLOT: Gross Pay - Base Pay
newdata["GrossMinusBase"] = newdata["Regular Gross Paid"].subtract(newdata["Base Salary"], fill_value=0)
newdata["GrossMinusBase"].plot()
len(newdata["Regular Gross Paid"])
len(newdata["Base Salary"])
len(newdata["GrossMinusBase"])
newdata["GrossMinusBase"].count()
newdata["GrossMinusBase"].mean()
newdata["GrossMinusBase"].median()
newdata["GrossMinusBase"].std()
newdata["GrossMinusBase"].max()
newdata["GrossMinusBase"].min()
newdata[newdata["GrossMinusBase"] > 0]["GrossMinusBase"].min()
from pandas.tools.plotting import scatter_matrix
scatter_matrix(newdata, alpha=.2, figsize=(8,8), diagonal='kde')
scatter_matrix(df, alpha=.2, figsize=(8,8), diagonal='kde')
newdata["GrossMinusBase"].hist(alpha=.4,bins='auto')
plt.hist(newdata["GrossMinusBase"],bins='auto')
g=plt.figure()
g = sb.distplot(newdata["GrossMinusBase"],kde=False)
g.set_xlim([-80000,50000])
sm.qqplot(newdata["GrossMinusBase"], line='q')
# --- # --- # --- # --- #
# - # - # - # - # - #
# Most Pay Based On Work Title - - - - - - - - -
x = df[(df["Pay Basis"] == "per Annum") & (df["Regular Gross Paid"] > 10000) & (df["Base Salary"] > 10000)]
# Who Made the least ? plot
x.groupby(["Fiscal Year","Title Description"])["Regular Gross Paid"].mean().sort_values(ascending=True)[:15].plot(kind='barh',color='lightslategray')
plt.xticks(fontsize=11, weight= 'bold', color='black')
plt.yticks(fontsize=9.0, weight='bold',color='black')
plt.ylabel('')
plt.xlabel('$',fontsize=9)
plt.text(x=-3000, y=16, s="Lowest Paid Employees", fontsize=28, weight='bold',alpha=.93)
plt.text(x=-3000, y=15.5, s="Employees On Annual Pay above $10,000",weight='bold',fontsize=10,alpha=.45)
# Who Made the most ? plot
x.groupby(["Fiscal Year","Title Description"])["Regular Gross Paid"].mean().sort_values(ascending=False)[:15].plot(kind='barh',color='cadetblue')
plt.xticks(fontsize=11, weight= 'bold',rotation=0, color='black')
plt.yticks(fontsize=9.0, weight='bold',color='black')
plt.ylabel('')
plt.xlabel('$',fontsize=9)
plt.text(x=-180000, y=16, s="Highest Paid Employees", fontsize=28, weight='bold',alpha=.93)
plt.text(x=-180000, y=15.1, s="Employees On Annual Pay above $10,000",weight='bold',fontsize=10,alpha=.45)
plt.axvline(x=(stats.trim_mean(x['Regular Gross Paid'].values, 0.1)), color='black', linewidth=1.3, alpha=.75)
plt.annotate('Trimmed Mean',fontsize=9,xy=(65000,7.5), xytext=(68000,7.4), arrowprops=dict(arrowstyle='<-',facecolor='black',connectionstyle="arc3"))
df.info() # Success
# Now change Start Date variable to real datetime object in Pandas.
# df["Agency Start Date"] = pd.to_datetime(df["Agency Start Date"])
## 2014 AGENCY EMPLOYEES
x1 = df[df["Fiscal Year"] == 2014]["Agency Name"].value_counts().sort_values(ascending=False)[0:10].values
y1 = df[df["Fiscal Year"] == 2014]["Agency Name"].value_counts().sort_values(ascending=False)[0:10].index
plt.rcParams['figure.figsize']= 9,7
ax1 = sb.barplot(x=x1, y=y1, palette='BuGn_d', saturation=1)
ax1.figure.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
sb.set_context("talk", font_scale=1.0)
sb.despine()
fontz = {'fontsize':22, 'fontweight': 'bold'}
# ax1.set(xlabel='', ylabel='')
## TOP AGENCIES WITH MOST EMPLOYEES in 2014
ax1.set_title('Top 8 Agencies With Most Employees in 2014',loc='left',fontdict=fontz, alpha=.90)
plt.text(x = 50000,y = 10.88, s="# of Employees")
plt.text(x=-45000, y=10.87, s='Justin Nunez Source: Kaggle', fontsize=11, color='#f0f0f0',backgroundcolor='grey')
########## BEGINNING OF NESTED PLOTS
# Could use a for loop or create a function to make this data... as well as for plotting
### Data 2014 plot
x1 = df[df["Fiscal Year"] == 2014]["Agency Name"].value_counts().sort_values(ascending=False)[0:8].values
y1 = df[df["Fiscal Year"] == 2014]["Agency Name"].value_counts().sort_values(ascending=False)[0:8].index
### Data 2015 plot
x2 = df[df["Fiscal Year"] == 2015]["Agency Name"].value_counts().sort_values(ascending=False)[0:8].values
y2 = df[df["Fiscal Year"] == 2015]["Agency Name"].value_counts().sort_values(ascending=False)[0:8].index
### Data 2016 Plot
x3 = df[df["Fiscal Year"] == 2016]["Agency Name"].value_counts().sort_values(ascending=False)[0:8].values
y3 = df[df["Fiscal Year"] == 2016]["Agency Name"].value_counts().sort_values(ascending=False)[0:8].index
### Data 2017 plot
x4 = df[df["Fiscal Year"] == 2017]["Agency Name"].value_counts().sort_values(ascending=False)[0:8].values
y4 = df[df["Fiscal Year"] == 2017]["Agency Name"].value_counts().sort_values(ascending=False)[0:8].index
# Plot
fig = plt.figure(figsize=(12,9))
ax0 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
# titles
fig.suptitle("Top 8 Agencies with most employees in...",fontweight='bold', fontsize=22)
ax0.title.set_text('2014')
ax2.title.set_text('2015')
ax3.title.set_text('2016')
ax4.title.set_text('2017')
sb.barplot(x=x1, y=y1, palette='BuGn_d', saturation=1, ax=ax0)
sb.barplot(x=x2, y=y2, palette='Blues_d', saturation=1, ax=ax2)
sb.barplot(x=x3, y=y3, palette='GnBu_d', saturation=1, ax=ax3)
sb.barplot(x=x4, y=y4, palette='RdBu_d', saturation=1, ax=ax4)
#plt.tight_layout(w_pad=1.1)
plt.subplots_adjust(left=.05, right=.95,hspace=.5, wspace=1.4,top=.85)
plt.text(x=80000, y=10.87, s='Justin Nunez Source: Kaggle', fontsize=11, color='#f0f0f0',backgroundcolor='grey')
plt.show()
################################################################## END OF PLOT ABOVE
# too many values that are negative or neglible
df[df["Regular Gross Paid"] < 20]
# We need to remove these
# New DataFrame With only values greater then 20,000 on per Annum Basis.
x = df[(df["Pay Basis"] == "per Annum") & (df["Regular Gross Paid"] > 20000) & (df["Base Salary"] > 20000)]
# Regular Gross Pay Plot
x.groupby(x["Fiscal Year"])['Regular Gross Paid'].median().plot(linewidth=4,c='g')
x.groupby(x["Fiscal Year"])['Regular Gross Paid'].mean().plot(linewidth=4, c='b')
plt.axhline(y=50000, color='black', linewidth=1.3, alpha=.75)
plt.xlim(left=2014, right=2016)
plt.text(x=2013.9, y=74000, s="Regular Gross Pay", fontsize=28, weight='bold',alpha=.93)
plt.text(x=2013.9, y=72600, s="Employees On Annual Pay above $20,000",fontsize=12,alpha=.75)
plt.text(x=2013.9, y=42000, s='Justin Nunez Source: Kaggle', fontsize=11, color='#f0f0f0',backgroundcolor='grey')
plt.text(x=2016.2, y=65400, s="Median", color='g', weight='bold', rotation=7)
plt.text(x=2015.5, y=67500, s="Mean", color='b', weight='bold', rotation=7)
plt.xticks(range(2014,2018,1), fontsize=14, weight= 'bold')
# Base Salary Plot
x.groupby(x["Fiscal Year"])['Base Salary'].median().plot(linewidth=4,c='g')
x.groupby(x["Fiscal Year"])['Base Salary'].mean().plot(linewidth=4,c='b')
plt.axhline(y=50000, color='black', linewidth=1.3, alpha=.75)
plt.xlim(left=2014, right=2016)
plt.text(x=2013.9, y=75000, s="Base Salary", fontsize=26, weight='bold',alpha=.90)
plt.text(x=2013.9, y=73800, s="Employees On Annual Pay above $20,000",fontsize=12,alpha=.75)
plt.text(x=2013.9, y=42000, s='Justin Nunez Source: Kaggle', fontsize=11, color='#f0f0f0',backgroundcolor='grey')
plt.text(x=2014, y=70600, s="Median", color='g', weight='bold', rotation=7)
plt.text(x=2014, y=67600, s="Mean", color='b', weight='bold', rotation=7)
plt.xticks(range(2014,2018,1), fontsize=14, weight= 'bold')
# Top 4 Boroughs with Highest Average Gross Pay 2014 Plot !
b = x[x["Fiscal Year"]==2014]
b.groupby(["Work Location Borough"])["Regular Gross Paid"].mean().sort_values(ascending=False)[:8].plot(kind='bar',color='darkcyan')
plt.xticks(fontsize=14, weight= 'bold',rotation=0, color='black')
plt.text(x=-.8, y=84000, s="Top 4 Boroughs with Highest Average Gross Pay in 2014", fontsize=26, weight='bold',alpha=.90)
plt.text(x=-.8, y=79600, s="Employees On Annual Pay above $20,000",fontsize=12,alpha=.75)
plt.xlabel('')
plt.axhline(y=b["Regular Gross Paid"].mean(), color='black', linewidth=1.3, alpha=.75)
plt.text(x = 2.5, y = (b["Regular Gross Paid"].mean() + 240), s="Average Gross Pay")
# Top 8 Boroughs with Highest Average Gross Pay 2015 Plot !
l = x[x["Fiscal Year"]==2015]
l.groupby(["Work Location Borough"])["Regular Gross Paid"].mean().sort_values(ascending=False)[:8].plot(kind='bar',color='darkcyan')
plt.xticks(fontsize=7.4, weight= 'bold',rotation=0, color='black')
plt.text(x=-.8, y=95000, s="Top 8 Boroughs with Highest Average Gross Pay in 2015", fontsize=26, weight='bold',alpha=.90)
plt.text(x=-.8, y=89900, s="Employees On Annual Pay above $20,000",fontsize=12,alpha=.75)
plt.xlabel('')
plt.axhline(y=l["Regular Gross Paid"].mean(), color='black', linewidth=1.3, alpha=.75)
plt.text(x = 2.5, y = (l["Regular Gross Paid"].mean() + 240), s="Average Gross Pay", fontsize=8.6)
# # Top 8 Boroughs with Highest Average Gross Pay 2016 Plot !
lg= x[x["Fiscal Year"]==2016]
lg.groupby(["Work Location Borough"])["Regular Gross Paid"].mean().sort_values(ascending=False)[:8].plot(kind='bar',color='darkcyan')
plt.xticks(fontsize=7.4, weight= 'bold',rotation=0, color='black')
plt.text(x=-.8, y=109000, s="Top 8 Boroughs with Highest Average Gross Pay in 2016", fontsize=26, weight='bold',alpha=.90)
plt.text(x=-.8, y=103000, s="Employees On Annual Pay above $20,000",fontsize=12,alpha=.75)
plt.xlabel('')
plt.axhline(y=lg["Regular Gross Paid"].mean(), color='black', linewidth=1.3, alpha=.75)
plt.text(x = 2.5, y = (lg["Regular Gross Paid"].mean() + 240), s="Average Gross Pay", fontsize=8.6)
# # Top 8 Boroughs with Highest Average Gross Pay 2017 Plot !
mg= x[x["Fiscal Year"]==2017]
mg.groupby(["Work Location Borough"])["Regular Gross Paid"].mean().sort_values(ascending=False)[:8].plot(kind='bar',color='darkcyan')
plt.xticks(fontsize=7.4, weight= 'bold',rotation=0, color='black')
plt.text(x=-.8, y=117000, s="Top 8 Boroughs with Highest Average Gross Pay in 2017", fontsize=26, weight='bold',alpha=.90)
plt.text(x=-.8, y=110000, s="Employees On Annual Pay above $20,000",fontsize=12,alpha=.75)
plt.xlabel('')
plt.axhline(y=mg["Regular Gross Paid"].mean(), color='black', linewidth=1.3, alpha=.75)
plt.text(x = 2.3, y = (mg["Regular Gross Paid"].mean() + 240), s="Average Gross Pay", fontsize=9.5)
### --- Distributions ---- ###
new = df[(df["Pay Basis"] == "per Annum") & (df["Regular Gross Paid"] > 10000) & (df["Base Salary"] > 10000)]
cc = new[new["Fiscal Year"]==2014]
dd = new[new["Fiscal Year"]==2015]
ee = new[new["Fiscal Year"]==2016]
ff = new[new["Fiscal Year"]==2017]
# Distribution - GROSS PAY GRAPH 2017
sb.set_context("paper")
sb.distplot(ff["Regular Gross Paid"],fit=norm,kde=False)
#sb.set(xticks=np.arange(0,250000,50000))
plt.xlim(left=0, right=200000)
plt.title('Distribution of Gross Pay 2017')
plt.text(x=100000, y=.000013, s="Sub-set of employees with salary above 10000", fontsize=8,alpha=.65)
# Distribution - BASE SALARY GRAPH
sb.set_context("paper")
sb.distplot(ff["Base Salary"],fit=norm,kde=False)
#sb.set(xticks=np.arange(0,250000,50000))
plt.xlim(left=0, right=200000)
plt.title('Distribution of Base Salary 2017')
plt.text(x=100000, y=.000017, s="Sub-set of employees with salary above 10000", fontsize=8,alpha=.65)
# Distribution of Base Salary for Annual Employees making over 10,000
fig1 = plt.figure(figsize=(12,9))
ax6 = fig1.add_subplot(221)
ax7 = fig1.add_subplot(222)
ax8 = fig1.add_subplot(223)
ax9 = fig1.add_subplot(224)
# titles
fig1.suptitle("Base Salary Distribution (Salary > $10,000) in...",fontweight='bold', fontsize=22)
ax6.title.set_text('2014')
ax7.title.set_text('2015')
ax8.title.set_text('2016')
ax9.title.set_text('2017')
sb.distplot(cc["Base Salary"], ax=ax6)
sb.distplot(dd["Base Salary"], ax=ax7)
sb.distplot(ee["Base Salary"], ax=ax8)
sb.distplot(ff["Base Salary"], kde=False, ax=ax9)
plt.subplots_adjust(left=.05, right=.95,hspace=.5, wspace=.2,top=.85)
plt.text(x=2000, y=-12000, s='Justin Nunez Source: Kaggle', fontsize=9, color='#f0f0f0',backgroundcolor='grey')
#plt.show()
# ========================================================
# Distributions with Normal Distribution Fit Lines!!!
# Distribution of Base Salary for Annual Employees making over 10,000
fig1 = plt.figure(figsize=(12,9))
ax6 = fig1.add_subplot(221)
ax7 = fig1.add_subplot(222)
ax8 = fig1.add_subplot(223)
ax9 = fig1.add_subplot(224)
# titles
fig1.suptitle("Base Salary Distribution (Salary > $10,000) in...",fontweight='bold', fontsize=22)
ax6.title.set_text('2014')
ax7.title.set_text('2015')
ax8.title.set_text('2016')
ax9.title.set_text('2017')
sb.distplot(cc["Base Salary"],fit=norm,kde=False, ax=ax6)
ax6.set_xlim(0,185000)
ax7.set_xlim(0,185000)
ax8.set_xlim(0,185000)
ax9.set_xlim(0,185000)
sb.distplot(dd["Base Salary"],fit=norm,kde=False, ax=ax7)
plt.xlim(left=0, right=200000)
sb.distplot(ee["Base Salary"],fit=norm,kde=False, ax=ax8)
plt.xlim(left=0, right=200000)
sb.distplot(ff["Base Salary"], kde=False, ax=ax9)
plt.subplots_adjust(left=.05, right=.95,hspace=.5, wspace=.35,top=.85)
#plt.xlim(left=0, right=175000)
plt.text(x=-5000, y=-15000, s='Justin Nunez Source: Kaggle', fontsize=9, color='#f0f0f0',backgroundcolor='grey')
# WONT RUN in the 4th distplot, so thats why its not on the 2017 graph.
# ==================================================================================
## EXTRA --- ** GRAPHS NOT SHOWN IN README ** ---
# distribution of Gross Pay in 2014. (Graph Not Shown in README) (Did not add it)
sb.kdeplot(cc["Regular Gross Paid"], shade=True)
new.describe() # Shows that 541,544.90 is Max Salary.
# Histogram of 2017 Gross Pay
ax5 = plt.subplot(111)
ax5.spines['top'].set_visible(False)
ax5.spines['right'].set_visible(False)
ax5.get_xaxis().tick_bottom()
ax5.get_yaxis().tick_left()
plt.xticks(fontsize=14)
plt.text(x=1000, y= 21000, s="Distribution of Gross Pay in 2017", fontsize=27, fontweight='bold')
plt.hist(list(ff["Regular Gross Paid"].values), color="#3F5D7D", bins=100)
## QQ PLOTS
x = df[(df["Pay Basis"] == "per Annum") & (df["Regular Gross Paid"] > 20000) & (df["Base Salary"] > 20000)]
cc = x[x["Fiscal Year"]==2014]
dd = x[x["Fiscal Year"]==2015]
ee = x[x["Fiscal Year"]==2016]
ff = x[x["Fiscal Year"]==2017]
sm.qqplot(cc["Regular Gross Paid"], line='q')
sm.qqplot(dd["Regular Gross Paid"], line='q')
sm.qqplot(ee["Regular Gross Paid"], line='q')
# Added flare to the plot aesthetics
fig = sm.qqplot(ff["Regular Gross Paid"], line='q');
# Grab the lines with blue dots
dots = fig.findobj(lambda x: hasattr(x, 'get_color') and x.get_color() == 'b')
[d.set_alpha(0.3) for d in dots]
# - - - - - - - #
### - ** - CODE FOR UPDATE 1.0. MARCH 10, 2018 - ** - ###
## """ TEACHER ANALYSIS """ ##
# CLEAN VARIABLE FIRST !
df["Title Description"] = df["Title Description"].astype(str)
# Testing how to deal with this variable.... none of these needed.
#df["Title Description"] = df["Title Description"].astype('str')
#df["Title Description"] = df["Title Description"].str.strip('')
#df["Title Description"] = df["Title Description"].astype('|S80')
#df["Title Description"] = df["Title Description"].astype('|S80')
#Get Rid of Non-Alpha Characters.
df["Title Description"]= df["Title Description"].str.replace('[^A-Za-z\s]+', '')
df["Title Description"] = df["Title Description"].apply(str)
df["Title Description"] = df["Title Description"].str.lstrip()
df["Title Description"] = df["Title Description"].str.lower()
'teacher' in df["Title Description"] # Doesn't seem to work.
df["Agency Start Date"] = df["Agency Start Date"].str.lstrip()
df["job_title"] = df["Title Description"].str.extract('([a-zA-Z ]+)', expand=False).str.strip().str.lstrip()
df["job_title"].str.contains('teacher').any() # TRUE
is_teacher = df.job_title.str.contains("teacher")
# SUBSET OF TEACHERS PAID ANNUALLY!
teacher = df[(df.job_title == 'teacher') & (df["Pay Basis"] == "per Annum")]
# Data of teachers in 2017.
teacher4 = teacher[teacher["Fiscal Year"] == 2017]
#CLEAN OTHER VARIABLE
teacher4["Agency Start Date"] = teacher4["Agency Start Date"].str.lstrip()
#Extract the year from Start Date
teacher4["Start_Year"] = teacher4["Agency Start Date"].str[6:10].sort_values(ascending=True)
# Cast to integer
teacher4["Start_Year"] = teacher4["Start_Year"].astype(int)
# Get rid of year 1999 and only look at employees with starting dates of 2000 to 2017!
above2000 = teacher4[(teacher4["Start_Year"] >= 2000) & (teacher4["Start_Year"]< 9000)]
# PLOTS ---
from pylab import rcParams
rcParams['figure.figsize'] = 20,15
# SEABORN BOXPLOT PAY DISTRIBUTIONS FOR TEACHERS!
ax=plt.axes()
sb.boxplot(x=above2000["Start_Year"],
y=above2000["Base Salary"], data=above2000, palette='hls',ax=ax)
plt.yticks(fontsize=11.0, weight='bold',color='black')
plt.ylabel('$ ', fontsize=23, rotation=0, weight='bold')
plt.xlabel('\n Year Started Working',fontsize=18, weight='bold')
plt.yticks(range(35000,140000,10000), fontsize=13, weight= 'bold')
plt.xticks(fontsize=16, weight= 'bold')
ax.set_title("Boxplot Distribution of Base Salary (USD) for Year Started Working", weight='bold')
# SEABORN BOXPLOT TEACHER! GROSS
ax1=plt.axes()
sb.boxplot(x=above2000["Start_Year"],
y=above2000["Regular Gross Paid"], data=above2000, palette='hls',ax=ax1)
plt.yticks(fontsize=11.0, weight='bold',color='black')
plt.ylabel('$ ', fontsize=23, rotation=0, weight='bold')
plt.xlabel('\n Year Started Working',fontsize=18, weight='bold')
plt.yticks(range(5000,200000,25000), fontsize=13, weight= 'bold')
plt.xticks(fontsize=16, weight= 'bold')
ax1.set_title("Boxplot Distribution of Gross Salary (USD) for Year Started Working", weight='bold')
## GROSS PAY Mean/Median PAY FOR TEACHERS YEAR 2000 - 2017
above2000.groupby(["Start_Year"])["Regular Gross Paid"].mean().plot(linewidth=2.25)
above2000.groupby(["Start_Year"])["Regular Gross Paid"].median().plot(linewidth=2.25)
plt.text(x=1999, y=98500, s="Average Gross Pay For Teachers in 2017", fontsize=28, weight='bold',alpha=.93)
plt.text(x=1999, y=94400, s="Based On Year They Started Working", fontsize=14, weight='bold',alpha=.78)
plt.text(x=2005, y=84000, s="Median", fontsize=14, weight='bold',alpha=.93, color='red')
plt.text(x=2005, y=69700, s="Mean", fontsize=14, weight='bold',alpha=.93, color='steelblue')
plt.axhline(y=20000, color='black', linewidth=1.3, alpha=.75)
plt.ylabel('$ ', fontsize=15, rotation=0, weight='bold')
plt.xlabel('Year Started Working',fontsize=13, weight='bold')
## BASE PAY Mean/Median FOR TEACHERS YEAR 2000 - 2017
above2000.groupby(["Start_Year"])["Base Salary"].mean().plot(linewidth=2.25)
above2000.groupby(["Start_Year"])["Base Salary"].median().plot(linewidth=2.25)
plt.text(x=1999, y=101100, s="Average Base Salary For Teachers in 2017", fontsize=28, weight='bold',alpha=.93)
plt.text(x=1999, y=97000, s="Based On Year They Started Working", fontsize=14, weight='bold',alpha=.78)
plt.text(x=2005, y=91000, s="Median", fontsize=14, weight='bold',alpha=.93, color='red')
plt.text(x=2005, y=81700, s="Mean", fontsize=14, weight='bold',alpha=.93, color='steelblue')
plt.axhline(y=20000, color='black', linewidth=1.3, alpha=.75)
plt.ylabel('$ ', fontsize=15, rotation=0, weight='bold')
plt.xlabel('Year Started Working',fontsize=13, weight='bold')
# Number of Workers Added Per Year PLOT
a1 = above2000["Start_Year"].value_counts().sort_values(ascending=False).index
a2 = above2000["Start_Year"].value_counts().sort_values(ascending=False).value
ax=plt.axes()
sb.barplot(x=a1, y=a2, palette='BuGn_d', saturation=1,ax=ax)
ax.set_title("Number Of Teachers Added Per Year", fontsize=24, weight='bold',alpha=.93)
plt.show()
# 2017 Teacher pay distribution (1 year of experience)
sb.distplot(above2000[above2000["Start_Year"] == 2017]["Regular Gross Paid"],fit=norm, kde=False)
## 17 year experience Distribution plot
sb.distplot(above2000[above2000["Start_Year"] == 2000]["Regular Gross Paid"],fit=norm,kde=False)
## Relationships between variables plots!
# Does TOTAL OTHER PAY relate to Salary in 2016?? Let's check.
b = above2000[above2000["Start_Year"] == 2016]
sb.jointplot(x="Regular Gross Paid",y="Total Other Pay",data=b)
sb.jointplot(x="Base Salary",y="Total Other Pay",data=b)
# 2016 - Base and Gross
sb.jointplot(x="Base Salary",y="Regular Gross Paid",data=b)
# HOW ABOUT 2017
rcParams['figure.figsize'] = 12,9
cc = above2000[above2000["Start_Year"] == 2017]
sb.distplot(cc["Base Salary"],fit=norm,kde=False) # BASE PAY HISTOGRAM
sb.jointplot(x="Regular Gross Paid",y="Total Other Pay",data=cc)
sb.jointplot(x="Base Salary",y="Total Other Pay",data=cc)
sb.jointplot(x="Base Salary",y="Regular Gross Paid",data=cc)
# 1995 (22 Years of experience) Pay Distribution Plot.
nn = teacher4[(teacher4["Start_Year"] >= 1994) & (teacher4["Start_Year"]< 9000)]
nn.head()
year1995 = nn[nn["Start_Year"] == 1995]
year1995.head()
sb.distplot(year1995["Regular Gross Paid"],fit=norm, kde=False)
year1995["Regular Gross Paid"].mean()
""" END OF TEACHER ANALYSIS """