99.2% Accuracy on Employee Resignations

Classifying whether or not employees are likely to resign

First import all the libraries we are going to use

In [1]:
import pandas as pd
import numpy as np

from sklearn import  neighbors, svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,\
    AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB 

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score,\
    mean_absolute_error, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split,\
    learning_curve
from sklearn.feature_selection import RFE

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import itertools
import pickle

DIR_DATA = "data"
DIR_PROCESSED = "processed"
CV_FOLDS = 5

HR_comma_sep.csv

All our data is stored in this file, here's a slightly altered description of the features provided with the competition.

Features

  • Employee satisfaction level
  • Last evaluation
  • Number of projects
  • Average monthly hours
  • Time spent at the company
  • Whether they have had a work accident
  • Whether they have had a promotion in the last 5 years
  • Department
  • Salary

Target

  • Whether the employee has left

Now let's load and look at the actual data

In [2]:
hr = pd.read_csv(DIR_DATA + '/HR_comma_sep.csv')
hr.head()
Out[2]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low
In [3]:
hr.describe()
Out[3]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
count 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000
mean 0.612834 0.716102 3.803054 201.050337 3.498233 0.144610 0.238083 0.021268
std 0.248631 0.171169 1.232592 49.943099 1.460136 0.351719 0.425924 0.144281
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000
50% 0.640000 0.720000 4.000000 200.000000 3.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000
In [4]:
print('Percent who left: {:.2f}'.format(np.sum(hr.left) / len(hr.left) * 100))
Percent who left: 23.81

First, let's turn sales departments and salary (high/medium/low) into category types so that scikit-learn's algorithms can handle them

In [5]:
hr.sales = hr.sales.astype('category').cat.codes
hr.salary = hr.salary.astype('category').cat.codes
hr[["sales","salary"]].head()
Out[5]:
sales salary
0 7 1
1 7 2
2 7 2
3 7 1
4 7 1

Missed Medical Appointments Exploration (Under Construction)

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing, neighbors, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, \
  RandomForestRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score,  confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from math import sqrt

import itertools
from itertools import cycle

import pickle
DIR_DATA = "data"
DIR_PROCESSED = "processed"
LABEL = "Status"
NON_PREDICTORS = [LABEL]#, "name","anime_id"]
CV_FOLDS = 5
In [4]:
# No-show-Issue-Comma-300k.csv
In [46]:
ma = pd.read_csv(DIR_DATA + '/No-show-Issue-Comma-300k.csv')
ma.head()
Out[46]:
Age Gender AppointmentRegistration ApointmentData DayOfTheWeek Status Diabetes Alcoolism HiperTension Handcap Smokes Scholarship Tuberculosis Sms_Reminder AwaitingTime
0 19 M 2014-12-16T14:46:25Z 2015-01-14T00:00:00Z Wednesday Show-Up 0 0 0 0 0 0 0 0 -29
1 24 F 2015-08-18T07:01:26Z 2015-08-19T00:00:00Z Wednesday Show-Up 0 0 0 0 0 0 0 0 -1
2 4 F 2014-02-17T12:53:46Z 2014-02-18T00:00:00Z Tuesday Show-Up 0 0 0 0 0 0 0 0 -1
3 5 M 2014-07-23T17:02:11Z 2014-08-07T00:00:00Z Thursday Show-Up 0 0 0 0 0 0 0 1 -15
4 38 M 2015-10-21T15:20:09Z 2015-10-27T00:00:00Z Tuesday Show-Up 0 0 0 0 0 0 0 1 -6
In [47]:
print('Percent who left: {:.2f}%'.format((1 - np.sum(ma.Status == 'Show-Up') / len(ma.Status)) * 100))
Percent who left: 30.24%
In [48]:
def getTime(appointment):
    hour, minute, _ = appointment.strip('Z').split(':')
    
    hour = int(hour[-2:])
    minute = int(minute)
    return 60*hour + minute

ma["AppointmenTime"] = ma.AppointmentRegistration.apply(lambda x: getTime(x))
In [49]:
ma.describe()
Out[49]:
Age Diabetes Alcoolism HiperTension Handcap Smokes Scholarship Tuberculosis Sms_Reminder AwaitingTime AppointmenTime
count 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000
mean 37.808017 0.077967 0.025010 0.215890 0.020523 0.052370 0.096897 0.000450 0.574173 -13.841813 705.811503
std 22.809014 0.268120 0.156156 0.411439 0.155934 0.222772 0.295818 0.021208 0.499826 15.687697 193.880593
min -2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -398.000000 354.000000
25% 19.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -20.000000 537.000000
50% 38.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -8.000000 667.000000
75% 56.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -4.000000 872.000000
max 113.000000 1.000000 1.000000 1.000000 4.000000 1.000000 1.000000 1.000000 2.000000 -1.000000 1362.000000
In [50]:
ma.Status = ma.Status.astype('category').cat.codes
ma.Gender = ma.Gender.astype('category').cat.codes
ma.DayOfTheWeek = ma.DayOfTheWeek.astype('category').cat.codes
# hr.Work_accident = hr.Work_accident.astype('category')
# hr.promotion_last_5years = hr.promotion_last_5years.astype('category')
# hr.left = hr.left.astype('category')

# Normalizing waiting time to mean 0 and unit variance
ma.AwaitingTime = ma.AwaitingTime.astype(np.float64)
wait_time_scaler = preprocessing.MinMaxScaler() #preprocessing.StandardScaler()
ma.AwaitingTime = wait_time_scaler.fit_transform(ma.AwaitingTime.values.reshape(-1,1))

# Feature Scaling Age to be between [0,1]
ma.Age = ma.Age.astype(np.float64)
age_scaler = preprocessing.MinMaxScaler()
ma.Age = age_scaler.fit_transform(ma.Age.values.reshape(-1,1))

# Feature Scaling Age to be between [0,1]
ma.AppointmenTime = ma.AppointmenTime.astype(np.float64)
time_scaler = preprocessing.MinMaxScaler()
ma.AppointmenTime = time_scaler.fit_transform(ma.AppointmenTime.values.reshape(-1,1))
In [51]:
ma.head()
Out[51]:
Age Gender AppointmentRegistration ApointmentData DayOfTheWeek Status Diabetes Alcoolism HiperTension Handcap Smokes Scholarship Tuberculosis Sms_Reminder AwaitingTime AppointmenTime
0 0.182609 1 2014-12-16T14:46:25Z 2015-01-14T00:00:00Z 6 1 0 0 0 0 0 0 0 0 0.929471 0.527778
1 0.226087 0 2015-08-18T07:01:26Z 2015-08-19T00:00:00Z 6 1 0 0 0 0 0 0 0 0 1.000000 0.066468
2 0.052174 0 2014-02-17T12:53:46Z 2014-02-18T00:00:00Z 5 1 0 0 0 0 0 0 0 0 1.000000 0.415675
3 0.060870 1 2014-07-23T17:02:11Z 2014-08-07T00:00:00Z 4 1 0 0 0 0 0 0 0 1 0.964736 0.662698
4 0.347826 1 2015-10-21T15:20:09Z 2015-10-27T00:00:00Z 5 1 0 0 0 0 0 0 0 1 0.987406 0.561508
In [52]:
ma.describe()
Out[52]:
Age Gender DayOfTheWeek Status Diabetes Alcoolism HiperTension Handcap Smokes Scholarship Tuberculosis Sms_Reminder AwaitingTime AppointmenTime
count 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000 300000.000000
mean 0.346157 0.331650 3.326650 0.697563 0.077967 0.025010 0.215890 0.020523 0.052370 0.096897 0.000450 0.574173 0.967653 0.349019
std 0.198339 0.470807 2.283518 0.459314 0.268120 0.156156 0.411439 0.155934 0.222772 0.295818 0.021208 0.499826 0.039516 0.192342
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.182609 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.952141 0.181548
50% 0.347826 0.000000 4.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.982368 0.310516
75% 0.504348 1.000000 5.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.992443 0.513889
max 1.000000 1.000000 6.000000 1.000000 1.000000 1.000000 1.000000 4.000000 1.000000 1.000000 1.000000 2.000000 1.000000 1.000000