Classifying whether or not employees are likely to resign¶

First import all the libraries we are going to use

import pandas as pd
import numpy as np

from sklearn import  neighbors, svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,\
    AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB 

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score,\
    mean_absolute_error, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split,\
    learning_curve
from sklearn.feature_selection import RFE

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import itertools
import pickle

DIR_DATA = "data"
DIR_PROCESSED = "processed"
CV_FOLDS = 5

HR_comma_sep.csv¶

All our data is stored in this file, here's a slightly altered description of the features provided with the competition.

Features¶

Employee satisfaction level
Last evaluation
Number of projects
Average monthly hours
Time spent at the company
Whether they have had a work accident
Whether they have had a promotion in the last 5 years
Department
Salary

Target¶

Whether the employee has left

Now let's load and look at the actual data

hr = pd.read_csv(DIR_DATA + '/HR_comma_sep.csv')
hr.head()

hr.describe()

print('Percent who left: {:.2f}'.format(np.sum(hr.left) / len(hr.left) * 100))

Percent who left: 23.81

First, let's turn sales departments and salary (high/medium/low) into category types so that scikit-learn's algorithms can handle them

hr.sales = hr.sales.astype('category').cat.codes
hr.salary = hr.salary.astype('category').cat.codes
hr[["sales","salary"]].head()

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing, neighbors, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, \
  RandomForestRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score,  confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from math import sqrt

import itertools
from itertools import cycle

import pickle
DIR_DATA = "data"
DIR_PROCESSED = "processed"
LABEL = "Status"
NON_PREDICTORS = [LABEL]#, "name","anime_id"]
CV_FOLDS = 5

# No-show-Issue-Comma-300k.csv

ma = pd.read_csv(DIR_DATA + '/No-show-Issue-Comma-300k.csv')
ma.head()

print('Percent who left: {:.2f}%'.format((1 - np.sum(ma.Status == 'Show-Up') / len(ma.Status)) * 100))

Percent who left: 30.24%

def getTime(appointment):
    hour, minute, _ = appointment.strip('Z').split(':')
    
    hour = int(hour[-2:])
    minute = int(minute)
    return 60*hour + minute

ma["AppointmenTime"] = ma.AppointmentRegistration.apply(lambda x: getTime(x))

ma.describe()

ma.Status = ma.Status.astype('category').cat.codes
ma.Gender = ma.Gender.astype('category').cat.codes
ma.DayOfTheWeek = ma.DayOfTheWeek.astype('category').cat.codes
# hr.Work_accident = hr.Work_accident.astype('category')
# hr.promotion_last_5years = hr.promotion_last_5years.astype('category')
# hr.left = hr.left.astype('category')

# Normalizing waiting time to mean 0 and unit variance
ma.AwaitingTime = ma.AwaitingTime.astype(np.float64)
wait_time_scaler = preprocessing.MinMaxScaler() #preprocessing.StandardScaler()
ma.AwaitingTime = wait_time_scaler.fit_transform(ma.AwaitingTime.values.reshape(-1,1))

# Feature Scaling Age to be between [0,1]
ma.Age = ma.Age.astype(np.float64)
age_scaler = preprocessing.MinMaxScaler()
ma.Age = age_scaler.fit_transform(ma.Age.values.reshape(-1,1))

# Feature Scaling Age to be between [0,1]
ma.AppointmenTime = ma.AppointmenTime.astype(np.float64)
time_scaler = preprocessing.MinMaxScaler()
ma.AppointmenTime = time_scaler.fit_transform(ma.AppointmenTime.values.reshape(-1,1))

ma.head()

ma.describe()

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	sales	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years
count	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000
mean	0.612834	0.716102	3.803054	201.050337	3.498233	0.144610	0.238083	0.021268
std	0.248631	0.171169	1.232592	49.943099	1.460136	0.351719	0.425924	0.144281
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000
50%	0.640000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000
75%	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000

	Age	Gender	AppointmentRegistration	ApointmentData	DayOfTheWeek	Status	Sms_Reminder	AwaitingTime
0	19	M	2014-12-16T14:46:25Z	2015-01-14T00:00:00Z	Wednesday	Show-Up	0	-29
1	24	F	2015-08-18T07:01:26Z	2015-08-19T00:00:00Z	Wednesday	Show-Up	0	-1
2	4	F	2014-02-17T12:53:46Z	2014-02-18T00:00:00Z	Tuesday	Show-Up	0	-1
3	5	M	2014-07-23T17:02:11Z	2014-08-07T00:00:00Z	Thursday	Show-Up	1	-15
4	38	M	2015-10-21T15:20:09Z	2015-10-27T00:00:00Z	Tuesday	Show-Up	1	-6

	Age	Diabetes	Alcoolism	HiperTension	Handcap	Smokes	Scholarship	Tuberculosis	Sms_Reminder	AwaitingTime	AppointmenTime
count	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000	300000.000000
mean	37.808017	0.077967	0.025010	0.215890	0.020523	0.052370	0.096897	0.000450	0.574173	-13.841813	705.811503
std	22.809014	0.268120	0.156156	0.411439	0.155934	0.222772	0.295818	0.021208	0.499826	15.687697	193.880593
min	-2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-398.000000	354.000000
25%	19.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-20.000000	537.000000
50%	38.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	-8.000000	667.000000
75%	56.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	-4.000000	872.000000
max	113.000000	1.000000	1.000000	1.000000	4.000000	1.000000	1.000000	1.000000	2.000000	-1.000000	1362.000000

	Age	Gender	AppointmentRegistration	ApointmentData	DayOfTheWeek	Status	Sms_Reminder	AwaitingTime	AppointmenTime
0	0.182609	1	2014-12-16T14:46:25Z	2015-01-14T00:00:00Z	6	1	0	0.929471	0.527778
1	0.226087	0	2015-08-18T07:01:26Z	2015-08-19T00:00:00Z	6	1	0	1.000000	0.066468
2	0.052174	0	2014-02-17T12:53:46Z	2014-02-18T00:00:00Z	5	1	0	1.000000	0.415675
3	0.060870	1	2014-07-23T17:02:11Z	2014-08-07T00:00:00Z	4	1	1	0.964736	0.662698
4	0.347826	1	2015-10-21T15:20:09Z	2015-10-27T00:00:00Z	5	1	1	0.987406	0.561508

99.2% Accuracy on Employee Resignations

Classifying whether or not employees are likely to resign¶

HR_comma_sep.csv¶

Features¶

Target¶

Missed Medical Appointments Exploration (Under Construction)