99.2% Accuracy on Employee Resignations
Classifying whether or not employees are likely to resign¶
First import all the libraries we are going to use
In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors, svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,\
AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score,\
mean_absolute_error, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split,\
learning_curve
from sklearn.feature_selection import RFE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import itertools
import pickle
DIR_DATA = "data"
DIR_PROCESSED = "processed"
CV_FOLDS = 5
HR_comma_sep.csv¶
All our data is stored in this file, here's a slightly altered description of the features provided with the competition.
Features¶
- Employee satisfaction level
- Last evaluation
- Number of projects
- Average monthly hours
- Time spent at the company
- Whether they have had a work accident
- Whether they have had a promotion in the last 5 years
- Department
- Salary
Target¶
- Whether the employee has left
Now let's load and look at the actual data
In [2]:
hr = pd.read_csv(DIR_DATA + '/HR_comma_sep.csv')
hr.head()
Out[2]:
In [3]:
hr.describe()
Out[3]:
In [4]:
print('Percent who left: {:.2f}'.format(np.sum(hr.left) / len(hr.left) * 100))
First, let's turn sales departments and salary (high/medium/low) into category types so that scikit-learn's algorithms can handle them
In [5]:
hr.sales = hr.sales.astype('category').cat.codes
hr.salary = hr.salary.astype('category').cat.codes
hr[["sales","salary"]].head()
Out[5]:
Missed Medical Appointments Exploration (Under Construction)
In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing, neighbors, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, \
RandomForestRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from math import sqrt
import itertools
from itertools import cycle
import pickle
DIR_DATA = "data"
DIR_PROCESSED = "processed"
LABEL = "Status"
NON_PREDICTORS = [LABEL]#, "name","anime_id"]
CV_FOLDS = 5
In [4]:
# No-show-Issue-Comma-300k.csv
In [46]:
ma = pd.read_csv(DIR_DATA + '/No-show-Issue-Comma-300k.csv')
ma.head()
Out[46]:
In [47]:
print('Percent who left: {:.2f}%'.format((1 - np.sum(ma.Status == 'Show-Up') / len(ma.Status)) * 100))
In [48]:
def getTime(appointment):
hour, minute, _ = appointment.strip('Z').split(':')
hour = int(hour[-2:])
minute = int(minute)
return 60*hour + minute
ma["AppointmenTime"] = ma.AppointmentRegistration.apply(lambda x: getTime(x))
In [49]:
ma.describe()
Out[49]:
In [50]:
ma.Status = ma.Status.astype('category').cat.codes
ma.Gender = ma.Gender.astype('category').cat.codes
ma.DayOfTheWeek = ma.DayOfTheWeek.astype('category').cat.codes
# hr.Work_accident = hr.Work_accident.astype('category')
# hr.promotion_last_5years = hr.promotion_last_5years.astype('category')
# hr.left = hr.left.astype('category')
# Normalizing waiting time to mean 0 and unit variance
ma.AwaitingTime = ma.AwaitingTime.astype(np.float64)
wait_time_scaler = preprocessing.MinMaxScaler() #preprocessing.StandardScaler()
ma.AwaitingTime = wait_time_scaler.fit_transform(ma.AwaitingTime.values.reshape(-1,1))
# Feature Scaling Age to be between [0,1]
ma.Age = ma.Age.astype(np.float64)
age_scaler = preprocessing.MinMaxScaler()
ma.Age = age_scaler.fit_transform(ma.Age.values.reshape(-1,1))
# Feature Scaling Age to be between [0,1]
ma.AppointmenTime = ma.AppointmenTime.astype(np.float64)
time_scaler = preprocessing.MinMaxScaler()
ma.AppointmenTime = time_scaler.fit_transform(ma.AppointmenTime.values.reshape(-1,1))
In [51]:
ma.head()
Out[51]:
In [52]:
ma.describe()
Out[52]: