99.2% Accuracy on Employee Resignations
Classifying whether or not employees are likely to resign¶
First import all the libraries we are going to use
import pandas as pd
import numpy as np
from sklearn import neighbors, svm
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,\
AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score,\
mean_absolute_error, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split,\
learning_curve
from sklearn.feature_selection import RFE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import itertools
import pickle
DIR_DATA = "data"
DIR_PROCESSED = "processed"
CV_FOLDS = 5
HR_comma_sep.csv¶
All our data is stored in this file, here's a slightly altered description of the features provided with the competition.
Features¶
- Employee satisfaction level
- Last evaluation
- Number of projects
- Average monthly hours
- Time spent at the company
- Whether they have had a work accident
- Whether they have had a promotion in the last 5 years
- Department
- Salary
Target¶
- Whether the employee has left
Now let's load and look at the actual data
hr = pd.read_csv(DIR_DATA + '/HR_comma_sep.csv')
hr.head()
hr.describe()
print('Percent who left: {:.2f}'.format(np.sum(hr.left) / len(hr.left) * 100))
First, let's turn sales departments and salary (high/medium/low) into category types so that scikit-learn's algorithms can handle them
hr.sales = hr.sales.astype('category').cat.codes
hr.salary = hr.salary.astype('category').cat.codes
hr[["sales","salary"]].head()
Off-the-shelf Classification¶
Let's create a quick test of basic classification algorithms to see how well each performs with mostly unaltered data
def predict_left(df, clf, test_size=0.2):
X = df.drop(['left'],1)
y = df.left
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)
print('Training Score: {:.3f}'.format(clf.score(X_train, y_train)))
print('Testing Score: %.3f' % (clf.score(X_test, y_test)))
print()
classifiers = [RandomForestClassifier(n_jobs=-1),
RandomForestClassifier(criterion='entropy', n_jobs=-1),
svm.SVC(),
LogisticRegressionCV(n_jobs=-1),
AdaBoostClassifier(),
GradientBoostingClassifier(),
neighbors.KNeighborsClassifier(n_jobs=-1),
MultinomialNB(class_prior=[76.19, 23.81])]
np.random.seed(0)
for i, clf in enumerate(classifiers):
print('Classifier ', i)
predict_left(hr, clf, test_size=0.4)
They all seem to do well, besides svm, with Random Forest doing best regardless of it's criterion at ~98.7% predictive accuracy.
K-Fold Cross Validation¶
Let's make sure they consistently produce that score using K-Fold Cross-Validation.
def cross_val_left(hr, clf, cv_folds=CV_FOLDS, drop=['left']):
X = hr.drop(drop, 1)
y = hr.left
scores = cross_val_score(clf, X, y, cv=cv_folds, n_jobs=-1)#, scoring='roc_auc')
print('Cross val score: ', sum(scores) / cv_folds )
print(scores)
print()
classifiers = [RandomForestClassifier(n_estimators=500 ,n_jobs=-1), RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1)]
for i, clf in enumerate(classifiers):
print('Classifier ', i)
cross_val_left(hr, clf)
Great, so they tend to do even better on average, with a score of 99.2% each over 5 cross-validation folds.
Learning Curve¶
Now let's see how the classifier improves with more and more training data by plotting a learning curve.
Plot code based on example on this page: https://goo.gl/pd1A8K
clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
X = hr.drop(['left'],1)
y = hr.left
train_sizes, train_scores, valid_scores = learning_curve(clf, X, y, train_sizes=np.linspace(.1, 1.0, 10), cv=5, n_jobs=-1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(valid_scores, axis=1)
test_scores_std = np.std(valid_scores, axis=1)
train_sizes = np.linspace(.1,1.0,10)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
plt.legend(loc="best")
plt.show()
So there seems to be a huge improvement between getting 10% and 20% of the data, with mild improvements after that.
Let's zoom in on the improvements after 40% of the training data is being trained on to get a better picture.
The blue outline around the green cross-validation score is the standard deviation of the predicted scores.
train_scores_mean = np.mean(train_scores, axis=1)[3:] # 0.4 on
train_scores_std = np.std(train_scores, axis=1)[3:]
test_scores_mean = np.mean(valid_scores, axis=1)[3:]
test_scores_std = np.std(valid_scores, axis=1)[3:]
train_sizes = np.linspace(.1,1.0,10)[3:]
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
plt.legend(loc="best")
plt.show()
Feature Selection¶
Now let's do some feature selection by removing the features that least matter
np.random.seed(0)
clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
X = hr.drop(['left'],1)
y = hr.left
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
print(X.columns)
clf.feature_importances_
Let's start by dropping all features with lower than 0.01 relevance.
It's possible that a feature with a low "relevance" can still matter if the magnitude is great enough, however most of these values are in the same range near zero, so that shouldn't matter.
drop = ['left', 'promotion_last_5years', 'Work_accident', 'sales', 'salary']
for i, clf in enumerate(classifiers):
print('Classifier ', i)
cross_val_left(hr, clf, drop=drop)
So it seems that dropping all with < 0.01 relevance seems to not affect score much:
99.20% -> 99.16%
Let's investigate further.
hr.corr()["left"]
Will we get the same results by looking at other metrics?
Let's see what happens when we drop everything with abs(correlation) < 0.005
drop = ['left', 'sales', 'salary']
classifiers = [RandomForestClassifier(n_estimators=500 ,n_jobs=-1), RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1)]#, svm.SVC()]#xgb.XGBClassifier(n_estimators=500, nthread=-1)]# svm.SVC()]
for i, clf in enumerate(classifiers):
print('Classifier ', i)
cross_val_left(hr, clf, drop=drop)
The decrease in accuracy is the same as when we pulled out four features before, even though we only took out half as many.
What happens if we only take out the other two features and leave in the "least correlated"?
drop = ['left', 'promotion_last_5years', 'Work_accident']
for i, clf in enumerate(classifiers):
print('Classifier ', i)
cross_val_left(hr, clf, drop=drop)
Hmm. So it seems removing Salary and Sales is what caused the 0.04 percentage point decrease, and still matter somewhat even though they're the least correlated.
The other two features though seem worthless and possibly counterproductive though since removing them managed to increase the RandomForestClassifier's (with Entropy) accuracy
Recursive Feature Selection¶
Recursive Feature Selection, or RFE, is a more predictable method of ranking features.
Let's see if we get the same results as before by looking at what the worst 4 features are
model = RandomForestClassifier(n_estimators=500, n_jobs=-1)
clf = RFE(model, 5)
X = hr.drop(['left'], 1)
y = hr.left
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
print(clf.support_)
print(clf.ranking_)
# Gets same result as feature_importance, which makes sense
It's the same as before, with promotion_last_5years and Work_accident being the worst features, with Salary and Sales right behind.
That worked out nicely.
But is that true if we use other algorithms with similar accuracy (that RFE is willing to work with)?
classifiers = [AdaBoostClassifier(),
GradientBoostingClassifier(),
LogisticRegressionCV(),
MultinomialNB()]
for i, model in enumerate(classifiers):
clf = RFE(model, 5)
X = hr.drop(['left'], 1)
y = hr.left
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("Classifier %d" % i)
clf.fit(X_train, y_train)
print(clf.ranking_)
print()
# Gets same result as feature_importance
Both the high accuracy classifiers (AdaBoost and GradientBoost) view those same 4 as being the lest useful, in similar if not identical orders. Logistic Regression and Multinomial Naive Bayes on the other hand, with their ~77% accuracy only agreed that sales was bad.
Given that all the high accuracy classifiers agree, I think it's safe to say that promotion_last_5years and Work_accident are the least useful features
drop = ['left', 'promotion_last_5years', 'Work_accident']
y = np.array(hr.left)
X = np.array(hr.drop(drop,1))
pca = PCA(n_components=2).fit(X)
X_pca = pca.transform(X)
Now that we've done the PCA transformation, let's see how much of the variance is explained per new feature, and overall
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))
It looks like pretty much all of the variance is being handled by the first dimension, and that we should still get high accuracy since most of the variance in the data is still there.
# Still get good accuracy, ~97%
clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
y = np.array(hr.left)
scores = cross_val_score(clf, X_pca, y, cv=CV_FOLDS, n_jobs=-1)
print('Cross val score: ', sum(scores) / CV_FOLDS )
print(scores)
Despite catching most of the variance, the accuracy of the classifier goes down a lot (3.5 percentage points) and there is a lot more variance in the various cross folds (92.07% <-> 98.33%)
Now let's plot our 2-Dimensional data¶
colors = itertools.cycle('rb')
target_ids = range(2)
plt.figure()
for i, c, label in zip(target_ids, colors, ["stay","left"]):
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1],
c=c, label=label)
plt.legend()
plt.show()
It seems to have plotted them into stratified lines of predictions where the same general chunks of each line are stay or left, with a few outliers. Someone might get pretty good accuracy by just looking at the x-values on the bottom, which makes sense since most of the variance is covered by the first pca-feature that we made
3-D¶
y = np.array(hr.left)
X = np.array(hr.drop(drop,1))
pca = PCA(n_components=3).fit(X)
X_pca = pca.transform(X)
colors = itertools.cycle('rb')
target_ids = range(2)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for i, c, label in zip(target_ids, colors, ["stay","left"]):
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1],X_pca[y == i, 2],
c=c, label=label)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.legend()
plt.show()
Sicne almost all of the variance is in the first dimension, it makes sense that the 3d graph is essentially the same 2-D plane we got before.
Now let's check in on the accuracy of the 3-D version
clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
y = hr.left
scores = cross_val_score(clf, X_pca, y, cv=CV_FOLDS, n_jobs=-1)
print('Cross val score: ', sum(scores) / CV_FOLDS )
print(scores)
Accuracy seems to be halfway between the 2-D accuracy and the original accuracy at 97.1%, which makes sense.
Confusion Matrix and Classification Report¶
Now, since the data is semi-unbalanced (only 23.81% left), let's just do a quick classification report to make sure that our accuracy metric isn't giving us a false impression of the quality of this classifier.
clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
X = hr.drop(drop,1)
y = hr.left
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
Confusion Matrix¶
Plot code based on example on this page: https://goo.gl/3Mz7vL
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
class_names = ["Stay","Left"]
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix')
plt.show()
As you can see, almost everything called a stay is a stay, and almost everything called left is a left, so recall and precision are both extremely high. So we definitely weren't getting a false impression.
Let's get the exact numbers below:
Classification Report¶
print(classification_report(y_test, y_pred, target_names=class_names))
Conclusion¶
99.2% is a pretty good accuracy, and it's not as if that's a bad metric since recall and precision are just as high even though the data is unbalanced. So I'm just going to leave it as is. However:
Other things to do (Feature Engineering)¶
- Department level metrics. However, in my preliminary exploration in that area didn't seem to improve anything.
- Feature Scaling on some of the metrics that are in completely different ranges than the rest of the features, like average_monthly_hours. Also didn't seem to improve anything on Random Forest at least.