Missed Medical Appointments Exploration (Under Construction)
In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing, neighbors, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, \
RandomForestRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from math import sqrt
import itertools
from itertools import cycle
import pickle
DIR_DATA = "data"
DIR_PROCESSED = "processed"
LABEL = "Status"
NON_PREDICTORS = [LABEL]#, "name","anime_id"]
CV_FOLDS = 5
In [4]:
# No-show-Issue-Comma-300k.csv
In [46]:
ma = pd.read_csv(DIR_DATA + '/No-show-Issue-Comma-300k.csv')
ma.head()
Out[46]:
In [47]:
print('Percent who left: {:.2f}%'.format((1 - np.sum(ma.Status == 'Show-Up') / len(ma.Status)) * 100))
In [48]:
def getTime(appointment):
hour, minute, _ = appointment.strip('Z').split(':')
hour = int(hour[-2:])
minute = int(minute)
return 60*hour + minute
ma["AppointmenTime"] = ma.AppointmentRegistration.apply(lambda x: getTime(x))
In [49]:
ma.describe()
Out[49]:
In [50]:
ma.Status = ma.Status.astype('category').cat.codes
ma.Gender = ma.Gender.astype('category').cat.codes
ma.DayOfTheWeek = ma.DayOfTheWeek.astype('category').cat.codes
# hr.Work_accident = hr.Work_accident.astype('category')
# hr.promotion_last_5years = hr.promotion_last_5years.astype('category')
# hr.left = hr.left.astype('category')
# Normalizing waiting time to mean 0 and unit variance
ma.AwaitingTime = ma.AwaitingTime.astype(np.float64)
wait_time_scaler = preprocessing.MinMaxScaler() #preprocessing.StandardScaler()
ma.AwaitingTime = wait_time_scaler.fit_transform(ma.AwaitingTime.values.reshape(-1,1))
# Feature Scaling Age to be between [0,1]
ma.Age = ma.Age.astype(np.float64)
age_scaler = preprocessing.MinMaxScaler()
ma.Age = age_scaler.fit_transform(ma.Age.values.reshape(-1,1))
# Feature Scaling Age to be between [0,1]
ma.AppointmenTime = ma.AppointmenTime.astype(np.float64)
time_scaler = preprocessing.MinMaxScaler()
ma.AppointmenTime = time_scaler.fit_transform(ma.AppointmenTime.values.reshape(-1,1))
In [51]:
ma.head()
Out[51]:
In [52]:
ma.describe()
Out[52]:
In [53]:
drop = ['Status','AppointmentRegistration','ApointmentData']
def predict_left(df, clf, test_size=0.2):
X = df.drop(drop,1)
y = df.Status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
clf.fit(X_train, y_train)
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)
print('Training Score: {:.3f}'.format(clf.score(X_train, y_train)))
print('Testing Score: %.3f' % (clf.score(X_test, y_test)))
print()
In [ ]:
In [ ]:
# Random Forest is bad at this, and K-NN is good but slow
classifiers = [neighbors.KNeighborsClassifier(n_jobs=-1), RandomForestClassifier(class_weight='balanced',n_jobs=-1), RandomForestClassifier(class_weight='balanced', criterion='entropy', n_jobs=-1),LogisticRegressionCV(class_weight='balanced', n_jobs=-1), AdaBoostClassifier(),xgb.XGBClassifier(nthread=-1), MultinomialNB()]#, neighbors.KNeighborsClassifier(n_jobs=-1)] #svm.SVC(), GradientBoostingClassifier(), neighbors.KNeighborsClassifier(n_jobs=-1)], RandomForestClassifier(n_jobs=-1), RandomForestClassifier(criterion='entropy', n_jobs=-1),
# classifiers = [RandomForestClassifier(n_estimators=500 ,n_jobs=-1), RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1), xgb.XGBClassifier(n_estimators=500, nthread=-1)]# svm.SVC()]
for i, clf in enumerate(classifiers):
print('Classifier ', i)
predict_left(ma, clf)
In [ ]:
def cross_val_left(df, clf, cv_folds=CV_FOLDS, drop=drop):
X = df.drop(drop, 1)#, 'sales', 'salary'],1)
y = df.Status
scores = cross_val_score(clf, X, y, cv=cv_folds, n_jobs=-1)
print('Cross val score: ', sum(scores) / cv_folds )
print(scores)
print()
In [ ]:
classifiers = [xgb.XGBClassifier(n_estimators=500, nthread=-1), LogisticRegressionCV(class_weight='balanced', n_jobs=-1), AdaBoostClassifier(n_estimators=500), MultinomialNB()]#} RandomForestClassifier(n_estimators=500 ,n_jobs=-1), RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1)]#, svm.SVC()]#]# svm.SVC()]
for i, clf in enumerate(classifiers):
print('Classifier ', i)
cross_val_left(ma, clf)
In [21]:
clf = xgb.XGBClassifier(n_estimators=500, nthread=-1)
X = ma.drop(drop,1)
y = ma.Status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
print(X.columns)
clf.feature_importances_
# Drop all lower than 0.01 relevance
Out[21]:
In [ ]:
In [23]:
# Dropping all with <0.01 relevance seems to not affect score much (.9920->.9916)
drop = ['left', 'promotion_last_5years', 'Work_accident', 'sales', 'salary']
for i, clf in enumerate(classifiers):
print('Classifier ', i)
cross_val_left(hr, clf, drop=drop)
In [56]:
ma.corr()["Status"]
Out[56]:
In [20]:
# Drop everything with corr to left of < 0.005
# Makes it worse
drop = ['left', 'sales', 'salary']
classifiers = [RandomForestClassifier(n_estimators=500 ,n_jobs=-1), RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1)]#, svm.SVC()]#xgb.XGBClassifier(n_estimators=500, nthread=-1)]# svm.SVC()]
for i, clf in enumerate(classifiers):
print('Classifier ', i)
cross_val_left(hr, clf, drop=drop)
In [35]:
ma.columns
Out[35]:
In [57]:
from sklearn.feature_selection import RFE
model = xgb.XGBClassifier(n_estimators=500, nthread=-1)#RandomForestClassifier(n_estimators=500, n_jobs=-1)
clf = RFE(model,5 )
X = ma.drop(drop,1)
y = ma.Status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
print(clf.support_)
print(clf.ranking_)
# Gets same result as feature_importance, which makes sense
In [58]:
# from sklearn.feature_selection import RFE
model = LogisticRegression()
clf = RFE(model,5 )
X = ma.drop(drop,1)
y = ma.Status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
print(clf.support_)
print(clf.ranking_)
# Gets same result as feature_importance
In [18]:
# PCA on relevant features
# drop = ['left', 'sales', 'salary']
y = np.array(ma.Status)
X = np.array(ma.drop(drop,1))
pca = PCA(n_components=2).fit(X)
X_pca = pca.transform(X)
In [19]:
print(pca.components_)
In [20]:
print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))
In [11]:
# Still get good same accuracy, ~69.7%
clf = LogisticRegressionCV(class_weight='balanced', n_jobs=-1)
y = np.array(ma.Status)
scores = cross_val_score(clf, X_pca, y, cv=CV_FOLDS, n_jobs=-1)
print('Cross val score: ', sum(scores) / CV_FOLDS )
print(scores)
In [22]:
colors = cycle('rb')
target_ids = range(2)
plt.figure()
for i, c, label in zip(target_ids, colors, ["Missed","Kept"]):
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1],
c=c, label=label)
plt.legend()
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [23]:
# 3D PCA, Clear plane difference between them
# drop = ['left', 'sales', 'salary']
y = np.array(ma.Status)
X = np.array(ma.drop(drop,1))
pca = PCA(n_components=3).fit(X)
X_pca = pca.transform(X)
colors = cycle('rb')
target_ids = range(2)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for i, c, label in zip(target_ids, colors, ["Missed", "Kept"]):
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1],X_pca[y == i, 2],
c=c, label=label)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.legend()
plt.show()
In [45]:
# 3D accuracy is the same as 2D
clf = RandomForestClassifier(class_weight='balanced',n_jobs=-1)#LogisticRegressionCV(class_weight='balanced', n_jobs=-1)
y = ma.Status
scores = cross_val_score(clf, X_pca, y, cv=CV_FOLDS, n_jobs=-1)
print('Cross val score: ', sum(scores) / CV_FOLDS )
print(scores)
In [ ]:
clf = svm.SVC(class_weight='balanced')#LogisticRegression(class_weight='balanced', n_jobs=-1)
X = ma.drop(drop,1)
y = ma.Status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_pred, y_test))
In [14]:
class_names = ["No-Show","Show-Up"]
print(classification_report(y_test,y_pred, target_names=class_names))
In [15]:
def plot_confusion_matrix(cm, classes,
title='Confusion matrix',
cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=10)
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix')
plt.show()
In [ ]:
print(classification_report(y_test,y_pred, target_names=class_names))
In [23]:
ma.columns
Out[23]:
In [35]:
percentageLeft = []
week = list(range(7))
for i in week:
day = ma[ma.DayOfTheWeek == i]
percentageLeft.append(np.sum(day.Status == 0) / len(day.Status) * 100)
plt.bar(week, percentageLeft)
plt.show()
In [44]:
for c in ['Gender', 'Diabetes', 'Alcoolism', 'HiperTension',
'Handcap', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder', "AwaitingTime", "Age"]:
percentageLeft = []
genders = ma[c].unique()
for i in genders:
sex = ma[ma[c] == i]
percentageLeft.append(np.sum(sex.Status == 0) / len(sex.Status) * 100)
plt.bar(genders, percentageLeft)
plt.show()
In [ ]: