#For ease of moving data from R to Python using rpy2
%load_ext rpy2.ipython
%R ata <- read.csv("~/Downloads/pima.csv")
%R utils::str(ata)
'data.frame': 768 obs. of 9 variables: $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ... $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ... $ BloodPressure: int 72 66 64 66 40 74 50 0 70 96 ... $ SkinThickness: int 35 29 0 23 35 0 32 0 45 0 ... $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ... $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ... $ Pedigree : num 0.627 0.351 0.672 0.167 2.288 ... $ Age : int 50 31 32 21 33 30 26 29 53 54 ... $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
%R library(Hmisc)
%R library(reshape)
%R library(ggplot2)
%R library(dplyr)
%R library(corrplot)
R[write to console]: Loading required package: lattice R[write to console]: Loading required package: survival R[write to console]: Loading required package: Formula R[write to console]: Loading required package: ggplot2 R[write to console]: Attaching package: ‘Hmisc’ R[write to console]: The following objects are masked from ‘package:base’: format.pval, units R[write to console]: Attaching package: ‘dplyr’ R[write to console]: The following object is masked from ‘package:reshape’: rename R[write to console]: The following objects are masked from ‘package:Hmisc’: src, summarize R[write to console]: The following objects are masked from ‘package:stats’: filter, lag R[write to console]: The following objects are masked from ‘package:base’: intersect, setdiff, setequal, union R[write to console]: corrplot 0.92 loaded
'corrplot' | 'dplyr' | 'reshape' | ... | 'datasets' | 'methods' | 'base' |
%R hist.data.frame(ata)
array([1.])
ata <- read.csv("~/Downloads/pima.csv")
library(corrplot)
atacor=cor(ata)
corrplot.mixed(atacor, number.cex = 1.3, tl.cex = 0.8)
install.packages('reshape')
library(reshape)
library(ggplot2)
meltata<-melt(ata)
box <- ggplot(meltata, aes(factor(variable), value))
box + geom_boxplot() + facet_wrap(~variable, scale="free")
The downloaded binary packages are in /var/folders/zz/wfk650fx7lx3mmyd6_y3yxgr0000gr/T//RtmpDejiPV/downloaded_packages
Using as id variables
import pandas
import sklearn
import pwlf
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import statsmodels.api as sm
#cross-validation for traditional model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
#moves data from R to Python
ata = %R ata
#5fold cross validation
crossvalidation = KFold(n_splits=5, random_state=1, shuffle=True)
ata.describe()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | Pedigree | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
#rough idea of importance of each feature using correlation scores, seen in R earlier too
CHD_corr = ata.corr()['Outcome']
corr_CHD = CHD_corr.abs().sort_values(ascending=False)[1:]; round(corr_CHD,2)
Glucose 0.47 BMI 0.29 Age 0.24 Pregnancies 0.22 Pedigree 0.17 Insulin 0.13 SkinThickness 0.07 BloodPressure 0.07 Name: Outcome, dtype: float64
#binarize each of the variables using the cut function in pandas
te1=pandas.cut(ata['BloodPressure'], 2, labels=['0','1'])
te2=pandas.cut(ata['Pregnancies'], 2, labels=['0','1'])
te3=pandas.cut(ata['Glucose'], 2, labels=['0','1'])
te4=pandas.cut(ata['SkinThickness'], 2, labels=['0','1'])
te5=pandas.cut(ata['Insulin'], 2, labels=['0','1'])
te6=pandas.cut(ata['BMI'], 2, labels=['0','1'])
te7=pandas.cut(ata['Pedigree'], 2, labels=['0','1'])
te8=pandas.cut(ata['Age'], 2, labels=['0','1'])
te=[te1,te2,te3,te4,te5,te6,te7,te8]
cX=pandas.DataFrame(data=te)
cX=cX.T
cata=pandas.concat([cX,ata.Outcome], axis=1)
import scipy.stats as stats
#generates the two way contingency tables for each variable
conglu=pandas.crosstab(cata['Glucose'], cata['Outcome'])
conbmi=pandas.crosstab(cata['BMI'], cata['Outcome'])
conage=pandas.crosstab(cata['Age'], cata['Outcome'])
conpreg=pandas.crosstab(cata['Pregnancies'], cata['Outcome'])
conped=pandas.crosstab(cata['Pedigree'], cata['Outcome'])
conins=pandas.crosstab(cata['Insulin'], cata['Outcome'])
conskin=pandas.crosstab(cata['SkinThickness'], cata['Outcome'])
conblood=pandas.crosstab(cata['BloodPressure'], cata['Outcome'])
#calculates the odds ratios from the tables
conoddglu = stats.fisher_exact(conglu)
conoddbmi = stats.fisher_exact(conbmi)
conoddage = stats.fisher_exact(conage)
conoddpreg = stats.fisher_exact(conpreg)
conoddped = stats.fisher_exact(conped)
conoddins = stats.fisher_exact(conins)
conoddskin = stats.fisher_exact(conskin)
conoddblood = stats.fisher_exact(conblood)
print("Glucose OddsR: ", conoddglu[0])
print("BMI OddsR: ", conoddbmi[0])
print("Age OddsR: ", conoddage[0])
print("Pregnancies OddsR: ", conoddpreg[0])
print("Pedigree OddsR: ", conoddped[0])
print("Insulin OddsR: ", conoddins[0])
print("SkinThickness OddsR: ", conoddskin[0])
print("BloodPressure OddsR: ", conoddblood[0])
Glucose OddsR: 8.93652037617555 BMI OddsR: 2.5734684477199448 Age OddsR: 1.6148936170212767 Pregnancies OddsR: 2.6526315789473682 Pedigree OddsR: 2.073202614379085 Insulin OddsR: 3.859375 SkinThickness OddsR: 0.9318181818181818 BloodPressure OddsR: 1.854251012145749
#Output variable
y=ata.Outcome
#Multiple input variables or features
X=ata.drop(columns='Outcome')
hlgX=X.drop(columns='Glucose')
#X with binarized Glucose variable
hlgX=pandas.concat([hlgX,cata.Glucose], axis=1)
Xt=X.to_numpy() #explanatory variables
hlgXt=hlgX.to_numpy()
yt=y.to_numpy() #bodyfat or output variable
#Binarized high/low glucose model
cmodel = LogisticRegression(solver='saga',max_iter=10000).fit(hlgXt, yt)
cscoresr2 = cross_validate(cmodel, cXt, yt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("High/Low Glucose Training "+"Folds: " + str(len(cscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(cscoresr2['train_score']))) + ", STD: " + str(np.std(cscoresr2['train_score'])))
print("High/Low Glucose Test "+"Folds: " + str(len(cscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(cscoresr2['test_score']))) + ", STD: " + str(np.std(cscoresr2['test_score'])))
High/Low Glucose Training Folds: 5, AUC: 0.7432069711424759, STD: 0.0053663171102469 High/Low Glucose Test Folds: 5, AUC: 0.7301923578099571, STD: 0.019305856016506587
cmodelc=cmodel.coef_
#model coefficients for binarized Glucose logistic regression model
pandas.DataFrame(index=hlgX.columns, data=np.transpose(cmodelc), columns=['High/Low Glucose'])
High/Low Glucose | |
---|---|
Pregnancies | 0.113083 |
BloodPressure | -0.024462 |
SkinThickness | -0.002657 |
Insulin | 0.001627 |
BMI | 0.021940 |
Pedigree | 0.143040 |
Age | 0.000134 |
Glucose | 0.478957 |
#odds ratio for Glucose variable
np.exp(0.478957)
1.6143897153083402
hlgX.describe()
Pregnancies | BloodPressure | SkinThickness | Insulin | BMI | Pedigree | Age | |
---|---|---|---|---|---|---|---|
count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
mean | 3.845052 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 |
std | 3.369578 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 |
25% | 1.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 |
50% | 3.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 |
75% | 6.000000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 |
max | 17.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 |
#average feature scores
lXtest=np.array([4,69,21,80, 32,0.5,20,0])
hXtest=np.array([4,69,21,80, 32,0.5,20,1])
lXtest = np.tile(lXtest, (41,1))
hXtest = np.tile(hXtest, (41,1))
lXtest[:,6]=lXtest[:,6]+np.indices((41,))
hXtest[:,6]=hXtest[:,6]+np.indices((41,))
#uses models to find probabilities based on fixed Xtest
hprobs=cmodel.predict_proba(hXtest)[:,1]
lprobs=cmodel.predict_proba(lXtest)[:,1]
plt.plot(hXtest[:,6],hprobs)
plt.plot(lXtest[:,6],lprobs)
plt.legend(['high glucose','low glucose'])
plt.xlabel('age (years)')
plt.ylabel('probability of Diabetes')
Text(0, 0.5, 'probability of Diabetes')
hslope=(max(hprobs)-min(hprobs))/40
lslope=(max(lprobs)-min(lprobs))/40
(hslope-lslope)/lslope*100
15.490061466034538
from sklearn import tree
treemodel = tree.DecisionTreeClassifier(min_samples_leaf=10, max_depth=4).fit(X, y)
plt.figure(figsize=(50,40))
tree.plot_tree(treemodel, fontsize=20, feature_names=X.columns,filled=True)
[Text(0.5, 0.9, 'Glucose <= 127.5\ngini = 0.454\nsamples = 768\nvalue = [500, 268]'), Text(0.25, 0.7, 'Age <= 28.5\ngini = 0.313\nsamples = 485\nvalue = [391, 94]'), Text(0.125, 0.5, 'BMI <= 30.95\ngini = 0.155\nsamples = 271\nvalue = [248, 23]'), Text(0.0625, 0.3, 'BloodPressure <= 85.0\ngini = 0.026\nsamples = 151\nvalue = [149, 2]'), Text(0.03125, 0.1, 'gini = 0.014\nsamples = 141\nvalue = [140, 1]'), Text(0.09375, 0.1, 'gini = 0.18\nsamples = 10\nvalue = [9, 1]'), Text(0.1875, 0.3, 'BloodPressure <= 53.0\ngini = 0.289\nsamples = 120\nvalue = [99, 21]'), Text(0.15625, 0.1, 'gini = 0.496\nsamples = 11\nvalue = [6, 5]'), Text(0.21875, 0.1, 'gini = 0.25\nsamples = 109\nvalue = [93, 16]'), Text(0.375, 0.5, 'BMI <= 26.35\ngini = 0.443\nsamples = 214\nvalue = [143, 71]'), Text(0.3125, 0.3, 'BMI <= 22.85\ngini = 0.093\nsamples = 41\nvalue = [39, 2]'), Text(0.28125, 0.1, 'gini = 0.32\nsamples = 10\nvalue = [8, 2]'), Text(0.34375, 0.1, 'gini = 0.0\nsamples = 31\nvalue = [31, 0]'), Text(0.4375, 0.3, 'Glucose <= 99.5\ngini = 0.48\nsamples = 173\nvalue = [104, 69]'), Text(0.40625, 0.1, 'gini = 0.298\nsamples = 55\nvalue = [45, 10]'), Text(0.46875, 0.1, 'gini = 0.5\nsamples = 118\nvalue = [59, 59]'), Text(0.75, 0.7, 'BMI <= 29.95\ngini = 0.474\nsamples = 283\nvalue = [109, 174]'), Text(0.625, 0.5, 'Glucose <= 145.5\ngini = 0.432\nsamples = 76\nvalue = [52, 24]'), Text(0.5625, 0.3, 'Insulin <= 132.5\ngini = 0.25\nsamples = 41\nvalue = [35, 6]'), Text(0.53125, 0.1, 'gini = 0.337\nsamples = 28\nvalue = [22, 6]'), Text(0.59375, 0.1, 'gini = 0.0\nsamples = 13\nvalue = [13, 0]'), Text(0.6875, 0.3, 'Insulin <= 14.5\ngini = 0.5\nsamples = 35\nvalue = [17, 18]'), Text(0.65625, 0.1, 'gini = 0.472\nsamples = 21\nvalue = [13, 8]'), Text(0.71875, 0.1, 'gini = 0.408\nsamples = 14\nvalue = [4, 10]'), Text(0.875, 0.5, 'Glucose <= 157.5\ngini = 0.399\nsamples = 207\nvalue = [57, 150]'), Text(0.8125, 0.3, 'Age <= 30.5\ngini = 0.476\nsamples = 115\nvalue = [45, 70]'), Text(0.78125, 0.1, 'gini = 0.497\nsamples = 50\nvalue = [27, 23]'), Text(0.84375, 0.1, 'gini = 0.4\nsamples = 65\nvalue = [18, 47]'), Text(0.9375, 0.3, 'Pedigree <= 0.3\ngini = 0.227\nsamples = 92\nvalue = [12, 80]'), Text(0.90625, 0.1, 'gini = 0.365\nsamples = 25\nvalue = [6, 19]'), Text(0.96875, 0.1, 'gini = 0.163\nsamples = 67\nvalue = [6, 61]')]
from sklearn.ensemble import RandomForestClassifier
rmodel = RandomForestClassifier(min_samples_leaf=10).fit(Xt, yt)
cscoresr2 = cross_validate(rmodel, Xt, yt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Random Forest Training "+"Folds: " + str(len(cscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(cscoresr2['train_score']))) + ", STD: " + str(np.std(cscoresr2['train_score'])))
print("Random Forest Test "+"Folds: " + str(len(cscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(cscoresr2['test_score']))) + ", STD: " + str(np.std(cscoresr2['test_score'])))
Random Forest Training Folds: 5, AUC: 0.9272431182529269, STD: 0.004339926953184873 Random Forest Test Folds: 5, AUC: 0.8357513084142825, STD: 0.029839078370195133
#relative importances for each of the explanatory variables
pandas.DataFrame(index=[X.columns], data=rmodel.feature_importances_, columns=['Random Forest'])
Random Forest | |
---|---|
Pregnancies | 0.083568 |
Glucose | 0.351063 |
BloodPressure | 0.035996 |
SkinThickness | 0.041866 |
Insulin | 0.065490 |
BMI | 0.183520 |
Pedigree | 0.090326 |
Age | 0.148171 |
iX=X
iX['Glucose*Age']=iX['Glucose']*iX['Age']
iXt=iX.to_numpy() #explanatory variables
#Traditional model
lmodel = LogisticRegression(solver='saga',max_iter=10000).fit(Xt, yt)
lscoresr2 = cross_validate(lmodel, Xt, yt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Logistic Training "+"Folds: " + str(len(lscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(lscoresr2['train_score']))) + ", STD: " + str(np.std(lscoresr2['train_score'])))
print("Logistic Test "+"Folds: " + str(len(lscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(lscoresr2['test_score']))) + ", STD: " + str(np.std(lscoresr2['test_score'])))
#Traditional model with interaction for glucose and age
X['Glucose*Age']=X['Glucose']*X['Age']
ilmodel = LogisticRegression(solver='saga',max_iter=10000).fit(iXt, yt)
ilscoresr2 = cross_validate(ilmodel, iXt, yt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Interactions Logistic Training "+"Folds: " + str(len(ilscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(ilscoresr2['train_score']))) + ", STD: " + str(np.std(ilscoresr2['train_score'])))
print("Interactions Logistic Test "+"Folds: " + str(len(ilscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(ilscoresr2['test_score']))) + ", STD: " + str(np.std(ilscoresr2['test_score'])))
Logistic Training Folds: 5, AUC: 0.6936373481770454, STD: 0.019878502108080856 Logistic Test Folds: 5, AUC: 0.6714270564037197, STD: 0.06656893602036591 Interactions Logistic Training Folds: 5, AUC: 0.7635738791515649, STD: 0.007032650900725648 Interactions Logistic Test Folds: 5, AUC: 0.7494879674224354, STD: 0.03799010243657058