#For ease of moving data from R to Python using rpy2
%load_ext rpy2.ipython


%R ata <- read.csv("~/Downloads/pima.csv")
%R utils::str(ata)

'data.frame':	768 obs. of  9 variables:
 $ Pregnancies  : int  6 1 8 1 0 5 3 10 2 8 ...
 $ Glucose      : int  148 85 183 89 137 116 78 115 197 125 ...
 $ BloodPressure: int  72 66 64 66 40 74 50 0 70 96 ...
 $ SkinThickness: int  35 29 0 23 35 0 32 0 45 0 ...
 $ Insulin      : int  0 0 0 94 168 0 88 0 543 0 ...
 $ BMI          : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
 $ Pedigree     : num  0.627 0.351 0.672 0.167 2.288 ...
 $ Age          : int  50 31 32 21 33 30 26 29 53 54 ...
 $ Outcome      : int  1 0 1 0 1 0 1 0 1 1 ...


%R library(Hmisc)
%R library(reshape)
%R library(ggplot2)
%R library(dplyr)
%R library(corrplot)

R[write to console]: Loading required package: lattice

R[write to console]: Loading required package: survival

R[write to console]: Loading required package: Formula

R[write to console]: Loading required package: ggplot2

R[write to console]: 
Attaching package: ‘Hmisc’


R[write to console]: The following objects are masked from ‘package:base’:

    format.pval, units


R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following object is masked from ‘package:reshape’:

    rename


R[write to console]: The following objects are masked from ‘package:Hmisc’:

    src, summarize


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


R[write to console]: corrplot 0.92 loaded


%R hist.data.frame(ata)

array([1.])


ata <- read.csv("~/Downloads/pima.csv")
library(corrplot)
atacor=cor(ata)
corrplot.mixed(atacor, number.cex = 1.3, tl.cex = 0.8)


install.packages('reshape')
library(reshape)
library(ggplot2)
meltata<-melt(ata)
box <- ggplot(meltata, aes(factor(variable), value))
box + geom_boxplot() + facet_wrap(~variable, scale="free")

The downloaded binary packages are in
	/var/folders/zz/wfk650fx7lx3mmyd6_y3yxgr0000gr/T//RtmpDejiPV/downloaded_packages

Using  as id variables


import pandas
import sklearn
import pwlf
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import statsmodels.api as sm

#cross-validation for traditional model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

#moves data from R to Python
ata = %R ata

#5fold cross validation
crossvalidation = KFold(n_splits=5, random_state=1, shuffle=True)


ata.describe()


#rough idea of importance of each feature using correlation scores, seen in R earlier too
CHD_corr = ata.corr()['Outcome']
corr_CHD = CHD_corr.abs().sort_values(ascending=False)[1:]; round(corr_CHD,2)

Glucose          0.47
BMI              0.29
Age              0.24
Pregnancies      0.22
Pedigree         0.17
Insulin          0.13
SkinThickness    0.07
BloodPressure    0.07
Name: Outcome, dtype: float64


#binarize each of the variables using the cut function in pandas

te1=pandas.cut(ata['BloodPressure'], 2, labels=['0','1'])
te2=pandas.cut(ata['Pregnancies'], 2, labels=['0','1'])
te3=pandas.cut(ata['Glucose'], 2, labels=['0','1'])
te4=pandas.cut(ata['SkinThickness'], 2, labels=['0','1'])
te5=pandas.cut(ata['Insulin'], 2, labels=['0','1'])
te6=pandas.cut(ata['BMI'], 2, labels=['0','1'])
te7=pandas.cut(ata['Pedigree'], 2, labels=['0','1'])
te8=pandas.cut(ata['Age'], 2, labels=['0','1'])
te=[te1,te2,te3,te4,te5,te6,te7,te8]
cX=pandas.DataFrame(data=te)
cX=cX.T
cata=pandas.concat([cX,ata.Outcome], axis=1)


import scipy.stats as stats

#generates the two way contingency tables for each variable
conglu=pandas.crosstab(cata['Glucose'], cata['Outcome'])
conbmi=pandas.crosstab(cata['BMI'], cata['Outcome'])
conage=pandas.crosstab(cata['Age'], cata['Outcome'])
conpreg=pandas.crosstab(cata['Pregnancies'], cata['Outcome'])
conped=pandas.crosstab(cata['Pedigree'], cata['Outcome'])
conins=pandas.crosstab(cata['Insulin'], cata['Outcome'])
conskin=pandas.crosstab(cata['SkinThickness'], cata['Outcome'])
conblood=pandas.crosstab(cata['BloodPressure'], cata['Outcome'])

#calculates the odds ratios from the tables
conoddglu = stats.fisher_exact(conglu)
conoddbmi = stats.fisher_exact(conbmi)
conoddage = stats.fisher_exact(conage)
conoddpreg = stats.fisher_exact(conpreg)
conoddped = stats.fisher_exact(conped)
conoddins = stats.fisher_exact(conins)
conoddskin = stats.fisher_exact(conskin)
conoddblood = stats.fisher_exact(conblood)

print("Glucose OddsR: ", conoddglu[0])
print("BMI OddsR: ", conoddbmi[0])
print("Age OddsR: ", conoddage[0])
print("Pregnancies OddsR: ", conoddpreg[0])
print("Pedigree OddsR: ", conoddped[0])
print("Insulin OddsR: ", conoddins[0])
print("SkinThickness OddsR: ", conoddskin[0])
print("BloodPressure OddsR: ", conoddblood[0])

Glucose OddsR:  8.93652037617555
BMI OddsR:  2.5734684477199448
Age OddsR:  1.6148936170212767
Pregnancies OddsR:  2.6526315789473682
Pedigree OddsR:  2.073202614379085
Insulin OddsR:  3.859375
SkinThickness OddsR:  0.9318181818181818
BloodPressure OddsR:  1.854251012145749


#Output variable
y=ata.Outcome

#Multiple input variables or features
X=ata.drop(columns='Outcome')
hlgX=X.drop(columns='Glucose')

#X with binarized Glucose variable
hlgX=pandas.concat([hlgX,cata.Glucose], axis=1)

Xt=X.to_numpy() #explanatory variables
hlgXt=hlgX.to_numpy()
yt=y.to_numpy() #bodyfat or output variable

#Binarized high/low glucose model
cmodel = LogisticRegression(solver='saga',max_iter=10000).fit(hlgXt, yt)
cscoresr2 = cross_validate(cmodel, cXt, yt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("High/Low Glucose Training "+"Folds: " + str(len(cscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(cscoresr2['train_score']))) + ", STD: " + str(np.std(cscoresr2['train_score'])))
print("High/Low Glucose Test "+"Folds: " + str(len(cscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(cscoresr2['test_score']))) + ", STD: " + str(np.std(cscoresr2['test_score'])))

High/Low Glucose Training Folds: 5, AUC: 0.7432069711424759, STD: 0.0053663171102469
High/Low Glucose Test Folds: 5, AUC: 0.7301923578099571, STD: 0.019305856016506587


cmodelc=cmodel.coef_


#model coefficients for binarized Glucose logistic regression model
pandas.DataFrame(index=hlgX.columns, data=np.transpose(cmodelc), columns=['High/Low Glucose'])


#odds ratio for Glucose variable
np.exp(0.478957)

1.6143897153083402


hlgX.describe()


#average feature scores
lXtest=np.array([4,69,21,80, 32,0.5,20,0])
hXtest=np.array([4,69,21,80, 32,0.5,20,1])


lXtest = np.tile(lXtest, (41,1))
hXtest = np.tile(hXtest, (41,1))


lXtest[:,6]=lXtest[:,6]+np.indices((41,))
hXtest[:,6]=hXtest[:,6]+np.indices((41,))


#uses models to find probabilities based on fixed Xtest
hprobs=cmodel.predict_proba(hXtest)[:,1]
lprobs=cmodel.predict_proba(lXtest)[:,1]


plt.plot(hXtest[:,6],hprobs)
plt.plot(lXtest[:,6],lprobs)
plt.legend(['high glucose','low glucose'])
plt.xlabel('age (years)')
plt.ylabel('probability of Diabetes')

Text(0, 0.5, 'probability of Diabetes')


hslope=(max(hprobs)-min(hprobs))/40


lslope=(max(lprobs)-min(lprobs))/40


(hslope-lslope)/lslope*100

15.490061466034538


from sklearn import tree
treemodel = tree.DecisionTreeClassifier(min_samples_leaf=10, max_depth=4).fit(X, y)


plt.figure(figsize=(50,40))
tree.plot_tree(treemodel, fontsize=20, feature_names=X.columns,filled=True)

[Text(0.5, 0.9, 'Glucose <= 127.5\ngini = 0.454\nsamples = 768\nvalue = [500, 268]'),
 Text(0.25, 0.7, 'Age <= 28.5\ngini = 0.313\nsamples = 485\nvalue = [391, 94]'),
 Text(0.125, 0.5, 'BMI <= 30.95\ngini = 0.155\nsamples = 271\nvalue = [248, 23]'),
 Text(0.0625, 0.3, 'BloodPressure <= 85.0\ngini = 0.026\nsamples = 151\nvalue = [149, 2]'),
 Text(0.03125, 0.1, 'gini = 0.014\nsamples = 141\nvalue = [140, 1]'),
 Text(0.09375, 0.1, 'gini = 0.18\nsamples = 10\nvalue = [9, 1]'),
 Text(0.1875, 0.3, 'BloodPressure <= 53.0\ngini = 0.289\nsamples = 120\nvalue = [99, 21]'),
 Text(0.15625, 0.1, 'gini = 0.496\nsamples = 11\nvalue = [6, 5]'),
 Text(0.21875, 0.1, 'gini = 0.25\nsamples = 109\nvalue = [93, 16]'),
 Text(0.375, 0.5, 'BMI <= 26.35\ngini = 0.443\nsamples = 214\nvalue = [143, 71]'),
 Text(0.3125, 0.3, 'BMI <= 22.85\ngini = 0.093\nsamples = 41\nvalue = [39, 2]'),
 Text(0.28125, 0.1, 'gini = 0.32\nsamples = 10\nvalue = [8, 2]'),
 Text(0.34375, 0.1, 'gini = 0.0\nsamples = 31\nvalue = [31, 0]'),
 Text(0.4375, 0.3, 'Glucose <= 99.5\ngini = 0.48\nsamples = 173\nvalue = [104, 69]'),
 Text(0.40625, 0.1, 'gini = 0.298\nsamples = 55\nvalue = [45, 10]'),
 Text(0.46875, 0.1, 'gini = 0.5\nsamples = 118\nvalue = [59, 59]'),
 Text(0.75, 0.7, 'BMI <= 29.95\ngini = 0.474\nsamples = 283\nvalue = [109, 174]'),
 Text(0.625, 0.5, 'Glucose <= 145.5\ngini = 0.432\nsamples = 76\nvalue = [52, 24]'),
 Text(0.5625, 0.3, 'Insulin <= 132.5\ngini = 0.25\nsamples = 41\nvalue = [35, 6]'),
 Text(0.53125, 0.1, 'gini = 0.337\nsamples = 28\nvalue = [22, 6]'),
 Text(0.59375, 0.1, 'gini = 0.0\nsamples = 13\nvalue = [13, 0]'),
 Text(0.6875, 0.3, 'Insulin <= 14.5\ngini = 0.5\nsamples = 35\nvalue = [17, 18]'),
 Text(0.65625, 0.1, 'gini = 0.472\nsamples = 21\nvalue = [13, 8]'),
 Text(0.71875, 0.1, 'gini = 0.408\nsamples = 14\nvalue = [4, 10]'),
 Text(0.875, 0.5, 'Glucose <= 157.5\ngini = 0.399\nsamples = 207\nvalue = [57, 150]'),
 Text(0.8125, 0.3, 'Age <= 30.5\ngini = 0.476\nsamples = 115\nvalue = [45, 70]'),
 Text(0.78125, 0.1, 'gini = 0.497\nsamples = 50\nvalue = [27, 23]'),
 Text(0.84375, 0.1, 'gini = 0.4\nsamples = 65\nvalue = [18, 47]'),
 Text(0.9375, 0.3, 'Pedigree <= 0.3\ngini = 0.227\nsamples = 92\nvalue = [12, 80]'),
 Text(0.90625, 0.1, 'gini = 0.365\nsamples = 25\nvalue = [6, 19]'),
 Text(0.96875, 0.1, 'gini = 0.163\nsamples = 67\nvalue = [6, 61]')]


from sklearn.ensemble import RandomForestClassifier
rmodel = RandomForestClassifier(min_samples_leaf=10).fit(Xt, yt)
cscoresr2 = cross_validate(rmodel, Xt, yt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Random Forest Training "+"Folds: " + str(len(cscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(cscoresr2['train_score']))) + ", STD: " + str(np.std(cscoresr2['train_score'])))
print("Random Forest Test "+"Folds: " + str(len(cscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(cscoresr2['test_score']))) + ", STD: " + str(np.std(cscoresr2['test_score'])))

Random Forest Training Folds: 5, AUC: 0.9272431182529269, STD: 0.004339926953184873
Random Forest Test Folds: 5, AUC: 0.8357513084142825, STD: 0.029839078370195133


#relative importances for each of the explanatory variables
pandas.DataFrame(index=[X.columns], data=rmodel.feature_importances_, columns=['Random Forest'])


iX=X
iX['Glucose*Age']=iX['Glucose']*iX['Age']


iXt=iX.to_numpy() #explanatory variables

#Traditional model
lmodel = LogisticRegression(solver='saga',max_iter=10000).fit(Xt, yt)
lscoresr2 = cross_validate(lmodel, Xt, yt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Logistic Training "+"Folds: " + str(len(lscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(lscoresr2['train_score']))) + ", STD: " + str(np.std(lscoresr2['train_score'])))
print("Logistic Test "+"Folds: " + str(len(lscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(lscoresr2['test_score']))) + ", STD: " + str(np.std(lscoresr2['test_score'])))

#Traditional model with interaction for glucose and age

X['Glucose*Age']=X['Glucose']*X['Age']

ilmodel = LogisticRegression(solver='saga',max_iter=10000).fit(iXt, yt)
ilscoresr2 = cross_validate(ilmodel, iXt, yt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Interactions Logistic Training "+"Folds: " + str(len(ilscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(ilscoresr2['train_score']))) + ", STD: " + str(np.std(ilscoresr2['train_score'])))
print("Interactions Logistic Test "+"Folds: " + str(len(ilscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(ilscoresr2['test_score']))) + ", STD: " + str(np.std(ilscoresr2['test_score'])))

Logistic Training Folds: 5, AUC: 0.6936373481770454, STD: 0.019878502108080856
Logistic Test Folds: 5, AUC: 0.6714270564037197, STD: 0.06656893602036591
Interactions Logistic Training Folds: 5, AUC: 0.7635738791515649, STD: 0.007032650900725648
Interactions Logistic Test Folds: 5, AUC: 0.7494879674224354, STD: 0.03799010243657058

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	Pedigree	Age	Outcome
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	120.894531	69.105469	20.536458	79.799479	31.992578	0.471876	33.240885	0.348958
std	3.369578	31.972618	19.355807	15.952218	115.244002	7.884160	0.331329	11.760232	0.476951
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.078000	21.000000	0.000000
25%	1.000000	99.000000	62.000000	0.000000	0.000000	27.300000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	23.000000	30.500000	32.000000	0.372500	29.000000	0.000000
75%	6.000000	140.250000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000

	Pregnancies	BloodPressure	SkinThickness	Insulin	BMI	Pedigree	Age
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	69.105469	20.536458	79.799479	31.992578	0.471876	33.240885
std	3.369578	19.355807	15.952218	115.244002	7.884160	0.331329	11.760232
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.078000	21.000000
25%	1.000000	62.000000	0.000000	0.000000	27.300000	0.243750	24.000000
50%	3.000000	72.000000	23.000000	30.500000	32.000000	0.372500	29.000000
75%	6.000000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000
max	17.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000

MSDS Assignment 3 Practical Machine Learning¶

Data Visualization in R for Pima Indians data¶

All of the variables look normally distributed - glucose, blood pressure, skin thickness and BMI. Pregnancies, pedigree, insulin and age look skewed to the left.¶

Glucose, BMI, Age and Pregnancies look like important features correlated with the Diabetes outcome.¶

We see many outliers in this dataset in the boxplots,for insulin, Age, Pregnancies and Pedigree - likely due to the left skew seen in the histograms.¶

Analysis in Python¶

Two-way contingency tables¶

Odds ratio from the model is 1.6 compared to 9.0 from the contingency table for Glucose.¶

Slope of high glucose probability is 15.5% higher than the low glucose probability slope.¶

	High/Low Glucose
Pregnancies	0.113083
BloodPressure	-0.024462
SkinThickness	-0.002657
Insulin	0.001627
BMI	0.021940
Pedigree	0.143040
Age	0.000134
Glucose	0.478957

	Random Forest
Pregnancies	0.083568
Glucose	0.351063
BloodPressure	0.035996
SkinThickness	0.041866
Insulin	0.065490
BMI	0.183520
Pedigree	0.090326
Age	0.148171