#For ease of moving data from R to Python using rpy2
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


%R ata <- read.csv("~/Downloads/framingham.csv")
%R utils::str(ata)

'data.frame':	3658 obs. of  16 variables:
 $ male           : int  1 0 1 0 0 0 0 0 1 1 ...
 $ age            : int  39 46 48 61 46 43 63 45 52 43 ...
 $ education      : int  4 2 1 3 3 2 1 2 1 1 ...
 $ currentSmoker  : int  0 0 1 1 1 0 0 1 0 1 ...
 $ cigsPerDay     : int  0 0 20 30 23 0 0 20 0 30 ...
 $ BPMeds         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ prevalentStroke: int  0 0 0 0 0 0 0 0 0 0 ...
 $ prevalentHyp   : int  0 0 0 1 0 1 0 0 1 1 ...
 $ diabetes       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ totChol        : int  195 250 245 225 285 228 205 313 260 225 ...
 $ sysBP          : num  106 121 128 150 130 ...
 $ diaBP          : num  70 81 80 95 84 110 71 71 89 107 ...
 $ BMI            : num  27 28.7 25.3 28.6 23.1 ...
 $ heartRate      : int  80 95 75 65 85 77 60 79 76 93 ...
 $ glucose        : int  77 76 70 103 85 99 85 78 79 88 ...
 $ TenYearCHD     : int  0 0 0 1 0 0 1 0 0 0 ...


%R library(Hmisc)
%R library(reshape)
%R library(ggplot2)
%R library(dplyr)
%R library(corrplot)


%R hist.data.frame(ata)

array([1.])


%R males=ata[ata$male==1,]
%R females=ata[ata$male==0,]
%R males$male <- NULL
%R females$male <- NULL


%R hist.data.frame(males)
%R hist.data.frame(females)

array([1.])


ata <- read.csv("~/Downloads/framingham.csv")
library(corrplot)
atacor=cor(ata)
corrplot.mixed(atacor, number.cex = 0.7, tl.cex = 0.7)

corrplot 0.92 loaded


install.packages('reshape')
library(reshape)
library(ggplot2)
meltata<-melt(ata)
box <- ggplot(meltata, aes(factor(variable), value))
box + geom_boxplot() + facet_wrap(~variable, scale="free")

The downloaded binary packages are in
	/var/folders/zz/wfk650fx7lx3mmyd6_y3yxgr0000gr/T//RtmpIKYb7R/downloaded_packages

Using  as id variables


import pandas
import sklearn
import pwlf
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import statsmodels.api as sm

#cross-validation for traditional model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

#moves data from R to Python
ata = %R ata
males= %R males
females= %R females
#10fold cross validation
crossvalidation = KFold(n_splits=10, random_state=1, shuffle=True)
#adds interaction features, which is each feature multiplied by each other
interaction = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)


ata.describe()


#rough idea of importance of each feature using correlation scores, seen in R earlier too
CHD_corr = ata.corr()['TenYearCHD']
corr_CHD = CHD_corr.abs().sort_values(ascending=False)[1:]; round(corr_CHD,2)

age                0.23
sysBP              0.22
prevalentHyp       0.18
diaBP              0.15
glucose            0.12
diabetes           0.09
male               0.09
totChol            0.09
BPMeds             0.09
BMI                0.08
education          0.06
cigsPerDay         0.05
prevalentStroke    0.05
heartRate          0.02
currentSmoker      0.02
Name: TenYearCHD, dtype: float64


#makes multicategory variables into 4 binary columns
males_education = pandas.get_dummies(males['education'], prefix='Education_')
females_education = pandas.get_dummies(females['education'], prefix='Education_')

males=males.drop(columns='education')
females=females.drop(columns='education')

males=males.join(males_education)
females=females.join(females_education)


#Output variable
my=males.TenYearCHD

#Multiple input variables or features
mX=males.drop(columns='TenYearCHD')
mXinter = interaction.fit_transform(mX)

mXt=mX.to_numpy() #explanatory variables
myt=my.to_numpy() #bodyfat or output variable

#Traditional model
mmodel = LogisticRegression(solver='saga',max_iter=10000).fit(mXt, myt)
mscoresr2 = cross_validate(mmodel, mXt, myt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Main effects Training "+"Folds: " + str(len(mscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(mscoresr2['train_score']))) + ", STD: " + str(np.std(mscoresr2['train_score'])))
print("Main effects Test "+"Folds: " + str(len(mscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(mscoresr2['test_score']))) + ", STD: " + str(np.std(mscoresr2['test_score'])))

#Interaction
mintermodel = LogisticRegression(solver='saga',max_iter=10000).fit(mXinter, myt)
minterscoresr2 = cross_validate(mintermodel, mXinter, myt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Interaction Training "+"Folds: " + str(len(minterscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(minterscoresr2['train_score']))) + ", STD: " + str(np.std(minterscoresr2['train_score'])))
print("Interaction Test "+"Folds: " + str(len(minterscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(minterscoresr2['test_score']))) + ", STD: " + str(np.std(minterscoresr2['test_score'])))

Main effects Training Folds: 10, AUC: 0.6678472937733854, STD: 0.005957283263222948
Main effects Test Folds: 10, AUC: 0.653808532449952, STD: 0.04694871300142252
Interaction Training Folds: 10, AUC: 0.6705588953754003, STD: 0.00867409281720517
Interaction Test Folds: 10, AUC: 0.6262449094226785, STD: 0.059652835872936634


#Output variable
fy=females.TenYearCHD

#Multiple input variables or features
fX=females.drop(columns='TenYearCHD')
fXinter = interaction.fit_transform(fX)


fXt=fX.to_numpy() #explanatory variables
fyt=fy.to_numpy() #bodyfat or output variable

#Traditional model
fmodel = LogisticRegression(solver='saga', max_iter=10000).fit(fXt, fyt)
fscoresr2 = cross_validate(fmodel, fXt, fyt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Main effects Training "+"Folds: " + str(len(fscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(fscoresr2['train_score']))) + ", STD: " + str(np.std(fscoresr2['train_score'])))
print("Main effects Test "+"Folds: " + str(len(fscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(fscoresr2['test_score']))) + ", STD: " + str(np.std(fscoresr2['test_score'])))

#Interactions model
fintermodel = LogisticRegression(solver='saga', max_iter=10000).fit(fXinter, fyt)
finterscoresr2 = cross_validate(fintermodel, fXinter, fyt, scoring="roc_auc", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Interaction Training "+"Folds: " + str(len(finterscoresr2['train_score'])) + ", AUC: " + str(np.mean(np.abs(finterscoresr2['train_score']))) + ", STD: " + str(np.std(finterscoresr2['train_score'])))
print("Interaction Test "+"Folds: " + str(len(finterscoresr2['test_score'])) + ", AUC: " + str(np.mean(np.abs(finterscoresr2['test_score']))) + ", STD: " + str(np.std(finterscoresr2['test_score'])))

Main effects Training Folds: 10, AUC: 0.6872968637679028, STD: 0.005287517767760473
Main effects Test Folds: 10, AUC: 0.6697437383574523, STD: 0.0510057491474261
Interaction Training Folds: 10, AUC: 0.68164514633492, STD: 0.0056144071166999415
Interaction Test Folds: 10, AUC: 0.6393565714779978, STD: 0.038424747104617615


pandas.crosstab(males['currentSmoker'], males['TenYearCHD'])


#odds ratio for males
(532*197)/(110*784)

1.2152597402597403


pandas.crosstab(females['currentSmoker'], females['TenYearCHD'])


#odds ratio for females
(1065*88)/(162*720)

0.8034979423868313


modds=np.exp(mmodel.coef_)


fodds=np.exp(fmodel.coef_)


#odds ratios for binary variables
compares=pandas.DataFrame(index=mX.columns, data=np.transpose(np.vstack((modds,fodds))), columns=['male','female'])
binarycompares=compares.drop(['age','cigsPerDay', 'totChol','sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose'])
binarycompares


#odds ratios for continuous variables
continuouscompares=compares.drop(['currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes','Education__1','Education__2','Education__3','Education__4'])
continuouscompares


mX.describe()


mcoeff=mmodel.coef_
fcoeff=fmodel.coef_
comparesm=pandas.DataFrame(index=mX.columns, data=np.transpose(np.vstack((mcoeff,fcoeff))), columns=['male','female'])
comparesm


#average feature scores
Xtest=np.array([35, 0, 0, 0, 0, 0, 0, 235, 130, 83, 26, 74, 81, 0, 1, 0, 0])


Xtest = np.tile(Xtest, (31,1))


Xtest[:,0]=Xtest[:,0]+np.indices((31,))


#uses models to find probabilities based on fixed Xtest
mprobs=mmodel.predict_proba(Xtest)[:,1]
fprobs=fmodel.predict_proba(Xtest)[:,1]


plt.plot(Xtest[:,0],mprobs)
plt.plot(Xtest[:,0],fprobs)
plt.legend(['male','female'])
plt.xlabel('age (years)')
plt.ylabel('probability of TenYearCHD')

Text(0, 0.5, 'probability of TenYearCHD')

	male	age	education	currentSmoker	cigsPerDay	BPMeds	prevalentStroke	prevalentHyp	diabetes	totChol	sysBP	diaBP	BMI	heartRate	glucose	TenYearCHD
count	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000	3658.000000
mean	0.443685	49.551941	1.980317	0.489065	9.025424	0.030344	0.005741	0.311646	0.027064	236.847731	132.370558	82.917031	25.782802	75.730727	81.852925	0.152269
std	0.496886	8.562029	1.022656	0.499949	11.921590	0.171557	0.075561	0.463229	0.162292	44.097681	22.086866	11.974258	4.065601	11.981525	23.904164	0.359331
min	0.000000	32.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	113.000000	83.500000	48.000000	15.540000	44.000000	40.000000	0.000000
25%	0.000000	42.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	206.000000	117.000000	75.000000	23.080000	68.000000	71.000000	0.000000
50%	0.000000	49.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	234.000000	128.000000	82.000000	25.380000	75.000000	78.000000	0.000000
75%	1.000000	56.000000	3.000000	1.000000	20.000000	0.000000	0.000000	1.000000	0.000000	263.000000	143.875000	90.000000	28.037500	82.000000	87.000000	0.000000
max	1.000000	70.000000	4.000000	1.000000	70.000000	1.000000	1.000000	1.000000	1.000000	600.000000	295.000000	142.500000	56.800000	143.000000	394.000000	1.000000

	male	female
currentSmoker	0.936733	0.817650
BPMeds	1.091402	1.172306
prevalentStroke	1.055553	1.046149
prevalentHyp	1.681934	1.964095
diabetes	1.189329	1.102967
Education__1	1.025964	1.187471
Education__2	0.809069	0.817745
Education__3	0.852386	0.885750
Education__4	0.997989	0.836805

	male	female
age	1.027416	1.026947
cigsPerDay	1.012160	1.021668
totChol	0.999897	0.997723
sysBP	1.019178	1.018848
diaBP	0.977011	0.967797
BMI	0.914913	0.975518
heartRate	0.982814	0.971011
glucose	1.004282	1.004130

	age	currentSmoker	cigsPerDay	BPMeds	prevalentStroke	prevalentHyp	diabetes	totChol	sysBP	diaBP	BMI	heartRate	glucose	Education__1	Education__2	Education__3	Education__4
count	1623.00000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000	1623.000000
mean	49.31793	0.604436	13.434381	0.020333	0.005545	0.312384	0.029575	233.375847	131.248922	83.612446	26.115595	74.184227	81.931608	0.438078	0.277880	0.129390	0.154652
std	8.54328	0.489122	13.761326	0.141179	0.074283	0.463609	0.169464	41.108145	19.402312	11.510911	3.389635	11.625313	24.310991	0.496304	0.448092	0.335735	0.361684
min	33.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	113.000000	83.500000	48.000000	15.540000	44.000000	40.000000	0.000000	0.000000	0.000000	0.000000
25%	42.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	206.000000	118.000000	76.000000	23.945000	66.000000	70.000000	0.000000	0.000000	0.000000	0.000000
50%	48.00000	1.000000	15.000000	0.000000	0.000000	0.000000	0.000000	231.000000	128.000000	82.000000	26.000000	75.000000	78.000000	0.000000	0.000000	0.000000	0.000000
75%	56.00000	1.000000	20.000000	0.000000	0.000000	1.000000	0.000000	259.000000	141.000000	90.000000	28.300000	80.000000	87.000000	1.000000	1.000000	0.000000	0.000000
max	69.00000	1.000000	70.000000	1.000000	1.000000	1.000000	1.000000	453.000000	232.000000	136.000000	40.380000	125.000000	394.000000	1.000000	1.000000	1.000000	1.000000

	male	female
age	0.027048	0.026590
currentSmoker	-0.065373	-0.201254
cigsPerDay	0.012087	0.021433
BPMeds	0.087462	0.158924
prevalentStroke	0.054060	0.045091
prevalentHyp	0.519962	0.675055
diabetes	0.173386	0.097954
totChol	-0.000103	-0.002279
sysBP	0.018996	0.018673
diaBP	-0.023257	-0.032734
BMI	-0.088924	-0.024786
heartRate	-0.017335	-0.029418
glucose	0.004273	0.004121
Education__1	0.025601	0.171819
Education__2	-0.211923	-0.201186
Education__3	-0.159736	-0.121320
Education__4	-0.001998	-0.178153

MSDS Assignment 2 Practical Machine Learning¶

Data Visualization in R for FHS data¶

All of the variables look normally distributed. Age does look skewed.¶

We see many outliers in this dataset in the boxplots, likely due to the binary nature of the outcomes.¶

Logisitic Regression Model in Python¶

Males dataset¶

Females dataset¶

Two-way contingency tables¶

TenYearCHD	0	1
currentSmoker
0	532	110
1	784	197

TenYearCHD	0	1
currentSmoker
0	1065	162
1	720	88