MSDS Assignment 4 Practical Machine Learning¶

Data Analysis in Python for DNA Microarray data¶

Assignment 4 Apoorv Saraogee Spring 2022 MSDS 499 Practical Machine Learning

Visualization in R¶

In [2]:
#For ease of moving data from R to Python using rpy2
%load_ext rpy2.ipython
In [3]:
#Reading in data for DNA microarrays

%R ata <- read.csv("~/Downloads/dna.csv")
%R utils::str(ata)
'data.frame':	62 obs. of  93 variables:
 $ T95018  : num  0.2036 -1.2943 0.7084 0.378 0.0201 ...
 $ T61609  : num  -0.222 -1.9 1.244 0.656 0.458 ...
 $ T71025  : num  0.01447 -0.90494 0.00973 -0.33517 -1.12808 ...
 $ X55715  : num  1.463 0.365 -0.462 1.722 0.565 ...
 $ T52185  : num  0.263 -1.735 0.243 0.701 -1.304 ...
 $ T57633  : num  0.96982 -0.24869 -1.66495 1.07952 0.00331 ...
 $ R78934  : num  0.193 0.589 0.282 -0.206 -0.459 ...
 $ T51529  : num  -0.343 -1.3 1.018 0.534 0.717 ...
 $ M26697  : num  -1.244 -1.045 0.492 0.883 1.266 ...
 $ T51023  : num  -0.241 -1.169 2.832 -0.466 0.562 ...
 $ D63874  : num  -0.341 -2.685 1.701 -0.109 -0.481 ...
 $ T57630  : num  0.929 -0.488 -1.386 1.29 -0.595 ...
 $ M36981  : num  -1.012 -1.412 0.449 0.158 1.871 ...
 $ M76378  : num  0.0495 -0.4712 0.2547 -0.8791 -0.3263 ...
 $ M63391  : num  -0.0866 -1.0607 0.1625 -0.9418 -0.7491 ...
 $ M76378.2: num  -0.00248 -0.5894 0.13031 -1.02753 -0.33742 ...
 $ T79152  : num  1.9037 0.0691 3.073 0.4315 0.5167 ...
 $ H64489  : num  -0.3955 -2.1448 0.0658 0.3195 -1.3522 ...
 $ X15183  : num  0.8533 -0.1398 0.0576 1.3885 0.9029 ...
 $ X14958  : num  0.6959 -0.8503 0.3135 0.0466 0.9907 ...
 $ Z50753  : num  0.382 0.678 -1.186 -0.204 -1.073 ...
 $ D31885  : num  0.7306 -2.0175 0.0769 0.6365 0.6402 ...
 $ U30825  : num  -0.668 -1.146 1.021 0.292 2.199 ...
 $ T60155  : num  0.406 1.028 -0.178 -1.375 -0.758 ...
 $ H40560  : num  -0.137 -2.815 1.418 0.939 -0.942 ...
 $ R87126  : num  1.022 0.685 -0.63 -0.788 -0.947 ...
 $ M22382  : num  0.333 -2.616 1.481 0.526 1.213 ...
 $ R42501  : num  0.0058 -0.7084 0.6827 0.1972 0.8076 ...
 $ T51571  : num  0.147 0.643 -0.103 -0.627 1.523 ...
 $ X12671  : num  0.329 -0.272 -0.433 0.742 0.73 ...
 $ X70944  : num  0.986 -1.2751 -0.5193 0.2931 0.0458 ...
 $ M76378.3: num  0.3407 -0.08615 0.45389 -0.53595 0.00365 ...
 $ H40095  : num  0.237 -0.573 0.735 0.224 1.33 ...
 $ X70326  : num  1.6067 -0.0869 -1.2943 0.6041 0.137 ...
 $ Z49269  : num  0.3443 -0.1487 0.0849 -1.3263 -0.5519 ...
 $ T92451  : num  0.0943 0.1826 -0.2065 -1.1536 -0.4986 ...
 $ Z49269.2: num  0.2549 -0.103 -0.0903 -1.454 -0.3518 ...
 $ R75843  : num  1.8133 -0.7328 -0.5033 1.2344 -0.0176 ...
 $ U29092  : num  -0.309 -2.676 -0.874 0.273 0.264 ...
 $ H43887  : num  -0.748 -0.112 -1.359 -0.573 -1.459 ...
 $ H11719  : num  -0.9742 2.1191 0.0135 -0.2436 -1.1444 ...
 $ T86473  : num  0.27442 -0.00375 -0.22985 1.14283 2.03499 ...
 $ X12466  : num  0.253 -1.004 0.077 -0.193 1.659 ...
 $ R08183  : num  0.594 -2.092 1.247 0.114 1.794 ...
 $ R36977  : num  -0.206 -0.869 1.446 -0.181 2.479 ...
 $ M80815  : num  -1.0806 -1.5086 0.0945 -0.2324 -0.2714 ...
 $ U09564  : num  -0.257 -1.408 0.169 -0.713 1.856 ...
 $ L08069  : num  0.115 -1.933 1.094 -0.164 1.31 ...
 $ R84411  : num  1.059 -0.605 0.407 -0.723 0.656 ...
 $ T96873  : num  -0.944 -1.849 0.464 -0.366 0.322 ...
 $ X74295  : num  -0.902 -0.351 0.465 -1.083 -1.884 ...
 $ T40454  : num  -1.8135 -2.3993 0.1183 0.0117 1.336 ...
 $ X12496  : num  -0.238 1.604 -0.974 -1.111 0.173 ...
 $ T47377  : num  0.642 -0.915 0.24 1.292 0.879 ...
 $ T62947  : num  -0.328 -0.588 0.79 0.07 1.497 ...
 $ L05144  : num  0.841 -0.921 -0.628 -0.359 -2.34 ...
 $ U26312  : num  0.923 -0.654 -0.687 0.495 1.218 ...
 $ H77597  : num  -0.431 -0.647 -0.609 -0.428 -1.472 ...
 $ R64115  : num  1.903 -1.079 0.438 0.609 0.672 ...
 $ J02854  : num  0.348 0.74 -0.293 -0.396 -0.293 ...
 $ L41559  : num  0.6646 0.0567 -0.8675 -0.3398 0.2183 ...
 $ L25941  : num  0.194 -1.009 -0.345 0.859 1.232 ...
 $ X86693  : num  -0.646 -0.2447 -0.0663 -1.5543 -1.2357 ...
 $ X74262  : num  0.102 -0.847 -1.715 0.766 1.661 ...
 $ X63629  : num  0.813 1.029 0.351 0.413 0.989 ...
 $ X13482  : num  0.657 -0.73 -0.122 0.75 0.196 ...
 $ T83368  : num  0.584 -0.796 -0.674 0.12 1.866 ...
 $ M36634  : num  -0.775 -0.553 -0.419 -1.01 -1.63 ...
 $ R52081  : num  -1.817 -1.041 0.672 0.262 0.326 ...
 $ T86749  : num  -0.755 -1.009 0.439 -0.433 0.985 ...
 $ H87135  : num  -0.361 -0.905 0.443 -0.383 0.405 ...
 $ M26383  : num  -0.517 1.032 1.085 2.467 0.965 ...
 $ D42047  : num  0.8651 0.1016 -1.0595 0.3304 -0.0011 ...
 $ T67077  : num  -0.0602 2.118 -0.1452 -0.4449 0.1809 ...
 $ D00596  : num  -0.102 0.336 -1.71 1.041 0.357 ...
 $ X53586  : num  0.548 -0.844 -1.964 0.756 0.34 ...
 $ X54942  : num  -0.151 -0.73 0.335 -0.466 -0.787 ...
 $ H20819  : num  0.0442 -0.3943 -1.4682 0.2592 1.2198 ...
 $ U17899  : num  -0.1595 -1.4907 0.5314 -0.0498 -0.2464 ...
 $ J05032  : num  -0.6798 -0.5081 1.0447 0.0255 1.9914 ...
 $ H08393  : num  0.232 -1.113 1.289 0.17 1.016 ...
 $ H06524  : num  -1.403 -0.196 -0.058 -1.184 -2.166 ...
 $ U32519  : num  0.689 -1.996 -0.648 -0.608 1.038 ...
 $ H55916  : num  0.409 0.582 0.92 0.719 0.805 ...
 $ U25138  : num  -0.3714 -0.724 0.0934 -0.6086 -1.2074 ...
 $ U19969  : num  -1.1798 -1.2213 0.0945 -1.1351 -0.5385 ...
 $ X56597  : num  -0.623 -0.564 0.135 0.679 -1.076 ...
 $ M91463  : num  0.521 0.952 -0.406 0.472 -1.105 ...
 $ X62048  : num  -1.089 -0.63 0.435 -0.79 1.454 ...
 $ D29808  : num  -0.2945 -0.6022 -0.5562 0.0134 -0.2264 ...
 $ T60778  : num  0.0441 0.9073 -0.4312 -2.4008 -0.8236 ...
 $ M64110  : num  0.418 0.499 -0.609 -0.849 -1.494 ...
 $ caseid  : chr  "T1" "T2" "T3" "T4" ...
In [6]:
%R library(Hmisc)
%R library(reshape)
%R library(ggplot2)
%R library(dplyr)
%R library(corrplot)
%R library(limma)
%R library(anocva)
Out[6]:
StrVector with 18 elements.
'anocva' 'limma' 'corrplot' ... 'datasets' 'methods' 'base'
In [7]:
#Visualization of data with Rs imageplot with tissues as rows and genes as columns

%R imageplot(as.matrix(ata[0:92]),layout=list(ngrid.r=1,ngrid.c=1,nspot.r=62,nspot.c=92), ylab=ata[93])
In [1]:
#Kernel in R to create R value correlation plots between desired genes across all tissue types

ata <- read.csv("~/Downloads/dna.csv")
cata=ata[0:92]
catacanc=ata[41:62,][0:92]
catanorm=ata[1:40,][0:92]
fit <- lm( as.matrix(cata['X55715']) ~ as.matrix(cata['T57630']))
rvalue=sqrt(summary(fit)$r.squared)
plot(t(catacanc['T57630']), t(catacanc['X55715']),pch=2)
points(t(catanorm['T57630']), t(catanorm['X55715']),pch=0)
legend(-1,legend=c("Line 1", "Line 2"))
abline(fit)
title(paste('Rval=',signif(rvalue)))

Finding number of clusters¶

In [26]:
#### Defines a k-means function that returns cluster labels directly
%R myKmeans = function(dist, k){return(kmeans(dist, k, iter.max = 50, nstart = 5)$cluster)}

# Calculate distances and perform {0,1} normalization
%R distMatrix = as.matrix(dist(ata[0:92]))
%R distMatrix = checkRange01(distMatrix)

# Estimate the optimal number of clusters using silhouette
%R r = nClust(meanDist = distMatrix, p = 1, maxClust = 10,clusteringFunction = myKmeans, criterion = "silhouette")
%R sprintf("The optimal number of clusters found was %d.", r)
Out[26]:
StrVector with 1 elements.
'The optimal number of clusters found was 2.'
In [9]:
# Estimate the optimal number of clusters using slope
%R r = nClust(meanDist = distMatrix, p = 1, maxClust = 10,clusteringFunction = myKmeans, criterion = "slope")
%R sprintf("The optimal number of clusters found was %d.", r)
Out[9]:
StrVector with 1 elements.
'The optimal number of clusters found was 2.'
In [11]:
import pandas
import sklearn
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.cluster import rand_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.manifold import TSNE
In [15]:
ata=%R ata
true_labels=np.zeros(62)
true_labels[0:40] =1


#preserving only 50% genetic information
pca = PCA(n_components=0.5)
pca.fit(ata.iloc[:,:-1])
print(pca.n_components_)
2
In [16]:
# preserving 90% genetic information

ata=%R ata
true_labels=np.zeros(62)
true_labels[0:40] =1

pca = PCA(n_components=0.9)
pca.fit(ata.iloc[:,:-1])
print(pca.n_components_)
18
In [22]:
#DBSCAN clustering method

dbscan = DBSCAN(eps=9.5, min_samples=10)

dbscan.fit(ata.iloc[:,:-1])
dblabels = dbscan.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(dblabels)) - (1 if -1 in dblabels else 0)
n_noise_ = list(dblabels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
print("Rand index: %0.3f" % sklearn.metrics.rand_score(true_labels,dblabels))
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(ata.iloc[:,:-1], dblabels))


#representing the DBSCAN representation using TSNE transformation
iata=ata.iloc[:,:-1]
iata['label']=dblabels
catacanc=iata.loc[iata['label'] == 1]
catanorm=iata.loc[iata['label'] == 0]

tsne = TSNE(n_components=2, random_state=0)
controls_tsne = tsne.fit_transform(catanorm)
cancer_tsne = tsne.fit_transform(catacanc)

#Plotting the TSNE transformation

plt.figure(figsize=(50, 50))
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot()
ax.scatter(cancer_tsne[:,0],cancer_tsne[:,1], color='red')
ax.scatter(controls_tsne[:,1], controls_tsne[:,0],color='blue')
plt.legend(["Cancer","Control"], loc='right')
plt.show()
Estimated number of clusters: 2
Estimated number of noise points: 5
Rand index: 0.779
Silhouette Coefficient: 0.229
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
<Figure size 3600x3600 with 0 Axes>

Clustering with number of clusters paramater¶

In [20]:
#Kmeans clustering method

kmlabel = KMeans(n_clusters=2).fit_predict(ata.iloc[:,:-1])
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(ata.iloc[:,:-1], kmlabel))
print("Rand index: %0.3f" % sklearn.metrics.rand_score(true_labels,kmlabel))


#representing the DBSCAN representation using TSNE transformation
iata=ata.iloc[:,:-1]
iata['label']=kmlabel
catacanc=iata.loc[iata['label'] == 1]
catanorm=iata.loc[iata['label'] == 0]

tsne = TSNE(n_components=2, random_state=0)
controls_tsne = tsne.fit_transform(catanorm)
cancer_tsne = tsne.fit_transform(catacanc)

#Plotting the TSNE transformation

plt.figure(figsize=(50, 50))
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot()
ax.scatter(cancer_tsne[:,0],cancer_tsne[:,1], color='red')
ax.scatter(controls_tsne[:,1], controls_tsne[:,0],color='blue')
plt.legend(["Cancer","Control"], loc='right')
plt.show()
Silhouette Coefficient: 0.334
Rand index: 0.796
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
<Figure size 3600x3600 with 0 Axes>
In [24]:
#chosen ward linkage hierarchal model

model = AgglomerativeClustering(n_clusters=2).fit_predict(ata.iloc[:,:-1])
pred_labels= AgglomerativeClustering(n_clusters=2, linkage='ward').fit(ata.iloc[:,:-1]).labels_
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(ata.iloc[:,:-1], model))
print("Rand index: %0.3f" % sklearn.metrics.rand_score(true_labels,pred_labels))

#generating dendrogram
Z = linkage(ata.iloc[:,:-1], 'ward')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)
plt.show()


#representing the DBSCAN representation using TSNE transformation
iata=ata.iloc[:,:-1]
iata['label']=pred_labels
catacanc=iata.loc[iata['label'] == 1]
catanorm=iata.loc[iata['label'] == 0]

tsne = TSNE(n_components=2, random_state=0)
controls_tsne = tsne.fit_transform(catanorm)
cancer_tsne = tsne.fit_transform(catacanc)

#Plotting the TSNE transformation

plt.figure(figsize=(50, 50))
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot()
ax.scatter(cancer_tsne[:,0],cancer_tsne[:,1], color='red')
ax.scatter(controls_tsne[:,1], controls_tsne[:,0],color='blue')
plt.legend(["Cancer","Control"], loc='right')
plt.show()
Silhouette Coefficient: 0.332
Rand index: 0.772
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
<Figure size 3600x3600 with 0 Axes>
In [66]:
#single linkage hierarchal model

model = AgglomerativeClustering(n_clusters=2, linkage='single').fit_predict(ata.iloc[:,:-1])
pred_labels= AgglomerativeClustering(n_clusters=2, linkage='single').fit(ata.iloc[:,:-1]).labels_
print("Rand index: %0.3f" % sklearn.metrics.rand_score(true_labels,pred_labels))
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(ata.iloc[:,:-1], model))

#generating dendrogram
Z = linkage(ata.iloc[:,:-1], 'single')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)
plt.show()
Rand index: 0.526
Silhouette Coefficient: 0.048
In [25]:
#comparing Rand index for Table 1

print("Rand index: %0.3f" % sklearn.metrics.rand_score(dblabels,kmlabel))
print("Rand index: %0.3f" % sklearn.metrics.rand_score(dblabels,pred_labels))
print("Rand index: %0.3f" % sklearn.metrics.rand_score(kmlabel,pred_labels))
Rand index: 0.895
Rand index: 0.925
Rand index: 0.968

Tissue clustering¶

In [17]:
model = AgglomerativeClustering(n_clusters=2).fit_predict(np.transpose(ata.iloc[:,:-1]))
pred_labels= AgglomerativeClustering(n_clusters=2, linkage='single').fit(np.transpose(ata.iloc[:,:-1])).labels_
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(np.transpose(ata.iloc[:,:-1]), model))


Z = linkage(np.transpose(ata.iloc[:,:-1]), 'ward')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)
plt.show()
Silhouette Coefficient: 0.393
In [ ]:
model = AgglomerativeClustering(n_clusters=2, linkage='single').fit_predict(np.transpose(ata.iloc[:,:-1]))
pred_labels= AgglomerativeClustering(n_clusters=2, linkage='single').fit(np.transpose(ata.iloc[:,:-1])).labels_
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(np.transpose(ata.iloc[:,:-1]), model))

Z = linkage(np.transpose(ata.iloc[:,:-1]), 'single')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)
plt.show()
Silhouette Coefficient: 0.393