#For ease of moving data from R to Python using rpy2
%load_ext rpy2.ipython
#Reading in data for DNA microarrays
%R ata <- read.csv("~/Downloads/dna.csv")
%R utils::str(ata)
'data.frame': 62 obs. of 93 variables: $ T95018 : num 0.2036 -1.2943 0.7084 0.378 0.0201 ... $ T61609 : num -0.222 -1.9 1.244 0.656 0.458 ... $ T71025 : num 0.01447 -0.90494 0.00973 -0.33517 -1.12808 ... $ X55715 : num 1.463 0.365 -0.462 1.722 0.565 ... $ T52185 : num 0.263 -1.735 0.243 0.701 -1.304 ... $ T57633 : num 0.96982 -0.24869 -1.66495 1.07952 0.00331 ... $ R78934 : num 0.193 0.589 0.282 -0.206 -0.459 ... $ T51529 : num -0.343 -1.3 1.018 0.534 0.717 ... $ M26697 : num -1.244 -1.045 0.492 0.883 1.266 ... $ T51023 : num -0.241 -1.169 2.832 -0.466 0.562 ... $ D63874 : num -0.341 -2.685 1.701 -0.109 -0.481 ... $ T57630 : num 0.929 -0.488 -1.386 1.29 -0.595 ... $ M36981 : num -1.012 -1.412 0.449 0.158 1.871 ... $ M76378 : num 0.0495 -0.4712 0.2547 -0.8791 -0.3263 ... $ M63391 : num -0.0866 -1.0607 0.1625 -0.9418 -0.7491 ... $ M76378.2: num -0.00248 -0.5894 0.13031 -1.02753 -0.33742 ... $ T79152 : num 1.9037 0.0691 3.073 0.4315 0.5167 ... $ H64489 : num -0.3955 -2.1448 0.0658 0.3195 -1.3522 ... $ X15183 : num 0.8533 -0.1398 0.0576 1.3885 0.9029 ... $ X14958 : num 0.6959 -0.8503 0.3135 0.0466 0.9907 ... $ Z50753 : num 0.382 0.678 -1.186 -0.204 -1.073 ... $ D31885 : num 0.7306 -2.0175 0.0769 0.6365 0.6402 ... $ U30825 : num -0.668 -1.146 1.021 0.292 2.199 ... $ T60155 : num 0.406 1.028 -0.178 -1.375 -0.758 ... $ H40560 : num -0.137 -2.815 1.418 0.939 -0.942 ... $ R87126 : num 1.022 0.685 -0.63 -0.788 -0.947 ... $ M22382 : num 0.333 -2.616 1.481 0.526 1.213 ... $ R42501 : num 0.0058 -0.7084 0.6827 0.1972 0.8076 ... $ T51571 : num 0.147 0.643 -0.103 -0.627 1.523 ... $ X12671 : num 0.329 -0.272 -0.433 0.742 0.73 ... $ X70944 : num 0.986 -1.2751 -0.5193 0.2931 0.0458 ... $ M76378.3: num 0.3407 -0.08615 0.45389 -0.53595 0.00365 ... $ H40095 : num 0.237 -0.573 0.735 0.224 1.33 ... $ X70326 : num 1.6067 -0.0869 -1.2943 0.6041 0.137 ... $ Z49269 : num 0.3443 -0.1487 0.0849 -1.3263 -0.5519 ... $ T92451 : num 0.0943 0.1826 -0.2065 -1.1536 -0.4986 ... $ Z49269.2: num 0.2549 -0.103 -0.0903 -1.454 -0.3518 ... $ R75843 : num 1.8133 -0.7328 -0.5033 1.2344 -0.0176 ... $ U29092 : num -0.309 -2.676 -0.874 0.273 0.264 ... $ H43887 : num -0.748 -0.112 -1.359 -0.573 -1.459 ... $ H11719 : num -0.9742 2.1191 0.0135 -0.2436 -1.1444 ... $ T86473 : num 0.27442 -0.00375 -0.22985 1.14283 2.03499 ... $ X12466 : num 0.253 -1.004 0.077 -0.193 1.659 ... $ R08183 : num 0.594 -2.092 1.247 0.114 1.794 ... $ R36977 : num -0.206 -0.869 1.446 -0.181 2.479 ... $ M80815 : num -1.0806 -1.5086 0.0945 -0.2324 -0.2714 ... $ U09564 : num -0.257 -1.408 0.169 -0.713 1.856 ... $ L08069 : num 0.115 -1.933 1.094 -0.164 1.31 ... $ R84411 : num 1.059 -0.605 0.407 -0.723 0.656 ... $ T96873 : num -0.944 -1.849 0.464 -0.366 0.322 ... $ X74295 : num -0.902 -0.351 0.465 -1.083 -1.884 ... $ T40454 : num -1.8135 -2.3993 0.1183 0.0117 1.336 ... $ X12496 : num -0.238 1.604 -0.974 -1.111 0.173 ... $ T47377 : num 0.642 -0.915 0.24 1.292 0.879 ... $ T62947 : num -0.328 -0.588 0.79 0.07 1.497 ... $ L05144 : num 0.841 -0.921 -0.628 -0.359 -2.34 ... $ U26312 : num 0.923 -0.654 -0.687 0.495 1.218 ... $ H77597 : num -0.431 -0.647 -0.609 -0.428 -1.472 ... $ R64115 : num 1.903 -1.079 0.438 0.609 0.672 ... $ J02854 : num 0.348 0.74 -0.293 -0.396 -0.293 ... $ L41559 : num 0.6646 0.0567 -0.8675 -0.3398 0.2183 ... $ L25941 : num 0.194 -1.009 -0.345 0.859 1.232 ... $ X86693 : num -0.646 -0.2447 -0.0663 -1.5543 -1.2357 ... $ X74262 : num 0.102 -0.847 -1.715 0.766 1.661 ... $ X63629 : num 0.813 1.029 0.351 0.413 0.989 ... $ X13482 : num 0.657 -0.73 -0.122 0.75 0.196 ... $ T83368 : num 0.584 -0.796 -0.674 0.12 1.866 ... $ M36634 : num -0.775 -0.553 -0.419 -1.01 -1.63 ... $ R52081 : num -1.817 -1.041 0.672 0.262 0.326 ... $ T86749 : num -0.755 -1.009 0.439 -0.433 0.985 ... $ H87135 : num -0.361 -0.905 0.443 -0.383 0.405 ... $ M26383 : num -0.517 1.032 1.085 2.467 0.965 ... $ D42047 : num 0.8651 0.1016 -1.0595 0.3304 -0.0011 ... $ T67077 : num -0.0602 2.118 -0.1452 -0.4449 0.1809 ... $ D00596 : num -0.102 0.336 -1.71 1.041 0.357 ... $ X53586 : num 0.548 -0.844 -1.964 0.756 0.34 ... $ X54942 : num -0.151 -0.73 0.335 -0.466 -0.787 ... $ H20819 : num 0.0442 -0.3943 -1.4682 0.2592 1.2198 ... $ U17899 : num -0.1595 -1.4907 0.5314 -0.0498 -0.2464 ... $ J05032 : num -0.6798 -0.5081 1.0447 0.0255 1.9914 ... $ H08393 : num 0.232 -1.113 1.289 0.17 1.016 ... $ H06524 : num -1.403 -0.196 -0.058 -1.184 -2.166 ... $ U32519 : num 0.689 -1.996 -0.648 -0.608 1.038 ... $ H55916 : num 0.409 0.582 0.92 0.719 0.805 ... $ U25138 : num -0.3714 -0.724 0.0934 -0.6086 -1.2074 ... $ U19969 : num -1.1798 -1.2213 0.0945 -1.1351 -0.5385 ... $ X56597 : num -0.623 -0.564 0.135 0.679 -1.076 ... $ M91463 : num 0.521 0.952 -0.406 0.472 -1.105 ... $ X62048 : num -1.089 -0.63 0.435 -0.79 1.454 ... $ D29808 : num -0.2945 -0.6022 -0.5562 0.0134 -0.2264 ... $ T60778 : num 0.0441 0.9073 -0.4312 -2.4008 -0.8236 ... $ M64110 : num 0.418 0.499 -0.609 -0.849 -1.494 ... $ caseid : chr "T1" "T2" "T3" "T4" ...
%R library(Hmisc)
%R library(reshape)
%R library(ggplot2)
%R library(dplyr)
%R library(corrplot)
%R library(limma)
%R library(anocva)
'anocva' | 'limma' | 'corrplot' | ... | 'datasets' | 'methods' | 'base' |
#Visualization of data with Rs imageplot with tissues as rows and genes as columns
%R imageplot(as.matrix(ata[0:92]),layout=list(ngrid.r=1,ngrid.c=1,nspot.r=62,nspot.c=92), ylab=ata[93])
#Kernel in R to create R value correlation plots between desired genes across all tissue types
ata <- read.csv("~/Downloads/dna.csv")
cata=ata[0:92]
catacanc=ata[41:62,][0:92]
catanorm=ata[1:40,][0:92]
fit <- lm( as.matrix(cata['X55715']) ~ as.matrix(cata['T57630']))
rvalue=sqrt(summary(fit)$r.squared)
plot(t(catacanc['T57630']), t(catacanc['X55715']),pch=2)
points(t(catanorm['T57630']), t(catanorm['X55715']),pch=0)
legend(-1,legend=c("Line 1", "Line 2"))
abline(fit)
title(paste('Rval=',signif(rvalue)))
#### Defines a k-means function that returns cluster labels directly
%R myKmeans = function(dist, k){return(kmeans(dist, k, iter.max = 50, nstart = 5)$cluster)}
# Calculate distances and perform {0,1} normalization
%R distMatrix = as.matrix(dist(ata[0:92]))
%R distMatrix = checkRange01(distMatrix)
# Estimate the optimal number of clusters using silhouette
%R r = nClust(meanDist = distMatrix, p = 1, maxClust = 10,clusteringFunction = myKmeans, criterion = "silhouette")
%R sprintf("The optimal number of clusters found was %d.", r)
'The optimal number of clusters found was 2.' |
# Estimate the optimal number of clusters using slope
%R r = nClust(meanDist = distMatrix, p = 1, maxClust = 10,clusteringFunction = myKmeans, criterion = "slope")
%R sprintf("The optimal number of clusters found was %d.", r)
'The optimal number of clusters found was 2.' |
import pandas
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.cluster import rand_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.manifold import TSNE
ata=%R ata
true_labels=np.zeros(62)
true_labels[0:40] =1
#preserving only 50% genetic information
pca = PCA(n_components=0.5)
pca.fit(ata.iloc[:,:-1])
print(pca.n_components_)
2
# preserving 90% genetic information
ata=%R ata
true_labels=np.zeros(62)
true_labels[0:40] =1
pca = PCA(n_components=0.9)
pca.fit(ata.iloc[:,:-1])
print(pca.n_components_)
18
#DBSCAN clustering method
dbscan = DBSCAN(eps=9.5, min_samples=10)
dbscan.fit(ata.iloc[:,:-1])
dblabels = dbscan.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(dblabels)) - (1 if -1 in dblabels else 0)
n_noise_ = list(dblabels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
print("Rand index: %0.3f" % sklearn.metrics.rand_score(true_labels,dblabels))
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(ata.iloc[:,:-1], dblabels))
#representing the DBSCAN representation using TSNE transformation
iata=ata.iloc[:,:-1]
iata['label']=dblabels
catacanc=iata.loc[iata['label'] == 1]
catanorm=iata.loc[iata['label'] == 0]
tsne = TSNE(n_components=2, random_state=0)
controls_tsne = tsne.fit_transform(catanorm)
cancer_tsne = tsne.fit_transform(catacanc)
#Plotting the TSNE transformation
plt.figure(figsize=(50, 50))
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot()
ax.scatter(cancer_tsne[:,0],cancer_tsne[:,1], color='red')
ax.scatter(controls_tsne[:,1], controls_tsne[:,0],color='blue')
plt.legend(["Cancer","Control"], loc='right')
plt.show()
Estimated number of clusters: 2 Estimated number of noise points: 5 Rand index: 0.779 Silhouette Coefficient: 0.229
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( /usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn( /usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( /usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn(
<Figure size 3600x3600 with 0 Axes>
#Kmeans clustering method
kmlabel = KMeans(n_clusters=2).fit_predict(ata.iloc[:,:-1])
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(ata.iloc[:,:-1], kmlabel))
print("Rand index: %0.3f" % sklearn.metrics.rand_score(true_labels,kmlabel))
#representing the DBSCAN representation using TSNE transformation
iata=ata.iloc[:,:-1]
iata['label']=kmlabel
catacanc=iata.loc[iata['label'] == 1]
catanorm=iata.loc[iata['label'] == 0]
tsne = TSNE(n_components=2, random_state=0)
controls_tsne = tsne.fit_transform(catanorm)
cancer_tsne = tsne.fit_transform(catacanc)
#Plotting the TSNE transformation
plt.figure(figsize=(50, 50))
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot()
ax.scatter(cancer_tsne[:,0],cancer_tsne[:,1], color='red')
ax.scatter(controls_tsne[:,1], controls_tsne[:,0],color='blue')
plt.legend(["Cancer","Control"], loc='right')
plt.show()
Silhouette Coefficient: 0.334 Rand index: 0.796
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( /usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn( /usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( /usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn(
<Figure size 3600x3600 with 0 Axes>
#chosen ward linkage hierarchal model
model = AgglomerativeClustering(n_clusters=2).fit_predict(ata.iloc[:,:-1])
pred_labels= AgglomerativeClustering(n_clusters=2, linkage='ward').fit(ata.iloc[:,:-1]).labels_
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(ata.iloc[:,:-1], model))
print("Rand index: %0.3f" % sklearn.metrics.rand_score(true_labels,pred_labels))
#generating dendrogram
Z = linkage(ata.iloc[:,:-1], 'ward')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)
plt.show()
#representing the DBSCAN representation using TSNE transformation
iata=ata.iloc[:,:-1]
iata['label']=pred_labels
catacanc=iata.loc[iata['label'] == 1]
catanorm=iata.loc[iata['label'] == 0]
tsne = TSNE(n_components=2, random_state=0)
controls_tsne = tsne.fit_transform(catanorm)
cancer_tsne = tsne.fit_transform(catacanc)
#Plotting the TSNE transformation
plt.figure(figsize=(50, 50))
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot()
ax.scatter(cancer_tsne[:,0],cancer_tsne[:,1], color='red')
ax.scatter(controls_tsne[:,1], controls_tsne[:,0],color='blue')
plt.legend(["Cancer","Control"], loc='right')
plt.show()
Silhouette Coefficient: 0.332 Rand index: 0.772
/usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( /usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn( /usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( /usr/local/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn(
<Figure size 3600x3600 with 0 Axes>
#single linkage hierarchal model
model = AgglomerativeClustering(n_clusters=2, linkage='single').fit_predict(ata.iloc[:,:-1])
pred_labels= AgglomerativeClustering(n_clusters=2, linkage='single').fit(ata.iloc[:,:-1]).labels_
print("Rand index: %0.3f" % sklearn.metrics.rand_score(true_labels,pred_labels))
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(ata.iloc[:,:-1], model))
#generating dendrogram
Z = linkage(ata.iloc[:,:-1], 'single')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)
plt.show()
Rand index: 0.526 Silhouette Coefficient: 0.048
#comparing Rand index for Table 1
print("Rand index: %0.3f" % sklearn.metrics.rand_score(dblabels,kmlabel))
print("Rand index: %0.3f" % sklearn.metrics.rand_score(dblabels,pred_labels))
print("Rand index: %0.3f" % sklearn.metrics.rand_score(kmlabel,pred_labels))
Rand index: 0.895 Rand index: 0.925 Rand index: 0.968
model = AgglomerativeClustering(n_clusters=2).fit_predict(np.transpose(ata.iloc[:,:-1]))
pred_labels= AgglomerativeClustering(n_clusters=2, linkage='single').fit(np.transpose(ata.iloc[:,:-1])).labels_
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(np.transpose(ata.iloc[:,:-1]), model))
Z = linkage(np.transpose(ata.iloc[:,:-1]), 'ward')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)
plt.show()
Silhouette Coefficient: 0.393
model = AgglomerativeClustering(n_clusters=2, linkage='single').fit_predict(np.transpose(ata.iloc[:,:-1]))
pred_labels= AgglomerativeClustering(n_clusters=2, linkage='single').fit(np.transpose(ata.iloc[:,:-1])).labels_
print("Silhouette Coefficient: %0.3f" % sklearn.metrics.silhouette_score(np.transpose(ata.iloc[:,:-1]), model))
Z = linkage(np.transpose(ata.iloc[:,:-1]), 'single')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z)
plt.show()
Silhouette Coefficient: 0.393