import pandas as pd
import sklearn
import pwlf
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import glob
from tensorflow import keras
import tensorflow as tf
from sklearn.model_selection import train_test_split
from Bio import SeqIO
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from nilearn import plotting
#5fold cross validation
crossvalidation = KFold(n_splits=5, random_state=1, shuffle=True)
# get data
humanpath =r'/Users/apoorvsara/Downloads/archive/human.txt'
chimppath =r'/Users/apoorvsara/Downloads/archive/chimpanzee.txt'
dogpath =r'/Users/apoorvsara/Downloads/archive/dog.txt'
human_rawdna = pd.read_table(humanpath)
chimp_rawdna = pd.read_table(chimppath)
dog_rawdna = pd.read_table(dogpath)
#Plotting class distribution of our training data
human_rawdna['class'].value_counts().sort_index().plot.bar()
plt.title("Class distribution of Human DNA")
Text(0.5, 1.0, 'Class distribution of Human DNA')
#Converts all rawdna with words or amino acids using k-mers of size 3
#Data preprocessing steps taken from Chauhan 2021
def Kmers_funct(seq, size=3):
return [seq[x:x+size].lower() for x in range(len(seq) - size + 1)]
human_rawdna['words'] = human_rawdna.apply(lambda x: Kmers_funct(x['sequence']), axis=1)
human_rawdna = human_rawdna.drop('sequence', axis=1)
chimp_rawdna['words'] = chimp_rawdna.apply(lambda x: Kmers_funct(x['sequence']), axis=1)
chimp_rawdna = chimp_rawdna.drop('sequence', axis=1)
dog_rawdna['words'] = dog_rawdna.apply(lambda x: Kmers_funct(x['sequence']), axis=1)
dog_rawdna = dog_rawdna.drop('sequence', axis=1)
human_texts = list(human_rawdna['words'])
for item in range(len(human_texts)):
human_texts[item] = ' '.join(human_texts[item])
#separate labels
y_human = human_rawdna.iloc[:, 0].values # y_human for human_dna
chimp_texts = list(chimp_rawdna['words'])
for item in range(len(chimp_texts)):
chimp_texts[item] = ' '.join(chimp_texts[item])
#separate labels
y_chim = chimp_rawdna.iloc[:, 0].values # y_chim for chimp_dna
dog_texts = list(dog_rawdna['words'])
for item in range(len(dog_texts)):
dog_texts[item] = ' '.join(dog_texts[item])
#separate labels
y_dog = dog_rawdna.iloc[:, 0].values # y_dog for dog_dna
cv = CountVectorizer(ngram_range=(8,8)) #The n-gram size of 8
X = cv.fit_transform(human_texts)
X_chimp = cv.transform(chimp_texts)
X_dog = cv.transform(dog_texts)
print(X.shape)
print(X_chimp.shape)
print(X_dog.shape)
(4380, 602855) (1682, 602855) (820, 602855)
# Splitting the human dataset into the training set and test set (80/20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
y_human,
test_size = 0.20,
random_state=42)
# Splitting the human training dataset for Naive Bayes and Random Forests into the training set and test set for neural networks (80/20)
from sklearn.model_selection import train_test_split
X_ntrain, X_valid, y_ntrain, y_valid = train_test_split(X_train,
y_train,
test_size = 0.20,
random_state=42)
#Naive Bayes model parameters from Chauhan 2021
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("Confusion matrix for predictions on human test DNA sequence\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))
def get_metrics(y_test, y_predicted):
accuracy = accuracy_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted, average='weighted')
recall = recall_score(y_test, y_predicted, average='weighted')
f1 = f1_score(y_test, y_predicted, average='weighted')
return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
Confusion matrix for predictions on human test DNA sequence Predicted 0 1 2 3 4 5 6 Actual 0 96 0 0 0 2 4 0 1 0 106 0 0 0 0 0 2 0 0 78 0 0 0 0 3 0 0 0 124 1 0 0 4 0 0 0 0 146 1 2 5 0 0 0 0 0 51 0 6 0 0 0 0 4 2 259 accuracy = 0.982 precision = 0.983 recall = 0.982 f1 = 0.982
#Performance on chimp sequences
y_pred_chimp = classifier.predict(X_chimp)
print("Confusion matrix for predictions on Chimpanzee test DNA sequence\n")
print(pd.crosstab(pd.Series(y_chim, name='Actual'), pd.Series(y_pred_chimp, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_chim, y_pred_chimp)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
Confusion matrix for predictions on Chimpanzee test DNA sequence Predicted 0 1 2 3 4 5 6 Actual 0 231 0 0 0 1 2 0 1 0 185 0 0 0 0 0 2 0 0 144 0 0 0 0 3 0 0 0 227 1 0 0 4 0 0 0 0 259 1 1 5 0 0 0 0 0 109 0 6 0 0 0 0 5 0 516 accuracy = 0.993 precision = 0.994 recall = 0.993 f1 = 0.993
# Performance on dog sequences
y_pred_dog = classifier.predict(X_dog)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_dog, name='Actual'), pd.Series(y_pred_dog, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_dog, y_pred_dog)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
Confusion matrix for predictions on Dog test DNA sequence Predicted 0 1 2 3 4 5 6 Actual 0 129 0 0 0 1 0 1 1 0 72 0 0 2 0 1 2 0 0 64 0 0 0 0 3 1 0 0 91 2 1 0 4 0 0 0 1 134 0 0 5 0 0 0 0 1 59 0 6 1 0 0 0 2 0 257 accuracy = 0.983 precision = 0.983 recall = 0.983 f1 = 0.983
#Random Forest
rfmodel = RandomForestClassifier().fit(X_train, y_train)
rfscoresr2 = cross_validate(rfmodel, X_train, y_train, scoring="accuracy", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Random Forest Training "+"Folds: " + str(len(rfscoresr2['train_score'])) + ", Accuracy: " + str(np.mean(np.abs(rfscoresr2['train_score']))) + ", STD: " + str(np.std(rfscoresr2['train_score'])))
print("Random Forest Test "+"Folds: " + str(len(rfscoresr2['test_score'])) + ", Accuracy: " + str(np.mean(np.abs(rfscoresr2['test_score']))) + ", STD: " + str(np.std(rfscoresr2['test_score'])))
Random Forest Training Folds: 5, Accuracy: 0.9995005605874692, STD: 0.0002854212184307589 Random Forest Test Folds: 5, Accuracy: 0.8861263501120848, STD: 0.01630437780828705
#Human prediction
y_rfpred = rfmodel.predict(X_test)
print("Confusion matrix for predictions on human test DNA sequence\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_rfpred, name='Predicted')))
def get_metrics(y_test, y_predicted):
accuracy = accuracy_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted, average='weighted')
recall = recall_score(y_test, y_predicted, average='weighted')
f1 = f1_score(y_test, y_predicted, average='weighted')
return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_rfpred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
Confusion matrix for predictions on human test DNA sequence Predicted 0 1 2 3 4 5 6 Actual 0 92 0 0 0 0 0 10 1 2 95 0 0 0 0 9 2 0 0 70 0 0 0 8 3 0 0 0 116 0 0 9 4 0 0 0 2 128 0 19 5 0 0 0 0 0 44 7 6 0 0 0 1 0 0 264 accuracy = 0.924 precision = 0.936 recall = 0.924 f1 = 0.925
#Performance on chimp sequences
y_rfpred_chimp = rfmodel.predict(X_chimp)
print("Confusion matrix for predictions on Chimpanzee test DNA sequence\n")
print(pd.crosstab(pd.Series(y_chim, name='Actual'), pd.Series(y_rfpred_chimp, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_chim, y_rfpred_chimp)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
Confusion matrix for predictions on Chimpanzee test DNA sequence Predicted 0 1 2 3 4 5 6 Actual 0 232 0 0 0 0 0 2 1 0 184 0 0 0 0 1 2 0 0 143 0 0 0 1 3 0 0 0 224 0 0 4 4 0 0 0 3 250 0 8 5 0 0 0 0 0 108 1 6 0 0 0 0 0 0 521 accuracy = 0.988 precision = 0.988 recall = 0.988 f1 = 0.988
# Performance on dog sequences
y_rfpred_dog = rfmodel.predict(X_dog)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_dog, name='Actual'), pd.Series(y_rfpred_dog, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_dog, y_rfpred_dog)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
Confusion matrix for predictions on Dog test DNA sequence Predicted 0 1 2 3 4 5 6 Actual 0 108 1 0 3 1 0 18 1 0 61 0 3 1 0 10 2 1 1 53 0 2 0 7 3 2 0 0 83 0 0 10 4 2 0 0 5 94 0 34 5 12 1 0 0 1 36 10 6 0 0 0 1 0 0 259 accuracy = 0.846 precision = 0.870 recall = 0.846 f1 = 0.844
#Training different 1-D CNNs
#Model with kernel size of 5 and 24 filters in the second layer
mlenet5 = keras.models.Sequential()
mlenet5.add(keras.layers.Conv1D(16, 5, activation='relu', input_shape=(X_ntrain.shape[1],1) ))
mlenet5.add(keras.layers.MaxPool1D(strides=3))
mlenet5.add(keras.layers.Conv1D(filters=24, kernel_size=5, padding='valid', activation='relu'))
mlenet5.add(keras.layers.MaxPool1D(strides=3))
mlenet5.add(keras.layers.Flatten())
mlenet5.add(keras.layers.Dense(84, activation='relu'))
mlenet5.add(keras.layers.Dense(7, activation='softmax'))
mlenet5.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=['accuracy'])
#Model with kernel size of 5 and 16 filters in the second layer
m2lenet5 = keras.models.Sequential()
m2lenet5.add(keras.layers.Conv1D(16, 5, activation='relu', input_shape=(X_ntrain.shape[1],1) ))
m2lenet5.add(keras.layers.MaxPool1D(strides=3))
m2lenet5.add(keras.layers.Conv1D(filters=16, kernel_size=5, padding='valid', activation='relu'))
m2lenet5.add(keras.layers.MaxPool1D(strides=3))
m2lenet5.add(keras.layers.Flatten())
m2lenet5.add(keras.layers.Dense(84, activation='relu'))
m2lenet5.add(keras.layers.Dense(7, activation='softmax'))
m2lenet5.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=['accuracy'])
#Model with kernel size of 3 and 24 filters in the second layer
m3lenet5 = keras.models.Sequential()
m3lenet5.add(keras.layers.Conv1D(16, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ))
m3lenet5.add(keras.layers.MaxPool1D(strides=3))
m3lenet5.add(keras.layers.Conv1D(filters=24, kernel_size=3, padding='valid', activation='relu'))
m3lenet5.add(keras.layers.MaxPool1D(strides=3))
m3lenet5.add(keras.layers.Flatten())
m3lenet5.add(keras.layers.Dense(84, activation='relu'))
m3lenet5.add(keras.layers.Dense(7, activation='softmax'))
m3lenet5.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=['accuracy'])
#Model with kernel size of 3 and 16 filters in the second layer
m4lenet5 = keras.models.Sequential()
m4lenet5.add(keras.layers.Conv1D(16, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ))
m4lenet5.add(keras.layers.MaxPool1D(strides=3))
m4lenet5.add(keras.layers.Conv1D(filters=16, kernel_size=3, padding='valid', activation='relu'))
m4lenet5.add(keras.layers.MaxPool1D(strides=3))
m4lenet5.add(keras.layers.Flatten())
m4lenet5.add(keras.layers.Dense(84, activation='relu'))
m4lenet5.add(keras.layers.Dense(7, activation='softmax'))
m4lenet5.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=['accuracy'])
2022-06-04 18:20:08.968751: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
lenet1 = mlenet5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=5)
Epoch 1/5 88/88 [==============================] - 376s 4s/step - loss: 1.7984 - accuracy: 0.3689 - val_loss: 1.6797 - val_accuracy: 0.3195 Epoch 2/5 88/88 [==============================] - 381s 4s/step - loss: 1.4009 - accuracy: 0.5822 - val_loss: 1.1428 - val_accuracy: 0.7561 Epoch 3/5 88/88 [==============================] - 377s 4s/step - loss: 0.7138 - accuracy: 0.8569 - val_loss: 0.6323 - val_accuracy: 0.8274 Epoch 4/5 88/88 [==============================] - 390s 4s/step - loss: 0.3298 - accuracy: 0.9493 - val_loss: 0.4320 - val_accuracy: 0.9001 Epoch 5/5 88/88 [==============================] - 383s 4s/step - loss: 0.1761 - accuracy: 0.9832 - val_loss: 0.3678 - val_accuracy: 0.8973
lenet2 = m2lenet5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=5)
Epoch 1/5 88/88 [==============================] - 343s 4s/step - loss: 1.7421 - accuracy: 0.3732 - val_loss: 1.5514 - val_accuracy: 0.4494 Epoch 2/5 88/88 [==============================] - 345s 4s/step - loss: 1.2154 - accuracy: 0.6497 - val_loss: 1.0843 - val_accuracy: 0.8873 Epoch 3/5 88/88 [==============================] - 351s 4s/step - loss: 0.6032 - accuracy: 0.8798 - val_loss: 0.5599 - val_accuracy: 0.8930 Epoch 4/5 88/88 [==============================] - 358s 4s/step - loss: 0.2771 - accuracy: 0.9636 - val_loss: 0.4065 - val_accuracy: 0.9044 Epoch 5/5 88/88 [==============================] - 359s 4s/step - loss: 0.1515 - accuracy: 0.9854 - val_loss: 0.3262 - val_accuracy: 0.9287
lenet3 = m3lenet5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=5)
Epoch 1/5 88/88 [==============================] - 346s 4s/step - loss: 1.6002 - accuracy: 0.4631 - val_loss: 1.3236 - val_accuracy: 0.8573 Epoch 2/5 88/88 [==============================] - 347s 4s/step - loss: 0.8466 - accuracy: 0.8052 - val_loss: 0.7240 - val_accuracy: 0.7389 Epoch 3/5 88/88 [==============================] - 343s 4s/step - loss: 0.3757 - accuracy: 0.9486 - val_loss: 0.4535 - val_accuracy: 0.9330 Epoch 4/5 88/88 [==============================] - 346s 4s/step - loss: 0.1920 - accuracy: 0.9790 - val_loss: 0.3411 - val_accuracy: 0.9230 Epoch 5/5 88/88 [==============================] - 353s 4s/step - loss: 0.1124 - accuracy: 0.9900 - val_loss: 0.3018 - val_accuracy: 0.9087
lenet4 = m4lenet5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=5)
Epoch 1/5 88/88 [==============================] - 320s 4s/step - loss: 1.7320 - accuracy: 0.3800 - val_loss: 1.5863 - val_accuracy: 0.4251 Epoch 2/5 88/88 [==============================] - 319s 4s/step - loss: 1.2028 - accuracy: 0.6664 - val_loss: 1.0152 - val_accuracy: 0.8745 Epoch 3/5 88/88 [==============================] - 318s 4s/step - loss: 0.5817 - accuracy: 0.8980 - val_loss: 0.5708 - val_accuracy: 0.8773 Epoch 4/5 88/88 [==============================] - 316s 4s/step - loss: 0.2773 - accuracy: 0.9668 - val_loss: 0.3939 - val_accuracy: 0.9158 Epoch 5/5 88/88 [==============================] - 310s 4s/step - loss: 0.1531 - accuracy: 0.9857 - val_loss: 0.3150 - val_accuracy: 0.9258
#Performance of lenet2 on human sequences
y_cnnpred_hum = np.argmax(m2lenet5.predict(X_test.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_cnnpred_hum, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_test, y_cnnpred_hum)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
28/28 [==============================] - 31s 974ms/step Confusion matrix for predictions on Dog test DNA sequence Predicted 0 1 2 3 4 5 6 Actual 0 85 3 0 3 4 0 7 1 0 102 0 1 0 0 3 2 0 3 69 0 0 0 6 3 0 0 0 119 0 0 6 4 0 2 0 2 131 0 14 5 0 1 0 0 0 42 8 6 0 0 0 1 0 0 264 accuracy = 0.927 precision = 0.934 recall = 0.927 f1 = 0.927
#Performance of lenet2 on chimp sequences
y_cnnpred_chim = np.argmax(m2lenet5.predict(X_chimp.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_chim, name='Actual'), pd.Series(y_cnnpred_chim, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_chim, y_cnnpred_chim)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
53/53 [==============================] - 58s 1s/step Confusion matrix for predictions on Dog test DNA sequence Predicted 0 1 2 3 4 5 6 Actual 0 223 1 0 1 1 0 8 1 0 184 0 0 1 0 0 2 0 0 139 0 0 0 5 3 0 0 0 224 2 0 2 4 0 6 0 2 243 0 10 5 0 0 0 1 0 106 2 6 0 0 0 0 0 0 521 accuracy = 0.975 precision = 0.976 recall = 0.975 f1 = 0.975
#Performance of lenet2 on chimp sequences
y_cnnpred_dog = np.argmax(m2lenet5.predict(X_dog.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_dog, name='Actual'), pd.Series(y_cnnpred_dog, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_dog, y_cnnpred_dog)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
26/26 [==============================] - 22s 833ms/step Confusion matrix for predictions on Dog test DNA sequence Predicted 0 1 2 3 4 5 6 Actual 0 117 3 0 1 0 0 10 1 0 71 0 0 0 0 4 2 0 3 54 0 1 0 6 3 0 3 0 87 1 0 4 4 0 2 0 5 106 0 22 5 4 4 0 0 1 47 4 6 0 1 0 0 0 0 259 accuracy = 0.904 precision = 0.915 recall = 0.904 f1 = 0.903
#Recurrent Neural Networks
#SimpleRNN layers
rnmodel1 = keras.models.Sequential([
keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.SimpleRNN(10, return_sequences=True),
keras.layers.SimpleRNN(10, return_sequences=True),
keras.layers.Flatten(),
keras.layers.Dense(7)
])
rnmodel1.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=['accuracy'])
#LSTM layers
rnmodel2 = keras.models.Sequential([
keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.LSTM(10, return_sequences=True),
keras.layers.LSTM(10, return_sequences=True),
keras.layers.Flatten(),
keras.layers.Dense(7)
])
rnmodel2.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=['accuracy'])
#GRU layers
rnmodel3 = keras.models.Sequential([
keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.GRU(10, return_sequences=True),
keras.layers.GRU(10, return_sequences=True),
keras.layers.Flatten(),
keras.layers.Dense(7)
])
rnmodel3.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=['accuracy'])
#GRU layers with 2 units instead of 10
rnmodel4 = keras.models.Sequential([
keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.GRU(2, return_sequences=True),
keras.layers.GRU(2, return_sequences=True),
keras.layers.Flatten(),
keras.layers.Dense(7)
])
rnmodel4.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=['accuracy'])
#GRU layers with one less pre-Conv1D layer and more parameters in the model
rnmodel5 = keras.models.Sequential([
keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.Conv1D(8, 3, activation='relu' ),
keras.layers.MaxPool1D(strides=3),
keras.layers.GRU(10, return_sequences=True),
keras.layers.GRU(10, return_sequences=True),
keras.layers.Flatten(),
keras.layers.Dense(7)
])
rnmodel5.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=['accuracy'])
rnmodel1.summary()
Model: "sequential_4" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= conv1d_8 (Conv1D) (None, 602853, 8) 32 max_pooling1d_8 (MaxPooling (None, 200951, 8) 0 1D) conv1d_9 (Conv1D) (None, 200949, 8) 200 max_pooling1d_9 (MaxPooling (None, 66983, 8) 0 1D) conv1d_10 (Conv1D) (None, 66981, 8) 200 max_pooling1d_10 (MaxPoolin (None, 22327, 8) 0 g1D) conv1d_11 (Conv1D) (None, 22325, 8) 200 max_pooling1d_11 (MaxPoolin (None, 7442, 8) 0 g1D) simple_rnn (SimpleRNN) (None, 7442, 10) 190 simple_rnn_1 (SimpleRNN) (None, 7442, 10) 210 flatten_4 (Flatten) (None, 74420) 0 dense_8 (Dense) (None, 7) 520947 ================================================================= Total params: 521,979 Trainable params: 521,979 Non-trainable params: 0 _________________________________________________________________
rn1 = rnmodel1.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)
Epoch 1/2 88/88 [==============================] - 776s 9s/step - loss: 10.8259 - accuracy: 0.1577 - val_loss: 10.9396 - val_accuracy: 0.1683 Epoch 2/2 88/88 [==============================] - 778s 9s/step - loss: 10.8511 - accuracy: 0.1584 - val_loss: 10.9396 - val_accuracy: 0.1683
rn2 = rnmodel2.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)
Epoch 1/2 88/88 [==============================] - 866s 10s/step - loss: 5.4435 - accuracy: 0.1195 - val_loss: 5.5940 - val_accuracy: 0.1327 Epoch 2/2 88/88 [==============================] - 880s 10s/step - loss: 5.4460 - accuracy: 0.1195 - val_loss: 5.5940 - val_accuracy: 0.1327
rn3 = rnmodel3.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)
Epoch 1/2 88/88 [==============================] - 910s 10s/step - loss: 9.8086 - accuracy: 0.3068 - val_loss: 10.0513 - val_accuracy: 0.2981 Epoch 2/2 88/88 [==============================] - 908s 10s/step - loss: 9.8361 - accuracy: 0.3100 - val_loss: 10.0513 - val_accuracy: 0.2981
rn4 = rnmodel4.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)
Epoch 1/2 88/88 [==============================] - 826s 9s/step - loss: 14.6873 - accuracy: 0.0817 - val_loss: 14.8075 - val_accuracy: 0.0813 Epoch 2/2 88/88 [==============================] - 804s 9s/step - loss: 14.8875 - accuracy: 0.0763 - val_loss: 14.8075 - val_accuracy: 0.0813
rn5 = rnmodel5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)
Epoch 1/2 88/88 [==============================] - 4766s 54s/step - loss: 6.1604 - accuracy: 0.3086 - val_loss: 6.2770 - val_accuracy: 0.2981 Epoch 2/2 88/88 [==============================] - 4803s 55s/step - loss: 6.1407 - accuracy: 0.3100 - val_loss: 6.2770 - val_accuracy: 0.2981
#Performance of rn5 on human sequences
y_rnnpred_hum = np.argmax(rnmodel5.predict(X_test.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_rnnpred_hum, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_test, y_rnnpred_hum)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
28/28 [==============================] - 63s 2s/step Confusion matrix for predictions on Dog test DNA sequence Predicted 6 Actual 0 102 1 106 2 78 3 125 4 149 5 51 6 265 accuracy = 0.303 precision = 0.092 recall = 0.303 f1 = 0.141
/usr/local/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
#Performance of rn5 on chimp sequences
y_rnnpred_chim = np.argmax(rnmodel5.predict(X_chimp.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_chim, name='Actual'), pd.Series(y_rnnpred_chim, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_chim, y_rnnpred_chim)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
53/53 [==============================] - 121s 2s/step Confusion matrix for predictions on Dog test DNA sequence Predicted 6 Actual 0 234 1 185 2 144 3 228 4 261 5 109 6 521 accuracy = 0.310 precision = 0.096 recall = 0.310 f1 = 0.147
/usr/local/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
#Performance of rn5 on dog sequences
y_rnnpred_dog = np.argmax(rnmodel5.predict(X_dog.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_dog, name='Actual'), pd.Series(y_rnnpred_dog, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_dog, y_rnnpred_dog)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))
26/26 [==============================] - 59s 2s/step Confusion matrix for predictions on Dog test DNA sequence Predicted 6 Actual 0 131 1 75 2 64 3 95 4 135 5 60 6 260 accuracy = 0.317 precision = 0.101 recall = 0.317 f1 = 0.153
/usr/local/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))