import pandas as pd
import sklearn
import pwlf
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import glob

from tensorflow import keras
import tensorflow as tf
from sklearn.model_selection import train_test_split
from Bio import SeqIO
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from nilearn import plotting
#5fold cross validation
crossvalidation = KFold(n_splits=5, random_state=1, shuffle=True)


# get data 

humanpath =r'/Users/apoorvsara/Downloads/archive/human.txt'
chimppath =r'/Users/apoorvsara/Downloads/archive/chimpanzee.txt'
dogpath =r'/Users/apoorvsara/Downloads/archive/dog.txt'

human_rawdna = pd.read_table(humanpath)
chimp_rawdna = pd.read_table(chimppath)
dog_rawdna = pd.read_table(dogpath)


#Plotting class distribution of our training data

human_rawdna['class'].value_counts().sort_index().plot.bar()
plt.title("Class distribution of Human DNA")

Text(0.5, 1.0, 'Class distribution of Human DNA')


#Converts all rawdna with words or amino acids using k-mers of size 3
#Data preprocessing steps taken from Chauhan 2021

def Kmers_funct(seq, size=3):
    return [seq[x:x+size].lower() for x in range(len(seq) - size + 1)]

human_rawdna['words'] = human_rawdna.apply(lambda x: Kmers_funct(x['sequence']), axis=1)
human_rawdna = human_rawdna.drop('sequence', axis=1)

chimp_rawdna['words'] = chimp_rawdna.apply(lambda x: Kmers_funct(x['sequence']), axis=1)
chimp_rawdna = chimp_rawdna.drop('sequence', axis=1)

dog_rawdna['words'] = dog_rawdna.apply(lambda x: Kmers_funct(x['sequence']), axis=1)
dog_rawdna = dog_rawdna.drop('sequence', axis=1)

human_texts = list(human_rawdna['words'])
for item in range(len(human_texts)):
    human_texts[item] = ' '.join(human_texts[item])
#separate labels
y_human = human_rawdna.iloc[:, 0].values # y_human for human_dna

chimp_texts = list(chimp_rawdna['words'])
for item in range(len(chimp_texts)):
    chimp_texts[item] = ' '.join(chimp_texts[item])
#separate labels
y_chim = chimp_rawdna.iloc[:, 0].values # y_chim for chimp_dna

dog_texts = list(dog_rawdna['words'])
for item in range(len(dog_texts)):
    dog_texts[item] = ' '.join(dog_texts[item])
#separate labels
y_dog = dog_rawdna.iloc[:, 0].values  # y_dog for dog_dna

cv = CountVectorizer(ngram_range=(8,8)) #The n-gram size of 8
X = cv.fit_transform(human_texts)
X_chimp = cv.transform(chimp_texts)
X_dog = cv.transform(dog_texts)


print(X.shape)
print(X_chimp.shape)
print(X_dog.shape)

(4380, 602855)
(1682, 602855)
(820, 602855)


# Splitting the human dataset into the training set and test set (80/20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y_human, 
                                                    test_size = 0.20, 
                                                    random_state=42)


# Splitting the human training dataset for Naive Bayes and Random Forests into the training set and test set for neural networks (80/20)
from sklearn.model_selection import train_test_split
X_ntrain, X_valid, y_ntrain, y_valid = train_test_split(X_train, 
                                                    y_train, 
                                                    test_size = 0.20, 
                                                    random_state=42)


#Naive Bayes model parameters from Chauhan 2021

classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print("Confusion matrix for predictions on human test DNA sequence\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))
def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix for predictions on human test DNA sequence

Predicted   0    1   2    3    4   5    6
Actual                                   
0          96    0   0    0    2   4    0
1           0  106   0    0    0   0    0
2           0    0  78    0    0   0    0
3           0    0   0  124    1   0    0
4           0    0   0    0  146   1    2
5           0    0   0    0    0  51    0
6           0    0   0    0    4   2  259
accuracy = 0.982 
precision = 0.983 
recall = 0.982 
f1 = 0.982


#Performance on chimp sequences

y_pred_chimp = classifier.predict(X_chimp)
print("Confusion matrix for predictions on Chimpanzee test DNA sequence\n")
print(pd.crosstab(pd.Series(y_chim, name='Actual'), pd.Series(y_pred_chimp, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_chim, y_pred_chimp)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix for predictions on Chimpanzee test DNA sequence

Predicted    0    1    2    3    4    5    6
Actual                                      
0          231    0    0    0    1    2    0
1            0  185    0    0    0    0    0
2            0    0  144    0    0    0    0
3            0    0    0  227    1    0    0
4            0    0    0    0  259    1    1
5            0    0    0    0    0  109    0
6            0    0    0    0    5    0  516
accuracy = 0.993 
precision = 0.994 
recall = 0.993 
f1 = 0.993


# Performance on dog sequences

y_pred_dog = classifier.predict(X_dog)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_dog, name='Actual'), pd.Series(y_pred_dog, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_dog, y_pred_dog)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix for predictions on Dog test DNA sequence

Predicted    0   1   2   3    4   5    6
Actual                                  
0          129   0   0   0    1   0    1
1            0  72   0   0    2   0    1
2            0   0  64   0    0   0    0
3            1   0   0  91    2   1    0
4            0   0   0   1  134   0    0
5            0   0   0   0    1  59    0
6            1   0   0   0    2   0  257
accuracy = 0.983 
precision = 0.983 
recall = 0.983 
f1 = 0.983


#Random Forest
rfmodel = RandomForestClassifier().fit(X_train, y_train)
rfscoresr2 = cross_validate(rfmodel, X_train, y_train, scoring="accuracy", cv=crossvalidation, n_jobs=10, return_train_score=True)
print("Random Forest Training "+"Folds: " + str(len(rfscoresr2['train_score'])) + ", Accuracy: " + str(np.mean(np.abs(rfscoresr2['train_score']))) + ", STD: " + str(np.std(rfscoresr2['train_score'])))
print("Random Forest Test "+"Folds: " + str(len(rfscoresr2['test_score'])) + ", Accuracy: " + str(np.mean(np.abs(rfscoresr2['test_score']))) + ", STD: " + str(np.std(rfscoresr2['test_score'])))

Random Forest Training Folds: 5, Accuracy: 0.9995005605874692, STD: 0.0002854212184307589
Random Forest Test Folds: 5, Accuracy: 0.8861263501120848, STD: 0.01630437780828705


#Human prediction

y_rfpred = rfmodel.predict(X_test)

print("Confusion matrix for predictions on human test DNA sequence\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_rfpred, name='Predicted')))
def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_rfpred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix for predictions on human test DNA sequence

Predicted   0   1   2    3    4   5    6
Actual                                  
0          92   0   0    0    0   0   10
1           2  95   0    0    0   0    9
2           0   0  70    0    0   0    8
3           0   0   0  116    0   0    9
4           0   0   0    2  128   0   19
5           0   0   0    0    0  44    7
6           0   0   0    1    0   0  264
accuracy = 0.924 
precision = 0.936 
recall = 0.924 
f1 = 0.925


#Performance on chimp sequences

y_rfpred_chimp = rfmodel.predict(X_chimp)
print("Confusion matrix for predictions on Chimpanzee test DNA sequence\n")
print(pd.crosstab(pd.Series(y_chim, name='Actual'), pd.Series(y_rfpred_chimp, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_chim, y_rfpred_chimp)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix for predictions on Chimpanzee test DNA sequence

Predicted    0    1    2    3    4    5    6
Actual                                      
0          232    0    0    0    0    0    2
1            0  184    0    0    0    0    1
2            0    0  143    0    0    0    1
3            0    0    0  224    0    0    4
4            0    0    0    3  250    0    8
5            0    0    0    0    0  108    1
6            0    0    0    0    0    0  521
accuracy = 0.988 
precision = 0.988 
recall = 0.988 
f1 = 0.988


# Performance on dog sequences

y_rfpred_dog = rfmodel.predict(X_dog)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_dog, name='Actual'), pd.Series(y_rfpred_dog, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_dog, y_rfpred_dog)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix for predictions on Dog test DNA sequence

Predicted    0   1   2   3   4   5    6
Actual                                 
0          108   1   0   3   1   0   18
1            0  61   0   3   1   0   10
2            1   1  53   0   2   0    7
3            2   0   0  83   0   0   10
4            2   0   0   5  94   0   34
5           12   1   0   0   1  36   10
6            0   0   0   1   0   0  259
accuracy = 0.846 
precision = 0.870 
recall = 0.846 
f1 = 0.844


#Training different 1-D CNNs

#Model with kernel size of 5 and 24 filters in the second layer
mlenet5 = keras.models.Sequential()
mlenet5.add(keras.layers.Conv1D(16, 5, activation='relu', input_shape=(X_ntrain.shape[1],1) ))
mlenet5.add(keras.layers.MaxPool1D(strides=3))
mlenet5.add(keras.layers.Conv1D(filters=24, kernel_size=5, padding='valid', activation='relu'))
mlenet5.add(keras.layers.MaxPool1D(strides=3))
mlenet5.add(keras.layers.Flatten())
mlenet5.add(keras.layers.Dense(84, activation='relu'))
mlenet5.add(keras.layers.Dense(7, activation='softmax'))
mlenet5.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=['accuracy'])

#Model with kernel size of 5 and 16 filters in the second layer
m2lenet5 = keras.models.Sequential()
m2lenet5.add(keras.layers.Conv1D(16, 5, activation='relu', input_shape=(X_ntrain.shape[1],1) ))
m2lenet5.add(keras.layers.MaxPool1D(strides=3))
m2lenet5.add(keras.layers.Conv1D(filters=16, kernel_size=5, padding='valid', activation='relu'))
m2lenet5.add(keras.layers.MaxPool1D(strides=3))
m2lenet5.add(keras.layers.Flatten())
m2lenet5.add(keras.layers.Dense(84, activation='relu'))
m2lenet5.add(keras.layers.Dense(7, activation='softmax'))
m2lenet5.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=['accuracy'])

#Model with kernel size of 3 and 24 filters in the second layer
m3lenet5 = keras.models.Sequential()
m3lenet5.add(keras.layers.Conv1D(16, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ))
m3lenet5.add(keras.layers.MaxPool1D(strides=3))
m3lenet5.add(keras.layers.Conv1D(filters=24, kernel_size=3, padding='valid', activation='relu'))
m3lenet5.add(keras.layers.MaxPool1D(strides=3))
m3lenet5.add(keras.layers.Flatten())
m3lenet5.add(keras.layers.Dense(84, activation='relu'))
m3lenet5.add(keras.layers.Dense(7, activation='softmax'))
m3lenet5.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=['accuracy'])

#Model with kernel size of 3 and 16 filters in the second layer
m4lenet5 = keras.models.Sequential()
m4lenet5.add(keras.layers.Conv1D(16, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ))
m4lenet5.add(keras.layers.MaxPool1D(strides=3))
m4lenet5.add(keras.layers.Conv1D(filters=16, kernel_size=3, padding='valid', activation='relu'))
m4lenet5.add(keras.layers.MaxPool1D(strides=3))
m4lenet5.add(keras.layers.Flatten())
m4lenet5.add(keras.layers.Dense(84, activation='relu'))
m4lenet5.add(keras.layers.Dense(7, activation='softmax'))
m4lenet5.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=['accuracy'])

2022-06-04 18:20:08.968751: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


lenet1 = mlenet5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=5)

Epoch 1/5
88/88 [==============================] - 376s 4s/step - loss: 1.7984 - accuracy: 0.3689 - val_loss: 1.6797 - val_accuracy: 0.3195
Epoch 2/5
88/88 [==============================] - 381s 4s/step - loss: 1.4009 - accuracy: 0.5822 - val_loss: 1.1428 - val_accuracy: 0.7561
Epoch 3/5
88/88 [==============================] - 377s 4s/step - loss: 0.7138 - accuracy: 0.8569 - val_loss: 0.6323 - val_accuracy: 0.8274
Epoch 4/5
88/88 [==============================] - 390s 4s/step - loss: 0.3298 - accuracy: 0.9493 - val_loss: 0.4320 - val_accuracy: 0.9001
Epoch 5/5
88/88 [==============================] - 383s 4s/step - loss: 0.1761 - accuracy: 0.9832 - val_loss: 0.3678 - val_accuracy: 0.8973


lenet2 = m2lenet5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=5)

Epoch 1/5
88/88 [==============================] - 343s 4s/step - loss: 1.7421 - accuracy: 0.3732 - val_loss: 1.5514 - val_accuracy: 0.4494
Epoch 2/5
88/88 [==============================] - 345s 4s/step - loss: 1.2154 - accuracy: 0.6497 - val_loss: 1.0843 - val_accuracy: 0.8873
Epoch 3/5
88/88 [==============================] - 351s 4s/step - loss: 0.6032 - accuracy: 0.8798 - val_loss: 0.5599 - val_accuracy: 0.8930
Epoch 4/5
88/88 [==============================] - 358s 4s/step - loss: 0.2771 - accuracy: 0.9636 - val_loss: 0.4065 - val_accuracy: 0.9044
Epoch 5/5
88/88 [==============================] - 359s 4s/step - loss: 0.1515 - accuracy: 0.9854 - val_loss: 0.3262 - val_accuracy: 0.9287


lenet3 = m3lenet5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=5)

Epoch 1/5
88/88 [==============================] - 346s 4s/step - loss: 1.6002 - accuracy: 0.4631 - val_loss: 1.3236 - val_accuracy: 0.8573
Epoch 2/5
88/88 [==============================] - 347s 4s/step - loss: 0.8466 - accuracy: 0.8052 - val_loss: 0.7240 - val_accuracy: 0.7389
Epoch 3/5
88/88 [==============================] - 343s 4s/step - loss: 0.3757 - accuracy: 0.9486 - val_loss: 0.4535 - val_accuracy: 0.9330
Epoch 4/5
88/88 [==============================] - 346s 4s/step - loss: 0.1920 - accuracy: 0.9790 - val_loss: 0.3411 - val_accuracy: 0.9230
Epoch 5/5
88/88 [==============================] - 353s 4s/step - loss: 0.1124 - accuracy: 0.9900 - val_loss: 0.3018 - val_accuracy: 0.9087


lenet4 = m4lenet5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=5)

Epoch 1/5
88/88 [==============================] - 320s 4s/step - loss: 1.7320 - accuracy: 0.3800 - val_loss: 1.5863 - val_accuracy: 0.4251
Epoch 2/5
88/88 [==============================] - 319s 4s/step - loss: 1.2028 - accuracy: 0.6664 - val_loss: 1.0152 - val_accuracy: 0.8745
Epoch 3/5
88/88 [==============================] - 318s 4s/step - loss: 0.5817 - accuracy: 0.8980 - val_loss: 0.5708 - val_accuracy: 0.8773
Epoch 4/5
88/88 [==============================] - 316s 4s/step - loss: 0.2773 - accuracy: 0.9668 - val_loss: 0.3939 - val_accuracy: 0.9158
Epoch 5/5
88/88 [==============================] - 310s 4s/step - loss: 0.1531 - accuracy: 0.9857 - val_loss: 0.3150 - val_accuracy: 0.9258


#Performance of lenet2 on human sequences

y_cnnpred_hum = np.argmax(m2lenet5.predict(X_test.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_cnnpred_hum, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_test, y_cnnpred_hum)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

28/28 [==============================] - 31s 974ms/step
Confusion matrix for predictions on Dog test DNA sequence

Predicted   0    1   2    3    4   5    6
Actual                                   
0          85    3   0    3    4   0    7
1           0  102   0    1    0   0    3
2           0    3  69    0    0   0    6
3           0    0   0  119    0   0    6
4           0    2   0    2  131   0   14
5           0    1   0    0    0  42    8
6           0    0   0    1    0   0  264
accuracy = 0.927 
precision = 0.934 
recall = 0.927 
f1 = 0.927


#Performance of lenet2 on chimp sequences

y_cnnpred_chim = np.argmax(m2lenet5.predict(X_chimp.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_chim, name='Actual'), pd.Series(y_cnnpred_chim, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_chim, y_cnnpred_chim)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

53/53 [==============================] - 58s 1s/step
Confusion matrix for predictions on Dog test DNA sequence

Predicted    0    1    2    3    4    5    6
Actual                                      
0          223    1    0    1    1    0    8
1            0  184    0    0    1    0    0
2            0    0  139    0    0    0    5
3            0    0    0  224    2    0    2
4            0    6    0    2  243    0   10
5            0    0    0    1    0  106    2
6            0    0    0    0    0    0  521
accuracy = 0.975 
precision = 0.976 
recall = 0.975 
f1 = 0.975


#Performance of lenet2 on chimp sequences

y_cnnpred_dog = np.argmax(m2lenet5.predict(X_dog.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_dog, name='Actual'), pd.Series(y_cnnpred_dog, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_dog, y_cnnpred_dog)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

26/26 [==============================] - 22s 833ms/step
Confusion matrix for predictions on Dog test DNA sequence

Predicted    0   1   2   3    4   5    6
Actual                                  
0          117   3   0   1    0   0   10
1            0  71   0   0    0   0    4
2            0   3  54   0    1   0    6
3            0   3   0  87    1   0    4
4            0   2   0   5  106   0   22
5            4   4   0   0    1  47    4
6            0   1   0   0    0   0  259
accuracy = 0.904 
precision = 0.915 
recall = 0.904 
f1 = 0.903


#Recurrent Neural Networks

#SimpleRNN layers
rnmodel1 = keras.models.Sequential([
    keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.SimpleRNN(10, return_sequences=True),
    keras.layers.SimpleRNN(10, return_sequences=True),
    keras.layers.Flatten(),
    keras.layers.Dense(7)
])
rnmodel1.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=['accuracy'])

#LSTM layers
rnmodel2 = keras.models.Sequential([
    keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.LSTM(10, return_sequences=True),
    keras.layers.LSTM(10, return_sequences=True),
    keras.layers.Flatten(),
    keras.layers.Dense(7)
])
rnmodel2.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=['accuracy'])

#GRU layers
rnmodel3 = keras.models.Sequential([
    keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.GRU(10, return_sequences=True),
    keras.layers.GRU(10, return_sequences=True),
    keras.layers.Flatten(),
    keras.layers.Dense(7)
])
rnmodel3.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=['accuracy'])

#GRU layers with 2 units instead of 10
rnmodel4 = keras.models.Sequential([
    keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.GRU(2, return_sequences=True),
    keras.layers.GRU(2, return_sequences=True),
    keras.layers.Flatten(),
    keras.layers.Dense(7)
])
rnmodel4.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=['accuracy'])

#GRU layers with one less pre-Conv1D layer and more parameters in the model
rnmodel5 = keras.models.Sequential([
    keras.layers.Conv1D(8, 3, activation='relu', input_shape=(X_ntrain.shape[1],1) ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.Conv1D(8, 3, activation='relu' ),
    keras.layers.MaxPool1D(strides=3),
    keras.layers.GRU(10, return_sequences=True),
    keras.layers.GRU(10, return_sequences=True),
    keras.layers.Flatten(),
    keras.layers.Dense(7)
])
rnmodel5.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=['accuracy'])


rnmodel1.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d_8 (Conv1D)           (None, 602853, 8)         32        
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 200951, 8)        0         
 1D)                                                             
                                                                 
 conv1d_9 (Conv1D)           (None, 200949, 8)         200       
                                                                 
 max_pooling1d_9 (MaxPooling  (None, 66983, 8)         0         
 1D)                                                             
                                                                 
 conv1d_10 (Conv1D)          (None, 66981, 8)          200       
                                                                 
 max_pooling1d_10 (MaxPoolin  (None, 22327, 8)         0         
 g1D)                                                            
                                                                 
 conv1d_11 (Conv1D)          (None, 22325, 8)          200       
                                                                 
 max_pooling1d_11 (MaxPoolin  (None, 7442, 8)          0         
 g1D)                                                            
                                                                 
 simple_rnn (SimpleRNN)      (None, 7442, 10)          190       
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 7442, 10)          210       
                                                                 
 flatten_4 (Flatten)         (None, 74420)             0         
                                                                 
 dense_8 (Dense)             (None, 7)                 520947    
                                                                 
=================================================================
Total params: 521,979
Trainable params: 521,979
Non-trainable params: 0
_________________________________________________________________


rn1 = rnmodel1.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)

Epoch 1/2
88/88 [==============================] - 776s 9s/step - loss: 10.8259 - accuracy: 0.1577 - val_loss: 10.9396 - val_accuracy: 0.1683
Epoch 2/2
88/88 [==============================] - 778s 9s/step - loss: 10.8511 - accuracy: 0.1584 - val_loss: 10.9396 - val_accuracy: 0.1683


rn2 = rnmodel2.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)

Epoch 1/2
88/88 [==============================] - 866s 10s/step - loss: 5.4435 - accuracy: 0.1195 - val_loss: 5.5940 - val_accuracy: 0.1327
Epoch 2/2
88/88 [==============================] - 880s 10s/step - loss: 5.4460 - accuracy: 0.1195 - val_loss: 5.5940 - val_accuracy: 0.1327


rn3 = rnmodel3.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)

Epoch 1/2
88/88 [==============================] - 910s 10s/step - loss: 9.8086 - accuracy: 0.3068 - val_loss: 10.0513 - val_accuracy: 0.2981
Epoch 2/2
88/88 [==============================] - 908s 10s/step - loss: 9.8361 - accuracy: 0.3100 - val_loss: 10.0513 - val_accuracy: 0.2981


rn4 = rnmodel4.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)

Epoch 1/2
88/88 [==============================] - 826s 9s/step - loss: 14.6873 - accuracy: 0.0817 - val_loss: 14.8075 - val_accuracy: 0.0813
Epoch 2/2
88/88 [==============================] - 804s 9s/step - loss: 14.8875 - accuracy: 0.0763 - val_loss: 14.8075 - val_accuracy: 0.0813


rn5 = rnmodel5.fit(X_ntrain.A, y_ntrain, validation_data=(X_valid.A, y_valid), epochs=2)

Epoch 1/2
88/88 [==============================] - 4766s 54s/step - loss: 6.1604 - accuracy: 0.3086 - val_loss: 6.2770 - val_accuracy: 0.2981
Epoch 2/2
88/88 [==============================] - 4803s 55s/step - loss: 6.1407 - accuracy: 0.3100 - val_loss: 6.2770 - val_accuracy: 0.2981


#Performance of rn5 on human sequences

y_rnnpred_hum = np.argmax(rnmodel5.predict(X_test.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_rnnpred_hum, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_test, y_rnnpred_hum)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

28/28 [==============================] - 63s 2s/step
Confusion matrix for predictions on Dog test DNA sequence

Predicted    6
Actual        
0          102
1          106
2           78
3          125
4          149
5           51
6          265
accuracy = 0.303 
precision = 0.092 
recall = 0.303 
f1 = 0.141

/usr/local/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


#Performance of rn5 on chimp sequences

y_rnnpred_chim = np.argmax(rnmodel5.predict(X_chimp.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_chim, name='Actual'), pd.Series(y_rnnpred_chim, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_chim, y_rnnpred_chim)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

53/53 [==============================] - 121s 2s/step
Confusion matrix for predictions on Dog test DNA sequence

Predicted    6
Actual        
0          234
1          185
2          144
3          228
4          261
5          109
6          521
accuracy = 0.310 
precision = 0.096 
recall = 0.310 
f1 = 0.147

/usr/local/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


#Performance of rn5 on dog sequences

y_rnnpred_dog = np.argmax(rnmodel5.predict(X_dog.A), axis=1)
print("Confusion matrix for predictions on Dog test DNA sequence\n")
print(pd.crosstab(pd.Series(y_dog, name='Actual'), pd.Series(y_rnnpred_dog, name='Predicted')))
accuracy, precision, recall, f1 = get_metrics(y_dog, y_rnnpred_dog)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

26/26 [==============================] - 59s 2s/step
Confusion matrix for predictions on Dog test DNA sequence

Predicted    6
Actual        
0          131
1           75
2           64
3           95
4          135
5           60
6          260
accuracy = 0.317 
precision = 0.101 
recall = 0.317 
f1 = 0.153

/usr/local/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

MSDS Assignment 6 Practical Machine Learning¶

Data Analysis in Python for DNA sequencing gene family data¶

Naive Bayes Model¶

Random Forest Model¶

1-D Convolutional Neural Networks¶

Recurrent Neural Networks¶