Decision Tree Classifier
Decision Tree Classifier¶
In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import rdMolDescriptors
import sklearn
import pickle as pkl
import numpy as np
import seaborn as sns
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
In [2]:
Chembl_df = pkl.load(open('datasets/Chembl_df.p', 'rb'))
In [3]:
Chembl_df
Out[3]:
In [4]:
smi = Chembl_df['Smiles']
In [5]:
Mol = []
for x in smi:
mol = Chem.MolFromSmiles(x)
Mol.append(mol)
In [6]:
y = list(Chembl_df['Classes'])
In [7]:
fmorgan3 = list(Chembl_df['Morgan3'])
Creating DecisionTree, Overtraining Plot & Feature Importances¶
In [8]:
#creating DecisionTrees with different depths & feature importances
#automated prediction and visualisation of cohens kappa
# x = Morgan Fingerprints, y = Classes, r = radius of Morgan Fingerprints, mols = List of mols
def DecisionTree(x, y, r, mols):
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size = 0.20, train_size = 0.80, random_state=42, stratify =y )
coh_kappa_test = []
coh_kappa_train = []
depths = list(range(1,100,5))
for ix in depths:
clf = sklearn.tree.DecisionTreeClassifier(max_depth = ix, random_state=42)
clf.fit(x_train, y_train)
y_test_pred = clf.predict(x_test, check_input = True)
y_train_pred = clf.predict(x_train, check_input = True)
kappa_test = sklearn.metrics.cohen_kappa_score(y_test, y_test_pred)
kappa_train = sklearn.metrics.cohen_kappa_score(y_train, y_train_pred)
coh_kappa_test.append(kappa_test)
coh_kappa_train.append(kappa_train)
#Overtraining Plot
fig, ax1 = plt.subplots()
ax1.plot(depths, coh_kappa_test, label = 'Validation Data')
ax1.plot(depths, coh_kappa_train, label = 'Training Data')
ax1.set_xlabel('Depth')
ax1.set_ylabel('Cohens Kappa')
ax1.legend()
ax1.set_title('Overtraining Plot')
print(f'Max Cohens Kappa = {np.max(coh_kappa_test)}')
i_maximum = np.argmax(coh_kappa_test)
best_depth = depths[i_maximum]
print(f'Best Depth = {depths[i_maximum]}')
#feature_importances
fig, ax2 = plt.subplots()
ax2.set_title('Feature Importances')
clf_best_depth = sklearn.tree.DecisionTreeClassifier(max_depth = best_depth, random_state=42)
clf_best_depth.fit(x_train, y_train)
importances = clf_best_depth.feature_importances_
ax2.plot(range(len(importances)), importances)
vib = importances.argsort()[-10:][::-1]
print(f'Important Features = {vib}')
moi = []
double =[]
for mol in mols:
bi ={}
rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=r, bitInfo=bi, nBits=4096, useFeatures=True)
for b in vib:
if b in bi.keys() and b not in double:
double.append(b)
moi.append((mol, b, bi))
return Chem.Draw.DrawMorganBits(moi[:12], molsPerRow=4, legends=[f'{tupl[1]}' for tupl in moi][:12])
Visualisation¶
In [9]:
#split dataset in atrain and test data
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(fmorgan3, y, test_size = 0.20, train_size = 0.80, random_state=42, stratify =y )
#creating a decision tree classifier
clf = sklearn.tree.DecisionTreeClassifier(max_depth = 16, random_state=42)
#train the decision tree classifier
clf.fit(x_train,y_train)
#Decision Tree classifier visualisation
fig, ax = plt.subplots(figsize=(16,10), dpi=300)
sklearn.tree.plot_tree(clf, label = all, filled=True)
plt.show(clf)
Overtraining Plot¶
Overtraining Plot & Feature Importances from Dataset¶
In [10]:
mols2 = DecisionTree(fmorgan3, y, 3, Mol)
plt.show()
mols2
Out[10]: