Author: Marcel Baltruschat (@GitHub)
Date: 16.07.2021
Installation with Conda¶
Tested on DeepChem (96 cores, 377 GB RAM, 4x Nvidia GeForce RTX 2080 Ti, Ubuntu 20.04.2 LTS, x86_64)
Some packages might only be available for Linux machines.
conda create -n rapids -c rapidsai -c nvidia -c conda-forge cudf cuml python=3.8 cudatoolkit=11.2 jupyterlab rdkit seaborn scikit-learn
Imports and Settings¶
% matplotlib inline
import cudf
import cuml
import gc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from multiprocessing import Pool
from rdkit.Chem import AllChem as Chem
from rdkit.DataStructs import ConvertToNumpyArray
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
from time import time
sns.set()
random_seed = 42
Loading Example Dataset¶
The original dataset was published by Ogura et al. (2019) [1].
The only changes made were the conversion from XLSX to CSV and the filtering out of all molecules with invalid valences.
df = pd.read_csv('datasets/example_dataset_hERG.csv')
print(len(df))
df.head()
Calculating Morgan Fingerprints (FCFP6) with Multiprocessing¶
def smi_to_FCFP6(smi):
    mol = Chem.MolFromSmiles(smi)
    fp = Chem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=4096, useFeatures=True)
    ar = np.empty(4096, dtype=np.int32)
    ConvertToNumpyArray(fp, ar)
    return ar
t0 = time()
df.Canonical_Smiles.apply(smi_to_FCFP6)
print(f'1 core (no pool): {time() - t0:.2f}s')
for cores in [1, 2, 4, 8, 16, 32, 64]:
    with Pool(cores) as p:
        t0 = time()
        p.map(smi_to_FCFP6, df.Canonical_Smiles)
        print(f'{cores} core(s): {time() - t0:.2f}s')
Rule of thumb: The more complex the single tasks are the more cores you can use¶
num_cores = 32
with Pool(num_cores) as p:
    X_data = np.array(p.map(smi_to_FCFP6, df.Canonical_Smiles))
X_data.shape
Exploring Chemical Space with GPU¶
# Convert to CUDA DataFrame
cX_data = cudf.DataFrame(X_data, dtype=np.float32)
ctsne = cuml.TSNE(perplexity=50, n_neighbors=150, random_state=random_seed)
t0 = time()
cpc2 = ctsne.fit_transform(cX_data)
print(f'TSNE time: {time() - t0:.2f}s')
# Converting result back to normal Pandas DataFrame
pc2 = cpc2.to_pandas()
# Get standard deviation
pc2_std = pc2.describe().loc['std']
# Find all outliers that are greater than four times the standard deviation for at least one of the two columns
outlier = np.any(list(zip(abs(pc2[0]) > pc2_std[0] * 4, abs(pc2[1]) > pc2_std[1] * 4)), axis=1)
outlier_ix = pc2.loc[outlier].index
pc2.drop(index=outlier_ix, inplace=True)
cpc2.drop(index=outlier_ix, inplace=True)
print(f'{len(outlier_ix)} outlier(s) dropped')
plt.figure(dpi=200)
sns.scatterplot(data=pc2, x=0, y=1, s=1)
plt.axis('off')
plt.tight_layout()
plt.show()
Free some CPU and GPU memory¶
del ctsne
plt.close()
# Without this call GPU memory is not freed
gc.collect();
cdbscan = cuml.DBSCAN(eps=0.75)
t0 = time()
ccluster = cdbscan.fit_predict(cpc2)
print(f'DBSCAN (GPU) clustering time: {time() - t0:.2f}s')
dbscan = DBSCAN(n_jobs=num_cores, eps=0.75)
t0 = time()
cluster = dbscan.fit_predict(pc2)
print(f'DBSCAN (CPU) clustering time: {time() - t0:.2f}s')
print(f'{len(cluster) - len((cluster == ccluster.to_array()).nonzero()[0])}/{len(cluster)} points have different clusters')
Visualize cluster¶
plt.figure(dpi=200)
sns.scatterplot(data=pc2, x=0, y=1, s=1, hue=cluster.astype(str), legend=False)
plt.axis('off')
plt.tight_layout()
plt.show()
Using FCFP6¶
cdbscan2 = cuml.DBSCAN()
t0 = time()
ccluster2 = cdbscan2.fit_predict(cX_data)
print(f'DBSCAN (GPU) clustering time: {time() - t0:.2f}s')
dbscan2 = DBSCAN(n_jobs=num_cores)
t0 = time()
cluster2 = dbscan2.fit_predict(X_data)
print(f'DBSCAN (CPU) clustering time: {time() - t0:.2f}s')
print(f'{len(cluster2) - len((cluster2 == ccluster2.to_array()).nonzero()[0])}/{len(cluster2)} points have different clusters')
Free some CPU and GPU memory¶
del cdbscan, ccluster, cX_data, cpc2, ccluster2, cdbscan2
# Without this call GPU memory is not freed
gc.collect();
Train a Random Forest Model on GPU¶
X_train, X_test, Y_train, Y_test = train_test_split(X_data, df['class'], test_size=0.1, stratify=df['class'], random_state=random_seed)
cX_train = cudf.DataFrame(X_train, dtype=np.float32)
cX_test = cudf.DataFrame(X_test, dtype=np.float32)
cY_train = cudf.Series(Y_train, dtype=np.int32)
cY_test = cudf.Series(Y_test, dtype=np.int32)
crf = cuml.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=64, random_state=random_seed)
t0 = time()
crf.fit(cX_train, cY_train)
print(f'RandomForest (GPU) training time: {time() - t0:.2f}s')
pred = crf.predict(cX_test).to_pandas()
print(f'Kappa: {cohen_kappa_score(Y_test, pred):.3f}')
Free some CPU and GPU memory¶
del cX_train, cX_test, cY_train, cY_test, crf
# Without this call GPU memory is not freed
gc.collect();
Train a Random Forest Model on CPU (32 cores)¶
rf = RandomForestClassifier(n_estimators=1000, n_jobs=num_cores, max_depth=64, random_state=random_seed)
t0 = time()
rf.fit(X_train, Y_train)
print(f'RandomForest (CPU) training time: {time() - t0:.2f}s')
pred = rf.predict(X_test)
print(f'Kappa: {cohen_kappa_score(Y_test, pred):.3f}')
References¶
[1] Ogura, K., Sato, T., Yuki, H. et al. Support Vector Machine model for hERG inhibitory activities based on the integrated hERG database using descriptor selection by NSGA-II. Sci Rep 9, 12220 (2019). https://doi.org/10.1038/s41598-019-47536-3
Useful Resources¶
- https://docs.python.org/3/library/multiprocessing.html
- http://practicalcheminformatics.blogspot.com/2020/06/wicked-fast-cheminformatics-with-nvidia.html
- https://patwalters.github.io/practicalcheminformatics/jupyter/dask/parallel/2021/03/28/dask-cheminformatics.html
- https://rapids.ai/start.html#get-rapids
- https://docs.rapids.ai/api/cudf/stable/10min.html
- https://docs.rapids.ai/api/cuml/stable/cuml_intro.html
Disclaimer¶
The configurations for modelling and clustering might be suboptimal for the specific task and dataset. Since the key point of this notebook is to show the possible runtime advantage of GPU usage and the comparison of results between CPU and GPU runs, configuration optimisation was neglected.