Optuna Pt.2
Track Optuna Hyperparameter Tuning with MLflow¶
Author: Marcel Baltruschat (@GitHub)
Date: 18.03.2022
License: MIT
Installation with Conda¶
conda create -n opt_ml -c conda-forge python=3.9 jupyterlab rdkit scikit-learn optuna mlflow ipywidgets multiprocess pytorch
Remark 1: ipywidgets
is not directly used, but some imports trigger a warning if it is not installed
Remark 2: multiprocess
is a fork of Python's multiprocessing
module, that supports interactive usage of Pool
on macOS.
Remark 3: On Windows, Pool multiprocessing in Jupyter Notebooks seems not to work and is disabled.
Imports and Settings¶
import platform
OS = platform.system()
if OS == 'Linux':
from multiprocessing import Pool
elif OS == 'Darwin':
from multiprocess import Pool
import sys
import warnings
from subprocess import Popen
import mlflow
import numpy as np
import optuna
import pandas as pd
import rdkit
import torch
from rdkit.Chem import AllChem as Chem, Descriptors, Crippen
from rdkit.DataStructs import ConvertToNumpyArray
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
optuna.logging.set_verbosity(optuna.logging.WARNING)
random_seed = 42
num_cores = 12
Used Versions¶
print(f'Python: {sys.version.split("|")[0]}\nMLflow: {mlflow.__version__}\nOptuna: {optuna.__version__}\nRDKit: {rdkit.__version__}\nPyTorch: {torch.__version__}')
# Adjust host and port as necessary
mlflow_proc = Popen(['mlflow', 'ui']) # '-h', '0.0.0.0', '-p', '8891'
You can already visit http://localhost:5000
Loading Example Dataset¶
The original dataset was published by Ogura et al. (2019) [1].
The only changes made were the conversion from XLSX to CSV and the filtering out of all molecules with invalid valences.
df = pd.read_csv('datasets/example_dataset_hERG.csv', names=['smi', 'act'], header=0)
print(len(df))
df.head(2)
df.act.value_counts()
=> Very unbalanced dataset
Perform Undersampling Based on MolWt and LogP¶
def data_from_smi(smi):
mol = Chem.MolFromSmiles(smi)
mw_logp = Descriptors.MolWt(mol) / 100 * Crippen.MolLogP(mol)
return mol, mw_logp
if OS == 'Windows':
res = np.array(list(map(data_from_smi, df.smi)))
else:
with Pool(num_cores) as p:
res = np.array(p.map(data_from_smi, df.smi))
df['ROMol'], df['mw_logp'] = res[:, 0], res[:, 1]
df.sort_values('mw_logp', inplace=True)
act0 = df.query('act == 0').reset_index(drop=True)
ix = np.linspace(0, len(act0) - 1, num=len(df) - len(act0), dtype=np.int32)
df = pd.concat([df.query('act == 1'), act0.loc[ix]])
df.act.value_counts()
Calculating Morgan Fingerprints (FCFP6)¶
def mol_to_FCFP6(mol):
fp = Chem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=2048, useFeatures=True)
ar = np.empty(2048, dtype=np.uint8)
ConvertToNumpyArray(fp, ar)
return ar
x_data = np.array(list(map(mol_to_FCFP6, df.ROMol)))
x_data.shape
Split Into Training and Test Datasets¶
y_data = np.array(df.act)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data,
test_size=0.1,
stratify=y_data,
random_state=random_seed,
shuffle=True)
Optimize models with Optuna and Track Results with MLflow¶
# Activate autologging for Scikit-learn
mlflow.sklearn.autolog()
Scikit-Learn Random Forest (with auto logging)¶
If you want to avoid having duplicated parameter sets you need to uncomment the commented code lines
# seen_param = []
def rf_obj(trial):
param = {
'n_estimators': trial.suggest_int('n_estimators', 50, 1000, step=50),
'max_depth': trial.suggest_int('max_depth', 1, 20),
'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None]),
'random_state': random_seed,
'n_jobs': num_cores,
}
# if param in seen_param:
# raise optuna.exceptions.TrialPruned()
# else:
# seen_param.append(param)
metrics = {}
model = RandomForestClassifier(**param)
with mlflow.start_run():
model.fit(x_train, y_train)
metrics['test_kappa'] = cohen_kappa_score(y_test, model.predict(x_test))
metrics['training_kappa'] = cohen_kappa_score(y_train, model.predict(x_train))
mlflow.log_metrics(metrics)
return metrics['test_kappa']
# Creates a new MLflow experiment and set it as active
mlflow.set_experiment('hERG Random Forest')
# Creates a new Optuna study for maximizing an outcome
study = optuna.create_study(direction='maximize')
# MLflow currently uses scikit-learn functions for metric calculation that were deprecated with version 1.0
with warnings.catch_warnings():
warnings.simplefilter('ignore')
# Optimize the objective function
study.optimize(rf_obj, n_trials=100)
Scikit-Learn MLP (with auto logging)¶
# seen_param = []
def nn_obj(trial):
n_hidden_layers = trial.suggest_int('n_hidden_layers', 0, 4)
n_neurons = trial.suggest_int('n_neurons', 16, 128) if n_hidden_layers > 0 else 0
param = {
'hidden_layer_sizes': [n_neurons] * n_hidden_layers,
'activation': trial.suggest_categorical('activation', ['identity', 'logistic', 'tanh', 'relu']),
'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
'alpha': trial.suggest_float('alpha', 0.00001, 0.1, log=True),
'learning_rate_init': trial.suggest_float('learning_rate', 0.00001, 0.1, log=True),
'max_iter': trial.suggest_int('epochs', 20, 300),
'random_state': random_seed,
}
# if param in seen_param:
# raise optuna.exceptions.TrialPruned()
# else:
# seen_param.append(param)
metrics = {}
model = MLPClassifier(**param)
with mlflow.start_run():
mlflow.log_params(dict(n_hidden_layers=n_hidden_layers, n_neurons=n_neurons))
model.fit(x_train, y_train)
metrics['test_kappa'] = cohen_kappa_score(y_test, model.predict(x_test))
metrics['training_kappa'] = cohen_kappa_score(y_train, model.predict(x_train))
mlflow.log_metrics(metrics)
return metrics['test_kappa']
# Creates a new MLflow experiment and set it as active
mlflow.set_experiment('hERG MLP')
# Creates a new Optuna study for maximizing an outcome
study = optuna.create_study(direction='maximize')
# MLflow currently uses scikit-learn functions for metric calculation that were deprecated with version 1.0
with warnings.catch_warnings():
warnings.simplefilter('ignore')
# Optimize the objective function
study.optimize(nn_obj, n_trials=100)
PyTorch MLP (manual logging)¶
class hERGDataset(Dataset):
def __init__(self, x_data, y_data):
self.x_data = x_data.astype(np.float32)
self.y_data = y_data.astype(np.float32).reshape(-1, 1)
def __len__(self):
return len(self.y_data)
def __getitem__(self, idx):
return self.x_data[idx], self.y_data[idx]
train_ds = hERGDataset(x_train, y_train)
test_ds = hERGDataset(x_test, y_test)
# seen_param = []
def pt_nn_obj(trial):
param = dict(
n_hidden_layers = trial.suggest_int('n_hidden_layers', 0, 3),
n_neurons = trial.suggest_int('n_neurons', 16, 128),
act = trial.suggest_categorical('activation', ['Sigmoid', 'Tanh', 'ReLU']),
lr = trial.suggest_float('learning_rate', 0.00001, 0.1, log=True),
epochs = trial.suggest_int('epochs', 20, 300),
batch_size = trial.suggest_int('batch_size', 1, 128),
random_seed = random_seed,
optimizer = 'Adam',
criterion = 'binary_crossentropy',
)
if param['n_hidden_layers'] == 0:
param['n_neurons'] = 0
# if param in seen_param:
# raise optuna.exceptions.TrialPruned()
# else:
# seen_param.append(param)
torch.manual_seed(param['random_seed'])
np.random.seed(param['random_seed'])
act_func = eval(f'nn.{param["act"]}')
layers = [nn.Linear(2048, param['n_neurons']), act_func()] # input layer
for i in range(param['n_hidden_layers']):
layers.append(nn.Linear(param['n_neurons'], param['n_neurons']))
layers.append(act_func())
layers.append(nn.Linear(param['n_neurons'], 1)) # output layer
layers.append(nn.Sigmoid())
model = nn.Sequential(*layers)
opt = optim.Adam(model.parameters(), lr=param['lr'])
crit = nn.BCELoss()
train_loader = DataLoader(train_ds, batch_size=param['batch_size'], shuffle=True)
test_loader = DataLoader(test_ds, batch_size=param['batch_size'], shuffle=False)
with mlflow.start_run():
mlflow.log_params(param)
for i in range(param['epochs']):
model.train()
train_loss = 0
train_kappa = 0
for data, labels in train_loader:
opt.zero_grad()
out = model(data)
loss = crit(out, labels)
loss.backward()
opt.step()
train_loss += loss.item()
train_kappa += cohen_kappa_score(labels.data.numpy(), out[:,-1].detach().numpy().round())
model.eval()
metrics = dict(train_loss=train_loss / len(train_loader), train_kappa=train_kappa / len(train_loader))
with torch.no_grad():
test_loss = 0
test_kappa = 0
for data, labels in test_loader:
out = model(data)
test_loss += crit(out, labels).item()
test_kappa += cohen_kappa_score(labels.data.numpy(), out[:,-1].numpy().round())
metrics['test_loss'] = test_loss / len(test_loader)
metrics['test_kappa'] = test_kappa / len(test_loader)
mlflow.log_metrics(metrics, step=i + 1)
mlflow.pytorch.log_model(model, 'model')
return metrics['test_kappa']
# Creates a new MLflow experiment and set it as active
mlflow.set_experiment('hERG PyTorch NN')
# Creates a new Optuna study for maximizing an outcome
study = optuna.create_study(direction='maximize')
# MLflow currently uses scikit-learn functions for metric calculation that were deprecated with version 1.0
with warnings.catch_warnings():
warnings.simplefilter('ignore')
# Optimize the objective function
study.optimize(pt_nn_obj, n_trials=100)
While Optuna is "studying" you can already investigate finished results on the MLflow server webpage...¶
All results that are shown on the MLflow page are retrieved from the local folder mlruns
in the current directory.
References¶
[1] Ogura, K., Sato, T., Yuki, H. et al. Support Vector Machine model for hERG inhibitory activities based on the integrated hERG database using descriptor selection by NSGA-II. Sci Rep 9, 12220 (2019). https://doi.org/10.1038/s41598-019-47536-3
Disclaimer¶
The configurations for modelling and hyperparamter optimization might be suboptimal for the specific task and dataset. Since the key point of this notebook is to show the usage of the MLflow tracking feature together with Optuna, configuration optimisation was neglected.