added hyperparameter advanced tutorial by Priyansi · Pull Request #69 · pytorch-ignite/examples

Priyansi · 2021-11-08T14:15:32Z

Fixes #29

tarasio-mirror

Thanks for the PR @Priyansi !
I've added some suggestions related to coding style.

tarasio-mirror · 2021-11-24T12:40:57Z

tutorials/advanced/01-hyperparameter-tuning.ipynb

+    "    trainset = CIFAR10(\n",
+    "        root=data_dir, train=True, download=True, transform=transform)\n",
+    "\n",
+    "    testset = CIFAR10(\n",
+    "        root=data_dir, train=False, download=True, transform=transform)\n",
+    "\n",
+    "    return trainset, testset"


Suggested change

" trainset = CIFAR10(\n",

" root=data_dir, train=True, download=True, transform=transform)\n",

"\n",

" testset = CIFAR10(\n",

" root=data_dir, train=False, download=True, transform=transform)\n",

"\n",

" return trainset, testset"

" trainset = CIFAR10(\n",

" root=data_dir, train=True, download=True, transform=transform\n",

" )\n",

" testset = CIFAR10(\n",

" root=data_dir, train=False, download=True, transform=transform\n",

" )\n",

" return trainset, testset"

tarasio-mirror · 2021-11-24T12:47:52Z

tutorials/advanced/01-hyperparameter-tuning.ipynb

+    "    train_subset, val_subset = random_split(\n",
+    "        trainset, [test_abs, len(trainset) - test_abs])\n",
+    "\n",
+    "    trainloader = idist.auto_dataloader(\n",
+    "        train_subset,\n",
+    "        batch_size=int(config[\"batch_size\"]),\n",
+    "        shuffle=True,\n",
+    "        num_workers=8)\n",
+    "    valloader = idist.auto_dataloader(\n",
+    "        val_subset,\n",
+    "        batch_size=int(config[\"batch_size\"]),\n",
+    "        shuffle=True,\n",
+    "        num_workers=8)\n",
+    "    \n",
+    "    return trainloader, valloader"


Suggested change

" train_subset, val_subset = random_split(\n",

" trainset, [test_abs, len(trainset) - test_abs])\n",

"\n",

" trainloader = idist.auto_dataloader(\n",

" train_subset,\n",

" batch_size=int(config[\"batch_size\"]),\n",

" shuffle=True,\n",

" num_workers=8)\n",

" valloader = idist.auto_dataloader(\n",

" val_subset,\n",

" batch_size=int(config[\"batch_size\"]),\n",

" shuffle=True,\n",

" num_workers=8)\n",

" \n",

" return trainloader, valloader"

" train_subset, val_subset = random_split(\n",

" trainset, [test_abs, len(trainset) - test_abs]\n",

" )\n",

" trainloader = idist.auto_dataloader(\n",

" train_subset,\n",

" batch_size=int(config[\"batch_size\"]),\n",

" shuffle=True,\n",

" num_workers=8\n",

" )\n",

" valloader = idist.auto_dataloader(\n",

" val_subset,\n",

" batch_size=int(config[\"batch_size\"]),\n",

" shuffle=True,\n",

" num_workers=8\n",

" )\n",

" return trainloader, valloader"

tarasio-mirror · 2021-11-24T12:50:30Z

tutorials/advanced/01-hyperparameter-tuning.ipynb

+    "def initialize(config, checkpoint_dir):\n",
+    "    model = idist.auto_model(Net(config[\"l1\"], config[\"l2\"]))\n",
+    "\n",
+    "    device = idist.device()\n",
+    "\n",
+    "    criterion = nn.CrossEntropyLoss()\n",
+    "    optimizer = idist.auto_optim(optim.SGD(model.parameters(), lr=config[\"lr\"], momentum=0.9))\n",
+    "\n",
+    "    if checkpoint_dir:\n",
+    "        model_state, optimizer_state = torch.load(\n",
+    "            os.path.join(checkpoint_dir, \"checkpoint\"))\n",
+    "        model.load_state_dict(model_state)\n",
+    "        optimizer.load_state_dict(optimizer_state)\n",
+    "    \n",
+    "    return model, device, criterion, optimizer"


Suggested change

"def initialize(config, checkpoint_dir):\n",

" model = idist.auto_model(Net(config[\"l1\"], config[\"l2\"]))\n",

"\n",

" device = idist.device()\n",

"\n",

" criterion = nn.CrossEntropyLoss()\n",

" optimizer = idist.auto_optim(optim.SGD(model.parameters(), lr=config[\"lr\"], momentum=0.9))\n",

"\n",

" if checkpoint_dir:\n",

" model_state, optimizer_state = torch.load(\n",

" os.path.join(checkpoint_dir, \"checkpoint\"))\n",

" model.load_state_dict(model_state)\n",

" optimizer.load_state_dict(optimizer_state)\n",

" \n",

" return model, device, criterion, optimizer"

"def initialize(config, checkpoint_dir):\n",

" model = idist.auto_model(Net(config[\"l1\"], config[\"l2\"]))\n",

"\n",

" device = idist.device()\n",

"\n",

" criterion = nn.CrossEntropyLoss()\n",

" optimizer = idist.auto_optim(\n",

" optim.SGD(model.parameters(), lr=config[\"lr\"], momentum=0.9)\n",

" )\n",

"\n",

" if checkpoint_dir:\n",

" model_state, optimizer_state = torch.load(\n",

" os.path.join(checkpoint_dir, \"checkpoint\")\n",

" )\n",

" model.load_state_dict(model_state)\n",

" optimizer.load_state_dict(optimizer_state)\n",

"\n",

" return model, device, criterion, optimizer"

tarasio-mirror · 2021-11-24T12:52:57Z

tutorials/advanced/01-hyperparameter-tuning.ipynb

+    "def train_cifar(config, data_dir=None, checkpoint_dir=None):\n",
+    "    trainloader, valloader = get_train_val_loaders(config, data_dir)\n",
+    "    model, device, criterion, optimizer = initialize(config, checkpoint_dir)\n",
+    "    \n",
+    "    trainer = create_supervised_trainer(model, optimizer, criterion, device=device, non_blocking=True)\n",
+    "    \n",
+    "    avg_output = RunningAverage(output_transform=lambda x: x)\n",
+    "    avg_output.attach(trainer, 'running_avg_loss')\n",
+    "    \n",
+    "    val_evaluator = create_supervised_evaluator(model, metrics={ \"accuracy\": Accuracy(), \"loss\": Loss(criterion)}, device=device, non_blocking=True)\n",
+    "    \n",
+    "    @trainer.on(Events.ITERATION_COMPLETED(every=2000))\n",
+    "    def log_training_loss(engine):\n",
+    "        print(f\"Epoch[{engine.state.epoch}], Iter[{engine.state.iteration}] Loss: {engine.state.output:.2f} Running Avg Loss: {engine.state.metrics['running_avg_loss']:.2f}\")\n",
+    "\n",
+    "\n",
+    "    @trainer.on(Events.EPOCH_COMPLETED)\n",
+    "    def log_validation_results(trainer):\n",
+    "        val_evaluator.run(valloader)\n",
+    "        metrics = val_evaluator.state.metrics\n",
+    "        print(f\"Validation Results - Epoch[{trainer.state.epoch}] Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}\")\n",
+    "\n",
+    "        with tune.checkpoint_dir(trainer.state.epoch) as checkpoint_dir:\n",
+    "            path = os.path.join(checkpoint_dir, \"checkpoint\")\n",
+    "            torch.save((model.state_dict(), optimizer.state_dict()), path)\n",
+    "            \n",
+    "        tune.report(loss=metrics['loss'], accuracy=metrics['accuracy'])   \n",
+    "\n",
+    "    trainer.run(trainloader, max_epochs=10) "


Suggested change

"def train_cifar(config, data_dir=None, checkpoint_dir=None):\n",

" trainloader, valloader = get_train_val_loaders(config, data_dir)\n",

" model, device, criterion, optimizer = initialize(config, checkpoint_dir)\n",

" \n",

" trainer = create_supervised_trainer(model, optimizer, criterion, device=device, non_blocking=True)\n",

" \n",

" avg_output = RunningAverage(output_transform=lambda x: x)\n",

" avg_output.attach(trainer, 'running_avg_loss')\n",

" \n",

" val_evaluator = create_supervised_evaluator(model, metrics={ \"accuracy\": Accuracy(), \"loss\": Loss(criterion)}, device=device, non_blocking=True)\n",

" \n",

" @trainer.on(Events.ITERATION_COMPLETED(every=2000))\n",

" def log_training_loss(engine):\n",

" print(f\"Epoch[{engine.state.epoch}], Iter[{engine.state.iteration}] Loss: {engine.state.output:.2f} Running Avg Loss: {engine.state.metrics['running_avg_loss']:.2f}\")\n",

"\n",

"\n",

" @trainer.on(Events.EPOCH_COMPLETED)\n",

" def log_validation_results(trainer):\n",

" val_evaluator.run(valloader)\n",

" metrics = val_evaluator.state.metrics\n",

" print(f\"Validation Results - Epoch[{trainer.state.epoch}] Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}\")\n",

"\n",

" with tune.checkpoint_dir(trainer.state.epoch) as checkpoint_dir:\n",

" path = os.path.join(checkpoint_dir, \"checkpoint\")\n",

" torch.save((model.state_dict(), optimizer.state_dict()), path)\n",

" \n",

" tune.report(loss=metrics['loss'], accuracy=metrics['accuracy']) \n",

"\n",

" trainer.run(trainloader, max_epochs=10) "

"def train_cifar(config, data_dir=None, checkpoint_dir=None):\n",

" trainloader, valloader = get_train_val_loaders(config, data_dir)\n",

" model, device, criterion, optimizer = initialize(config, checkpoint_dir)\n",

"\n",

" trainer = create_supervised_trainer(\n",

" model, optimizer, criterion, device=device, non_blocking=True\n",

" )\n",

"\n",

" avg_output = RunningAverage(output_transform=lambda x: x)\n",

" avg_output.attach(trainer, \"running_avg_loss\")\n",

"\n",

" val_evaluator = create_supervised_evaluator(\n",

" model,\n",

" metrics={\"accuracy\": Accuracy(), \"loss\": Loss(criterion)},\n",

" device=device,\n",

" non_blocking=True,\n",

" )\n",

"\n",

" @trainer.on(Events.ITERATION_COMPLETED(every=2000))\n",

" def log_training_loss(engine):\n",

" print(\n",

" f\"Epoch[{engine.state.epoch}], Iter[{engine.state.iteration}] Loss: {engine.state.output:.2f} Running Avg Loss: {engine.state.metrics['running_avg_loss']:.2f}\"\n",

" )\n",

"\n",

" @trainer.on(Events.EPOCH_COMPLETED)\n",

" def log_validation_results(trainer):\n",

" val_evaluator.run(valloader)\n",

" metrics = val_evaluator.state.metrics\n",

" print(\n",

" f\"Validation Results - Epoch[{trainer.state.epoch}] Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}\"\n",

" )\n",

"\n",

" with tune.checkpoint_dir(trainer.state.epoch) as checkpoint_dir:\n",

" path = os.path.join(checkpoint_dir, \"checkpoint\")\n",

" torch.save((model.state_dict(), optimizer.state_dict()), path)\n",

"\n",

" tune.report(loss=metrics[\"loss\"], accuracy=metrics[\"accuracy\"])\n",

"\n",

" trainer.run(trainloader, max_epochs=10)"

tarasio-mirror · 2021-11-24T12:54:59Z

tutorials/advanced/01-hyperparameter-tuning.ipynb

+    "def test_best_model(best_trial, data_dir=None):\n",
+    "    _, testset = load_data(data_dir)\n",
+    "    \n",
+    "    best_trained_model = idist.auto_model(Net(best_trial.config[\"l1\"], best_trial.config[\"l2\"]))\n",
+    "    device = idist.device()\n",
+    "\n",
+    "    best_checkpoint_dir = best_trial.checkpoint.value\n",
+    "    model_state, optimizer_state = torch.load(os.path.join(\n",
+    "        best_checkpoint_dir, \"checkpoint\"))\n",
+    "    best_trained_model.load_state_dict(model_state)\n",
+    "\n",
+    "    test_evaluator = create_supervised_evaluator(best_trained_model, metrics={\"Accuracy\": Accuracy()}, device=device, non_blocking=True)\n",
+    "\n",
+    "    testloader = idist.auto_dataloader(testset, batch_size=4, shuffle=False, num_workers=2)\n",
+    "\n",
+    "    test_evaluator.run(testloader)\n",
+    "    print(f\"Best trial test set accuracy: {test_evaluator.state.metrics}\")"


Suggested change

"def test_best_model(best_trial, data_dir=None):\n",

" _, testset = load_data(data_dir)\n",

" \n",

" best_trained_model = idist.auto_model(Net(best_trial.config[\"l1\"], best_trial.config[\"l2\"]))\n",

" device = idist.device()\n",

"\n",

" best_checkpoint_dir = best_trial.checkpoint.value\n",

" model_state, optimizer_state = torch.load(os.path.join(\n",

" best_checkpoint_dir, \"checkpoint\"))\n",

" best_trained_model.load_state_dict(model_state)\n",

"\n",

" test_evaluator = create_supervised_evaluator(best_trained_model, metrics={\"Accuracy\": Accuracy()}, device=device, non_blocking=True)\n",

"\n",

" testloader = idist.auto_dataloader(testset, batch_size=4, shuffle=False, num_workers=2)\n",

"\n",

" test_evaluator.run(testloader)\n",

" print(f\"Best trial test set accuracy: {test_evaluator.state.metrics}\")"

"def test_best_model(best_trial, data_dir=None):\n",

" _, testset = load_data(data_dir)\n",

"\n",

" best_trained_model = idist.auto_model(\n",

" Net(best_trial.config[\"l1\"], best_trial.config[\"l2\"])\n",

" )\n",

" device = idist.device()\n",

"\n",

" best_checkpoint_dir = best_trial.checkpoint.value\n",

" model_state, optimizer_state = torch.load(\n",

" os.path.join(best_checkpoint_dir, \"checkpoint\")\n",

" )\n",

" best_trained_model.load_state_dict(model_state)\n",

"\n",

" test_evaluator = create_supervised_evaluator(\n",

" best_trained_model,\n",

" metrics={\"Accuracy\": Accuracy()},\n",

" device=device,\n",

" non_blocking=True,\n",

" )\n",

"\n",

" testloader = idist.auto_dataloader(\n",

" testset, batch_size=4, shuffle=False, num_workers=2\n",

" )\n",

"\n",

" test_evaluator.run(testloader)\n",

" print(f\"Best trial test set accuracy: {test_evaluator.state.metrics}\")"

tarasio-mirror · 2021-11-24T12:56:51Z

tutorials/advanced/01-hyperparameter-tuning.ipynb

+    "def main(num_samples=10, max_num_epochs=10, gpus_per_trial=1):\n",
+    "    data_dir = os.path.abspath(\"./data\")\n",
+    "    load_data(data_dir)\n",
+    "    \n",
+    "    config = {\n",
+    "        \"l1\": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),\n",
+    "        \"l2\": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),\n",
+    "        \"lr\": tune.loguniform(1e-4, 1e-1),\n",
+    "        \"batch_size\": tune.choice([2, 4, 8, 16])\n",
+    "    }\n",
+    "    scheduler = ASHAScheduler(\n",
+    "        metric=\"loss\",\n",
+    "        mode=\"min\",\n",
+    "        max_t=max_num_epochs,\n",
+    "        grace_period=1,\n",
+    "        reduction_factor=2)\n",
+    "    reporter = CLIReporter(\n",
+    "        metric_columns=[\"loss\", \"accuracy\", \"training_iteration\"])\n",
+    "    result = tune.run(\n",
+    "        partial(train_cifar, data_dir=data_dir),\n",
+    "        resources_per_trial={\"cpu\": 2, \"gpu\": gpus_per_trial},\n",
+    "        config=config,\n",
+    "        num_samples=num_samples,\n",
+    "        scheduler=scheduler,\n",
+    "        progress_reporter=reporter)\n",
+    "\n",
+    "    best_trial = result.get_best_trial(\"loss\", \"min\", \"last\")\n",
+    "    print(f\"Best trial config: {best_trial.config}\")\n",
+    "    print(f\"Best trial final validation loss: {best_trial.last_result['loss']}\")\n",
+    "    print(f\"Best trial final validation accuracy: {best_trial.last_result['accuracy']}\")\n",
+    "    \n",
+    "    test_best_model(best_trial, data_dir)"


Suggested change

"def main(num_samples=10, max_num_epochs=10, gpus_per_trial=1):\n",

" data_dir = os.path.abspath(\"./data\")\n",

" load_data(data_dir)\n",

" \n",

" config = {\n",

" \"l1\": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),\n",

" \"l2\": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),\n",

" \"lr\": tune.loguniform(1e-4, 1e-1),\n",

" \"batch_size\": tune.choice([2, 4, 8, 16])\n",

" }\n",

" scheduler = ASHAScheduler(\n",

" metric=\"loss\",\n",

" mode=\"min\",\n",

" max_t=max_num_epochs,\n",

" grace_period=1,\n",

" reduction_factor=2)\n",

" reporter = CLIReporter(\n",

" metric_columns=[\"loss\", \"accuracy\", \"training_iteration\"])\n",

" result = tune.run(\n",

" partial(train_cifar, data_dir=data_dir),\n",

" resources_per_trial={\"cpu\": 2, \"gpu\": gpus_per_trial},\n",

" config=config,\n",

" num_samples=num_samples,\n",

" scheduler=scheduler,\n",

" progress_reporter=reporter)\n",

"\n",

" best_trial = result.get_best_trial(\"loss\", \"min\", \"last\")\n",

" print(f\"Best trial config: {best_trial.config}\")\n",

" print(f\"Best trial final validation loss: {best_trial.last_result['loss']}\")\n",

" print(f\"Best trial final validation accuracy: {best_trial.last_result['accuracy']}\")\n",

" \n",

" test_best_model(best_trial, data_dir)"

"def main(num_samples=10, max_num_epochs=10, gpus_per_trial=1):\n",

" data_dir = os.path.abspath(\"./data\")\n",

" load_data(data_dir)\n",

"\n",

" config = {\n",

" \"l1\": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),\n",

" \"l2\": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),\n",

" \"lr\": tune.loguniform(1e-4, 1e-1),\n",

" \"batch_size\": tune.choice([2, 4, 8, 16]),\n",

" }\n",

" scheduler = ASHAScheduler(\n",

" metric=\"loss\",\n",

" mode=\"min\",\n",

" max_t=max_num_epochs,\n",

" grace_period=1,\n",

" reduction_factor=2,\n",

" )\n",

" reporter = CLIReporter(metric_columns=[\"loss\", \"accuracy\", \"training_iteration\"])\n",

" result = tune.run(\n",

" partial(train_cifar, data_dir=data_dir),\n",

" resources_per_trial={\"cpu\": 2, \"gpu\": gpus_per_trial},\n",

" config=config,\n",

" num_samples=num_samples,\n",

" scheduler=scheduler,\n",

" progress_reporter=reporter,\n",

" )\n",

"\n",

" best_trial = result.get_best_trial(\"loss\", \"min\", \"last\")\n",

" print(f\"Best trial config: {best_trial.config}\")\n",

" print(f\"Best trial final validation loss: {best_trial.last_result['loss']}\")\n",

" print(f\"Best trial final validation accuracy: {best_trial.last_result['accuracy']}\")\n",

"\n",

" test_best_model(best_trial, data_dir)"

tarasio-mirror · 2021-11-24T13:04:05Z

What about adding some summarizing sentences about the best trial, how to interpret the results?

tarasio-mirror · 2021-11-24T13:12:08Z

tutorials/advanced/01-hyperparameter-tuning.ipynb

+    "For every trial, Ray Tune will randomly sample a combination of parameters from these search spaces. It will then train a number of models in parallel and find the best performing one among these. \n",
+    "We also use the `ASHAScheduler()` which is one of the trial schedulers that aggressively terminate low-performing trials.\n",
+    "Apart from that, we leverage the `CLIReporter()` to prettify our outputs.\n",
+    "And then, we wrap `train_cifar` in functools.partial and pass it to `tune.run` along with other resources like the CPUs and GPUs available to use, the configurable parameters, the number of trials, scheduler and reporter.\n",


nit

Suggested change

"And then, we wrap `train_cifar` in functools.partial and pass it to `tune.run` along with other resources like the CPUs and GPUs available to use, the configurable parameters, the number of trials, scheduler and reporter.\n",

"And then, we wrap `train_cifar` in `functools.partial` and pass it to `tune.run` along with other resources like the CPUs and GPUs available to use, the configurable parameters, the number of trials, scheduler and reporter.\n",

tarasio-mirror · 2021-11-29T17:59:59Z

tutorials/advanced/01-hyperparameter-tuning.ipynb

+    "id": "vJgTaKWU8Doq"
+   },
+   "source": [
+    "In this tutorial, we will see how [Ray Tune](https://docs.ray.io/en/stable/tune.html) can be used with Ignite for hyperparameter tuning. We will also compare it with other frameworks like [Optuna](https://optuna.org/) and [Ax](https://ax.dev/) for hyperparameter optimization.\n",


are we going to add comparisons with ax and optuna?

added explanation

6df7b3f

Priyansi requested review from sdesrozis, tarasio-mirror and vfdev-5 November 8, 2021 14:15

tarasio-mirror reviewed Nov 24, 2021

View reviewed changes

vfdev-5 assigned tarasio-mirror Nov 29, 2021

tarasio-mirror reviewed Nov 29, 2021

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

added hyperparameter advanced tutorial#69

added hyperparameter advanced tutorial#69
Priyansi wants to merge 1 commit intomainfrom
add-hyperparam

Priyansi commented Nov 8, 2021

Uh oh!

tarasio-mirror left a comment

Uh oh!

tarasio-mirror Nov 24, 2021

Uh oh!

tarasio-mirror Nov 24, 2021

Uh oh!

tarasio-mirror Nov 24, 2021

Uh oh!

tarasio-mirror Nov 24, 2021

Uh oh!

tarasio-mirror Nov 24, 2021

Uh oh!

tarasio-mirror Nov 24, 2021

Uh oh!

tarasio-mirror commented Nov 24, 2021

Uh oh!

tarasio-mirror Nov 24, 2021

Uh oh!

tarasio-mirror Nov 29, 2021

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

	"And then, we wrap `train_cifar` in functools.partial and pass it to `tune.run` along with other resources like the CPUs and GPUs available to use, the configurable parameters, the number of trials, scheduler and reporter.\n",
	"And then, we wrap `train_cifar` in `functools.partial` and pass it to `tune.run` along with other resources like the CPUs and GPUs available to use, the configurable parameters, the number of trials, scheduler and reporter.\n",

Conversation

Priyansi commented Nov 8, 2021

Uh oh!

tarasio-mirror left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

tarasio-mirror commented Nov 24, 2021

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants