xautodl/exps/NATS-algos/bohb.py

277 lines
8.9 KiB
Python

##################################################
# Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2020 #
###################################################################
# BOHB: Robust and Efficient Hyperparameter Optimization at Scale #
# required to install hpbandster ##################################
# pip install hpbandster ##################################
###################################################################
# OMP_NUM_THREADS=4 python exps/NATS-algos/bohb.py --search_space tss --dataset cifar10 --num_samples 4 --random_fraction 0.0 --bandwidth_factor 3 --rand_seed 1
# OMP_NUM_THREADS=4 python exps/NATS-algos/bohb.py --search_space sss --dataset cifar10 --num_samples 4 --random_fraction 0.0 --bandwidth_factor 3 --rand_seed 1
###################################################################
import os, sys, time, random, argparse, collections
from copy import deepcopy
import torch
from xautodl.config_utils import load_config
from xautodl.datasets import get_datasets, SearchDataset
from xautodl.procedures import prepare_seed, prepare_logger
from xautodl.log_utils import AverageMeter, time_string, convert_secs2time
from xautodl.models import CellStructure, get_search_spaces
from nats_bench import create
# BOHB: Robust and Efficient Hyperparameter Optimization at Scale, ICML 2018
import ConfigSpace
from hpbandster.optimizers.bohb import BOHB
import hpbandster.core.nameserver as hpns
from hpbandster.core.worker import Worker
def get_topology_config_space(search_space, max_nodes=4):
cs = ConfigSpace.ConfigurationSpace()
# edge2index = {}
for i in range(1, max_nodes):
for j in range(i):
node_str = "{:}<-{:}".format(i, j)
cs.add_hyperparameter(
ConfigSpace.CategoricalHyperparameter(node_str, search_space)
)
return cs
def get_size_config_space(search_space):
cs = ConfigSpace.ConfigurationSpace()
for ilayer in range(search_space["numbers"]):
node_str = "layer-{:}".format(ilayer)
cs.add_hyperparameter(
ConfigSpace.CategoricalHyperparameter(node_str, search_space["candidates"])
)
return cs
def config2topology_func(max_nodes=4):
def config2structure(config):
genotypes = []
for i in range(1, max_nodes):
xlist = []
for j in range(i):
node_str = "{:}<-{:}".format(i, j)
op_name = config[node_str]
xlist.append((op_name, j))
genotypes.append(tuple(xlist))
return CellStructure(genotypes)
return config2structure
def config2size_func(search_space):
def config2structure(config):
channels = []
for ilayer in range(search_space["numbers"]):
node_str = "layer-{:}".format(ilayer)
channels.append(str(config[node_str]))
return ":".join(channels)
return config2structure
class MyWorker(Worker):
def __init__(self, *args, convert_func=None, dataset=None, api=None, **kwargs):
super().__init__(*args, **kwargs)
self.convert_func = convert_func
self._dataset = dataset
self._api = api
self.total_times = []
self.trajectory = []
def compute(self, config, budget, **kwargs):
arch = self.convert_func(config)
accuracy, latency, time_cost, total_time = self._api.simulate_train_eval(
arch, self._dataset, iepoch=int(budget) - 1, hp="12"
)
self.trajectory.append((accuracy, arch))
self.total_times.append(total_time)
return {"loss": 100 - accuracy, "info": self._api.query_index_by_arch(arch)}
def main(xargs, api):
torch.set_num_threads(4)
prepare_seed(xargs.rand_seed)
logger = prepare_logger(args)
logger.log("{:} use api : {:}".format(time_string(), api))
api.reset_time()
search_space = get_search_spaces(xargs.search_space, "nats-bench")
if xargs.search_space == "tss":
cs = get_topology_config_space(search_space)
config2structure = config2topology_func()
else:
cs = get_size_config_space(search_space)
config2structure = config2size_func(search_space)
hb_run_id = "0"
NS = hpns.NameServer(run_id=hb_run_id, host="localhost", port=0)
ns_host, ns_port = NS.start()
num_workers = 1
workers = []
for i in range(num_workers):
w = MyWorker(
nameserver=ns_host,
nameserver_port=ns_port,
convert_func=config2structure,
dataset=xargs.dataset,
api=api,
run_id=hb_run_id,
id=i,
)
w.run(background=True)
workers.append(w)
start_time = time.time()
bohb = BOHB(
configspace=cs,
run_id=hb_run_id,
eta=3,
min_budget=1,
max_budget=12,
nameserver=ns_host,
nameserver_port=ns_port,
num_samples=xargs.num_samples,
random_fraction=xargs.random_fraction,
bandwidth_factor=xargs.bandwidth_factor,
ping_interval=10,
min_bandwidth=xargs.min_bandwidth,
)
results = bohb.run(xargs.n_iters, min_n_workers=num_workers)
bohb.shutdown(shutdown_workers=True)
NS.shutdown()
# print('There are {:} runs.'.format(len(results.get_all_runs())))
# workers[0].total_times
# workers[0].trajectory
current_best_index = []
for idx in range(len(workers[0].trajectory)):
trajectory = workers[0].trajectory[: idx + 1]
arch = max(trajectory, key=lambda x: x[0])[1]
current_best_index.append(api.query_index_by_arch(arch))
best_arch = max(workers[0].trajectory, key=lambda x: x[0])[1]
logger.log(
"Best found configuration: {:} within {:.3f} s".format(
best_arch, workers[0].total_times[-1]
)
)
info = api.query_info_str_by_arch(
best_arch, "200" if xargs.search_space == "tss" else "90"
)
logger.log("{:}".format(info))
logger.log("-" * 100)
logger.close()
return logger.log_dir, current_best_index, workers[0].total_times
if __name__ == "__main__":
parser = argparse.ArgumentParser(
"BOHB: Robust and Efficient Hyperparameter Optimization at Scale"
)
parser.add_argument(
"--dataset",
type=str,
choices=["cifar10", "cifar100", "ImageNet16-120"],
help="Choose between Cifar10/100 and ImageNet-16.",
)
# general arg
parser.add_argument(
"--search_space",
type=str,
choices=["tss", "sss"],
help="Choose the search space.",
)
parser.add_argument(
"--time_budget",
type=int,
default=20000,
help="The total time cost budge for searching (in seconds).",
)
parser.add_argument(
"--loops_if_rand", type=int, default=500, help="The total runs for evaluation."
)
# BOHB
parser.add_argument(
"--strategy",
default="sampling",
type=str,
nargs="?",
help="optimization strategy for the acquisition function",
)
parser.add_argument(
"--min_bandwidth",
default=0.3,
type=float,
nargs="?",
help="minimum bandwidth for KDE",
)
parser.add_argument(
"--num_samples",
default=64,
type=int,
nargs="?",
help="number of samples for the acquisition function",
)
parser.add_argument(
"--random_fraction",
default=0.33,
type=float,
nargs="?",
help="fraction of random configurations",
)
parser.add_argument(
"--bandwidth_factor",
default=3,
type=int,
nargs="?",
help="factor multiplied to the bandwidth",
)
parser.add_argument(
"--n_iters",
default=300,
type=int,
nargs="?",
help="number of iterations for optimization method",
)
# log
parser.add_argument(
"--save_dir",
type=str,
default="./output/search",
help="Folder to save checkpoints and log.",
)
parser.add_argument("--rand_seed", type=int, default=-1, help="manual seed")
args = parser.parse_args()
api = create(None, args.search_space, fast_mode=False, verbose=False)
args.save_dir = os.path.join(
"{:}-{:}".format(args.save_dir, args.search_space),
"{:}-T{:}".format(args.dataset, args.time_budget),
"BOHB",
)
print("save-dir : {:}".format(args.save_dir))
if args.rand_seed < 0:
save_dir, all_info = None, collections.OrderedDict()
for i in range(args.loops_if_rand):
print("{:} : {:03d}/{:03d}".format(time_string(), i, args.loops_if_rand))
args.rand_seed = random.randint(1, 100000)
save_dir, all_archs, all_total_times = main(args, api)
all_info[i] = {"all_archs": all_archs, "all_total_times": all_total_times}
save_path = save_dir / "results.pth"
print("save into {:}".format(save_path))
torch.save(all_info, save_path)
else:
main(args, api)