diff --git a/README.md b/README.md index da9b7a8..972eaa6 100644 --- a/README.md +++ b/README.md @@ -24,8 +24,8 @@ CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-cifar.sh GDAS_V1 cifar100 cut Train the searched CNN on ImageNet ``` -CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 -CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 +CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 B128 -1 +CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 B128 -1 ``` Evaluate a trained CNN model diff --git a/configs/nas-imagenet-B128.config b/configs/nas-imagenet-B128.config new file mode 100644 index 0000000..442b497 --- /dev/null +++ b/configs/nas-imagenet-B128.config @@ -0,0 +1,15 @@ +{ + "type" : ["str", "steplr"], + "batch_size": ["int", 128], + "epochs" : ["int", 250], + "decay_period": ["int", 1], + "gamma" : ["float", 0.97], + "momentum" : ["float", 0.9], + "decay" : ["float", 0.00003], + "LR" : ["float", 0.1], + "label_smooth": ["float", 0.1], + "auxiliary" : ["bool", 1], + "auxiliary_weight" : ["float", 0.4], + "grad_clip" : ["float", 5], + "drop_path_prob" : ["float", 0] +} diff --git a/configs/nas-imagenet-B256.config b/configs/nas-imagenet-B256.config new file mode 100644 index 0000000..a5926fb --- /dev/null +++ b/configs/nas-imagenet-B256.config @@ -0,0 +1,15 @@ +{ + "type" : ["str", "steplr"], + "batch_size": ["int", 256], + "epochs" : ["int", 250], + "decay_period": ["int", 1], + "gamma" : ["float", 0.97], + "momentum" : ["float", 0.9], + "decay" : ["float", 0.00003], + "LR" : ["float", 0.1], + "label_smooth": ["float", 0.1], + "auxiliary" : ["bool", 1], + "auxiliary_weight" : ["float", 0.4], + "grad_clip" : ["float", 5], + "drop_path_prob" : ["float", 0] +} diff --git a/exps-cnn/train_base.py b/exps-cnn/train_base.py index 6e8c003..560ca5e 100644 --- a/exps-cnn/train_base.py +++ b/exps-cnn/train_base.py @@ -42,7 +42,7 @@ else : print('Find CUDA_VISIBLE_DEVICES={: assert torch.cuda.is_available(), 'torch.cuda is not available' -if args.manualSeed is None: +if args.manualSeed is None or args.manualSeed < 0: args.manualSeed = random.randint(1, 10000) random.seed(args.manualSeed) cudnn.benchmark = True @@ -54,10 +54,10 @@ torch.cuda.manual_seed_all(args.manualSeed) def main(): # Init logger - args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) + #args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) - log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w') + log = open(os.path.join(args.save_path, 'seed-{:}-log.txt'.format(args.manualSeed)), 'w') print_log('Save Path : {:}'.format(args.save_path), log) state = {k: v for k, v in args._get_kwargs()} print_log(state, log) diff --git a/exps-cnn/train_utils.py b/exps-cnn/train_utils.py index 7e0880e..efc90f3 100644 --- a/exps-cnn/train_utils.py +++ b/exps-cnn/train_utils.py @@ -59,8 +59,8 @@ def main_procedure(config, dataset, data_path, args, genotype, init_channels, la raise ValueError('Can not find the schedular type : {:}'.format(config.type)) - checkpoint_path = os.path.join(args.save_path, 'checkpoint-{:}-model.pth'.format(dataset)) - checkpoint_best = os.path.join(args.save_path, 'checkpoint-{:}-best.pth'.format(dataset)) + checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-model.pth'.format(args.manualSeed, dataset)) + checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-best.pth'.format(args.manualSeed, dataset)) if pure_evaluate: print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) basemodel.load_state_dict( pure_evaluate ) diff --git a/exps-cnn/train_utils_imagenet.py b/exps-cnn/train_utils_imagenet.py index 24357cc..d0c8928 100644 --- a/exps-cnn/train_utils_imagenet.py +++ b/exps-cnn/train_utils_imagenet.py @@ -81,8 +81,8 @@ def main_procedure_imagenet(config, data_path, args, genotype, init_channels, la raise ValueError('Can not find the schedular type : {:}'.format(config.type)) - checkpoint_path = os.path.join(args.save_path, 'checkpoint-imagenet-model.pth') - checkpoint_best = os.path.join(args.save_path, 'checkpoint-imagenet-best.pth') + checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-model.pth'.format(args.manualSeed)) + checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-best.pth'.format(args.manualSeed)) if pure_evaluate: print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) diff --git a/scripts-cluster/job-script.sh b/scripts-cluster/job-script.sh index 172f563..0c4ebde 100644 --- a/scripts-cluster/job-script.sh +++ b/scripts-cluster/job-script.sh @@ -1,15 +1,16 @@ #!/bin/bash # echo "CHECK-DATA-DIR START" -#sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ -# COMM_KM_Data COMM_km_2018 \ -# `pwd`/hadoop-data \ -# afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets +sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ + COMM_KM_Data COMM_km_2018 \ + `pwd`/hadoop-data \ + afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets export TORCH_HOME="./data/data/" -wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME} -tar xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME} -rm ${TORCH_HOME}/cifar.python.tar +#wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME} +#tar -xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME} +tar -xf ./hadoop-data/cifar.python.tar -C ${TORCH_HOME} +#rm ${TORCH_HOME}/cifar.python.tar #tar xvf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} cifar_dir="${TORCH_HOME}/cifar.python" diff --git a/scripts-cnn/train-imagenet.sh b/scripts-cnn/train-imagenet.sh index d934ebd..5569656 100644 --- a/scripts-cnn/train-imagenet.sh +++ b/scripts-cnn/train-imagenet.sh @@ -1,7 +1,7 @@ #!/usr/bin/env sh -if [ "$#" -ne 3 ] ;then +if [ "$#" -ne 5 ] ;then echo "Input illegal number of parameters " $# - echo "Need 3 parameters for the architecture, and the channel and the layers" + echo "Need 5 parameters for the architecture, and the channel, and the layers, and the batch-size, and the seed" exit 1 fi if [ "$TORCH_HOME" = "" ]; then @@ -15,7 +15,9 @@ arch=$1 dataset=imagenet channels=$2 layers=$3 -SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-E250 +BATCH=$4 +seed=$5 +SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-${BATCH}-E250 PY_C="./env/bin/python" #PY_C="$CONDA_PYTHON_EXE" @@ -27,8 +29,8 @@ else echo "Cluster Run with Python: "${PY_C} echo "Unzip ILSVRC2012" tar --version - #tar xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} - commands="./data/data/get_imagenet.sh" + tar -xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} + #commands="./data/data/get_imagenet.sh" #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 tar > ${commands} #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-ZIP ./data/data/ILSVRC2012 zip > ./data/data/get_imagenet.sh #bash ./data/data/get_imagenet.sh @@ -42,16 +44,16 @@ else # free -g #done < "${commands}" #wget http://10.127.2.44:8000/ILSVRC2012.tar --directory-prefix=${TORCH_HOME} - ${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands} - count=0 - while read -r line; do - temp_file="./data/data/TEMP-${count}.sh" - echo "${line}" > ${temp_file} - bash ${temp_file} - count=$((count+1)) + #${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands} + #count=0 + #while read -r line; do + # temp_file="./data/data/TEMP-${count}.sh" + # echo "${line}" > ${temp_file} + # bash ${temp_file} + # count=$((count+1)) #${PY_C} ./data/ps_mem.py -p $$ # free -g - done < "${commands}" + #done < "${commands}" #echo "Copy ILSVRC2012 done" #tar -xvf ${TORCH_HOME}/ILSVRC2012.tar -C ${TORCH_HOME} #rm ${TORCH_HOME}/ILSVRC2012.tar @@ -66,5 +68,6 @@ ${PY_C} ./exps-cnn/train_base.py \ --save_path ${SAVED} \ --grad_clip 5 \ --init_channels ${channels} --layers ${layers} \ - --model_config ./configs/nas-imagenet.config \ + --model_config ./configs/nas-imagenet-${BATCH}.config \ + --manualSeed ${seed} \ --print_freq 200 --workers 20