update ImageNet training

This commit is contained in:
Xuanyi Dong 2019-04-04 20:29:41 +08:00
parent 666c105f51
commit 4121d1719f
8 changed files with 64 additions and 30 deletions

View File

@ -24,8 +24,8 @@ CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-cifar.sh GDAS_V1 cifar100 cut
Train the searched CNN on ImageNet Train the searched CNN on ImageNet
``` ```
CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 B128 -1
CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 B128 -1
``` ```
Evaluate a trained CNN model Evaluate a trained CNN model

View File

@ -0,0 +1,15 @@
{
"type" : ["str", "steplr"],
"batch_size": ["int", 128],
"epochs" : ["int", 250],
"decay_period": ["int", 1],
"gamma" : ["float", 0.97],
"momentum" : ["float", 0.9],
"decay" : ["float", 0.00003],
"LR" : ["float", 0.1],
"label_smooth": ["float", 0.1],
"auxiliary" : ["bool", 1],
"auxiliary_weight" : ["float", 0.4],
"grad_clip" : ["float", 5],
"drop_path_prob" : ["float", 0]
}

View File

@ -0,0 +1,15 @@
{
"type" : ["str", "steplr"],
"batch_size": ["int", 256],
"epochs" : ["int", 250],
"decay_period": ["int", 1],
"gamma" : ["float", 0.97],
"momentum" : ["float", 0.9],
"decay" : ["float", 0.00003],
"LR" : ["float", 0.1],
"label_smooth": ["float", 0.1],
"auxiliary" : ["bool", 1],
"auxiliary_weight" : ["float", 0.4],
"grad_clip" : ["float", 5],
"drop_path_prob" : ["float", 0]
}

View File

@ -42,7 +42,7 @@ else : print('Find CUDA_VISIBLE_DEVICES={:
assert torch.cuda.is_available(), 'torch.cuda is not available' assert torch.cuda.is_available(), 'torch.cuda is not available'
if args.manualSeed is None: if args.manualSeed is None or args.manualSeed < 0:
args.manualSeed = random.randint(1, 10000) args.manualSeed = random.randint(1, 10000)
random.seed(args.manualSeed) random.seed(args.manualSeed)
cudnn.benchmark = True cudnn.benchmark = True
@ -54,10 +54,10 @@ torch.cuda.manual_seed_all(args.manualSeed)
def main(): def main():
# Init logger # Init logger
args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) #args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed))
if not os.path.isdir(args.save_path): if not os.path.isdir(args.save_path):
os.makedirs(args.save_path) os.makedirs(args.save_path)
log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w') log = open(os.path.join(args.save_path, 'seed-{:}-log.txt'.format(args.manualSeed)), 'w')
print_log('Save Path : {:}'.format(args.save_path), log) print_log('Save Path : {:}'.format(args.save_path), log)
state = {k: v for k, v in args._get_kwargs()} state = {k: v for k, v in args._get_kwargs()}
print_log(state, log) print_log(state, log)

View File

@ -59,8 +59,8 @@ def main_procedure(config, dataset, data_path, args, genotype, init_channels, la
raise ValueError('Can not find the schedular type : {:}'.format(config.type)) raise ValueError('Can not find the schedular type : {:}'.format(config.type))
checkpoint_path = os.path.join(args.save_path, 'checkpoint-{:}-model.pth'.format(dataset)) checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-model.pth'.format(args.manualSeed, dataset))
checkpoint_best = os.path.join(args.save_path, 'checkpoint-{:}-best.pth'.format(dataset)) checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-best.pth'.format(args.manualSeed, dataset))
if pure_evaluate: if pure_evaluate:
print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) print_log('-'*20 + 'Pure Evaluation' + '-'*20, log)
basemodel.load_state_dict( pure_evaluate ) basemodel.load_state_dict( pure_evaluate )

View File

@ -81,8 +81,8 @@ def main_procedure_imagenet(config, data_path, args, genotype, init_channels, la
raise ValueError('Can not find the schedular type : {:}'.format(config.type)) raise ValueError('Can not find the schedular type : {:}'.format(config.type))
checkpoint_path = os.path.join(args.save_path, 'checkpoint-imagenet-model.pth') checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-model.pth'.format(args.manualSeed))
checkpoint_best = os.path.join(args.save_path, 'checkpoint-imagenet-best.pth') checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-best.pth'.format(args.manualSeed))
if pure_evaluate: if pure_evaluate:
print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) print_log('-'*20 + 'Pure Evaluation' + '-'*20, log)

View File

@ -1,15 +1,16 @@
#!/bin/bash #!/bin/bash
# #
echo "CHECK-DATA-DIR START" echo "CHECK-DATA-DIR START"
#sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
# COMM_KM_Data COMM_km_2018 \ COMM_KM_Data COMM_km_2018 \
# `pwd`/hadoop-data \ `pwd`/hadoop-data \
# afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
export TORCH_HOME="./data/data/" export TORCH_HOME="./data/data/"
wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME} #wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME}
tar xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME} #tar -xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME}
rm ${TORCH_HOME}/cifar.python.tar tar -xf ./hadoop-data/cifar.python.tar -C ${TORCH_HOME}
#rm ${TORCH_HOME}/cifar.python.tar
#tar xvf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} #tar xvf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME}
cifar_dir="${TORCH_HOME}/cifar.python" cifar_dir="${TORCH_HOME}/cifar.python"

View File

@ -1,7 +1,7 @@
#!/usr/bin/env sh #!/usr/bin/env sh
if [ "$#" -ne 3 ] ;then if [ "$#" -ne 5 ] ;then
echo "Input illegal number of parameters " $# echo "Input illegal number of parameters " $#
echo "Need 3 parameters for the architecture, and the channel and the layers" echo "Need 5 parameters for the architecture, and the channel, and the layers, and the batch-size, and the seed"
exit 1 exit 1
fi fi
if [ "$TORCH_HOME" = "" ]; then if [ "$TORCH_HOME" = "" ]; then
@ -15,7 +15,9 @@ arch=$1
dataset=imagenet dataset=imagenet
channels=$2 channels=$2
layers=$3 layers=$3
SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-E250 BATCH=$4
seed=$5
SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-${BATCH}-E250
PY_C="./env/bin/python" PY_C="./env/bin/python"
#PY_C="$CONDA_PYTHON_EXE" #PY_C="$CONDA_PYTHON_EXE"
@ -27,8 +29,8 @@ else
echo "Cluster Run with Python: "${PY_C} echo "Cluster Run with Python: "${PY_C}
echo "Unzip ILSVRC2012" echo "Unzip ILSVRC2012"
tar --version tar --version
#tar xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} tar -xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME}
commands="./data/data/get_imagenet.sh" #commands="./data/data/get_imagenet.sh"
#${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 tar > ${commands} #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 tar > ${commands}
#${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-ZIP ./data/data/ILSVRC2012 zip > ./data/data/get_imagenet.sh #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-ZIP ./data/data/ILSVRC2012 zip > ./data/data/get_imagenet.sh
#bash ./data/data/get_imagenet.sh #bash ./data/data/get_imagenet.sh
@ -42,16 +44,16 @@ else
# free -g # free -g
#done < "${commands}" #done < "${commands}"
#wget http://10.127.2.44:8000/ILSVRC2012.tar --directory-prefix=${TORCH_HOME} #wget http://10.127.2.44:8000/ILSVRC2012.tar --directory-prefix=${TORCH_HOME}
${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands} #${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands}
count=0 #count=0
while read -r line; do #while read -r line; do
temp_file="./data/data/TEMP-${count}.sh" # temp_file="./data/data/TEMP-${count}.sh"
echo "${line}" > ${temp_file} # echo "${line}" > ${temp_file}
bash ${temp_file} # bash ${temp_file}
count=$((count+1)) # count=$((count+1))
#${PY_C} ./data/ps_mem.py -p $$ #${PY_C} ./data/ps_mem.py -p $$
# free -g # free -g
done < "${commands}" #done < "${commands}"
#echo "Copy ILSVRC2012 done" #echo "Copy ILSVRC2012 done"
#tar -xvf ${TORCH_HOME}/ILSVRC2012.tar -C ${TORCH_HOME} #tar -xvf ${TORCH_HOME}/ILSVRC2012.tar -C ${TORCH_HOME}
#rm ${TORCH_HOME}/ILSVRC2012.tar #rm ${TORCH_HOME}/ILSVRC2012.tar
@ -66,5 +68,6 @@ ${PY_C} ./exps-cnn/train_base.py \
--save_path ${SAVED} \ --save_path ${SAVED} \
--grad_clip 5 \ --grad_clip 5 \
--init_channels ${channels} --layers ${layers} \ --init_channels ${channels} --layers ${layers} \
--model_config ./configs/nas-imagenet.config \ --model_config ./configs/nas-imagenet-${BATCH}.config \
--manualSeed ${seed} \
--print_freq 200 --workers 20 --print_freq 200 --workers 20