update ImageNet training

This commit is contained in:
Xuanyi Dong 2019-04-04 20:29:41 +08:00
parent 666c105f51
commit 4121d1719f
8 changed files with 64 additions and 30 deletions

View File

@ -24,8 +24,8 @@ CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-cifar.sh GDAS_V1 cifar100 cut
Train the searched CNN on ImageNet
```
CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14
CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14
CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 B128 -1
CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 B128 -1
```
Evaluate a trained CNN model

View File

@ -0,0 +1,15 @@
{
"type" : ["str", "steplr"],
"batch_size": ["int", 128],
"epochs" : ["int", 250],
"decay_period": ["int", 1],
"gamma" : ["float", 0.97],
"momentum" : ["float", 0.9],
"decay" : ["float", 0.00003],
"LR" : ["float", 0.1],
"label_smooth": ["float", 0.1],
"auxiliary" : ["bool", 1],
"auxiliary_weight" : ["float", 0.4],
"grad_clip" : ["float", 5],
"drop_path_prob" : ["float", 0]
}

View File

@ -0,0 +1,15 @@
{
"type" : ["str", "steplr"],
"batch_size": ["int", 256],
"epochs" : ["int", 250],
"decay_period": ["int", 1],
"gamma" : ["float", 0.97],
"momentum" : ["float", 0.9],
"decay" : ["float", 0.00003],
"LR" : ["float", 0.1],
"label_smooth": ["float", 0.1],
"auxiliary" : ["bool", 1],
"auxiliary_weight" : ["float", 0.4],
"grad_clip" : ["float", 5],
"drop_path_prob" : ["float", 0]
}

View File

@ -42,7 +42,7 @@ else : print('Find CUDA_VISIBLE_DEVICES={:
assert torch.cuda.is_available(), 'torch.cuda is not available'
if args.manualSeed is None:
if args.manualSeed is None or args.manualSeed < 0:
args.manualSeed = random.randint(1, 10000)
random.seed(args.manualSeed)
cudnn.benchmark = True
@ -54,10 +54,10 @@ torch.cuda.manual_seed_all(args.manualSeed)
def main():
# Init logger
args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed))
#args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed))
if not os.path.isdir(args.save_path):
os.makedirs(args.save_path)
log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w')
log = open(os.path.join(args.save_path, 'seed-{:}-log.txt'.format(args.manualSeed)), 'w')
print_log('Save Path : {:}'.format(args.save_path), log)
state = {k: v for k, v in args._get_kwargs()}
print_log(state, log)

View File

@ -59,8 +59,8 @@ def main_procedure(config, dataset, data_path, args, genotype, init_channels, la
raise ValueError('Can not find the schedular type : {:}'.format(config.type))
checkpoint_path = os.path.join(args.save_path, 'checkpoint-{:}-model.pth'.format(dataset))
checkpoint_best = os.path.join(args.save_path, 'checkpoint-{:}-best.pth'.format(dataset))
checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-model.pth'.format(args.manualSeed, dataset))
checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-best.pth'.format(args.manualSeed, dataset))
if pure_evaluate:
print_log('-'*20 + 'Pure Evaluation' + '-'*20, log)
basemodel.load_state_dict( pure_evaluate )

View File

@ -81,8 +81,8 @@ def main_procedure_imagenet(config, data_path, args, genotype, init_channels, la
raise ValueError('Can not find the schedular type : {:}'.format(config.type))
checkpoint_path = os.path.join(args.save_path, 'checkpoint-imagenet-model.pth')
checkpoint_best = os.path.join(args.save_path, 'checkpoint-imagenet-best.pth')
checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-model.pth'.format(args.manualSeed))
checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-best.pth'.format(args.manualSeed))
if pure_evaluate:
print_log('-'*20 + 'Pure Evaluation' + '-'*20, log)

View File

@ -1,15 +1,16 @@
#!/bin/bash
#
echo "CHECK-DATA-DIR START"
#sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
# COMM_KM_Data COMM_km_2018 \
# `pwd`/hadoop-data \
# afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
COMM_KM_Data COMM_km_2018 \
`pwd`/hadoop-data \
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
export TORCH_HOME="./data/data/"
wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME}
tar xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME}
rm ${TORCH_HOME}/cifar.python.tar
#wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME}
#tar -xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME}
tar -xf ./hadoop-data/cifar.python.tar -C ${TORCH_HOME}
#rm ${TORCH_HOME}/cifar.python.tar
#tar xvf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME}
cifar_dir="${TORCH_HOME}/cifar.python"

View File

@ -1,7 +1,7 @@
#!/usr/bin/env sh
if [ "$#" -ne 3 ] ;then
if [ "$#" -ne 5 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 3 parameters for the architecture, and the channel and the layers"
echo "Need 5 parameters for the architecture, and the channel, and the layers, and the batch-size, and the seed"
exit 1
fi
if [ "$TORCH_HOME" = "" ]; then
@ -15,7 +15,9 @@ arch=$1
dataset=imagenet
channels=$2
layers=$3
SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-E250
BATCH=$4
seed=$5
SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-${BATCH}-E250
PY_C="./env/bin/python"
#PY_C="$CONDA_PYTHON_EXE"
@ -27,8 +29,8 @@ else
echo "Cluster Run with Python: "${PY_C}
echo "Unzip ILSVRC2012"
tar --version
#tar xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME}
commands="./data/data/get_imagenet.sh"
tar -xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME}
#commands="./data/data/get_imagenet.sh"
#${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 tar > ${commands}
#${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-ZIP ./data/data/ILSVRC2012 zip > ./data/data/get_imagenet.sh
#bash ./data/data/get_imagenet.sh
@ -42,16 +44,16 @@ else
# free -g
#done < "${commands}"
#wget http://10.127.2.44:8000/ILSVRC2012.tar --directory-prefix=${TORCH_HOME}
${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands}
count=0
while read -r line; do
temp_file="./data/data/TEMP-${count}.sh"
echo "${line}" > ${temp_file}
bash ${temp_file}
count=$((count+1))
#${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands}
#count=0
#while read -r line; do
# temp_file="./data/data/TEMP-${count}.sh"
# echo "${line}" > ${temp_file}
# bash ${temp_file}
# count=$((count+1))
#${PY_C} ./data/ps_mem.py -p $$
# free -g
done < "${commands}"
#done < "${commands}"
#echo "Copy ILSVRC2012 done"
#tar -xvf ${TORCH_HOME}/ILSVRC2012.tar -C ${TORCH_HOME}
#rm ${TORCH_HOME}/ILSVRC2012.tar
@ -66,5 +68,6 @@ ${PY_C} ./exps-cnn/train_base.py \
--save_path ${SAVED} \
--grad_clip 5 \
--init_channels ${channels} --layers ${layers} \
--model_config ./configs/nas-imagenet.config \
--model_config ./configs/nas-imagenet-${BATCH}.config \
--manualSeed ${seed} \
--print_freq 200 --workers 20