update ImageNet training

2019-04-04 20:29:41 +08:00
parent 666c105f51
commit 4121d1719f
8 changed files with 64 additions and 30 deletions
--- a/README.md
+++ b/README.md
@@ -24,8 +24,8 @@ CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-cifar.sh GDAS_V1 cifar100 cut

 Train the searched CNN on ImageNet
 ```
-CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14
-CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14
+CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 B128 -1
+CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 B128 -1
 ```

 Evaluate a trained CNN model
--- a/configs/nas-imagenet-B128.config
+++ b/configs/nas-imagenet-B128.config
@@ -0,0 +1,15 @@
+{
+  "type"      : ["str",   "steplr"],
+  "batch_size": ["int",   128],
+  "epochs"    : ["int",   250],
+  "decay_period": ["int",   1],
+  "gamma"     : ["float", 0.97],
+  "momentum"  : ["float", 0.9],
+  "decay"     : ["float", 0.00003],
+  "LR"        : ["float", 0.1],
+  "label_smooth": ["float", 0.1],
+  "auxiliary" : ["bool", 1],
+  "auxiliary_weight" : ["float", 0.4],
+  "grad_clip" : ["float", 5],
+  "drop_path_prob" : ["float", 0]
+}
--- a/configs/nas-imagenet-B256.config
+++ b/configs/nas-imagenet-B256.config
@@ -0,0 +1,15 @@
+{
+  "type"      : ["str",   "steplr"],
+  "batch_size": ["int",   256],
+  "epochs"    : ["int",   250],
+  "decay_period": ["int",   1],
+  "gamma"     : ["float", 0.97],
+  "momentum"  : ["float", 0.9],
+  "decay"     : ["float", 0.00003],
+  "LR"        : ["float", 0.1],
+  "label_smooth": ["float", 0.1],
+  "auxiliary" : ["bool", 1],
+  "auxiliary_weight" : ["float", 0.4],
+  "grad_clip" : ["float", 5],
+  "drop_path_prob" : ["float", 0]
+}
--- a/exps-cnn/train_base.py
+++ b/exps-cnn/train_base.py
@@ -42,7 +42,7 @@ else                                       : print('Find CUDA_VISIBLE_DEVICES={:
 assert torch.cuda.is_available(), 'torch.cuda is not available'


-if args.manualSeed is None:
+if args.manualSeed is None or args.manualSeed < 0:
  args.manualSeed = random.randint(1, 10000)
 random.seed(args.manualSeed)
 cudnn.benchmark = True
@@ -54,10 +54,10 @@ torch.cuda.manual_seed_all(args.manualSeed)
 def main():

  # Init logger
-  args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed))
+  #args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed))
  if not os.path.isdir(args.save_path):
    os.makedirs(args.save_path)
-  log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w')
+  log = open(os.path.join(args.save_path, 'seed-{:}-log.txt'.format(args.manualSeed)), 'w')
  print_log('Save Path      : {:}'.format(args.save_path), log)
  state = {k: v for k, v in args._get_kwargs()}
  print_log(state, log)
--- a/exps-cnn/train_utils.py
+++ b/exps-cnn/train_utils.py
@@ -59,8 +59,8 @@ def main_procedure(config, dataset, data_path, args, genotype, init_channels, la
    raise ValueError('Can not find the schedular type : {:}'.format(config.type))


-  checkpoint_path = os.path.join(args.save_path, 'checkpoint-{:}-model.pth'.format(dataset))
-  checkpoint_best = os.path.join(args.save_path, 'checkpoint-{:}-best.pth'.format(dataset))
+  checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-model.pth'.format(args.manualSeed, dataset))
+  checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-best.pth'.format(args.manualSeed, dataset))
  if pure_evaluate:
    print_log('-'*20 + 'Pure Evaluation' + '-'*20, log)
    basemodel.load_state_dict( pure_evaluate )
--- a/exps-cnn/train_utils_imagenet.py
+++ b/exps-cnn/train_utils_imagenet.py
@@ -81,8 +81,8 @@ def main_procedure_imagenet(config, data_path, args, genotype, init_channels, la
    raise ValueError('Can not find the schedular type : {:}'.format(config.type))


-  checkpoint_path = os.path.join(args.save_path, 'checkpoint-imagenet-model.pth')
-  checkpoint_best = os.path.join(args.save_path, 'checkpoint-imagenet-best.pth')
+  checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-model.pth'.format(args.manualSeed))
+  checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-best.pth'.format(args.manualSeed))

  if pure_evaluate:
    print_log('-'*20 + 'Pure Evaluation' + '-'*20, log)
--- a/scripts-cluster/job-script.sh
+++ b/scripts-cluster/job-script.sh
@@ -1,15 +1,16 @@
 #!/bin/bash
 #
 echo "CHECK-DATA-DIR START"
-#sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
-#    COMM_KM_Data COMM_km_2018 \
-#    `pwd`/hadoop-data \
-#    afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
+sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
+    COMM_KM_Data COMM_km_2018 \
+    `pwd`/hadoop-data \
+    afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets

 export TORCH_HOME="./data/data/"
-wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME}
-tar xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME}
-rm ${TORCH_HOME}/cifar.python.tar
+#wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME}
+#tar -xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME}
+tar -xf ./hadoop-data/cifar.python.tar -C ${TORCH_HOME}
+#rm ${TORCH_HOME}/cifar.python.tar
 #tar xvf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME}

 cifar_dir="${TORCH_HOME}/cifar.python"
--- a/scripts-cnn/train-imagenet.sh
+++ b/scripts-cnn/train-imagenet.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env sh
-if [ "$#" -ne 3 ] ;then
+if [ "$#" -ne 5 ] ;then
  echo "Input illegal number of parameters " $#
-  echo "Need 3 parameters for the architecture, and the channel and the layers"
+  echo "Need 5 parameters for the architecture, and the channel, and the layers, and the batch-size, and the seed"
  exit 1               
 fi 
 if [ "$TORCH_HOME" = "" ]; then
@@ -15,7 +15,9 @@ arch=$1
 dataset=imagenet
 channels=$2
 layers=$3
-SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-E250
+BATCH=$4
+seed=$5
+SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-${BATCH}-E250

 PY_C="./env/bin/python"
 #PY_C="$CONDA_PYTHON_EXE"
@@ -27,8 +29,8 @@ else
  echo "Cluster Run with Python: "${PY_C}
  echo "Unzip ILSVRC2012"
  tar --version
-  #tar xf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME}
-  commands="./data/data/get_imagenet.sh"
+  tar -xf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME}
+  #commands="./data/data/get_imagenet.sh"
  #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 tar > ${commands}
  #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-ZIP ./data/data/ILSVRC2012 zip > ./data/data/get_imagenet.sh
  #bash ./data/data/get_imagenet.sh
@@ -42,16 +44,16 @@ else
  #  free -g
  #done < "${commands}"
  #wget http://10.127.2.44:8000/ILSVRC2012.tar --directory-prefix=${TORCH_HOME}
-  ${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands}
-  count=0
-  while read -r line; do
-    temp_file="./data/data/TEMP-${count}.sh"
-    echo "${line}" > ${temp_file}
-    bash ${temp_file}
-    count=$((count+1))
+  #${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands}
+  #count=0
+  #while read -r line; do
+  #  temp_file="./data/data/TEMP-${count}.sh"
+  #  echo "${line}" > ${temp_file}
+  #  bash ${temp_file}
+  #  count=$((count+1))
   #${PY_C} ./data/ps_mem.py -p $$
  #  free -g
-  done < "${commands}"
+  #done < "${commands}"
  #echo "Copy ILSVRC2012 done"
  #tar -xvf ${TORCH_HOME}/ILSVRC2012.tar -C ${TORCH_HOME}
  #rm ${TORCH_HOME}/ILSVRC2012.tar
@@ -66,5 +68,6 @@ ${PY_C} ./exps-cnn/train_base.py \
 	--save_path ${SAVED} \
 	--grad_clip 5 \
 	--init_channels ${channels} --layers ${layers} \
-	--model_config ./configs/nas-imagenet.config \
+	--model_config ./configs/nas-imagenet-${BATCH}.config \
+	--manualSeed ${seed} \
 	--print_freq 200 --workers 20