update ImageNet training
This commit is contained in:
		| @@ -24,8 +24,8 @@ CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-cifar.sh GDAS_V1 cifar100 cut | ||||
|  | ||||
| Train the searched CNN on ImageNet | ||||
| ``` | ||||
| CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 | ||||
| CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 | ||||
| CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14 B128 -1 | ||||
| CUDA_VISIBLE_DEVICES=0 bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14 B128 -1 | ||||
| ``` | ||||
|  | ||||
| Evaluate a trained CNN model | ||||
|   | ||||
							
								
								
									
										15
									
								
								configs/nas-imagenet-B128.config
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								configs/nas-imagenet-B128.config
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,15 @@ | ||||
| { | ||||
|   "type"      : ["str",   "steplr"], | ||||
|   "batch_size": ["int",   128], | ||||
|   "epochs"    : ["int",   250], | ||||
|   "decay_period": ["int",   1], | ||||
|   "gamma"     : ["float", 0.97], | ||||
|   "momentum"  : ["float", 0.9], | ||||
|   "decay"     : ["float", 0.00003], | ||||
|   "LR"        : ["float", 0.1], | ||||
|   "label_smooth": ["float", 0.1], | ||||
|   "auxiliary" : ["bool", 1], | ||||
|   "auxiliary_weight" : ["float", 0.4], | ||||
|   "grad_clip" : ["float", 5], | ||||
|   "drop_path_prob" : ["float", 0] | ||||
| } | ||||
							
								
								
									
										15
									
								
								configs/nas-imagenet-B256.config
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								configs/nas-imagenet-B256.config
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,15 @@ | ||||
| { | ||||
|   "type"      : ["str",   "steplr"], | ||||
|   "batch_size": ["int",   256], | ||||
|   "epochs"    : ["int",   250], | ||||
|   "decay_period": ["int",   1], | ||||
|   "gamma"     : ["float", 0.97], | ||||
|   "momentum"  : ["float", 0.9], | ||||
|   "decay"     : ["float", 0.00003], | ||||
|   "LR"        : ["float", 0.1], | ||||
|   "label_smooth": ["float", 0.1], | ||||
|   "auxiliary" : ["bool", 1], | ||||
|   "auxiliary_weight" : ["float", 0.4], | ||||
|   "grad_clip" : ["float", 5], | ||||
|   "drop_path_prob" : ["float", 0] | ||||
| } | ||||
| @@ -42,7 +42,7 @@ else                                       : print('Find CUDA_VISIBLE_DEVICES={: | ||||
| assert torch.cuda.is_available(), 'torch.cuda is not available' | ||||
|  | ||||
|  | ||||
| if args.manualSeed is None: | ||||
| if args.manualSeed is None or args.manualSeed < 0: | ||||
|   args.manualSeed = random.randint(1, 10000) | ||||
| random.seed(args.manualSeed) | ||||
| cudnn.benchmark = True | ||||
| @@ -54,10 +54,10 @@ torch.cuda.manual_seed_all(args.manualSeed) | ||||
| def main(): | ||||
|  | ||||
|   # Init logger | ||||
|   args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) | ||||
|   #args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) | ||||
|   if not os.path.isdir(args.save_path): | ||||
|     os.makedirs(args.save_path) | ||||
|   log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w') | ||||
|   log = open(os.path.join(args.save_path, 'seed-{:}-log.txt'.format(args.manualSeed)), 'w') | ||||
|   print_log('Save Path      : {:}'.format(args.save_path), log) | ||||
|   state = {k: v for k, v in args._get_kwargs()} | ||||
|   print_log(state, log) | ||||
|   | ||||
| @@ -59,8 +59,8 @@ def main_procedure(config, dataset, data_path, args, genotype, init_channels, la | ||||
|     raise ValueError('Can not find the schedular type : {:}'.format(config.type)) | ||||
|  | ||||
|  | ||||
|   checkpoint_path = os.path.join(args.save_path, 'checkpoint-{:}-model.pth'.format(dataset)) | ||||
|   checkpoint_best = os.path.join(args.save_path, 'checkpoint-{:}-best.pth'.format(dataset)) | ||||
|   checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-model.pth'.format(args.manualSeed, dataset)) | ||||
|   checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-{:}-best.pth'.format(args.manualSeed, dataset)) | ||||
|   if pure_evaluate: | ||||
|     print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) | ||||
|     basemodel.load_state_dict( pure_evaluate ) | ||||
|   | ||||
| @@ -81,8 +81,8 @@ def main_procedure_imagenet(config, data_path, args, genotype, init_channels, la | ||||
|     raise ValueError('Can not find the schedular type : {:}'.format(config.type)) | ||||
|  | ||||
|  | ||||
|   checkpoint_path = os.path.join(args.save_path, 'checkpoint-imagenet-model.pth') | ||||
|   checkpoint_best = os.path.join(args.save_path, 'checkpoint-imagenet-best.pth') | ||||
|   checkpoint_path = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-model.pth'.format(args.manualSeed)) | ||||
|   checkpoint_best = os.path.join(args.save_path, 'seed-{:}-checkpoint-imagenet-best.pth'.format(args.manualSeed)) | ||||
|  | ||||
|   if pure_evaluate: | ||||
|     print_log('-'*20 + 'Pure Evaluation' + '-'*20, log) | ||||
|   | ||||
| @@ -1,15 +1,16 @@ | ||||
| #!/bin/bash | ||||
| # | ||||
| echo "CHECK-DATA-DIR START" | ||||
| #sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ | ||||
| #    COMM_KM_Data COMM_km_2018 \ | ||||
| #    `pwd`/hadoop-data \ | ||||
| #    afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets | ||||
| sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ | ||||
|     COMM_KM_Data COMM_km_2018 \ | ||||
|     `pwd`/hadoop-data \ | ||||
|     afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets | ||||
|  | ||||
| export TORCH_HOME="./data/data/" | ||||
| wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME} | ||||
| tar xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME} | ||||
| rm ${TORCH_HOME}/cifar.python.tar | ||||
| #wget -q http://10.127.2.44:8000/cifar.python.tar --directory-prefix=${TORCH_HOME} | ||||
| #tar -xvf ${TORCH_HOME}/cifar.python.tar -C ${TORCH_HOME} | ||||
| tar -xf ./hadoop-data/cifar.python.tar -C ${TORCH_HOME} | ||||
| #rm ${TORCH_HOME}/cifar.python.tar | ||||
| #tar xvf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME} | ||||
|  | ||||
| cifar_dir="${TORCH_HOME}/cifar.python" | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| #!/usr/bin/env sh | ||||
| if [ "$#" -ne 3 ] ;then | ||||
| if [ "$#" -ne 5 ] ;then | ||||
|   echo "Input illegal number of parameters " $# | ||||
|   echo "Need 3 parameters for the architecture, and the channel and the layers" | ||||
|   echo "Need 5 parameters for the architecture, and the channel, and the layers, and the batch-size, and the seed" | ||||
|   exit 1                | ||||
| fi  | ||||
| if [ "$TORCH_HOME" = "" ]; then | ||||
| @@ -15,7 +15,9 @@ arch=$1 | ||||
| dataset=imagenet | ||||
| channels=$2 | ||||
| layers=$3 | ||||
| SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-E250 | ||||
| BATCH=$4 | ||||
| seed=$5 | ||||
| SAVED=./output/NAS-CNN/${arch}-${dataset}-C${channels}-L${layers}-${BATCH}-E250 | ||||
|  | ||||
| PY_C="./env/bin/python" | ||||
| #PY_C="$CONDA_PYTHON_EXE" | ||||
| @@ -27,8 +29,8 @@ else | ||||
|   echo "Cluster Run with Python: "${PY_C} | ||||
|   echo "Unzip ILSVRC2012" | ||||
|   tar --version | ||||
|   #tar xf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME} | ||||
|   commands="./data/data/get_imagenet.sh" | ||||
|   tar -xf ./hadoop-data/ILSVRC2012.tar   -C ${TORCH_HOME} | ||||
|   #commands="./data/data/get_imagenet.sh" | ||||
|   #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 tar > ${commands} | ||||
|   #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-ZIP ./data/data/ILSVRC2012 zip > ./data/data/get_imagenet.sh | ||||
|   #bash ./data/data/get_imagenet.sh | ||||
| @@ -42,16 +44,16 @@ else | ||||
|   #  free -g | ||||
|   #done < "${commands}" | ||||
|   #wget http://10.127.2.44:8000/ILSVRC2012.tar --directory-prefix=${TORCH_HOME} | ||||
|   ${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands} | ||||
|   count=0 | ||||
|   while read -r line; do | ||||
|     temp_file="./data/data/TEMP-${count}.sh" | ||||
|     echo "${line}" > ${temp_file} | ||||
|     bash ${temp_file} | ||||
|     count=$((count+1)) | ||||
|   #${PY_C} ./data/decompress.py ./data/classes.txt ${TORCH_HOME}/ILSVRC2012 wget > ${commands} | ||||
|   #count=0 | ||||
|   #while read -r line; do | ||||
|   #  temp_file="./data/data/TEMP-${count}.sh" | ||||
|   #  echo "${line}" > ${temp_file} | ||||
|   #  bash ${temp_file} | ||||
|   #  count=$((count+1)) | ||||
|    #${PY_C} ./data/ps_mem.py -p $$ | ||||
|   #  free -g | ||||
|   done < "${commands}" | ||||
|   #done < "${commands}" | ||||
|   #echo "Copy ILSVRC2012 done" | ||||
|   #tar -xvf ${TORCH_HOME}/ILSVRC2012.tar -C ${TORCH_HOME} | ||||
|   #rm ${TORCH_HOME}/ILSVRC2012.tar | ||||
| @@ -66,5 +68,6 @@ ${PY_C} ./exps-cnn/train_base.py \ | ||||
| 	--save_path ${SAVED} \ | ||||
| 	--grad_clip 5 \ | ||||
| 	--init_channels ${channels} --layers ${layers} \ | ||||
| 	--model_config ./configs/nas-imagenet.config \ | ||||
| 	--model_config ./configs/nas-imagenet-${BATCH}.config \ | ||||
| 	--manualSeed ${seed} \ | ||||
| 	--print_freq 200 --workers 20 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user