update
This commit is contained in:
		
							
								
								
									
										19
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										19
									
								
								README.md
									
									
									
									
									
								
							| @@ -13,13 +13,6 @@ conda install pytorch torchvision cuda100 -c pytorch | |||||||
|  |  | ||||||
| ## Algorithm | ## Algorithm | ||||||
|  |  | ||||||
| Searching CNNs |  | ||||||
| ``` |  | ||||||
| bash ./scripts-cnn/search.sh 1 base cifar10 |  | ||||||
| bash ./scripts-cnn/DMS-V-TrainV3.sh 1 |  | ||||||
| bash ./scripts-cnn/search-acc-v2.sh 3 acc2 |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| Train the searched CNN on CIFAR | Train the searched CNN on CIFAR | ||||||
| ``` | ``` | ||||||
| bash ./scripts-cnn/train-cifar.sh 0 GDAS_FG cifar10  cut | bash ./scripts-cnn/train-cifar.sh 0 GDAS_FG cifar10  cut | ||||||
| @@ -36,10 +29,10 @@ bash ./scripts-cnn/train-imagenet.sh 0 GDAS_V1 50 14 | |||||||
|  |  | ||||||
| Train the searched RNN | Train the searched RNN | ||||||
| ``` | ``` | ||||||
| bash ./scripts-rnn/train-PTB.sh 0 DARTS_V1 | CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-PTB.sh DARTS_V1 | ||||||
| bash ./scripts-rnn/train-PTB.sh 0 DARTS_V2 | CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-PTB.sh DARTS_V2 | ||||||
| bash ./scripts-rnn/train-PTB.sh 0 GDAS | CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-PTB.sh GDAS | ||||||
| bash ./scripts-rnn/train-WT2.sh 0 DARTS_V1 | CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-WT2.sh DARTS_V1 | ||||||
| bash ./scripts-rnn/train-WT2.sh 0 DARTS_V2 | CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-WT2.sh DARTS_V2 | ||||||
| bash ./scripts-rnn/train-WT2.sh 0 GDAS | CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-WT2.sh GDAS | ||||||
| ``` | ``` | ||||||
|   | |||||||
| @@ -1,30 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 3 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 3 parameters for the GPUs and the epochs and the cutout" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=acc2 |  | ||||||
| cutout=$3 |  | ||||||
| dataset=cifar10 |  | ||||||
| epoch=$2 |  | ||||||
| SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--arch ${arch} --dataset ${dataset} --batch_size 128 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.05 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--tau_max 10 --tau_min 4 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos.config \ |  | ||||||
| 	--print_freq 100 --workers 8 |  | ||||||
| @@ -1,57 +0,0 @@ | |||||||
| # Neural-Architecture-Search |  | ||||||
|  |  | ||||||
| ### Baseline |  | ||||||
| ``` |  | ||||||
| bash ./scripts-nas/search.sh 1 base cifar10 |  | ||||||
| bash ./scripts-nas/search.sh 1 share |  | ||||||
| bash ./scripts-nas/batch-base-search.sh 1 |  | ||||||
| bash ./scripts-nas/batch-base-model.sh 1 |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| ### Meta |  | ||||||
| ``` |  | ||||||
| bash ./scripts-nas/meta-search.sh 0 meta 20 5 |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| ### Acceleration |  | ||||||
| ``` |  | ||||||
| bash ./scripts-nas/search-acc-v2.sh 3 acc2 |  | ||||||
| bash ./scripts-nas/DMS-V-Train.sh 0 |  | ||||||
|  |  | ||||||
| bash ./scripts-nas/search-acc-simple.sh 3 NetworkV2 |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| ### Base Model Training |  | ||||||
| ``` |  | ||||||
| bash ./scripts-nas/train-model.sh 3 AmoebaNet |  | ||||||
| bash ./scripts-nas/train-model.sh 3 NASNet |  | ||||||
| bash ./scripts-nas/train-model.sh 3 DARTS_V1 |  | ||||||
| bash ./scripts-nas/train-model-simple.sh 3 AmoebaNet |  | ||||||
| bash ./scripts-nas/train-imagenet.sh 3 DARTS_V2 50 14 |  | ||||||
|  |  | ||||||
| bash scripts-nas/TRAIN-BASE.sh 0 PNASNet cifar10 nocut 48 11 |  | ||||||
| bash scripts-nas/TRAIN-BASE.sh 0 AmoebaNet cifar10 nocut 36 20 |  | ||||||
| bash scripts-nas/TRAIN-BASE.sh 0 NASNet cifar10 nocut 33 20 |  | ||||||
|  |  | ||||||
| bash scripts-nas/TRAIN-BASE.sh 0 DMS_F1 cifar10 nocut 36 20 |  | ||||||
| bash scripts-nas/TRAIN-BASE.sh 0 DMS_V1 cifar10 nocut 36 20 |  | ||||||
| bash scripts-nas/TRAIN-BASE.sh 0 GDAS_CC cifar10 nocut 36 20 |  | ||||||
| bash scripts-nas/train-imagenet.sh 3 DMS_F1 52 14 |  | ||||||
| bash scripts-nas/train-imagenet.sh 3 DMS_V1 50 14 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| bash scripts-nas/TRAIN-BASE.sh 0 DMS_V1 cifar10 nocut 36 20 |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ### Visualization |  | ||||||
| ``` |  | ||||||
| python ./exps-nas/vis-arch.py --checkpoint  --save_dir |  | ||||||
| python ./exps-nas/cvpr-vis.py --save_dir ./snapshots/NAS-VIS/ |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| ### Test datasets |  | ||||||
| ``` |  | ||||||
| cd ./lib/datasets/ |  | ||||||
| python test_NLP.py |  | ||||||
| ``` |  | ||||||
| @@ -1,30 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| # bash scripts-nas/TRAIN-BASE.sh 0 DMS_V1 cifar10 nocut init-channel layers |  | ||||||
| if [ "$#" -ne 6 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 6 parameters for the GPUs, the architecture, the dataset, the config, the initial channel, and the number of layers" |  | ||||||
|   exit 1 |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| dataset=$3 |  | ||||||
| config=$4 |  | ||||||
| C=$5 |  | ||||||
| N=$6 |  | ||||||
| SAVED=./snapshots/NAS/${arch}-${C}-${N}-${dataset}-${config}-E600 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/train_base.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--dataset ${dataset} --arch ${arch} \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--grad_clip 5 \ |  | ||||||
| 	--init_channels ${C} --layers ${N} \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos-${config}.config \ |  | ||||||
| 	--print_freq 100 --workers 8 |  | ||||||
| @@ -1,23 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| set -e |  | ||||||
| if [ "$#" -ne 1 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 1 parameters for the GPUs" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
|  |  | ||||||
| bash ./scripts-nas/train-model.sh ${gpus} AmoebaNet 0 |  | ||||||
|  |  | ||||||
| bash ./scripts-nas/train-model.sh ${gpus} NASNet    0 |  | ||||||
|  |  | ||||||
| bash ./scripts-nas/train-model.sh ${gpus} DARTS_V1  0 |  | ||||||
|  |  | ||||||
| bash ./scripts-nas/train-model.sh ${gpus} DARTS_V2  0 |  | ||||||
| @@ -1,19 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 1 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 1 parameters for the GPUs" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| Times="1 2 3" |  | ||||||
|  |  | ||||||
| for time in ${Times}; do |  | ||||||
|   bash ./scripts-nas/search.sh ${gpus}  |  | ||||||
| done |  | ||||||
| @@ -1,30 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 4 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 4 parameters for the GPUs and the network and N-way and K-shot" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| n_way=$3 |  | ||||||
| k_shot=$4 |  | ||||||
| cutout=16 |  | ||||||
| epoch=60 |  | ||||||
| SAVED=./snapshots/NAS/Meta-Search-${arch}-N${n_way}-K${k_shot}-cut${cutout}-${epoch} |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/meta_search.py \ |  | ||||||
| 	--data_path $TORCH_HOME/tiered-imagenet \ |  | ||||||
| 	--arch ${arch} --n_way ${n_way} --k_shot ${k_shot} \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.001 --learning_rate_min 0.0001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos-cut.config \ |  | ||||||
| 	--print_freq 200 --workers 16 |  | ||||||
| @@ -1,29 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 2 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 2 parameters for the GPUs and the network" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=cifar10 |  | ||||||
| epoch=100 |  | ||||||
| SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E100 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--arch ${arch} --dataset ${dataset} --batch_size 128 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos-simple.config \ |  | ||||||
| 	--print_freq 100 --workers 8 |  | ||||||
| @@ -1,29 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 2 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 2 parameters for the GPUs and the network" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=cifar10 |  | ||||||
| epoch=150 |  | ||||||
| SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--arch ${arch} --dataset ${dataset} --batch_size 128 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos.config \ |  | ||||||
| 	--print_freq 100 --workers 8 |  | ||||||
| @@ -1,29 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 2 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 2 parameters for the GPUs and the network" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=cifar10 |  | ||||||
| epoch=200 |  | ||||||
| SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--arch ${arch} --dataset ${dataset} --batch_size 128 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos.config \ |  | ||||||
| 	--print_freq 100 --workers 8 |  | ||||||
| @@ -1,29 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 2 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 2 parameters for the GPUs and the network" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=cifar10 |  | ||||||
| epoch=300 |  | ||||||
| SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--arch ${arch} --dataset ${dataset} --batch_size 128 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos.config \ |  | ||||||
| 	--print_freq 100 --workers 8 |  | ||||||
| @@ -1,29 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 2 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 2 parameters for the GPUs and the network" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=cifar10 |  | ||||||
| epoch=50 |  | ||||||
| SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--arch ${arch} --dataset ${dataset} --batch_size 128 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos.config \ |  | ||||||
| 	--print_freq 100 --workers 8 |  | ||||||
| @@ -1,29 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 2 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 2 parameters for the GPUs and the network" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=cifar10 |  | ||||||
| epoch=100 |  | ||||||
| SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--arch ${arch} --dataset ${dataset} --batch_size 128 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos.config \ |  | ||||||
| 	--print_freq 100 --workers 8 |  | ||||||
| @@ -1,45 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 3 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 3 parameters for the GPUs and the network and the dataset" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=$3 |  | ||||||
| epoch=50 |  | ||||||
| SAVED=./snapshots/NAS/Search-${arch}-${dataset}-cut${cutout}-${epoch} |  | ||||||
|  |  | ||||||
| if [ "$dataset" == "cifar10" ] ;then |  | ||||||
|   dataset_root=$TORCH_HOME/cifar.python |  | ||||||
|   print_freq=100 |  | ||||||
| elif [ "$dataset" == "cifar100" ] ;then |  | ||||||
|   dataset_root=$TORCH_HOME/cifar.python |  | ||||||
|   print_freq=100 |  | ||||||
| elif [ "$dataset" == "tiered" ] ;then |  | ||||||
|   dataset_root=$TORCH_HOME/tiered-imagenet |  | ||||||
|   print_freq=500 |  | ||||||
| else |  | ||||||
|   echo 'invalid dataset-name :'${dataset} |  | ||||||
|   exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/train_search.py \ |  | ||||||
| 	--data_path ${dataset_root} \ |  | ||||||
| 	--arch ${arch} \ |  | ||||||
| 	--dataset ${dataset} --batch_size 64 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--manualSeed 3858 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos.config \ |  | ||||||
| 	--print_freq ${print_freq} --workers 8 |  | ||||||
| @@ -1,9 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
|  |  | ||||||
| seeds="seed-8167  seed-908  seed-9242" |  | ||||||
| for seed in ${seeds}; do |  | ||||||
| python ./exps-nas/vis-arch.py --checkpoint ./snapshots/NAS/Search-cifar10-cut16-100/${seed}/checkpoint-search.pth \ |  | ||||||
|                               --save_dir   ./snapshots/NAS-VIS/Search-cut16-100/${seed} |  | ||||||
| done |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1,313 +0,0 @@ | |||||||
| # DARTS First Order, Refer to https://github.com/quark0/darts |  | ||||||
| import os, sys, time, glob, random, argparse |  | ||||||
| import numpy as np |  | ||||||
| from copy import deepcopy |  | ||||||
| import torch |  | ||||||
| import torch.nn.functional as F |  | ||||||
| import torchvision.datasets as dset |  | ||||||
| import torch.backends.cudnn as cudnn |  | ||||||
| import torchvision.transforms as transforms |  | ||||||
| from pathlib import Path |  | ||||||
| lib_dir = (Path(__file__).parent / '..' / 'lib').resolve() |  | ||||||
| if str(lib_dir) not in sys.path: sys.path.insert(0, str(lib_dir)) |  | ||||||
| from utils import AverageMeter, time_string, convert_secs2time |  | ||||||
| from utils import print_log, obtain_accuracy |  | ||||||
| from utils import Cutout, count_parameters_in_MB |  | ||||||
| from datasets import TieredImageNet |  | ||||||
| from nas import return_alphas_str, Network, NetworkV1, NetworkF1 |  | ||||||
| from train_utils import main_procedure |  | ||||||
| from scheduler import load_config |  | ||||||
|  |  | ||||||
| Networks = {'base': Network, 'share': NetworkV1, 'fix': NetworkF1} |  | ||||||
|  |  | ||||||
| parser = argparse.ArgumentParser("CNN") |  | ||||||
| parser.add_argument('--data_path',         type=str,   help='Path to dataset') |  | ||||||
| parser.add_argument('--dataset',           type=str,   choices=['cifar10', 'cifar100', 'tiered'], help='Choose between Cifar10/100 and TieredImageNet.') |  | ||||||
| parser.add_argument('--arch',              type=str,   choices=Networks.keys(), help='Choose networks.') |  | ||||||
| parser.add_argument('--batch_size',        type=int,   help='the batch size') |  | ||||||
| parser.add_argument('--learning_rate_max', type=float, help='initial learning rate') |  | ||||||
| parser.add_argument('--learning_rate_min', type=float, help='minimum learning rate') |  | ||||||
| parser.add_argument('--momentum',          type=float, help='momentum') |  | ||||||
| parser.add_argument('--weight_decay',      type=float, help='weight decay') |  | ||||||
| parser.add_argument('--epochs',            type=int,   help='num of training epochs') |  | ||||||
| # architecture leraning rate |  | ||||||
| parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding') |  | ||||||
| parser.add_argument('--arch_weight_decay',  type=float, default=1e-3, help='weight decay for arch encoding') |  | ||||||
| # |  | ||||||
| parser.add_argument('--init_channels',      type=int, help='num of init channels') |  | ||||||
| parser.add_argument('--layers',             type=int, help='total number of layers') |  | ||||||
| #  |  | ||||||
| parser.add_argument('--cutout',         type=int,   help='cutout length, negative means no cutout') |  | ||||||
| parser.add_argument('--grad_clip',      type=float, help='gradient clipping') |  | ||||||
| parser.add_argument('--model_config',   type=str  , help='the model configuration') |  | ||||||
|  |  | ||||||
| # resume |  | ||||||
| parser.add_argument('--resume',         type=str  , help='the resume path') |  | ||||||
| parser.add_argument('--only_base',action='store_true', default=False, help='only train the searched model') |  | ||||||
| # split data |  | ||||||
| parser.add_argument('--validate', action='store_true', default=False, help='split train-data int train/val or not') |  | ||||||
| parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data') |  | ||||||
| # log |  | ||||||
| parser.add_argument('--workers',       type=int, default=2, help='number of data loading workers (default: 2)') |  | ||||||
| parser.add_argument('--save_path',     type=str, help='Folder to save checkpoints and log.') |  | ||||||
| parser.add_argument('--print_freq',    type=int, help='print frequency (default: 200)') |  | ||||||
| parser.add_argument('--manualSeed',    type=int, help='manual seed') |  | ||||||
| args = parser.parse_args() |  | ||||||
|  |  | ||||||
| assert torch.cuda.is_available(), 'torch.cuda is not available' |  | ||||||
|  |  | ||||||
| if args.manualSeed is None: |  | ||||||
|   args.manualSeed = random.randint(1, 10000) |  | ||||||
| random.seed(args.manualSeed) |  | ||||||
| cudnn.benchmark = True |  | ||||||
| cudnn.enabled   = True |  | ||||||
| torch.manual_seed(args.manualSeed) |  | ||||||
| torch.cuda.manual_seed_all(args.manualSeed) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def main(): |  | ||||||
|  |  | ||||||
|   # Init logger |  | ||||||
|   args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) |  | ||||||
|   if not os.path.isdir(args.save_path): |  | ||||||
|     os.makedirs(args.save_path) |  | ||||||
|   log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w') |  | ||||||
|   print_log('save path : {}'.format(args.save_path), log) |  | ||||||
|   state = {k: v for k, v in args._get_kwargs()} |  | ||||||
|   print_log(state, log) |  | ||||||
|   print_log("Random Seed: {}".format(args.manualSeed), log) |  | ||||||
|   print_log("Python version : {}".format(sys.version.replace('\n', ' ')), log) |  | ||||||
|   print_log("Torch  version : {}".format(torch.__version__), log) |  | ||||||
|   print_log("CUDA   version : {}".format(torch.version.cuda), log) |  | ||||||
|   print_log("cuDNN  version : {}".format(cudnn.version()), log) |  | ||||||
|   print_log("Num of GPUs    : {}".format(torch.cuda.device_count()), log) |  | ||||||
|   args.dataset = args.dataset.lower() |  | ||||||
|  |  | ||||||
|   # Mean + Std |  | ||||||
|   if args.dataset == 'cifar10': |  | ||||||
|     mean = [x / 255 for x in [125.3, 123.0, 113.9]] |  | ||||||
|     std = [x / 255 for x in [63.0, 62.1, 66.7]] |  | ||||||
|   elif args.dataset == 'cifar100': |  | ||||||
|     mean = [x / 255 for x in [129.3, 124.1, 112.4]] |  | ||||||
|     std = [x / 255 for x in [68.2, 65.4, 70.4]] |  | ||||||
|   elif args.dataset == 'tiered': |  | ||||||
|     mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] |  | ||||||
|   else: |  | ||||||
|     raise TypeError("Unknow dataset : {:}".format(args.dataset)) |  | ||||||
|   # Data Argumentation |  | ||||||
|   if args.dataset == 'cifar10' or args.dataset == 'cifar100': |  | ||||||
|     lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), |  | ||||||
|              transforms.Normalize(mean, std)] |  | ||||||
|     if args.cutout > 0 : lists += [Cutout(args.cutout)] |  | ||||||
|     train_transform = transforms.Compose(lists) |  | ||||||
|     test_transform  = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)]) |  | ||||||
|   elif args.dataset == 'tiered': |  | ||||||
|     lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(80, padding=4), transforms.ToTensor(), transforms.Normalize(mean, std)] |  | ||||||
|     if args.cutout > 0 : lists += [Cutout(args.cutout)] |  | ||||||
|     train_transform = transforms.Compose(lists) |  | ||||||
|     test_transform  = transforms.Compose([transforms.CenterCrop(80), transforms.ToTensor(), transforms.Normalize(mean, std)]) |  | ||||||
|   else: |  | ||||||
|     raise TypeError("Unknow dataset : {:}".format(args.dataset)) |  | ||||||
|   # Datasets |  | ||||||
|   if args.dataset == 'cifar10': |  | ||||||
|     train_data = dset.CIFAR10(args.data_path, train= True, transform=train_transform, download=True) |  | ||||||
|     test_data  = dset.CIFAR10(args.data_path, train=False, transform=test_transform , download=True) |  | ||||||
|     num_classes, head = 10, 'cifar' |  | ||||||
|   elif args.dataset == 'cifar100': |  | ||||||
|     train_data = dset.CIFAR100(args.data_path, train= True, transform=train_transform, download=True) |  | ||||||
|     test_data  = dset.CIFAR100(args.data_path, train=False, transform=test_transform , download=True) |  | ||||||
|     num_classes, head = 100, 'cifar' |  | ||||||
|   elif args.dataset == 'tiered': |  | ||||||
|     train_data = TieredImageNet(args.data_path, 'train-val', train_transform) |  | ||||||
|     test_data = None |  | ||||||
|     num_classes, head = train_data.n_classes, 'imagenet' |  | ||||||
|   else: |  | ||||||
|     raise TypeError("Unknow dataset : {:}".format(args.dataset)) |  | ||||||
|   # Data Loader |  | ||||||
|   if args.validate: |  | ||||||
|     indices = list(range(len(train_data))) |  | ||||||
|     split   = int(args.train_portion * len(indices)) |  | ||||||
|     random.shuffle(indices) |  | ||||||
|     train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, |  | ||||||
|                       sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), |  | ||||||
|                       pin_memory=True, num_workers=args.workers) |  | ||||||
|     test_loader  = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, |  | ||||||
|                       sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:]), |  | ||||||
|                       pin_memory=True, num_workers=args.workers) |  | ||||||
|   else: |  | ||||||
|     train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) |  | ||||||
|     test_loader  = torch.utils.data.DataLoader(test_data,  batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) |  | ||||||
|  |  | ||||||
|   # network and criterion |  | ||||||
|   criterion = torch.nn.CrossEntropyLoss().cuda() |  | ||||||
|   basemodel = Networks[args.arch](args.init_channels, num_classes, args.layers, head=head) |  | ||||||
|   model     = torch.nn.DataParallel(basemodel).cuda() |  | ||||||
|   print_log("Network : {:}".format(model), log) |  | ||||||
|   print_log("Parameter size = {:.3f} MB".format(count_parameters_in_MB(basemodel.base_parameters())), log) |  | ||||||
|   print_log("Train-transformation : {:}\nTest--transformation : {:}\nClass number : {:}".format(train_transform, test_transform, num_classes), log) |  | ||||||
|  |  | ||||||
|   # optimizer and LR-scheduler |  | ||||||
|   base_optimizer = torch.optim.SGD (basemodel.base_parameters(), args.learning_rate_max, momentum=args.momentum, weight_decay=args.weight_decay) |  | ||||||
|   base_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(base_optimizer, float(args.epochs), eta_min=args.learning_rate_min) |  | ||||||
|   arch_optimizer = torch.optim.Adam(basemodel.arch_parameters(), lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay) |  | ||||||
|  |  | ||||||
|   # snapshot |  | ||||||
|   checkpoint_path = os.path.join(args.save_path, 'checkpoint-search.pth') |  | ||||||
|   if args.resume is not None and os.path.isfile(args.resume): |  | ||||||
|     checkpoint = torch.load(args.resume) |  | ||||||
|     start_epoch = checkpoint['epoch'] |  | ||||||
|     basemodel.load_state_dict( checkpoint['state_dict'] ) |  | ||||||
|     base_optimizer.load_state_dict( checkpoint['base_optimizer'] ) |  | ||||||
|     arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] ) |  | ||||||
|     base_scheduler.load_state_dict( checkpoint['base_scheduler'] ) |  | ||||||
|     genotypes   = checkpoint['genotypes'] |  | ||||||
|     print_log('Load resume from {:} with start-epoch = {:}'.format(args.resume, start_epoch), log) |  | ||||||
|   elif os.path.isfile(checkpoint_path): |  | ||||||
|     checkpoint = torch.load(checkpoint_path) |  | ||||||
|     start_epoch = checkpoint['epoch'] |  | ||||||
|     basemodel.load_state_dict( checkpoint['state_dict'] ) |  | ||||||
|     base_optimizer.load_state_dict( checkpoint['base_optimizer'] ) |  | ||||||
|     arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] ) |  | ||||||
|     base_scheduler.load_state_dict( checkpoint['base_scheduler'] ) |  | ||||||
|     genotypes   = checkpoint['genotypes'] |  | ||||||
|     print_log('Load checkpoint from {:} with start-epoch = {:}'.format(checkpoint_path, start_epoch), log) |  | ||||||
|   else: |  | ||||||
|     start_epoch, genotypes = 0, {} |  | ||||||
|     print_log('Train model-search from scratch.', log) |  | ||||||
|  |  | ||||||
|   config = load_config(args.model_config) |  | ||||||
|  |  | ||||||
|   if args.only_base: |  | ||||||
|     print_log('---- Only Train the Searched Model ----', log) |  | ||||||
|     main_procedure(config, args.dataset, args.data_path, args, basemodel.genotype(), 36, 20, log) |  | ||||||
|     return |  | ||||||
|  |  | ||||||
|   # Main loop |  | ||||||
|   start_time, epoch_time, total_train_time = time.time(), AverageMeter(), 0 |  | ||||||
|   for epoch in range(start_epoch, args.epochs): |  | ||||||
|     base_scheduler.step() |  | ||||||
|  |  | ||||||
|     need_time = convert_secs2time(epoch_time.val * (args.epochs-epoch), True) |  | ||||||
|     print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [LR={:6.4f} ~ {:6.4f}] [Batch={:d}]'.format(time_string(), epoch, args.epochs, need_time, min(base_scheduler.get_lr()), max(base_scheduler.get_lr()), args.batch_size), log) |  | ||||||
|  |  | ||||||
|     genotype = basemodel.genotype() |  | ||||||
|     print_log('genotype = {:}'.format(genotype), log) |  | ||||||
|  |  | ||||||
|     print_log('{:03d}/{:03d} alphas :\n{:}'.format(epoch, args.epochs, return_alphas_str(basemodel)), log) |  | ||||||
|  |  | ||||||
|     # training |  | ||||||
|     train_acc1, train_acc5, train_obj, train_time \ |  | ||||||
|                                       = train(train_loader, test_loader, model, criterion, base_optimizer, arch_optimizer, epoch, log) |  | ||||||
|     total_train_time += train_time |  | ||||||
|     # validation |  | ||||||
|     valid_acc1, valid_acc5, valid_obj = infer(test_loader, model, criterion, epoch, log) |  | ||||||
|     print_log('Base-Search : {:03d}/{:03d} : Train-Acc={:.3f}, Test-Acc={:.3f}'.format(epoch, args.epochs, train_acc1, valid_acc1), log) |  | ||||||
|     # save genotype |  | ||||||
|     genotypes[epoch] = basemodel.genotype() |  | ||||||
|     # save checkpoint |  | ||||||
|     torch.save({'epoch' : epoch + 1, |  | ||||||
|                 'args'  : deepcopy(args), |  | ||||||
|                 'state_dict': basemodel.state_dict(), |  | ||||||
|                 'genotypes' : genotypes, |  | ||||||
|                 'base_optimizer' : base_optimizer.state_dict(), |  | ||||||
|                 'arch_optimizer' : arch_optimizer.state_dict(), |  | ||||||
|                 'base_scheduler' : base_scheduler.state_dict()}, |  | ||||||
|                 checkpoint_path) |  | ||||||
|     print_log('----> Save into {:}'.format(checkpoint_path), log) |  | ||||||
|  |  | ||||||
|     # measure elapsed time |  | ||||||
|     epoch_time.update(time.time() - start_time) |  | ||||||
|     start_time = time.time() |  | ||||||
|  |  | ||||||
|   print_log('Finish with training time = {:}'.format( convert_secs2time(total_train_time, True) ), log) |  | ||||||
|  |  | ||||||
|   # clear GPU cache |  | ||||||
|   #torch.cuda.empty_cache() |  | ||||||
|   #main_procedure(config, 'cifar10', os.environ['TORCH_HOME'] + '/cifar.python', args, basemodel.genotype(), 36, 20, log) |  | ||||||
|   log.close() |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def train(train_queue, valid_queue, model, criterion, base_optimizer, arch_optimizer, epoch, log): |  | ||||||
|   data_time, batch_time = AverageMeter(), AverageMeter() |  | ||||||
|   objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter() |  | ||||||
|   model.train() |  | ||||||
|  |  | ||||||
|   valid_iter = iter(valid_queue) |  | ||||||
|   end = time.time() |  | ||||||
|   for step, (inputs, targets) in enumerate(train_queue): |  | ||||||
|     batch, C, H, W = inputs.size() |  | ||||||
|  |  | ||||||
|     #inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True) |  | ||||||
|     targets = targets.cuda(non_blocking=True) |  | ||||||
|     data_time.update(time.time() - end) |  | ||||||
|  |  | ||||||
|     # get a random minibatch from the search queue with replacement |  | ||||||
|     try: |  | ||||||
|       input_search, target_search = next(valid_iter) |  | ||||||
|     except: |  | ||||||
|       valid_iter = iter(valid_queue) |  | ||||||
|       input_search, target_search = next(valid_iter) |  | ||||||
|      |  | ||||||
|     target_search = target_search.cuda(non_blocking=True) |  | ||||||
|  |  | ||||||
|     # update the architecture |  | ||||||
|     arch_optimizer.zero_grad() |  | ||||||
|     output_search = model(input_search) |  | ||||||
|     arch_loss = criterion(output_search, target_search) |  | ||||||
|     arch_loss.backward() |  | ||||||
|     arch_optimizer.step() |  | ||||||
|  |  | ||||||
|     # update the parameters |  | ||||||
|     base_optimizer.zero_grad() |  | ||||||
|     logits = model(inputs) |  | ||||||
|     loss = criterion(logits, targets) |  | ||||||
|  |  | ||||||
|     loss.backward() |  | ||||||
|     torch.nn.utils.clip_grad_norm_(model.module.base_parameters(), args.grad_clip) |  | ||||||
|     base_optimizer.step() |  | ||||||
|  |  | ||||||
|     prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) |  | ||||||
|     objs.update(loss.item() , batch) |  | ||||||
|     top1.update(prec1.item(), batch) |  | ||||||
|     top5.update(prec5.item(), batch) |  | ||||||
|  |  | ||||||
|     # measure elapsed time |  | ||||||
|     batch_time.update(time.time() - end) |  | ||||||
|     end = time.time() |  | ||||||
|  |  | ||||||
|     if step % args.print_freq == 0 or (step+1) == len(train_queue): |  | ||||||
|       Sstr = ' TRAIN-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(train_queue)) |  | ||||||
|       Tstr = 'Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})'.format(batch_time=batch_time, data_time=data_time) |  | ||||||
|       Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5) |  | ||||||
|       print_log(Sstr + ' ' + Tstr + ' ' + Lstr, log) |  | ||||||
|  |  | ||||||
|   return top1.avg, top5.avg, objs.avg, batch_time.sum |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def infer(valid_queue, model, criterion, epoch, log): |  | ||||||
|   objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter() |  | ||||||
|    |  | ||||||
|   model.eval() |  | ||||||
|   with torch.no_grad(): |  | ||||||
|     for step, (inputs, targets) in enumerate(valid_queue): |  | ||||||
|       batch, C, H, W = inputs.size() |  | ||||||
|       targets = targets.cuda(non_blocking=True) |  | ||||||
|  |  | ||||||
|       logits = model(inputs) |  | ||||||
|       loss = criterion(logits, targets) |  | ||||||
|  |  | ||||||
|       prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) |  | ||||||
|       objs.update(loss.item() , batch) |  | ||||||
|       top1.update(prec1.item(), batch) |  | ||||||
|       top5.update(prec5.item(), batch) |  | ||||||
|  |  | ||||||
|       if step % args.print_freq == 0 or (step+1) == len(valid_queue): |  | ||||||
|         Sstr = ' VALID-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(valid_queue)) |  | ||||||
|         Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5) |  | ||||||
|         print_log(Sstr + ' ' + Lstr, log) |  | ||||||
|  |  | ||||||
|   return top1.avg, top5.avg, objs.avg |  | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == '__main__': |  | ||||||
|   main()  |  | ||||||
| @@ -1,310 +0,0 @@ | |||||||
| import os, sys, time, glob, random, argparse |  | ||||||
| import numpy as np |  | ||||||
| from copy import deepcopy |  | ||||||
| import torch |  | ||||||
| import torch.nn.functional as F |  | ||||||
| import torchvision.datasets as dset |  | ||||||
| import torch.backends.cudnn as cudnn |  | ||||||
| import torchvision.transforms as transforms |  | ||||||
| from pathlib import Path |  | ||||||
| lib_dir = (Path(__file__).parent / '..' / 'lib').resolve() |  | ||||||
| if str(lib_dir) not in sys.path: sys.path.insert(0, str(lib_dir)) |  | ||||||
| from utils import AverageMeter, time_string, convert_secs2time |  | ||||||
| from utils import print_log, obtain_accuracy |  | ||||||
| from utils import Cutout, count_parameters_in_MB |  | ||||||
| from nas import Network, NetworkACC2, NetworkV3, NetworkV4, NetworkV5, NetworkFACC1 |  | ||||||
| from nas import return_alphas_str |  | ||||||
| from train_utils import main_procedure |  | ||||||
| from scheduler import load_config |  | ||||||
|  |  | ||||||
| Networks = {'base': Network, 'acc2': NetworkACC2, 'facc1': NetworkFACC1, 'NetworkV3': NetworkV3, 'NetworkV4': NetworkV4, 'NetworkV5': NetworkV5} |  | ||||||
|  |  | ||||||
|  |  | ||||||
| parser = argparse.ArgumentParser("cifar") |  | ||||||
| parser.add_argument('--data_path',         type=str,   help='Path to dataset') |  | ||||||
| parser.add_argument('--dataset',           type=str,   choices=['cifar10', 'cifar100'], help='Choose between Cifar10/100 and ImageNet.') |  | ||||||
| parser.add_argument('--arch',              type=str,   choices=Networks.keys(),         help='Choose networks.') |  | ||||||
| parser.add_argument('--batch_size',        type=int,   help='the batch size') |  | ||||||
| parser.add_argument('--learning_rate_max', type=float, help='initial learning rate') |  | ||||||
| parser.add_argument('--learning_rate_min', type=float, help='minimum learning rate') |  | ||||||
| parser.add_argument('--tau_max',           type=float, help='initial tau') |  | ||||||
| parser.add_argument('--tau_min',           type=float, help='minimum tau') |  | ||||||
| parser.add_argument('--momentum',          type=float, help='momentum') |  | ||||||
| parser.add_argument('--weight_decay',      type=float, help='weight decay') |  | ||||||
| parser.add_argument('--epochs',            type=int,   help='num of training epochs') |  | ||||||
| # architecture leraning rate |  | ||||||
| parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding') |  | ||||||
| parser.add_argument('--arch_weight_decay',  type=float, default=1e-3, help='weight decay for arch encoding') |  | ||||||
| # |  | ||||||
| parser.add_argument('--init_channels',      type=int, help='num of init channels') |  | ||||||
| parser.add_argument('--layers',             type=int, help='total number of layers') |  | ||||||
| #  |  | ||||||
| parser.add_argument('--cutout',         type=int,   help='cutout length, negative means no cutout') |  | ||||||
| parser.add_argument('--grad_clip',      type=float, help='gradient clipping') |  | ||||||
| parser.add_argument('--model_config',   type=str  , help='the model configuration') |  | ||||||
|  |  | ||||||
| # resume |  | ||||||
| parser.add_argument('--resume',         type=str  , help='the resume path') |  | ||||||
| parser.add_argument('--only_base',action='store_true', default=False, help='only train the searched model') |  | ||||||
| # split data |  | ||||||
| parser.add_argument('--validate', action='store_true', default=False, help='split train-data int train/val or not') |  | ||||||
| parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data') |  | ||||||
| # log |  | ||||||
| parser.add_argument('--workers',       type=int, default=2, help='number of data loading workers (default: 2)') |  | ||||||
| parser.add_argument('--save_path',     type=str, help='Folder to save checkpoints and log.') |  | ||||||
| parser.add_argument('--print_freq',    type=int, help='print frequency (default: 200)') |  | ||||||
| parser.add_argument('--manualSeed',    type=int, help='manual seed') |  | ||||||
| args = parser.parse_args() |  | ||||||
|  |  | ||||||
| assert torch.cuda.is_available(), 'torch.cuda is not available' |  | ||||||
|  |  | ||||||
| if args.manualSeed is None: |  | ||||||
|   args.manualSeed = random.randint(1, 10000) |  | ||||||
| random.seed(args.manualSeed) |  | ||||||
| cudnn.benchmark = True |  | ||||||
| cudnn.enabled   = True |  | ||||||
| torch.manual_seed(args.manualSeed) |  | ||||||
| torch.cuda.manual_seed_all(args.manualSeed) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def main(): |  | ||||||
|  |  | ||||||
|   # Init logger |  | ||||||
|   args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) |  | ||||||
|   if not os.path.isdir(args.save_path): |  | ||||||
|     os.makedirs(args.save_path) |  | ||||||
|   log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w') |  | ||||||
|   print_log('save path : {}'.format(args.save_path), log) |  | ||||||
|   state = {k: v for k, v in args._get_kwargs()} |  | ||||||
|   print_log(state, log) |  | ||||||
|   print_log("Random Seed: {}".format(args.manualSeed), log) |  | ||||||
|   print_log("Python version : {}".format(sys.version.replace('\n', ' ')), log) |  | ||||||
|   print_log("Torch  version : {}".format(torch.__version__), log) |  | ||||||
|   print_log("CUDA   version : {}".format(torch.version.cuda), log) |  | ||||||
|   print_log("cuDNN  version : {}".format(cudnn.version()), log) |  | ||||||
|   print_log("Num of GPUs    : {}".format(torch.cuda.device_count()), log) |  | ||||||
|   args.dataset = args.dataset.lower() |  | ||||||
|  |  | ||||||
|   # Mean + Std |  | ||||||
|   if args.dataset == 'cifar10': |  | ||||||
|     mean = [x / 255 for x in [125.3, 123.0, 113.9]] |  | ||||||
|     std = [x / 255 for x in [63.0, 62.1, 66.7]] |  | ||||||
|   elif args.dataset == 'cifar100': |  | ||||||
|     mean = [x / 255 for x in [129.3, 124.1, 112.4]] |  | ||||||
|     std = [x / 255 for x in [68.2, 65.4, 70.4]] |  | ||||||
|   else: |  | ||||||
|     raise TypeError("Unknow dataset : {:}".format(args.dataset)) |  | ||||||
|   # Data Argumentation |  | ||||||
|   if args.dataset == 'cifar10' or args.dataset == 'cifar100': |  | ||||||
|     lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), |  | ||||||
|              transforms.Normalize(mean, std)] |  | ||||||
|     if args.cutout > 0 : lists += [Cutout(args.cutout)] |  | ||||||
|     train_transform = transforms.Compose(lists) |  | ||||||
|     test_transform  = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)]) |  | ||||||
|   else: |  | ||||||
|     raise TypeError("Unknow dataset : {:}".format(args.dataset)) |  | ||||||
|   # Datasets |  | ||||||
|   if args.dataset == 'cifar10': |  | ||||||
|     train_data = dset.CIFAR10(args.data_path, train= True, transform=train_transform, download=True) |  | ||||||
|     test_data  = dset.CIFAR10(args.data_path, train=False, transform=test_transform , download=True) |  | ||||||
|     num_classes = 10 |  | ||||||
|   elif args.dataset == 'cifar100': |  | ||||||
|     train_data = dset.CIFAR100(args.data_path, train= True, transform=train_transform, download=True) |  | ||||||
|     test_data  = dset.CIFAR100(args.data_path, train=False, transform=test_transform , download=True) |  | ||||||
|     num_classes = 100 |  | ||||||
|   else: |  | ||||||
|     raise TypeError("Unknow dataset : {:}".format(args.dataset)) |  | ||||||
|   # Data Loader |  | ||||||
|   if args.validate: |  | ||||||
|     indices = list(range(len(train_data))) |  | ||||||
|     split   = int(args.train_portion * len(indices)) |  | ||||||
|     random.shuffle(indices) |  | ||||||
|     train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, |  | ||||||
|                       sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), |  | ||||||
|                       pin_memory=True, num_workers=args.workers) |  | ||||||
|     test_loader  = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, |  | ||||||
|                       sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:]), |  | ||||||
|                       pin_memory=True, num_workers=args.workers) |  | ||||||
|   else: |  | ||||||
|     train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) |  | ||||||
|     test_loader  = torch.utils.data.DataLoader(test_data,  batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) |  | ||||||
|  |  | ||||||
|   # network and criterion |  | ||||||
|   criterion = torch.nn.CrossEntropyLoss().cuda() |  | ||||||
|   basemodel = Networks[args.arch](args.init_channels, num_classes, args.layers) |  | ||||||
|   model     = torch.nn.DataParallel(basemodel).cuda() |  | ||||||
|   print_log("Parameter size = {:.3f} MB".format(count_parameters_in_MB(basemodel.base_parameters())), log) |  | ||||||
|   print_log("Train-transformation : {:}\nTest--transformation : {:}".format(train_transform, test_transform), log) |  | ||||||
|  |  | ||||||
|   # optimizer and LR-scheduler |  | ||||||
|   base_optimizer = torch.optim.SGD (basemodel.base_parameters(), args.learning_rate_max, momentum=args.momentum, weight_decay=args.weight_decay) |  | ||||||
|   #base_optimizer = torch.optim.Adam(basemodel.base_parameters(), lr=args.learning_rate_max, betas=(0.5, 0.999), weight_decay=args.weight_decay) |  | ||||||
|   base_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(base_optimizer, float(args.epochs), eta_min=args.learning_rate_min) |  | ||||||
|   arch_optimizer = torch.optim.Adam(basemodel.arch_parameters(), lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay) |  | ||||||
|  |  | ||||||
|   # snapshot |  | ||||||
|   checkpoint_path = os.path.join(args.save_path, 'checkpoint-search.pth') |  | ||||||
|   if args.resume is not None and os.path.isfile(args.resume): |  | ||||||
|     checkpoint = torch.load(args.resume) |  | ||||||
|     start_epoch = checkpoint['epoch'] |  | ||||||
|     basemodel.load_state_dict( checkpoint['state_dict'] ) |  | ||||||
|     base_optimizer.load_state_dict( checkpoint['base_optimizer'] ) |  | ||||||
|     arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] ) |  | ||||||
|     base_scheduler.load_state_dict( checkpoint['base_scheduler'] ) |  | ||||||
|     genotypes   = checkpoint['genotypes'] |  | ||||||
|     print_log('Load resume from {:} with start-epoch = {:}'.format(args.resume, start_epoch), log) |  | ||||||
|   elif os.path.isfile(checkpoint_path): |  | ||||||
|     checkpoint = torch.load(checkpoint_path) |  | ||||||
|     start_epoch = checkpoint['epoch'] |  | ||||||
|     basemodel.load_state_dict( checkpoint['state_dict'] ) |  | ||||||
|     base_optimizer.load_state_dict( checkpoint['base_optimizer'] ) |  | ||||||
|     arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] ) |  | ||||||
|     base_scheduler.load_state_dict( checkpoint['base_scheduler'] ) |  | ||||||
|     genotypes   = checkpoint['genotypes'] |  | ||||||
|     print_log('Load checkpoint from {:} with start-epoch = {:}'.format(checkpoint_path, start_epoch), log) |  | ||||||
|   else: |  | ||||||
|     start_epoch, genotypes = 0, {} |  | ||||||
|     print_log('Train model-search from scratch.', log) |  | ||||||
|  |  | ||||||
|   config = load_config(args.model_config) |  | ||||||
|  |  | ||||||
|   if args.only_base: |  | ||||||
|     print_log('---- Only Train the Searched Model ----', log) |  | ||||||
|     main_procedure(config, args.dataset, args.data_path, args, basemodel.genotype(), 36, 20, log) |  | ||||||
|     return |  | ||||||
|  |  | ||||||
|   # Main loop |  | ||||||
|   start_time, epoch_time, total_train_time = time.time(), AverageMeter(), 0 |  | ||||||
|   for epoch in range(start_epoch, args.epochs): |  | ||||||
|     base_scheduler.step() |  | ||||||
|  |  | ||||||
|     basemodel.set_tau( args.tau_max - epoch*1.0/args.epochs*(args.tau_max-args.tau_min) ) |  | ||||||
|     #if epoch + 2 == args.epochs: |  | ||||||
|     #  torch.cuda.empty_cache() |  | ||||||
|     #  basemodel.set_gumbel(False) |  | ||||||
|  |  | ||||||
|     need_time = convert_secs2time(epoch_time.val * (args.epochs-epoch), True) |  | ||||||
|     print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [LR={:6.4f} ~ {:6.4f}] [Batch={:d}], tau={:}'.format(time_string(), epoch, args.epochs, need_time, min(base_scheduler.get_lr()), max(base_scheduler.get_lr()), args.batch_size, basemodel.get_tau()), log) |  | ||||||
|  |  | ||||||
|     genotype = basemodel.genotype() |  | ||||||
|     print_log('genotype = {:}'.format(genotype), log) |  | ||||||
|  |  | ||||||
|     print_log('{:03d}/{:03d} alphas :\n{:}'.format(epoch, args.epochs, return_alphas_str(basemodel)), log) |  | ||||||
|  |  | ||||||
|     # training |  | ||||||
|     train_acc1, train_acc5, train_obj, train_time \ |  | ||||||
|                                       = train(train_loader, test_loader, model, criterion, base_optimizer, arch_optimizer, epoch, log) |  | ||||||
|     total_train_time += train_time |  | ||||||
|     # validation |  | ||||||
|     valid_acc1, valid_acc5, valid_obj = infer(test_loader, model, criterion, epoch, log) |  | ||||||
|     print_log('{:03d}/{:03d}, Train-Accuracy = {:.2f}, Test-Accuracy = {:.2f}'.format(epoch, args.epochs, train_acc1, valid_acc1), log) |  | ||||||
|     # save genotype |  | ||||||
|     genotypes[epoch] = basemodel.genotype() |  | ||||||
|     # save checkpoint |  | ||||||
|     torch.save({'epoch' : epoch + 1, |  | ||||||
|                 'args'  : deepcopy(args), |  | ||||||
|                 'state_dict': basemodel.state_dict(), |  | ||||||
|                 'genotypes' : genotypes, |  | ||||||
|                 'base_optimizer' : base_optimizer.state_dict(), |  | ||||||
|                 'arch_optimizer' : arch_optimizer.state_dict(), |  | ||||||
|                 'base_scheduler' : base_scheduler.state_dict()}, |  | ||||||
|                 checkpoint_path) |  | ||||||
|     print_log('----> Save into {:}'.format(checkpoint_path), log) |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     # measure elapsed time |  | ||||||
|     epoch_time.update(time.time() - start_time) |  | ||||||
|     start_time = time.time() |  | ||||||
|  |  | ||||||
|   print_log('Finish with training time = {:}'.format( convert_secs2time(total_train_time, True) ), log) |  | ||||||
|  |  | ||||||
|   # clear GPU cache |  | ||||||
|   #torch.cuda.empty_cache() |  | ||||||
|   #main_procedure(config, args.dataset, args.data_path, args, basemodel.genotype(), 36, 20, log) |  | ||||||
|   log.close() |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def train(train_queue, valid_queue, model, criterion, base_optimizer, arch_optimizer, epoch, log): |  | ||||||
|   data_time, batch_time = AverageMeter(), AverageMeter() |  | ||||||
|   objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter() |  | ||||||
|   model.train() |  | ||||||
|  |  | ||||||
|   valid_iter = iter(valid_queue) |  | ||||||
|   end = time.time() |  | ||||||
|   for step, (inputs, targets) in enumerate(train_queue): |  | ||||||
|     batch, C, H, W = inputs.size() |  | ||||||
|  |  | ||||||
|     #inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True) |  | ||||||
|     targets = targets.cuda(non_blocking=True) |  | ||||||
|  |  | ||||||
|     # get a random minibatch from the search queue with replacement |  | ||||||
|     try: |  | ||||||
|       input_search, target_search = next(valid_iter) |  | ||||||
|     except: |  | ||||||
|       valid_iter = iter(valid_queue) |  | ||||||
|       input_search, target_search = next(valid_iter) |  | ||||||
|      |  | ||||||
|     target_search = target_search.cuda(non_blocking=True) |  | ||||||
|     data_time.update(time.time() - end) |  | ||||||
|  |  | ||||||
|     # update the architecture |  | ||||||
|     arch_optimizer.zero_grad() |  | ||||||
|     output_search = model(input_search) |  | ||||||
|     arch_loss = criterion(output_search, target_search) |  | ||||||
|     arch_loss.backward() |  | ||||||
|     arch_optimizer.step() |  | ||||||
|  |  | ||||||
|     # update the parameters |  | ||||||
|     base_optimizer.zero_grad() |  | ||||||
|     logits = model(inputs) |  | ||||||
|     loss = criterion(logits, targets) |  | ||||||
|  |  | ||||||
|     loss.backward() |  | ||||||
|     torch.nn.utils.clip_grad_norm_(model.module.base_parameters(), args.grad_clip) |  | ||||||
|     base_optimizer.step() |  | ||||||
|  |  | ||||||
|     prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) |  | ||||||
|     objs.update(loss.item() , batch) |  | ||||||
|     top1.update(prec1.item(), batch) |  | ||||||
|     top5.update(prec5.item(), batch) |  | ||||||
|  |  | ||||||
|     # measure elapsed time |  | ||||||
|     batch_time.update(time.time() - end) |  | ||||||
|     end = time.time() |  | ||||||
|  |  | ||||||
|     if step % args.print_freq == 0 or (step+1) == len(train_queue): |  | ||||||
|       Sstr = ' TRAIN-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(train_queue)) |  | ||||||
|       Tstr = 'Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})'.format(batch_time=batch_time, data_time=data_time) |  | ||||||
|       Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5) |  | ||||||
|       print_log(Sstr + ' ' + Tstr + ' ' + Lstr, log) |  | ||||||
|  |  | ||||||
|   return top1.avg, top5.avg, objs.avg, batch_time.sum |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def infer(valid_queue, model, criterion, epoch, log): |  | ||||||
|   objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter() |  | ||||||
|    |  | ||||||
|   model.eval() |  | ||||||
|   with torch.no_grad(): |  | ||||||
|     for step, (inputs, targets) in enumerate(valid_queue): |  | ||||||
|       batch, C, H, W = inputs.size() |  | ||||||
|       targets = targets.cuda(non_blocking=True) |  | ||||||
|  |  | ||||||
|       logits = model(inputs) |  | ||||||
|       loss = criterion(logits, targets) |  | ||||||
|  |  | ||||||
|       prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) |  | ||||||
|       objs.update(loss.item() , batch) |  | ||||||
|       top1.update(prec1.item(), batch) |  | ||||||
|       top5.update(prec5.item(), batch) |  | ||||||
|  |  | ||||||
|       if step % args.print_freq == 0 or (step+1) == len(valid_queue): |  | ||||||
|         Sstr = ' VALID-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(valid_queue)) |  | ||||||
|         Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5) |  | ||||||
|         print_log(Sstr + ' ' + Lstr, log) |  | ||||||
|  |  | ||||||
|   return top1.avg, top5.avg, objs.avg |  | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == '__main__': |  | ||||||
|   main()  |  | ||||||
| @@ -1,7 +1,6 @@ | |||||||
| import os, sys, time | import os, sys, time | ||||||
| from copy import deepcopy | from copy import deepcopy | ||||||
| import torch | import torch | ||||||
| import torchvision.datasets as dset |  | ||||||
| import torchvision.transforms as transforms | import torchvision.transforms as transforms | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,7 +2,6 @@ import os, sys, time | |||||||
| from copy import deepcopy | from copy import deepcopy | ||||||
| import torch | import torch | ||||||
| import torch.nn as nn | import torch.nn as nn | ||||||
| import torchvision.datasets as dset |  | ||||||
| import torchvision.transforms as transforms | import torchvision.transforms as transforms | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								output/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								output/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | |||||||
|  | * | ||||||
							
								
								
									
										29
									
								
								scripts-cluster/job-script.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								scripts-cluster/job-script.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | # | ||||||
|  | echo "CHECK-DATA-DIR START" | ||||||
|  | cifar_dir="./data/data/cifar.python" | ||||||
|  | if [ -d ${cifar_dir} ]; then | ||||||
|  |   echo "Find cifar-dir: "${cifar_dir} | ||||||
|  | else | ||||||
|  |   echo "Can not find cifar-dir: "${cifar_dir} | ||||||
|  |   exit 1 | ||||||
|  | fi | ||||||
|  | echo "CHECK-DATA-DIR DONE" | ||||||
|  |  | ||||||
|  | sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ | ||||||
|  |     COMM_KM_Data COMM_km_2018 \ | ||||||
|  |     `pwd`/hadoop-data \ | ||||||
|  |     afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets | ||||||
|  |  | ||||||
|  | echo "PWD: " `pwd` | ||||||
|  | echo "files::  " `ls` | ||||||
|  | echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES} | ||||||
|  |  | ||||||
|  | # config python | ||||||
|  | PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz | ||||||
|  | wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1 | ||||||
|  | tar xzf $PYTHON_ENV | ||||||
|  |  | ||||||
|  | alias python="./env/bin/python" | ||||||
|  |  | ||||||
|  | echo "Python:  " `which python` | ||||||
							
								
								
									
										44
									
								
								scripts-cluster/submit.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								scripts-cluster/submit.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,44 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | # bash ./scripts-cluster/submit.sh ${QUEUE} ${JOB-NAME} ${GPUs} | ||||||
|  | #find -name "._*" | xargs rm -rf | ||||||
|  | ODIR=$(pwd) | ||||||
|  | FDIR=$(cd $(dirname $0); pwd) | ||||||
|  | echo "Bash-Dir : "${ODIR} | ||||||
|  | echo "File-Dir : "${FDIR} | ||||||
|  | echo "File-Name: "${0} | ||||||
|  |  | ||||||
|  | if [ "$#" -ne 4 ] ;then | ||||||
|  |   echo "Input illegal number of parameters " $# | ||||||
|  |   echo "Need 4 parameters for the queue-name, the job-name, and the number-of-GPUs" | ||||||
|  |   exit 1                | ||||||
|  | fi | ||||||
|  | find -name "__pycache__" | xargs rm -rf | ||||||
|  |  | ||||||
|  | QUEUE=$1 | ||||||
|  | NAME=$2 | ||||||
|  | GPUs=$3 | ||||||
|  | CMD=$4 | ||||||
|  | TIME=$(date +"%Y-%h-%d-%T") | ||||||
|  |  | ||||||
|  | JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" | ||||||
|  |  | ||||||
|  | cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} | ||||||
|  | echo ${CMD}              >> ${JOB_SCRIPT} | ||||||
|  |  | ||||||
|  | exit 1 | ||||||
|  | HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ${HGCP_CLIENT_BIN}/submit \ | ||||||
|  |     --hdfs afs://xingtian.afs.baidu.com:9902 \ | ||||||
|  |     --hdfs-user COMM_KM_Data \ | ||||||
|  |     --hdfs-passwd COMM_km_2018 \ | ||||||
|  |     --hdfs-path /user/COMM_KM_Data/dongxuanyi/logs \ | ||||||
|  |     --file-dir ./ \ | ||||||
|  |     --job-name ${NAME} \ | ||||||
|  |     --queue-name ${QUEUE} \ | ||||||
|  |     --num-nodes 1 \ | ||||||
|  |     --num-task-pernode 1 \ | ||||||
|  |     --gpu-pnode ${GPUs} \ | ||||||
|  |     --time-limit 0 \ | ||||||
|  |     --job-script ${JOB_SCRIPT} | ||||||
							
								
								
									
										1
									
								
								scripts-cluster/tmps/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								scripts-cluster/tmps/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | |||||||
|  | * | ||||||
| @@ -1,31 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| # bash scripts-cnn/DMS-V-TrainV3.sh 1 |  | ||||||
| if [ "$#" -ne 1 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 1 parameters for the GPUs and the epochs" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=acc2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=cifar10 |  | ||||||
| epoch=200 |  | ||||||
| SAVED=./snapshots/NAS/ACC-V3-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/GDAS-Search.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--arch ${arch} --dataset ${dataset} --batch_size 128 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.01 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--tau_max 10 --tau_min 1 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos-cut.config \ |  | ||||||
| 	--print_freq 100 --workers 10 |  | ||||||
| @@ -1,29 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 2 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 2 parameters for the GPUs and the network" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=cifar10 |  | ||||||
| epoch=200 |  | ||||||
| SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/GDAS-Search.py \ |  | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ |  | ||||||
| 	--arch ${arch} --dataset ${dataset} --batch_size 128 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos-cut.config \ |  | ||||||
| 	--print_freq 100 --workers 10 |  | ||||||
| @@ -1,44 +0,0 @@ | |||||||
| #!/usr/bin/env sh |  | ||||||
| if [ "$#" -ne 3 ] ;then |  | ||||||
|   echo "Input illegal number of parameters " $# |  | ||||||
|   echo "Need 3 parameters for the GPUs and the network and the dataset" |  | ||||||
|   exit 1                |  | ||||||
| fi  |  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 |  | ||||||
| arch=$2 |  | ||||||
| cutout=0 |  | ||||||
| dataset=$3 |  | ||||||
| epoch=50 |  | ||||||
| SAVED=./snapshots/NAS/Search-${arch}-${dataset}-cut${cutout}-${epoch} |  | ||||||
|  |  | ||||||
| if [ "$dataset" == "cifar10" ] ;then |  | ||||||
|   dataset_root=$TORCH_HOME/cifar.python |  | ||||||
|   print_freq=100 |  | ||||||
| elif [ "$dataset" == "cifar100" ] ;then |  | ||||||
|   dataset_root=$TORCH_HOME/cifar.python |  | ||||||
|   print_freq=100 |  | ||||||
| elif [ "$dataset" == "tiered" ] ;then |  | ||||||
|   dataset_root=$TORCH_HOME/tiered-imagenet |  | ||||||
|   print_freq=500 |  | ||||||
| else |  | ||||||
|   echo 'invalid dataset-name :'${dataset} |  | ||||||
|   exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/DARTS-Search.py \ |  | ||||||
| 	--data_path ${dataset_root} \ |  | ||||||
| 	--arch ${arch} \ |  | ||||||
| 	--dataset ${dataset} --batch_size 64 \ |  | ||||||
| 	--save_path ${SAVED} \ |  | ||||||
| 	--learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ |  | ||||||
| 	--epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ |  | ||||||
| 	--init_channels 16 --layers 8 \ |  | ||||||
| 	--model_config ./configs/nas-cifar-cos-cut.config \ |  | ||||||
| 	--print_freq ${print_freq} --workers 8 |  | ||||||
| @@ -17,9 +17,10 @@ arch=$2 | |||||||
| dataset=$3 | dataset=$3 | ||||||
| cutout=$4 | cutout=$4 | ||||||
| SAVED=./snapshots/NAS/${arch}-${dataset}-${cutout}-E600 | SAVED=./snapshots/NAS/${arch}-${dataset}-${cutout}-E600 | ||||||
|  | #--data_path $TORCH_HOME/cifar.python \ | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/train_base.py \ | CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/train_base.py \ | ||||||
| 	--data_path $TORCH_HOME/cifar.python \ | 	--data_path ./data/data/cifar.python \ | ||||||
| 	--dataset ${dataset} --arch ${arch} \ | 	--dataset ${dataset} --arch ${arch} \ | ||||||
| 	--save_path ${SAVED} \ | 	--save_path ${SAVED} \ | ||||||
| 	--grad_clip 5 \ | 	--grad_clip 5 \ | ||||||
|   | |||||||
| @@ -1,21 +1,14 @@ | |||||||
| #!/usr/bin/env sh | #!/usr/bin/env sh | ||||||
| if [ "$#" -ne 2 ] ;then | if [ "$#" -ne 1 ] ;then | ||||||
|   echo "Input illegal number of parameters " $# |   echo "Input illegal number of parameters " $# | ||||||
|   echo "Need 2 parameters for the GPU and the architecture" |   echo "Need 1 parameters for the GPU and the architecture" | ||||||
|   exit 1                |   exit 1                | ||||||
| fi  | fi  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 | arch=$1 | ||||||
| arch=$2 | SAVED=./output/NAS-RNN/Search-${arch}-PTB | ||||||
| SAVED=./snapshots/NAS-RNN/Search-${arch}-PTB |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-rnn/train_rnn_base.py \ | python ./exps-rnn/train_rnn_base.py \ | ||||||
| 	--arch ${arch} \ | 	--arch ${arch} \ | ||||||
| 	--save_path ${SAVED} \ | 	--save_path ${SAVED} \ | ||||||
| 	--config_path ./configs/NAS-PTB-BASE.config \ | 	--config_path ./configs/NAS-PTB-BASE.config \ | ||||||
|   | |||||||
| @@ -1,21 +1,14 @@ | |||||||
| #!/usr/bin/env sh | #!/usr/bin/env sh | ||||||
| if [ "$#" -ne 2 ] ;then | if [ "$#" -ne 1 ] ;then | ||||||
|   echo "Input illegal number of parameters " $# |   echo "Input illegal number of parameters " $# | ||||||
|   echo "Need 2 parameters for the GPU and the architecture" |   echo "Need 1 parameters for the architectures" | ||||||
|   exit 1                |   exit 1                | ||||||
| fi  | fi  | ||||||
| if [ "$TORCH_HOME" = "" ]; then |  | ||||||
|   echo "Must set TORCH_HOME envoriment variable for data dir saving" |  | ||||||
|   exit 1 |  | ||||||
| else |  | ||||||
|   echo "TORCH_HOME : $TORCH_HOME" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| gpus=$1 | arch=$1 | ||||||
| arch=$2 | SAVED=./output/NAS-RNN/Search-${arch}-WT2 | ||||||
| SAVED=./snapshots/NAS-RNN/Search-${arch}-WT2 |  | ||||||
|  |  | ||||||
| CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/rnn/train_rnn_base.py \ | python ./exps-rnn/train_rnn_base.py \ | ||||||
| 	--arch ${arch} \ | 	--arch ${arch} \ | ||||||
| 	--save_path ${SAVED} \ | 	--save_path ${SAVED} \ | ||||||
| 	--config_path ./configs/NAS-WT2-BASE.config \ | 	--config_path ./configs/NAS-WT2-BASE.config \ | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user