From 3734384b684bfeae67c20fdc9b7bd4061e503c19 Mon Sep 17 00:00:00 2001 From: Xuanyi Dong <280835372@qq.com> Date: Sat, 30 Mar 2019 00:50:18 +0800 Subject: [PATCH] update --- README.md | 19 +- TEMP/DMS-V-Train.sh | 30 --- TEMP/README.md | 57 ------ TEMP/TRAIN-BASE.sh | 30 --- TEMP/batch-base-model.sh | 23 --- TEMP/batch-base-search.sh | 19 -- TEMP/meta-search.sh | 30 --- TEMP/search-acc-simple.sh | 29 --- TEMP/search-acc-v2-E150.sh | 29 --- TEMP/search-acc-v2-E200.sh | 29 --- TEMP/search-acc-v2-E300.sh | 29 --- TEMP/search-acc-v2-E50.sh | 29 --- TEMP/search-acc-v2.sh | 29 --- TEMP/search.sh | 45 ----- TEMP/vis.sh | 9 - exps-cnn/DARTS-Search.py | 313 ------------------------------- exps-cnn/GDAS-Search.py | 310 ------------------------------ exps-cnn/train_utils.py | 1 - exps-cnn/train_utils_imagenet.py | 1 - output/.gitignore | 1 + scripts-cluster/job-script.sh | 29 +++ scripts-cluster/submit.sh | 44 +++++ scripts-cluster/tmps/.gitignore | 1 + scripts-cnn/DMS-V-TrainV3.sh | 31 --- scripts-cnn/search-acc-v2.sh | 29 --- scripts-cnn/search.sh | 44 ----- scripts-cnn/train-cifar.sh | 3 +- scripts-rnn/train-PTB.sh | 17 +- scripts-rnn/train-WT2.sh | 17 +- 29 files changed, 93 insertions(+), 1184 deletions(-) delete mode 100644 TEMP/DMS-V-Train.sh delete mode 100644 TEMP/README.md delete mode 100644 TEMP/TRAIN-BASE.sh delete mode 100644 TEMP/batch-base-model.sh delete mode 100644 TEMP/batch-base-search.sh delete mode 100644 TEMP/meta-search.sh delete mode 100644 TEMP/search-acc-simple.sh delete mode 100644 TEMP/search-acc-v2-E150.sh delete mode 100644 TEMP/search-acc-v2-E200.sh delete mode 100644 TEMP/search-acc-v2-E300.sh delete mode 100644 TEMP/search-acc-v2-E50.sh delete mode 100644 TEMP/search-acc-v2.sh delete mode 100644 TEMP/search.sh delete mode 100644 TEMP/vis.sh delete mode 100644 exps-cnn/DARTS-Search.py delete mode 100644 exps-cnn/GDAS-Search.py create mode 100644 output/.gitignore create mode 100644 scripts-cluster/job-script.sh create mode 100644 scripts-cluster/submit.sh create mode 100644 scripts-cluster/tmps/.gitignore delete mode 100644 scripts-cnn/DMS-V-TrainV3.sh delete mode 100644 scripts-cnn/search-acc-v2.sh delete mode 100644 scripts-cnn/search.sh diff --git a/README.md b/README.md index 79ddf02..4eea8c4 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,6 @@ conda install pytorch torchvision cuda100 -c pytorch ## Algorithm -Searching CNNs -``` -bash ./scripts-cnn/search.sh 1 base cifar10 -bash ./scripts-cnn/DMS-V-TrainV3.sh 1 -bash ./scripts-cnn/search-acc-v2.sh 3 acc2 -``` - Train the searched CNN on CIFAR ``` bash ./scripts-cnn/train-cifar.sh 0 GDAS_FG cifar10 cut @@ -36,10 +29,10 @@ bash ./scripts-cnn/train-imagenet.sh 0 GDAS_V1 50 14 Train the searched RNN ``` -bash ./scripts-rnn/train-PTB.sh 0 DARTS_V1 -bash ./scripts-rnn/train-PTB.sh 0 DARTS_V2 -bash ./scripts-rnn/train-PTB.sh 0 GDAS -bash ./scripts-rnn/train-WT2.sh 0 DARTS_V1 -bash ./scripts-rnn/train-WT2.sh 0 DARTS_V2 -bash ./scripts-rnn/train-WT2.sh 0 GDAS +CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-PTB.sh DARTS_V1 +CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-PTB.sh DARTS_V2 +CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-PTB.sh GDAS +CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-WT2.sh DARTS_V1 +CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-WT2.sh DARTS_V2 +CUDA_VISIBLE_DEVICES=0 bash ./scripts-rnn/train-WT2.sh GDAS ``` diff --git a/TEMP/DMS-V-Train.sh b/TEMP/DMS-V-Train.sh deleted file mode 100644 index 456f90c..0000000 --- a/TEMP/DMS-V-Train.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 3 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 3 parameters for the GPUs and the epochs and the cutout" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=acc2 -cutout=$3 -dataset=cifar10 -epoch=$2 -SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ - --data_path $TORCH_HOME/cifar.python \ - --arch ${arch} --dataset ${dataset} --batch_size 128 \ - --save_path ${SAVED} \ - --learning_rate_max 0.05 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --tau_max 10 --tau_min 4 \ - --model_config ./configs/nas-cifar-cos.config \ - --print_freq 100 --workers 8 diff --git a/TEMP/README.md b/TEMP/README.md deleted file mode 100644 index d0a3b72..0000000 --- a/TEMP/README.md +++ /dev/null @@ -1,57 +0,0 @@ -# Neural-Architecture-Search - -### Baseline -``` -bash ./scripts-nas/search.sh 1 base cifar10 -bash ./scripts-nas/search.sh 1 share -bash ./scripts-nas/batch-base-search.sh 1 -bash ./scripts-nas/batch-base-model.sh 1 -``` - -### Meta -``` -bash ./scripts-nas/meta-search.sh 0 meta 20 5 -``` - -### Acceleration -``` -bash ./scripts-nas/search-acc-v2.sh 3 acc2 -bash ./scripts-nas/DMS-V-Train.sh 0 - -bash ./scripts-nas/search-acc-simple.sh 3 NetworkV2 -``` - -### Base Model Training -``` -bash ./scripts-nas/train-model.sh 3 AmoebaNet -bash ./scripts-nas/train-model.sh 3 NASNet -bash ./scripts-nas/train-model.sh 3 DARTS_V1 -bash ./scripts-nas/train-model-simple.sh 3 AmoebaNet -bash ./scripts-nas/train-imagenet.sh 3 DARTS_V2 50 14 - -bash scripts-nas/TRAIN-BASE.sh 0 PNASNet cifar10 nocut 48 11 -bash scripts-nas/TRAIN-BASE.sh 0 AmoebaNet cifar10 nocut 36 20 -bash scripts-nas/TRAIN-BASE.sh 0 NASNet cifar10 nocut 33 20 - -bash scripts-nas/TRAIN-BASE.sh 0 DMS_F1 cifar10 nocut 36 20 -bash scripts-nas/TRAIN-BASE.sh 0 DMS_V1 cifar10 nocut 36 20 -bash scripts-nas/TRAIN-BASE.sh 0 GDAS_CC cifar10 nocut 36 20 -bash scripts-nas/train-imagenet.sh 3 DMS_F1 52 14 -bash scripts-nas/train-imagenet.sh 3 DMS_V1 50 14 - - -bash scripts-nas/TRAIN-BASE.sh 0 DMS_V1 cifar10 nocut 36 20 -``` - - -### Visualization -``` -python ./exps-nas/vis-arch.py --checkpoint --save_dir -python ./exps-nas/cvpr-vis.py --save_dir ./snapshots/NAS-VIS/ -``` - -### Test datasets -``` -cd ./lib/datasets/ -python test_NLP.py -``` diff --git a/TEMP/TRAIN-BASE.sh b/TEMP/TRAIN-BASE.sh deleted file mode 100644 index 97c73b8..0000000 --- a/TEMP/TRAIN-BASE.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env sh -# bash scripts-nas/TRAIN-BASE.sh 0 DMS_V1 cifar10 nocut init-channel layers -if [ "$#" -ne 6 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 6 parameters for the GPUs, the architecture, the dataset, the config, the initial channel, and the number of layers" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -dataset=$3 -config=$4 -C=$5 -N=$6 -SAVED=./snapshots/NAS/${arch}-${C}-${N}-${dataset}-${config}-E600 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/train_base.py \ - --data_path $TORCH_HOME/cifar.python \ - --dataset ${dataset} --arch ${arch} \ - --save_path ${SAVED} \ - --grad_clip 5 \ - --init_channels ${C} --layers ${N} \ - --model_config ./configs/nas-cifar-cos-${config}.config \ - --print_freq 100 --workers 8 diff --git a/TEMP/batch-base-model.sh b/TEMP/batch-base-model.sh deleted file mode 100644 index 7cbd0d7..0000000 --- a/TEMP/batch-base-model.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env sh -set -e -if [ "$#" -ne 1 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 1 parameters for the GPUs" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 - -bash ./scripts-nas/train-model.sh ${gpus} AmoebaNet 0 - -bash ./scripts-nas/train-model.sh ${gpus} NASNet 0 - -bash ./scripts-nas/train-model.sh ${gpus} DARTS_V1 0 - -bash ./scripts-nas/train-model.sh ${gpus} DARTS_V2 0 diff --git a/TEMP/batch-base-search.sh b/TEMP/batch-base-search.sh deleted file mode 100644 index e353dac..0000000 --- a/TEMP/batch-base-search.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 1 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 1 parameters for the GPUs" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -Times="1 2 3" - -for time in ${Times}; do - bash ./scripts-nas/search.sh ${gpus} -done diff --git a/TEMP/meta-search.sh b/TEMP/meta-search.sh deleted file mode 100644 index c2d6777..0000000 --- a/TEMP/meta-search.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 4 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 4 parameters for the GPUs and the network and N-way and K-shot" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -n_way=$3 -k_shot=$4 -cutout=16 -epoch=60 -SAVED=./snapshots/NAS/Meta-Search-${arch}-N${n_way}-K${k_shot}-cut${cutout}-${epoch} - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/meta_search.py \ - --data_path $TORCH_HOME/tiered-imagenet \ - --arch ${arch} --n_way ${n_way} --k_shot ${k_shot} \ - --save_path ${SAVED} \ - --learning_rate_max 0.001 --learning_rate_min 0.0001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --model_config ./configs/nas-cifar-cos-cut.config \ - --print_freq 200 --workers 16 diff --git a/TEMP/search-acc-simple.sh b/TEMP/search-acc-simple.sh deleted file mode 100644 index 03b5efd..0000000 --- a/TEMP/search-acc-simple.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 2 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 2 parameters for the GPUs and the network" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -cutout=0 -dataset=cifar10 -epoch=100 -SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E100 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ - --data_path $TORCH_HOME/cifar.python \ - --arch ${arch} --dataset ${dataset} --batch_size 128 \ - --save_path ${SAVED} \ - --learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --model_config ./configs/nas-cifar-cos-simple.config \ - --print_freq 100 --workers 8 diff --git a/TEMP/search-acc-v2-E150.sh b/TEMP/search-acc-v2-E150.sh deleted file mode 100644 index c25c29e..0000000 --- a/TEMP/search-acc-v2-E150.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 2 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 2 parameters for the GPUs and the network" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -cutout=0 -dataset=cifar10 -epoch=150 -SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ - --data_path $TORCH_HOME/cifar.python \ - --arch ${arch} --dataset ${dataset} --batch_size 128 \ - --save_path ${SAVED} \ - --learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --model_config ./configs/nas-cifar-cos.config \ - --print_freq 100 --workers 8 diff --git a/TEMP/search-acc-v2-E200.sh b/TEMP/search-acc-v2-E200.sh deleted file mode 100644 index 4f4812d..0000000 --- a/TEMP/search-acc-v2-E200.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 2 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 2 parameters for the GPUs and the network" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -cutout=0 -dataset=cifar10 -epoch=200 -SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ - --data_path $TORCH_HOME/cifar.python \ - --arch ${arch} --dataset ${dataset} --batch_size 128 \ - --save_path ${SAVED} \ - --learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --model_config ./configs/nas-cifar-cos.config \ - --print_freq 100 --workers 8 diff --git a/TEMP/search-acc-v2-E300.sh b/TEMP/search-acc-v2-E300.sh deleted file mode 100644 index 31b609c..0000000 --- a/TEMP/search-acc-v2-E300.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 2 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 2 parameters for the GPUs and the network" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -cutout=0 -dataset=cifar10 -epoch=300 -SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ - --data_path $TORCH_HOME/cifar.python \ - --arch ${arch} --dataset ${dataset} --batch_size 128 \ - --save_path ${SAVED} \ - --learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --model_config ./configs/nas-cifar-cos.config \ - --print_freq 100 --workers 8 diff --git a/TEMP/search-acc-v2-E50.sh b/TEMP/search-acc-v2-E50.sh deleted file mode 100644 index 5154eb1..0000000 --- a/TEMP/search-acc-v2-E50.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 2 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 2 parameters for the GPUs and the network" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -cutout=0 -dataset=cifar10 -epoch=50 -SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ - --data_path $TORCH_HOME/cifar.python \ - --arch ${arch} --dataset ${dataset} --batch_size 128 \ - --save_path ${SAVED} \ - --learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --model_config ./configs/nas-cifar-cos.config \ - --print_freq 100 --workers 8 diff --git a/TEMP/search-acc-v2.sh b/TEMP/search-acc-v2.sh deleted file mode 100644 index a30f0a6..0000000 --- a/TEMP/search-acc-v2.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 2 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 2 parameters for the GPUs and the network" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -cutout=0 -dataset=cifar10 -epoch=100 -SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/acc_search_v2.py \ - --data_path $TORCH_HOME/cifar.python \ - --arch ${arch} --dataset ${dataset} --batch_size 128 \ - --save_path ${SAVED} \ - --learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --model_config ./configs/nas-cifar-cos.config \ - --print_freq 100 --workers 8 diff --git a/TEMP/search.sh b/TEMP/search.sh deleted file mode 100644 index c82f839..0000000 --- a/TEMP/search.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 3 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 3 parameters for the GPUs and the network and the dataset" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -cutout=0 -dataset=$3 -epoch=50 -SAVED=./snapshots/NAS/Search-${arch}-${dataset}-cut${cutout}-${epoch} - -if [ "$dataset" == "cifar10" ] ;then - dataset_root=$TORCH_HOME/cifar.python - print_freq=100 -elif [ "$dataset" == "cifar100" ] ;then - dataset_root=$TORCH_HOME/cifar.python - print_freq=100 -elif [ "$dataset" == "tiered" ] ;then - dataset_root=$TORCH_HOME/tiered-imagenet - print_freq=500 -else - echo 'invalid dataset-name :'${dataset} - exit 1 -fi - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/train_search.py \ - --data_path ${dataset_root} \ - --arch ${arch} \ - --dataset ${dataset} --batch_size 64 \ - --save_path ${SAVED} \ - --learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --manualSeed 3858 \ - --model_config ./configs/nas-cifar-cos.config \ - --print_freq ${print_freq} --workers 8 diff --git a/TEMP/vis.sh b/TEMP/vis.sh deleted file mode 100644 index 704917d..0000000 --- a/TEMP/vis.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env sh - -seeds="seed-8167 seed-908 seed-9242" -for seed in ${seeds}; do -python ./exps-nas/vis-arch.py --checkpoint ./snapshots/NAS/Search-cifar10-cut16-100/${seed}/checkpoint-search.pth \ - --save_dir ./snapshots/NAS-VIS/Search-cut16-100/${seed} -done - - diff --git a/exps-cnn/DARTS-Search.py b/exps-cnn/DARTS-Search.py deleted file mode 100644 index c6d6d65..0000000 --- a/exps-cnn/DARTS-Search.py +++ /dev/null @@ -1,313 +0,0 @@ -# DARTS First Order, Refer to https://github.com/quark0/darts -import os, sys, time, glob, random, argparse -import numpy as np -from copy import deepcopy -import torch -import torch.nn.functional as F -import torchvision.datasets as dset -import torch.backends.cudnn as cudnn -import torchvision.transforms as transforms -from pathlib import Path -lib_dir = (Path(__file__).parent / '..' / 'lib').resolve() -if str(lib_dir) not in sys.path: sys.path.insert(0, str(lib_dir)) -from utils import AverageMeter, time_string, convert_secs2time -from utils import print_log, obtain_accuracy -from utils import Cutout, count_parameters_in_MB -from datasets import TieredImageNet -from nas import return_alphas_str, Network, NetworkV1, NetworkF1 -from train_utils import main_procedure -from scheduler import load_config - -Networks = {'base': Network, 'share': NetworkV1, 'fix': NetworkF1} - -parser = argparse.ArgumentParser("CNN") -parser.add_argument('--data_path', type=str, help='Path to dataset') -parser.add_argument('--dataset', type=str, choices=['cifar10', 'cifar100', 'tiered'], help='Choose between Cifar10/100 and TieredImageNet.') -parser.add_argument('--arch', type=str, choices=Networks.keys(), help='Choose networks.') -parser.add_argument('--batch_size', type=int, help='the batch size') -parser.add_argument('--learning_rate_max', type=float, help='initial learning rate') -parser.add_argument('--learning_rate_min', type=float, help='minimum learning rate') -parser.add_argument('--momentum', type=float, help='momentum') -parser.add_argument('--weight_decay', type=float, help='weight decay') -parser.add_argument('--epochs', type=int, help='num of training epochs') -# architecture leraning rate -parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding') -parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding') -# -parser.add_argument('--init_channels', type=int, help='num of init channels') -parser.add_argument('--layers', type=int, help='total number of layers') -# -parser.add_argument('--cutout', type=int, help='cutout length, negative means no cutout') -parser.add_argument('--grad_clip', type=float, help='gradient clipping') -parser.add_argument('--model_config', type=str , help='the model configuration') - -# resume -parser.add_argument('--resume', type=str , help='the resume path') -parser.add_argument('--only_base',action='store_true', default=False, help='only train the searched model') -# split data -parser.add_argument('--validate', action='store_true', default=False, help='split train-data int train/val or not') -parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data') -# log -parser.add_argument('--workers', type=int, default=2, help='number of data loading workers (default: 2)') -parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.') -parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)') -parser.add_argument('--manualSeed', type=int, help='manual seed') -args = parser.parse_args() - -assert torch.cuda.is_available(), 'torch.cuda is not available' - -if args.manualSeed is None: - args.manualSeed = random.randint(1, 10000) -random.seed(args.manualSeed) -cudnn.benchmark = True -cudnn.enabled = True -torch.manual_seed(args.manualSeed) -torch.cuda.manual_seed_all(args.manualSeed) - - -def main(): - - # Init logger - args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) - if not os.path.isdir(args.save_path): - os.makedirs(args.save_path) - log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w') - print_log('save path : {}'.format(args.save_path), log) - state = {k: v for k, v in args._get_kwargs()} - print_log(state, log) - print_log("Random Seed: {}".format(args.manualSeed), log) - print_log("Python version : {}".format(sys.version.replace('\n', ' ')), log) - print_log("Torch version : {}".format(torch.__version__), log) - print_log("CUDA version : {}".format(torch.version.cuda), log) - print_log("cuDNN version : {}".format(cudnn.version()), log) - print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log) - args.dataset = args.dataset.lower() - - # Mean + Std - if args.dataset == 'cifar10': - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - elif args.dataset == 'cifar100': - mean = [x / 255 for x in [129.3, 124.1, 112.4]] - std = [x / 255 for x in [68.2, 65.4, 70.4]] - elif args.dataset == 'tiered': - mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] - else: - raise TypeError("Unknow dataset : {:}".format(args.dataset)) - # Data Argumentation - if args.dataset == 'cifar10' or args.dataset == 'cifar100': - lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), - transforms.Normalize(mean, std)] - if args.cutout > 0 : lists += [Cutout(args.cutout)] - train_transform = transforms.Compose(lists) - test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)]) - elif args.dataset == 'tiered': - lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(80, padding=4), transforms.ToTensor(), transforms.Normalize(mean, std)] - if args.cutout > 0 : lists += [Cutout(args.cutout)] - train_transform = transforms.Compose(lists) - test_transform = transforms.Compose([transforms.CenterCrop(80), transforms.ToTensor(), transforms.Normalize(mean, std)]) - else: - raise TypeError("Unknow dataset : {:}".format(args.dataset)) - # Datasets - if args.dataset == 'cifar10': - train_data = dset.CIFAR10(args.data_path, train= True, transform=train_transform, download=True) - test_data = dset.CIFAR10(args.data_path, train=False, transform=test_transform , download=True) - num_classes, head = 10, 'cifar' - elif args.dataset == 'cifar100': - train_data = dset.CIFAR100(args.data_path, train= True, transform=train_transform, download=True) - test_data = dset.CIFAR100(args.data_path, train=False, transform=test_transform , download=True) - num_classes, head = 100, 'cifar' - elif args.dataset == 'tiered': - train_data = TieredImageNet(args.data_path, 'train-val', train_transform) - test_data = None - num_classes, head = train_data.n_classes, 'imagenet' - else: - raise TypeError("Unknow dataset : {:}".format(args.dataset)) - # Data Loader - if args.validate: - indices = list(range(len(train_data))) - split = int(args.train_portion * len(indices)) - random.shuffle(indices) - train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, - sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), - pin_memory=True, num_workers=args.workers) - test_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, - sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:]), - pin_memory=True, num_workers=args.workers) - else: - train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) - test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) - - # network and criterion - criterion = torch.nn.CrossEntropyLoss().cuda() - basemodel = Networks[args.arch](args.init_channels, num_classes, args.layers, head=head) - model = torch.nn.DataParallel(basemodel).cuda() - print_log("Network : {:}".format(model), log) - print_log("Parameter size = {:.3f} MB".format(count_parameters_in_MB(basemodel.base_parameters())), log) - print_log("Train-transformation : {:}\nTest--transformation : {:}\nClass number : {:}".format(train_transform, test_transform, num_classes), log) - - # optimizer and LR-scheduler - base_optimizer = torch.optim.SGD (basemodel.base_parameters(), args.learning_rate_max, momentum=args.momentum, weight_decay=args.weight_decay) - base_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(base_optimizer, float(args.epochs), eta_min=args.learning_rate_min) - arch_optimizer = torch.optim.Adam(basemodel.arch_parameters(), lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay) - - # snapshot - checkpoint_path = os.path.join(args.save_path, 'checkpoint-search.pth') - if args.resume is not None and os.path.isfile(args.resume): - checkpoint = torch.load(args.resume) - start_epoch = checkpoint['epoch'] - basemodel.load_state_dict( checkpoint['state_dict'] ) - base_optimizer.load_state_dict( checkpoint['base_optimizer'] ) - arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] ) - base_scheduler.load_state_dict( checkpoint['base_scheduler'] ) - genotypes = checkpoint['genotypes'] - print_log('Load resume from {:} with start-epoch = {:}'.format(args.resume, start_epoch), log) - elif os.path.isfile(checkpoint_path): - checkpoint = torch.load(checkpoint_path) - start_epoch = checkpoint['epoch'] - basemodel.load_state_dict( checkpoint['state_dict'] ) - base_optimizer.load_state_dict( checkpoint['base_optimizer'] ) - arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] ) - base_scheduler.load_state_dict( checkpoint['base_scheduler'] ) - genotypes = checkpoint['genotypes'] - print_log('Load checkpoint from {:} with start-epoch = {:}'.format(checkpoint_path, start_epoch), log) - else: - start_epoch, genotypes = 0, {} - print_log('Train model-search from scratch.', log) - - config = load_config(args.model_config) - - if args.only_base: - print_log('---- Only Train the Searched Model ----', log) - main_procedure(config, args.dataset, args.data_path, args, basemodel.genotype(), 36, 20, log) - return - - # Main loop - start_time, epoch_time, total_train_time = time.time(), AverageMeter(), 0 - for epoch in range(start_epoch, args.epochs): - base_scheduler.step() - - need_time = convert_secs2time(epoch_time.val * (args.epochs-epoch), True) - print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [LR={:6.4f} ~ {:6.4f}] [Batch={:d}]'.format(time_string(), epoch, args.epochs, need_time, min(base_scheduler.get_lr()), max(base_scheduler.get_lr()), args.batch_size), log) - - genotype = basemodel.genotype() - print_log('genotype = {:}'.format(genotype), log) - - print_log('{:03d}/{:03d} alphas :\n{:}'.format(epoch, args.epochs, return_alphas_str(basemodel)), log) - - # training - train_acc1, train_acc5, train_obj, train_time \ - = train(train_loader, test_loader, model, criterion, base_optimizer, arch_optimizer, epoch, log) - total_train_time += train_time - # validation - valid_acc1, valid_acc5, valid_obj = infer(test_loader, model, criterion, epoch, log) - print_log('Base-Search : {:03d}/{:03d} : Train-Acc={:.3f}, Test-Acc={:.3f}'.format(epoch, args.epochs, train_acc1, valid_acc1), log) - # save genotype - genotypes[epoch] = basemodel.genotype() - # save checkpoint - torch.save({'epoch' : epoch + 1, - 'args' : deepcopy(args), - 'state_dict': basemodel.state_dict(), - 'genotypes' : genotypes, - 'base_optimizer' : base_optimizer.state_dict(), - 'arch_optimizer' : arch_optimizer.state_dict(), - 'base_scheduler' : base_scheduler.state_dict()}, - checkpoint_path) - print_log('----> Save into {:}'.format(checkpoint_path), log) - - # measure elapsed time - epoch_time.update(time.time() - start_time) - start_time = time.time() - - print_log('Finish with training time = {:}'.format( convert_secs2time(total_train_time, True) ), log) - - # clear GPU cache - #torch.cuda.empty_cache() - #main_procedure(config, 'cifar10', os.environ['TORCH_HOME'] + '/cifar.python', args, basemodel.genotype(), 36, 20, log) - log.close() - - -def train(train_queue, valid_queue, model, criterion, base_optimizer, arch_optimizer, epoch, log): - data_time, batch_time = AverageMeter(), AverageMeter() - objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter() - model.train() - - valid_iter = iter(valid_queue) - end = time.time() - for step, (inputs, targets) in enumerate(train_queue): - batch, C, H, W = inputs.size() - - #inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True) - targets = targets.cuda(non_blocking=True) - data_time.update(time.time() - end) - - # get a random minibatch from the search queue with replacement - try: - input_search, target_search = next(valid_iter) - except: - valid_iter = iter(valid_queue) - input_search, target_search = next(valid_iter) - - target_search = target_search.cuda(non_blocking=True) - - # update the architecture - arch_optimizer.zero_grad() - output_search = model(input_search) - arch_loss = criterion(output_search, target_search) - arch_loss.backward() - arch_optimizer.step() - - # update the parameters - base_optimizer.zero_grad() - logits = model(inputs) - loss = criterion(logits, targets) - - loss.backward() - torch.nn.utils.clip_grad_norm_(model.module.base_parameters(), args.grad_clip) - base_optimizer.step() - - prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) - objs.update(loss.item() , batch) - top1.update(prec1.item(), batch) - top5.update(prec5.item(), batch) - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if step % args.print_freq == 0 or (step+1) == len(train_queue): - Sstr = ' TRAIN-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(train_queue)) - Tstr = 'Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})'.format(batch_time=batch_time, data_time=data_time) - Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5) - print_log(Sstr + ' ' + Tstr + ' ' + Lstr, log) - - return top1.avg, top5.avg, objs.avg, batch_time.sum - - -def infer(valid_queue, model, criterion, epoch, log): - objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter() - - model.eval() - with torch.no_grad(): - for step, (inputs, targets) in enumerate(valid_queue): - batch, C, H, W = inputs.size() - targets = targets.cuda(non_blocking=True) - - logits = model(inputs) - loss = criterion(logits, targets) - - prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) - objs.update(loss.item() , batch) - top1.update(prec1.item(), batch) - top5.update(prec5.item(), batch) - - if step % args.print_freq == 0 or (step+1) == len(valid_queue): - Sstr = ' VALID-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(valid_queue)) - Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5) - print_log(Sstr + ' ' + Lstr, log) - - return top1.avg, top5.avg, objs.avg - - -if __name__ == '__main__': - main() diff --git a/exps-cnn/GDAS-Search.py b/exps-cnn/GDAS-Search.py deleted file mode 100644 index 1930201..0000000 --- a/exps-cnn/GDAS-Search.py +++ /dev/null @@ -1,310 +0,0 @@ -import os, sys, time, glob, random, argparse -import numpy as np -from copy import deepcopy -import torch -import torch.nn.functional as F -import torchvision.datasets as dset -import torch.backends.cudnn as cudnn -import torchvision.transforms as transforms -from pathlib import Path -lib_dir = (Path(__file__).parent / '..' / 'lib').resolve() -if str(lib_dir) not in sys.path: sys.path.insert(0, str(lib_dir)) -from utils import AverageMeter, time_string, convert_secs2time -from utils import print_log, obtain_accuracy -from utils import Cutout, count_parameters_in_MB -from nas import Network, NetworkACC2, NetworkV3, NetworkV4, NetworkV5, NetworkFACC1 -from nas import return_alphas_str -from train_utils import main_procedure -from scheduler import load_config - -Networks = {'base': Network, 'acc2': NetworkACC2, 'facc1': NetworkFACC1, 'NetworkV3': NetworkV3, 'NetworkV4': NetworkV4, 'NetworkV5': NetworkV5} - - -parser = argparse.ArgumentParser("cifar") -parser.add_argument('--data_path', type=str, help='Path to dataset') -parser.add_argument('--dataset', type=str, choices=['cifar10', 'cifar100'], help='Choose between Cifar10/100 and ImageNet.') -parser.add_argument('--arch', type=str, choices=Networks.keys(), help='Choose networks.') -parser.add_argument('--batch_size', type=int, help='the batch size') -parser.add_argument('--learning_rate_max', type=float, help='initial learning rate') -parser.add_argument('--learning_rate_min', type=float, help='minimum learning rate') -parser.add_argument('--tau_max', type=float, help='initial tau') -parser.add_argument('--tau_min', type=float, help='minimum tau') -parser.add_argument('--momentum', type=float, help='momentum') -parser.add_argument('--weight_decay', type=float, help='weight decay') -parser.add_argument('--epochs', type=int, help='num of training epochs') -# architecture leraning rate -parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding') -parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding') -# -parser.add_argument('--init_channels', type=int, help='num of init channels') -parser.add_argument('--layers', type=int, help='total number of layers') -# -parser.add_argument('--cutout', type=int, help='cutout length, negative means no cutout') -parser.add_argument('--grad_clip', type=float, help='gradient clipping') -parser.add_argument('--model_config', type=str , help='the model configuration') - -# resume -parser.add_argument('--resume', type=str , help='the resume path') -parser.add_argument('--only_base',action='store_true', default=False, help='only train the searched model') -# split data -parser.add_argument('--validate', action='store_true', default=False, help='split train-data int train/val or not') -parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data') -# log -parser.add_argument('--workers', type=int, default=2, help='number of data loading workers (default: 2)') -parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.') -parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)') -parser.add_argument('--manualSeed', type=int, help='manual seed') -args = parser.parse_args() - -assert torch.cuda.is_available(), 'torch.cuda is not available' - -if args.manualSeed is None: - args.manualSeed = random.randint(1, 10000) -random.seed(args.manualSeed) -cudnn.benchmark = True -cudnn.enabled = True -torch.manual_seed(args.manualSeed) -torch.cuda.manual_seed_all(args.manualSeed) - - -def main(): - - # Init logger - args.save_path = os.path.join(args.save_path, 'seed-{:}'.format(args.manualSeed)) - if not os.path.isdir(args.save_path): - os.makedirs(args.save_path) - log = open(os.path.join(args.save_path, 'log-seed-{:}.txt'.format(args.manualSeed)), 'w') - print_log('save path : {}'.format(args.save_path), log) - state = {k: v for k, v in args._get_kwargs()} - print_log(state, log) - print_log("Random Seed: {}".format(args.manualSeed), log) - print_log("Python version : {}".format(sys.version.replace('\n', ' ')), log) - print_log("Torch version : {}".format(torch.__version__), log) - print_log("CUDA version : {}".format(torch.version.cuda), log) - print_log("cuDNN version : {}".format(cudnn.version()), log) - print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log) - args.dataset = args.dataset.lower() - - # Mean + Std - if args.dataset == 'cifar10': - mean = [x / 255 for x in [125.3, 123.0, 113.9]] - std = [x / 255 for x in [63.0, 62.1, 66.7]] - elif args.dataset == 'cifar100': - mean = [x / 255 for x in [129.3, 124.1, 112.4]] - std = [x / 255 for x in [68.2, 65.4, 70.4]] - else: - raise TypeError("Unknow dataset : {:}".format(args.dataset)) - # Data Argumentation - if args.dataset == 'cifar10' or args.dataset == 'cifar100': - lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), - transforms.Normalize(mean, std)] - if args.cutout > 0 : lists += [Cutout(args.cutout)] - train_transform = transforms.Compose(lists) - test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)]) - else: - raise TypeError("Unknow dataset : {:}".format(args.dataset)) - # Datasets - if args.dataset == 'cifar10': - train_data = dset.CIFAR10(args.data_path, train= True, transform=train_transform, download=True) - test_data = dset.CIFAR10(args.data_path, train=False, transform=test_transform , download=True) - num_classes = 10 - elif args.dataset == 'cifar100': - train_data = dset.CIFAR100(args.data_path, train= True, transform=train_transform, download=True) - test_data = dset.CIFAR100(args.data_path, train=False, transform=test_transform , download=True) - num_classes = 100 - else: - raise TypeError("Unknow dataset : {:}".format(args.dataset)) - # Data Loader - if args.validate: - indices = list(range(len(train_data))) - split = int(args.train_portion * len(indices)) - random.shuffle(indices) - train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, - sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), - pin_memory=True, num_workers=args.workers) - test_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, - sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:]), - pin_memory=True, num_workers=args.workers) - else: - train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) - test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) - - # network and criterion - criterion = torch.nn.CrossEntropyLoss().cuda() - basemodel = Networks[args.arch](args.init_channels, num_classes, args.layers) - model = torch.nn.DataParallel(basemodel).cuda() - print_log("Parameter size = {:.3f} MB".format(count_parameters_in_MB(basemodel.base_parameters())), log) - print_log("Train-transformation : {:}\nTest--transformation : {:}".format(train_transform, test_transform), log) - - # optimizer and LR-scheduler - base_optimizer = torch.optim.SGD (basemodel.base_parameters(), args.learning_rate_max, momentum=args.momentum, weight_decay=args.weight_decay) - #base_optimizer = torch.optim.Adam(basemodel.base_parameters(), lr=args.learning_rate_max, betas=(0.5, 0.999), weight_decay=args.weight_decay) - base_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(base_optimizer, float(args.epochs), eta_min=args.learning_rate_min) - arch_optimizer = torch.optim.Adam(basemodel.arch_parameters(), lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay) - - # snapshot - checkpoint_path = os.path.join(args.save_path, 'checkpoint-search.pth') - if args.resume is not None and os.path.isfile(args.resume): - checkpoint = torch.load(args.resume) - start_epoch = checkpoint['epoch'] - basemodel.load_state_dict( checkpoint['state_dict'] ) - base_optimizer.load_state_dict( checkpoint['base_optimizer'] ) - arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] ) - base_scheduler.load_state_dict( checkpoint['base_scheduler'] ) - genotypes = checkpoint['genotypes'] - print_log('Load resume from {:} with start-epoch = {:}'.format(args.resume, start_epoch), log) - elif os.path.isfile(checkpoint_path): - checkpoint = torch.load(checkpoint_path) - start_epoch = checkpoint['epoch'] - basemodel.load_state_dict( checkpoint['state_dict'] ) - base_optimizer.load_state_dict( checkpoint['base_optimizer'] ) - arch_optimizer.load_state_dict( checkpoint['arch_optimizer'] ) - base_scheduler.load_state_dict( checkpoint['base_scheduler'] ) - genotypes = checkpoint['genotypes'] - print_log('Load checkpoint from {:} with start-epoch = {:}'.format(checkpoint_path, start_epoch), log) - else: - start_epoch, genotypes = 0, {} - print_log('Train model-search from scratch.', log) - - config = load_config(args.model_config) - - if args.only_base: - print_log('---- Only Train the Searched Model ----', log) - main_procedure(config, args.dataset, args.data_path, args, basemodel.genotype(), 36, 20, log) - return - - # Main loop - start_time, epoch_time, total_train_time = time.time(), AverageMeter(), 0 - for epoch in range(start_epoch, args.epochs): - base_scheduler.step() - - basemodel.set_tau( args.tau_max - epoch*1.0/args.epochs*(args.tau_max-args.tau_min) ) - #if epoch + 2 == args.epochs: - # torch.cuda.empty_cache() - # basemodel.set_gumbel(False) - - need_time = convert_secs2time(epoch_time.val * (args.epochs-epoch), True) - print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [LR={:6.4f} ~ {:6.4f}] [Batch={:d}], tau={:}'.format(time_string(), epoch, args.epochs, need_time, min(base_scheduler.get_lr()), max(base_scheduler.get_lr()), args.batch_size, basemodel.get_tau()), log) - - genotype = basemodel.genotype() - print_log('genotype = {:}'.format(genotype), log) - - print_log('{:03d}/{:03d} alphas :\n{:}'.format(epoch, args.epochs, return_alphas_str(basemodel)), log) - - # training - train_acc1, train_acc5, train_obj, train_time \ - = train(train_loader, test_loader, model, criterion, base_optimizer, arch_optimizer, epoch, log) - total_train_time += train_time - # validation - valid_acc1, valid_acc5, valid_obj = infer(test_loader, model, criterion, epoch, log) - print_log('{:03d}/{:03d}, Train-Accuracy = {:.2f}, Test-Accuracy = {:.2f}'.format(epoch, args.epochs, train_acc1, valid_acc1), log) - # save genotype - genotypes[epoch] = basemodel.genotype() - # save checkpoint - torch.save({'epoch' : epoch + 1, - 'args' : deepcopy(args), - 'state_dict': basemodel.state_dict(), - 'genotypes' : genotypes, - 'base_optimizer' : base_optimizer.state_dict(), - 'arch_optimizer' : arch_optimizer.state_dict(), - 'base_scheduler' : base_scheduler.state_dict()}, - checkpoint_path) - print_log('----> Save into {:}'.format(checkpoint_path), log) - - - # measure elapsed time - epoch_time.update(time.time() - start_time) - start_time = time.time() - - print_log('Finish with training time = {:}'.format( convert_secs2time(total_train_time, True) ), log) - - # clear GPU cache - #torch.cuda.empty_cache() - #main_procedure(config, args.dataset, args.data_path, args, basemodel.genotype(), 36, 20, log) - log.close() - - -def train(train_queue, valid_queue, model, criterion, base_optimizer, arch_optimizer, epoch, log): - data_time, batch_time = AverageMeter(), AverageMeter() - objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter() - model.train() - - valid_iter = iter(valid_queue) - end = time.time() - for step, (inputs, targets) in enumerate(train_queue): - batch, C, H, W = inputs.size() - - #inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True) - targets = targets.cuda(non_blocking=True) - - # get a random minibatch from the search queue with replacement - try: - input_search, target_search = next(valid_iter) - except: - valid_iter = iter(valid_queue) - input_search, target_search = next(valid_iter) - - target_search = target_search.cuda(non_blocking=True) - data_time.update(time.time() - end) - - # update the architecture - arch_optimizer.zero_grad() - output_search = model(input_search) - arch_loss = criterion(output_search, target_search) - arch_loss.backward() - arch_optimizer.step() - - # update the parameters - base_optimizer.zero_grad() - logits = model(inputs) - loss = criterion(logits, targets) - - loss.backward() - torch.nn.utils.clip_grad_norm_(model.module.base_parameters(), args.grad_clip) - base_optimizer.step() - - prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) - objs.update(loss.item() , batch) - top1.update(prec1.item(), batch) - top5.update(prec5.item(), batch) - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if step % args.print_freq == 0 or (step+1) == len(train_queue): - Sstr = ' TRAIN-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(train_queue)) - Tstr = 'Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})'.format(batch_time=batch_time, data_time=data_time) - Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5) - print_log(Sstr + ' ' + Tstr + ' ' + Lstr, log) - - return top1.avg, top5.avg, objs.avg, batch_time.sum - - -def infer(valid_queue, model, criterion, epoch, log): - objs, top1, top5 = AverageMeter(), AverageMeter(), AverageMeter() - - model.eval() - with torch.no_grad(): - for step, (inputs, targets) in enumerate(valid_queue): - batch, C, H, W = inputs.size() - targets = targets.cuda(non_blocking=True) - - logits = model(inputs) - loss = criterion(logits, targets) - - prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5)) - objs.update(loss.item() , batch) - top1.update(prec1.item(), batch) - top5.update(prec5.item(), batch) - - if step % args.print_freq == 0 or (step+1) == len(valid_queue): - Sstr = ' VALID-SEARCH ' + time_string() + ' Epoch: [{:03d}][{:03d}/{:03d}]'.format(epoch, step, len(valid_queue)) - Lstr = 'Loss {loss.val:.3f} ({loss.avg:.3f}) Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})'.format(loss=objs, top1=top1, top5=top5) - print_log(Sstr + ' ' + Lstr, log) - - return top1.avg, top5.avg, objs.avg - - -if __name__ == '__main__': - main() diff --git a/exps-cnn/train_utils.py b/exps-cnn/train_utils.py index ca1a51d..c5e29fb 100644 --- a/exps-cnn/train_utils.py +++ b/exps-cnn/train_utils.py @@ -1,7 +1,6 @@ import os, sys, time from copy import deepcopy import torch -import torchvision.datasets as dset import torchvision.transforms as transforms diff --git a/exps-cnn/train_utils_imagenet.py b/exps-cnn/train_utils_imagenet.py index 6173c9a..76fdd42 100644 --- a/exps-cnn/train_utils_imagenet.py +++ b/exps-cnn/train_utils_imagenet.py @@ -2,7 +2,6 @@ import os, sys, time from copy import deepcopy import torch import torch.nn as nn -import torchvision.datasets as dset import torchvision.transforms as transforms diff --git a/output/.gitignore b/output/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/output/.gitignore @@ -0,0 +1 @@ +* diff --git a/scripts-cluster/job-script.sh b/scripts-cluster/job-script.sh new file mode 100644 index 0000000..ffbce42 --- /dev/null +++ b/scripts-cluster/job-script.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +echo "CHECK-DATA-DIR START" +cifar_dir="./data/data/cifar.python" +if [ -d ${cifar_dir} ]; then + echo "Find cifar-dir: "${cifar_dir} +else + echo "Can not find cifar-dir: "${cifar_dir} + exit 1 +fi +echo "CHECK-DATA-DIR DONE" + +sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ + COMM_KM_Data COMM_km_2018 \ + `pwd`/hadoop-data \ + afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets + +echo "PWD: " `pwd` +echo "files:: " `ls` +echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES} + +# config python +PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz +wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1 +tar xzf $PYTHON_ENV + +alias python="./env/bin/python" + +echo "Python: " `which python` diff --git a/scripts-cluster/submit.sh b/scripts-cluster/submit.sh new file mode 100644 index 0000000..c0e0bd0 --- /dev/null +++ b/scripts-cluster/submit.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# bash ./scripts-cluster/submit.sh ${QUEUE} ${JOB-NAME} ${GPUs} +#find -name "._*" | xargs rm -rf +ODIR=$(pwd) +FDIR=$(cd $(dirname $0); pwd) +echo "Bash-Dir : "${ODIR} +echo "File-Dir : "${FDIR} +echo "File-Name: "${0} + +if [ "$#" -ne 4 ] ;then + echo "Input illegal number of parameters " $# + echo "Need 4 parameters for the queue-name, the job-name, and the number-of-GPUs" + exit 1 +fi +find -name "__pycache__" | xargs rm -rf + +QUEUE=$1 +NAME=$2 +GPUs=$3 +CMD=$4 +TIME=$(date +"%Y-%h-%d-%T") + +JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" + +cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} +echo ${CMD} >> ${JOB_SCRIPT} + +exit 1 +HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" + + +${HGCP_CLIENT_BIN}/submit \ + --hdfs afs://xingtian.afs.baidu.com:9902 \ + --hdfs-user COMM_KM_Data \ + --hdfs-passwd COMM_km_2018 \ + --hdfs-path /user/COMM_KM_Data/dongxuanyi/logs \ + --file-dir ./ \ + --job-name ${NAME} \ + --queue-name ${QUEUE} \ + --num-nodes 1 \ + --num-task-pernode 1 \ + --gpu-pnode ${GPUs} \ + --time-limit 0 \ + --job-script ${JOB_SCRIPT} diff --git a/scripts-cluster/tmps/.gitignore b/scripts-cluster/tmps/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/scripts-cluster/tmps/.gitignore @@ -0,0 +1 @@ +* diff --git a/scripts-cnn/DMS-V-TrainV3.sh b/scripts-cnn/DMS-V-TrainV3.sh deleted file mode 100644 index 1fe6ddd..0000000 --- a/scripts-cnn/DMS-V-TrainV3.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env sh -# bash scripts-cnn/DMS-V-TrainV3.sh 1 -if [ "$#" -ne 1 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 1 parameters for the GPUs and the epochs" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=acc2 -cutout=0 -dataset=cifar10 -epoch=200 -SAVED=./snapshots/NAS/ACC-V3-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/GDAS-Search.py \ - --data_path $TORCH_HOME/cifar.python \ - --arch ${arch} --dataset ${dataset} --batch_size 128 \ - --save_path ${SAVED} \ - --learning_rate_max 0.01 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --tau_max 10 --tau_min 1 \ - --model_config ./configs/nas-cifar-cos-cut.config \ - --print_freq 100 --workers 10 diff --git a/scripts-cnn/search-acc-v2.sh b/scripts-cnn/search-acc-v2.sh deleted file mode 100644 index 9438f5c..0000000 --- a/scripts-cnn/search-acc-v2.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 2 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 2 parameters for the GPUs and the network" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -cutout=0 -dataset=cifar10 -epoch=200 -SAVED=./snapshots/NAS/ACC-V2-Search-${arch}-${dataset}-cut${cutout}-${epoch}-E600 - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/GDAS-Search.py \ - --data_path $TORCH_HOME/cifar.python \ - --arch ${arch} --dataset ${dataset} --batch_size 128 \ - --save_path ${SAVED} \ - --learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --model_config ./configs/nas-cifar-cos-cut.config \ - --print_freq 100 --workers 10 diff --git a/scripts-cnn/search.sh b/scripts-cnn/search.sh deleted file mode 100644 index eefcbca..0000000 --- a/scripts-cnn/search.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env sh -if [ "$#" -ne 3 ] ;then - echo "Input illegal number of parameters " $# - echo "Need 3 parameters for the GPUs and the network and the dataset" - exit 1 -fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi - -gpus=$1 -arch=$2 -cutout=0 -dataset=$3 -epoch=50 -SAVED=./snapshots/NAS/Search-${arch}-${dataset}-cut${cutout}-${epoch} - -if [ "$dataset" == "cifar10" ] ;then - dataset_root=$TORCH_HOME/cifar.python - print_freq=100 -elif [ "$dataset" == "cifar100" ] ;then - dataset_root=$TORCH_HOME/cifar.python - print_freq=100 -elif [ "$dataset" == "tiered" ] ;then - dataset_root=$TORCH_HOME/tiered-imagenet - print_freq=500 -else - echo 'invalid dataset-name :'${dataset} - exit 1 -fi - -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/DARTS-Search.py \ - --data_path ${dataset_root} \ - --arch ${arch} \ - --dataset ${dataset} --batch_size 64 \ - --save_path ${SAVED} \ - --learning_rate_max 0.025 --learning_rate_min 0.001 --momentum 0.9 --weight_decay 0.0003 \ - --epochs ${epoch} --cutout ${cutout} --validate --grad_clip 5 \ - --init_channels 16 --layers 8 \ - --model_config ./configs/nas-cifar-cos-cut.config \ - --print_freq ${print_freq} --workers 8 diff --git a/scripts-cnn/train-cifar.sh b/scripts-cnn/train-cifar.sh index 695d96e..ad2e769 100644 --- a/scripts-cnn/train-cifar.sh +++ b/scripts-cnn/train-cifar.sh @@ -17,9 +17,10 @@ arch=$2 dataset=$3 cutout=$4 SAVED=./snapshots/NAS/${arch}-${dataset}-${cutout}-E600 +#--data_path $TORCH_HOME/cifar.python \ CUDA_VISIBLE_DEVICES=${gpus} python ./exps-cnn/train_base.py \ - --data_path $TORCH_HOME/cifar.python \ + --data_path ./data/data/cifar.python \ --dataset ${dataset} --arch ${arch} \ --save_path ${SAVED} \ --grad_clip 5 \ diff --git a/scripts-rnn/train-PTB.sh b/scripts-rnn/train-PTB.sh index 661dced..e667fd6 100644 --- a/scripts-rnn/train-PTB.sh +++ b/scripts-rnn/train-PTB.sh @@ -1,21 +1,14 @@ #!/usr/bin/env sh -if [ "$#" -ne 2 ] ;then +if [ "$#" -ne 1 ] ;then echo "Input illegal number of parameters " $# - echo "Need 2 parameters for the GPU and the architecture" + echo "Need 1 parameters for the GPU and the architecture" exit 1 fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi -gpus=$1 -arch=$2 -SAVED=./snapshots/NAS-RNN/Search-${arch}-PTB +arch=$1 +SAVED=./output/NAS-RNN/Search-${arch}-PTB -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-rnn/train_rnn_base.py \ +python ./exps-rnn/train_rnn_base.py \ --arch ${arch} \ --save_path ${SAVED} \ --config_path ./configs/NAS-PTB-BASE.config \ diff --git a/scripts-rnn/train-WT2.sh b/scripts-rnn/train-WT2.sh index 1f29ddf..fd61800 100644 --- a/scripts-rnn/train-WT2.sh +++ b/scripts-rnn/train-WT2.sh @@ -1,21 +1,14 @@ #!/usr/bin/env sh -if [ "$#" -ne 2 ] ;then +if [ "$#" -ne 1 ] ;then echo "Input illegal number of parameters " $# - echo "Need 2 parameters for the GPU and the architecture" + echo "Need 1 parameters for the architectures" exit 1 fi -if [ "$TORCH_HOME" = "" ]; then - echo "Must set TORCH_HOME envoriment variable for data dir saving" - exit 1 -else - echo "TORCH_HOME : $TORCH_HOME" -fi -gpus=$1 -arch=$2 -SAVED=./snapshots/NAS-RNN/Search-${arch}-WT2 +arch=$1 +SAVED=./output/NAS-RNN/Search-${arch}-WT2 -CUDA_VISIBLE_DEVICES=${gpus} python ./exps-nas/rnn/train_rnn_base.py \ +python ./exps-rnn/train_rnn_base.py \ --arch ${arch} \ --save_path ${SAVED} \ --config_path ./configs/NAS-WT2-BASE.config \