update scripts
This commit is contained in:
		| @@ -7,6 +7,7 @@ import torch.nn.functional as F | |||||||
| import torchvision.datasets as dset | import torchvision.datasets as dset | ||||||
| import torch.backends.cudnn as cudnn | import torch.backends.cudnn as cudnn | ||||||
| import torchvision.transforms as transforms | import torchvision.transforms as transforms | ||||||
|  | import multiprocessing | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| lib_dir = (Path(__file__).parent / '..' / 'lib').resolve() | lib_dir = (Path(__file__).parent / '..' / 'lib').resolve() | ||||||
| print ('lib-dir : {:}'.format(lib_dir)) | print ('lib-dir : {:}'.format(lib_dir)) | ||||||
| @@ -29,7 +30,7 @@ parser.add_argument('--config_path',       type=str, help='the training configur | |||||||
| parser.add_argument('--save_path',         type=str, help='Folder to save checkpoints and log.') | parser.add_argument('--save_path',         type=str, help='Folder to save checkpoints and log.') | ||||||
| parser.add_argument('--print_freq',        type=int, help='print frequency (default: 200)') | parser.add_argument('--print_freq',        type=int, help='print frequency (default: 200)') | ||||||
| parser.add_argument('--manualSeed',        type=int, help='manual seed') | parser.add_argument('--manualSeed',        type=int, help='manual seed') | ||||||
| parser.add_argument('--threads',           type=int, default=10, help='the number of threads') | parser.add_argument('--threads',           type=int, default=4, help='the number of threads') | ||||||
| args = parser.parse_args() | args = parser.parse_args() | ||||||
|  |  | ||||||
| assert torch.cuda.is_available(), 'torch.cuda is not available' | assert torch.cuda.is_available(), 'torch.cuda is not available' | ||||||
| @@ -50,7 +51,7 @@ def main(): | |||||||
|   if not os.path.isdir(args.save_path): |   if not os.path.isdir(args.save_path): | ||||||
|     os.makedirs(args.save_path) |     os.makedirs(args.save_path) | ||||||
|   log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w') |   log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w') | ||||||
|   print_log('save path : {}'.format(args.save_path), log) |   print_log('save path : {:}'.format(args.save_path), log) | ||||||
|   state = {k: v for k, v in args._get_kwargs()} |   state = {k: v for k, v in args._get_kwargs()} | ||||||
|   print_log(state, log) |   print_log(state, log) | ||||||
|   print_log("Random Seed: {}".format(args.manualSeed), log) |   print_log("Random Seed: {}".format(args.manualSeed), log) | ||||||
| @@ -59,6 +60,7 @@ def main(): | |||||||
|   print_log("CUDA   version : {}".format(torch.version.cuda), log) |   print_log("CUDA   version : {}".format(torch.version.cuda), log) | ||||||
|   print_log("cuDNN  version : {}".format(cudnn.version()), log) |   print_log("cuDNN  version : {}".format(cudnn.version()), log) | ||||||
|   print_log("Num of GPUs    : {}".format(torch.cuda.device_count()), log) |   print_log("Num of GPUs    : {}".format(torch.cuda.device_count()), log) | ||||||
|  |   print_log("Num of CPUs    : {}".format(multiprocessing.cpu_count()), log) | ||||||
|  |  | ||||||
|   config = load_config( args.config_path ) |   config = load_config( args.config_path ) | ||||||
|   genotype = Networks[ args.arch ] |   genotype = Networks[ args.arch ] | ||||||
|   | |||||||
| @@ -3,7 +3,7 @@ from .utils import time_file_str, time_string | |||||||
| from .utils import test_imagenet_data | from .utils import test_imagenet_data | ||||||
| from .utils import print_log | from .utils import print_log | ||||||
| from .evaluation_utils import obtain_accuracy | from .evaluation_utils import obtain_accuracy | ||||||
| from .draw_pts import draw_points | #from .draw_pts import draw_points | ||||||
| from .gpu_manager import GPUManager | from .gpu_manager import GPUManager | ||||||
|  |  | ||||||
| from .save_meta import Save_Meta | from .save_meta import Save_Meta | ||||||
|   | |||||||
| @@ -1,9 +1,6 @@ | |||||||
| import os, sys, time | import os, sys, time | ||||||
| import numpy as np | import numpy as np | ||||||
| import matplotlib |  | ||||||
| import random | import random | ||||||
| matplotlib.use('agg') |  | ||||||
| import matplotlib.pyplot as plt |  | ||||||
|  |  | ||||||
| class AverageMeter(object): | class AverageMeter(object): | ||||||
|   """Computes and stores the average and current value""" |   """Computes and stores the average and current value""" | ||||||
| @@ -53,6 +50,9 @@ class RecorderMeter(object): | |||||||
|     else:       return self.epoch_accuracy[:self.current_epoch, 1].max() |     else:       return self.epoch_accuracy[:self.current_epoch, 1].max() | ||||||
|  |  | ||||||
|   def plot_curve(self, save_path): |   def plot_curve(self, save_path): | ||||||
|  |     import matplotlib | ||||||
|  |     matplotlib.use('agg') | ||||||
|  |     import matplotlib.pyplot as plt | ||||||
|     title = 'the accuracy/loss curve of train/val' |     title = 'the accuracy/loss curve of train/val' | ||||||
|     dpi = 100  |     dpi = 100  | ||||||
|     width, height = 1600, 1000 |     width, height = 1600, 1000 | ||||||
| @@ -97,7 +97,7 @@ class RecorderMeter(object): | |||||||
|     plt.close(fig) |     plt.close(fig) | ||||||
|      |      | ||||||
| def print_log(print_string, log): | def print_log(print_string, log): | ||||||
|   print("{}".format(print_string)) |   print ("{:}".format(print_string)) | ||||||
|   if log is not None: |   if log is not None: | ||||||
|     log.write('{}\n'.format(print_string)) |     log.write('{}\n'.format(print_string)) | ||||||
|     log.flush() |     log.flush() | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								output/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								output/.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1 +0,0 @@ | |||||||
| * |  | ||||||
							
								
								
									
										9
									
								
								scripts-cluster/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								scripts-cluster/README.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | |||||||
|  | # Commands on Cluster | ||||||
|  |  | ||||||
|  | ## RNN | ||||||
|  | ``` | ||||||
|  | bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS" | ||||||
|  | bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS" | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | ## CNN | ||||||
| @@ -1,6 +1,13 @@ | |||||||
| #!/bin/bash | #!/bin/bash | ||||||
| # | # | ||||||
| echo "CHECK-DATA-DIR START" | echo "CHECK-DATA-DIR START" | ||||||
|  | sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ | ||||||
|  |     COMM_KM_Data COMM_km_2018 \ | ||||||
|  |     `pwd`/hadoop-data \ | ||||||
|  |     afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets | ||||||
|  |  | ||||||
|  | tar xvf ./hadoop-data/cifar.python.tar -C ./data/data/ | ||||||
|  |  | ||||||
| cifar_dir="./data/data/cifar.python" | cifar_dir="./data/data/cifar.python" | ||||||
| if [ -d ${cifar_dir} ]; then | if [ -d ${cifar_dir} ]; then | ||||||
|   echo "Find cifar-dir: "${cifar_dir} |   echo "Find cifar-dir: "${cifar_dir} | ||||||
| @@ -10,20 +17,17 @@ else | |||||||
| fi | fi | ||||||
| echo "CHECK-DATA-DIR DONE" | echo "CHECK-DATA-DIR DONE" | ||||||
|  |  | ||||||
| sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ |  | ||||||
|     COMM_KM_Data COMM_km_2018 \ |  | ||||||
|     `pwd`/hadoop-data \ |  | ||||||
|     afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets |  | ||||||
|  |  | ||||||
| echo "PWD: " `pwd` |  | ||||||
| echo "files::  " `ls` |  | ||||||
| echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES} |  | ||||||
|  |  | ||||||
| # config python | # config python | ||||||
| PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz | PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz | ||||||
| wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1 | wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1 | ||||||
| tar xzf $PYTHON_ENV | tar xzf $PYTHON_ENV | ||||||
|  |  | ||||||
| alias python="./env/bin/python" | echo "JOB-PWD   : " `pwd` | ||||||
|  | echo "JOB-files :  " `ls` | ||||||
|  | echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES} | ||||||
|  |  | ||||||
| echo "Python:  " `which python` | echo `./env/bin/python --version` | ||||||
|  |  | ||||||
|  | # real commands | ||||||
|  | bash ./scripts-rnn/train-WT2.sh GDAS | ||||||
|   | |||||||
| @@ -18,14 +18,15 @@ QUEUE=$1 | |||||||
| NAME=$2 | NAME=$2 | ||||||
| GPUs=$3 | GPUs=$3 | ||||||
| CMD=$4 | CMD=$4 | ||||||
| TIME=$(date +"%Y-%h-%d-%T") | TIME=$(date +"%Y-%h-%d--%T") | ||||||
|  | TIME="${TIME//:/-}" | ||||||
|  |  | ||||||
| JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" | JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" | ||||||
|  | echo "JOB-SCRIPT: " ${JOB_SCRIPT} | ||||||
|  |  | ||||||
| cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} | cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} | ||||||
| echo ${CMD}              >> ${JOB_SCRIPT} | echo ${CMD}              >> ${JOB_SCRIPT} | ||||||
|  |  | ||||||
| exit 1 |  | ||||||
| HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" | HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -42,3 +43,6 @@ ${HGCP_CLIENT_BIN}/submit \ | |||||||
|     --gpu-pnode ${GPUs} \ |     --gpu-pnode ${GPUs} \ | ||||||
|     --time-limit 0 \ |     --time-limit 0 \ | ||||||
|     --job-script ${JOB_SCRIPT} |     --job-script ${JOB_SCRIPT} | ||||||
|  |  | ||||||
|  | #--job-script ${FDIR}/job-script.sh | ||||||
|  | #echo "JOB-SCRIPT: " ${JOB_SCRIPT} | ||||||
|   | |||||||
| @@ -7,8 +7,18 @@ fi | |||||||
|  |  | ||||||
| arch=$1 | arch=$1 | ||||||
| SAVED=./output/NAS-RNN/Search-${arch}-PTB | SAVED=./output/NAS-RNN/Search-${arch}-PTB | ||||||
|  | PY_C="./env/bin/python" | ||||||
|  |  | ||||||
| python ./exps-rnn/train_rnn_base.py \ | if [ ! -f ${PY_C} ]; then | ||||||
|  |   echo "Local Run with Python: "`which python` | ||||||
|  |   PY_C="python" | ||||||
|  | else | ||||||
|  |   echo "Cluster Run with Python: "${PY_C} | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | ${PY_C} --version | ||||||
|  |  | ||||||
|  | ${PY_C} ./exps-rnn/train_rnn_base.py \ | ||||||
| 	--arch ${arch} \ | 	--arch ${arch} \ | ||||||
| 	--save_path ${SAVED} \ | 	--save_path ${SAVED} \ | ||||||
| 	--config_path ./configs/NAS-PTB-BASE.config \ | 	--config_path ./configs/NAS-PTB-BASE.config \ | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| #!/usr/bin/env sh | #!/bin/bash | ||||||
| if [ "$#" -ne 1 ] ;then | if [ "$#" -ne 1 ] ;then | ||||||
|   echo "Input illegal number of parameters " $# |   echo "Input illegal number of parameters " $# | ||||||
|   echo "Need 1 parameters for the architectures" |   echo "Need 1 parameters for the architectures" | ||||||
| @@ -7,8 +7,18 @@ fi | |||||||
|  |  | ||||||
| arch=$1 | arch=$1 | ||||||
| SAVED=./output/NAS-RNN/Search-${arch}-WT2 | SAVED=./output/NAS-RNN/Search-${arch}-WT2 | ||||||
|  | PY_C="./env/bin/python" | ||||||
|  |  | ||||||
| python ./exps-rnn/train_rnn_base.py \ | if [ ! -f ${PY_C} ]; then | ||||||
|  |   echo "Local Run with Python: "`which python` | ||||||
|  |   PY_C="python" | ||||||
|  | else | ||||||
|  |   echo "Cluster Run with Python: "${PY_C} | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | ${PY_C} --version | ||||||
|  |  | ||||||
|  | ${PY_C} ./exps-rnn/train_rnn_base.py \ | ||||||
| 	--arch ${arch} \ | 	--arch ${arch} \ | ||||||
| 	--save_path ${SAVED} \ | 	--save_path ${SAVED} \ | ||||||
| 	--config_path ./configs/NAS-WT2-BASE.config \ | 	--config_path ./configs/NAS-WT2-BASE.config \ | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user