update scripts
This commit is contained in:
		| @@ -7,6 +7,7 @@ import torch.nn.functional as F | ||||
| import torchvision.datasets as dset | ||||
| import torch.backends.cudnn as cudnn | ||||
| import torchvision.transforms as transforms | ||||
| import multiprocessing | ||||
| from pathlib import Path | ||||
| lib_dir = (Path(__file__).parent / '..' / 'lib').resolve() | ||||
| print ('lib-dir : {:}'.format(lib_dir)) | ||||
| @@ -29,7 +30,7 @@ parser.add_argument('--config_path',       type=str, help='the training configur | ||||
| parser.add_argument('--save_path',         type=str, help='Folder to save checkpoints and log.') | ||||
| parser.add_argument('--print_freq',        type=int, help='print frequency (default: 200)') | ||||
| parser.add_argument('--manualSeed',        type=int, help='manual seed') | ||||
| parser.add_argument('--threads',           type=int, default=10, help='the number of threads') | ||||
| parser.add_argument('--threads',           type=int, default=4, help='the number of threads') | ||||
| args = parser.parse_args() | ||||
|  | ||||
| assert torch.cuda.is_available(), 'torch.cuda is not available' | ||||
| @@ -50,7 +51,7 @@ def main(): | ||||
|   if not os.path.isdir(args.save_path): | ||||
|     os.makedirs(args.save_path) | ||||
|   log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w') | ||||
|   print_log('save path : {}'.format(args.save_path), log) | ||||
|   print_log('save path : {:}'.format(args.save_path), log) | ||||
|   state = {k: v for k, v in args._get_kwargs()} | ||||
|   print_log(state, log) | ||||
|   print_log("Random Seed: {}".format(args.manualSeed), log) | ||||
| @@ -59,6 +60,7 @@ def main(): | ||||
|   print_log("CUDA   version : {}".format(torch.version.cuda), log) | ||||
|   print_log("cuDNN  version : {}".format(cudnn.version()), log) | ||||
|   print_log("Num of GPUs    : {}".format(torch.cuda.device_count()), log) | ||||
|   print_log("Num of CPUs    : {}".format(multiprocessing.cpu_count()), log) | ||||
|  | ||||
|   config = load_config( args.config_path ) | ||||
|   genotype = Networks[ args.arch ] | ||||
|   | ||||
| @@ -3,7 +3,7 @@ from .utils import time_file_str, time_string | ||||
| from .utils import test_imagenet_data | ||||
| from .utils import print_log | ||||
| from .evaluation_utils import obtain_accuracy | ||||
| from .draw_pts import draw_points | ||||
| #from .draw_pts import draw_points | ||||
| from .gpu_manager import GPUManager | ||||
|  | ||||
| from .save_meta import Save_Meta | ||||
|   | ||||
| @@ -1,9 +1,6 @@ | ||||
| import os, sys, time | ||||
| import numpy as np | ||||
| import matplotlib | ||||
| import random | ||||
| matplotlib.use('agg') | ||||
| import matplotlib.pyplot as plt | ||||
|  | ||||
| class AverageMeter(object): | ||||
|   """Computes and stores the average and current value""" | ||||
| @@ -53,6 +50,9 @@ class RecorderMeter(object): | ||||
|     else:       return self.epoch_accuracy[:self.current_epoch, 1].max() | ||||
|  | ||||
|   def plot_curve(self, save_path): | ||||
|     import matplotlib | ||||
|     matplotlib.use('agg') | ||||
|     import matplotlib.pyplot as plt | ||||
|     title = 'the accuracy/loss curve of train/val' | ||||
|     dpi = 100  | ||||
|     width, height = 1600, 1000 | ||||
| @@ -97,7 +97,7 @@ class RecorderMeter(object): | ||||
|     plt.close(fig) | ||||
|      | ||||
| def print_log(print_string, log): | ||||
|   print("{}".format(print_string)) | ||||
|   print ("{:}".format(print_string)) | ||||
|   if log is not None: | ||||
|     log.write('{}\n'.format(print_string)) | ||||
|     log.flush() | ||||
|   | ||||
							
								
								
									
										1
									
								
								output/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								output/.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1 +0,0 @@ | ||||
| * | ||||
							
								
								
									
										9
									
								
								scripts-cluster/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								scripts-cluster/README.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | ||||
| # Commands on Cluster | ||||
|  | ||||
| ## RNN | ||||
| ``` | ||||
| bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS" | ||||
| bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS" | ||||
| ``` | ||||
|  | ||||
| ## CNN | ||||
| @@ -1,6 +1,13 @@ | ||||
| #!/bin/bash | ||||
| # | ||||
| echo "CHECK-DATA-DIR START" | ||||
| sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ | ||||
|     COMM_KM_Data COMM_km_2018 \ | ||||
|     `pwd`/hadoop-data \ | ||||
|     afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets | ||||
|  | ||||
| tar xvf ./hadoop-data/cifar.python.tar -C ./data/data/ | ||||
|  | ||||
| cifar_dir="./data/data/cifar.python" | ||||
| if [ -d ${cifar_dir} ]; then | ||||
|   echo "Find cifar-dir: "${cifar_dir} | ||||
| @@ -10,20 +17,17 @@ else | ||||
| fi | ||||
| echo "CHECK-DATA-DIR DONE" | ||||
|  | ||||
| sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ | ||||
|     COMM_KM_Data COMM_km_2018 \ | ||||
|     `pwd`/hadoop-data \ | ||||
|     afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets | ||||
|  | ||||
| echo "PWD: " `pwd` | ||||
| echo "files::  " `ls` | ||||
| echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES} | ||||
|  | ||||
| # config python | ||||
| PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz | ||||
| wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1 | ||||
| tar xzf $PYTHON_ENV | ||||
|  | ||||
| alias python="./env/bin/python" | ||||
| echo "JOB-PWD   : " `pwd` | ||||
| echo "JOB-files :  " `ls` | ||||
| echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES} | ||||
|  | ||||
| echo "Python:  " `which python` | ||||
| echo `./env/bin/python --version` | ||||
|  | ||||
| # real commands | ||||
| bash ./scripts-rnn/train-WT2.sh GDAS | ||||
|   | ||||
| @@ -18,14 +18,15 @@ QUEUE=$1 | ||||
| NAME=$2 | ||||
| GPUs=$3 | ||||
| CMD=$4 | ||||
| TIME=$(date +"%Y-%h-%d-%T") | ||||
| TIME=$(date +"%Y-%h-%d--%T") | ||||
| TIME="${TIME//:/-}" | ||||
|  | ||||
| JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" | ||||
| echo "JOB-SCRIPT: " ${JOB_SCRIPT} | ||||
|  | ||||
| cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} | ||||
| echo ${CMD}              >> ${JOB_SCRIPT} | ||||
|  | ||||
| exit 1 | ||||
| HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" | ||||
|  | ||||
|  | ||||
| @@ -42,3 +43,6 @@ ${HGCP_CLIENT_BIN}/submit \ | ||||
|     --gpu-pnode ${GPUs} \ | ||||
|     --time-limit 0 \ | ||||
|     --job-script ${JOB_SCRIPT} | ||||
|  | ||||
| #--job-script ${FDIR}/job-script.sh | ||||
| #echo "JOB-SCRIPT: " ${JOB_SCRIPT} | ||||
|   | ||||
| @@ -7,8 +7,18 @@ fi | ||||
|  | ||||
| arch=$1 | ||||
| SAVED=./output/NAS-RNN/Search-${arch}-PTB | ||||
| PY_C="./env/bin/python" | ||||
|  | ||||
| python ./exps-rnn/train_rnn_base.py \ | ||||
| if [ ! -f ${PY_C} ]; then | ||||
|   echo "Local Run with Python: "`which python` | ||||
|   PY_C="python" | ||||
| else | ||||
|   echo "Cluster Run with Python: "${PY_C} | ||||
| fi | ||||
|  | ||||
| ${PY_C} --version | ||||
|  | ||||
| ${PY_C} ./exps-rnn/train_rnn_base.py \ | ||||
| 	--arch ${arch} \ | ||||
| 	--save_path ${SAVED} \ | ||||
| 	--config_path ./configs/NAS-PTB-BASE.config \ | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| #!/usr/bin/env sh | ||||
| #!/bin/bash | ||||
| if [ "$#" -ne 1 ] ;then | ||||
|   echo "Input illegal number of parameters " $# | ||||
|   echo "Need 1 parameters for the architectures" | ||||
| @@ -7,8 +7,18 @@ fi | ||||
|  | ||||
| arch=$1 | ||||
| SAVED=./output/NAS-RNN/Search-${arch}-WT2 | ||||
| PY_C="./env/bin/python" | ||||
|  | ||||
| python ./exps-rnn/train_rnn_base.py \ | ||||
| if [ ! -f ${PY_C} ]; then | ||||
|   echo "Local Run with Python: "`which python` | ||||
|   PY_C="python" | ||||
| else | ||||
|   echo "Cluster Run with Python: "${PY_C} | ||||
| fi | ||||
|  | ||||
| ${PY_C} --version | ||||
|  | ||||
| ${PY_C} ./exps-rnn/train_rnn_base.py \ | ||||
| 	--arch ${arch} \ | ||||
| 	--save_path ${SAVED} \ | ||||
| 	--config_path ./configs/NAS-WT2-BASE.config \ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user