update scripts

This commit is contained in:
Xuanyi Dong 2019-03-30 02:10:20 +08:00
parent 3734384b68
commit c8dddf9cf9
9 changed files with 61 additions and 23 deletions

View File

@ -7,6 +7,7 @@ import torch.nn.functional as F
import torchvision.datasets as dset
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
import multiprocessing
from pathlib import Path
lib_dir = (Path(__file__).parent / '..' / 'lib').resolve()
print ('lib-dir : {:}'.format(lib_dir))
@ -29,7 +30,7 @@ parser.add_argument('--config_path', type=str, help='the training configur
parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.')
parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)')
parser.add_argument('--manualSeed', type=int, help='manual seed')
parser.add_argument('--threads', type=int, default=10, help='the number of threads')
parser.add_argument('--threads', type=int, default=4, help='the number of threads')
args = parser.parse_args()
assert torch.cuda.is_available(), 'torch.cuda is not available'
@ -50,7 +51,7 @@ def main():
if not os.path.isdir(args.save_path):
os.makedirs(args.save_path)
log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w')
print_log('save path : {}'.format(args.save_path), log)
print_log('save path : {:}'.format(args.save_path), log)
state = {k: v for k, v in args._get_kwargs()}
print_log(state, log)
print_log("Random Seed: {}".format(args.manualSeed), log)
@ -59,6 +60,7 @@ def main():
print_log("CUDA version : {}".format(torch.version.cuda), log)
print_log("cuDNN version : {}".format(cudnn.version()), log)
print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log)
print_log("Num of CPUs : {}".format(multiprocessing.cpu_count()), log)
config = load_config( args.config_path )
genotype = Networks[ args.arch ]

View File

@ -3,7 +3,7 @@ from .utils import time_file_str, time_string
from .utils import test_imagenet_data
from .utils import print_log
from .evaluation_utils import obtain_accuracy
from .draw_pts import draw_points
#from .draw_pts import draw_points
from .gpu_manager import GPUManager
from .save_meta import Save_Meta

View File

@ -1,9 +1,6 @@
import os, sys, time
import numpy as np
import matplotlib
import random
matplotlib.use('agg')
import matplotlib.pyplot as plt
class AverageMeter(object):
"""Computes and stores the average and current value"""
@ -53,6 +50,9 @@ class RecorderMeter(object):
else: return self.epoch_accuracy[:self.current_epoch, 1].max()
def plot_curve(self, save_path):
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
title = 'the accuracy/loss curve of train/val'
dpi = 100
width, height = 1600, 1000
@ -97,7 +97,7 @@ class RecorderMeter(object):
plt.close(fig)
def print_log(print_string, log):
print("{}".format(print_string))
print ("{:}".format(print_string))
if log is not None:
log.write('{}\n'.format(print_string))
log.flush()

1
output/.gitignore vendored
View File

@ -1 +0,0 @@
*

View File

@ -0,0 +1,9 @@
# Commands on Cluster
## RNN
```
bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS"
bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS"
```
## CNN

View File

@ -1,6 +1,13 @@
#!/bin/bash
#
echo "CHECK-DATA-DIR START"
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
COMM_KM_Data COMM_km_2018 \
`pwd`/hadoop-data \
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
tar xvf ./hadoop-data/cifar.python.tar -C ./data/data/
cifar_dir="./data/data/cifar.python"
if [ -d ${cifar_dir} ]; then
echo "Find cifar-dir: "${cifar_dir}
@ -10,20 +17,17 @@ else
fi
echo "CHECK-DATA-DIR DONE"
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
COMM_KM_Data COMM_km_2018 \
`pwd`/hadoop-data \
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
echo "PWD: " `pwd`
echo "files:: " `ls`
echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
# config python
PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz
wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1
tar xzf $PYTHON_ENV
alias python="./env/bin/python"
echo "JOB-PWD : " `pwd`
echo "JOB-files : " `ls`
echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
echo "Python: " `which python`
echo `./env/bin/python --version`
# real commands
bash ./scripts-rnn/train-WT2.sh GDAS

View File

@ -18,14 +18,15 @@ QUEUE=$1
NAME=$2
GPUs=$3
CMD=$4
TIME=$(date +"%Y-%h-%d-%T")
TIME=$(date +"%Y-%h-%d--%T")
TIME="${TIME//:/-}"
JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh"
echo "JOB-SCRIPT: " ${JOB_SCRIPT}
cat ${FDIR}/job-script.sh > ${JOB_SCRIPT}
echo ${CMD} >> ${JOB_SCRIPT}
exit 1
HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin"
@ -42,3 +43,6 @@ ${HGCP_CLIENT_BIN}/submit \
--gpu-pnode ${GPUs} \
--time-limit 0 \
--job-script ${JOB_SCRIPT}
#--job-script ${FDIR}/job-script.sh
#echo "JOB-SCRIPT: " ${JOB_SCRIPT}

View File

@ -7,8 +7,18 @@ fi
arch=$1
SAVED=./output/NAS-RNN/Search-${arch}-PTB
PY_C="./env/bin/python"
python ./exps-rnn/train_rnn_base.py \
if [ ! -f ${PY_C} ]; then
echo "Local Run with Python: "`which python`
PY_C="python"
else
echo "Cluster Run with Python: "${PY_C}
fi
${PY_C} --version
${PY_C} ./exps-rnn/train_rnn_base.py \
--arch ${arch} \
--save_path ${SAVED} \
--config_path ./configs/NAS-PTB-BASE.config \

View File

@ -1,4 +1,4 @@
#!/usr/bin/env sh
#!/bin/bash
if [ "$#" -ne 1 ] ;then
echo "Input illegal number of parameters " $#
echo "Need 1 parameters for the architectures"
@ -7,8 +7,18 @@ fi
arch=$1
SAVED=./output/NAS-RNN/Search-${arch}-WT2
PY_C="./env/bin/python"
python ./exps-rnn/train_rnn_base.py \
if [ ! -f ${PY_C} ]; then
echo "Local Run with Python: "`which python`
PY_C="python"
else
echo "Cluster Run with Python: "${PY_C}
fi
${PY_C} --version
${PY_C} ./exps-rnn/train_rnn_base.py \
--arch ${arch} \
--save_path ${SAVED} \
--config_path ./configs/NAS-WT2-BASE.config \