update scripts
This commit is contained in:
parent
3734384b68
commit
c8dddf9cf9
@ -7,6 +7,7 @@ import torch.nn.functional as F
|
||||
import torchvision.datasets as dset
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torchvision.transforms as transforms
|
||||
import multiprocessing
|
||||
from pathlib import Path
|
||||
lib_dir = (Path(__file__).parent / '..' / 'lib').resolve()
|
||||
print ('lib-dir : {:}'.format(lib_dir))
|
||||
@ -29,7 +30,7 @@ parser.add_argument('--config_path', type=str, help='the training configur
|
||||
parser.add_argument('--save_path', type=str, help='Folder to save checkpoints and log.')
|
||||
parser.add_argument('--print_freq', type=int, help='print frequency (default: 200)')
|
||||
parser.add_argument('--manualSeed', type=int, help='manual seed')
|
||||
parser.add_argument('--threads', type=int, default=10, help='the number of threads')
|
||||
parser.add_argument('--threads', type=int, default=4, help='the number of threads')
|
||||
args = parser.parse_args()
|
||||
|
||||
assert torch.cuda.is_available(), 'torch.cuda is not available'
|
||||
@ -50,7 +51,7 @@ def main():
|
||||
if not os.path.isdir(args.save_path):
|
||||
os.makedirs(args.save_path)
|
||||
log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w')
|
||||
print_log('save path : {}'.format(args.save_path), log)
|
||||
print_log('save path : {:}'.format(args.save_path), log)
|
||||
state = {k: v for k, v in args._get_kwargs()}
|
||||
print_log(state, log)
|
||||
print_log("Random Seed: {}".format(args.manualSeed), log)
|
||||
@ -59,6 +60,7 @@ def main():
|
||||
print_log("CUDA version : {}".format(torch.version.cuda), log)
|
||||
print_log("cuDNN version : {}".format(cudnn.version()), log)
|
||||
print_log("Num of GPUs : {}".format(torch.cuda.device_count()), log)
|
||||
print_log("Num of CPUs : {}".format(multiprocessing.cpu_count()), log)
|
||||
|
||||
config = load_config( args.config_path )
|
||||
genotype = Networks[ args.arch ]
|
||||
|
@ -3,7 +3,7 @@ from .utils import time_file_str, time_string
|
||||
from .utils import test_imagenet_data
|
||||
from .utils import print_log
|
||||
from .evaluation_utils import obtain_accuracy
|
||||
from .draw_pts import draw_points
|
||||
#from .draw_pts import draw_points
|
||||
from .gpu_manager import GPUManager
|
||||
|
||||
from .save_meta import Save_Meta
|
||||
|
@ -1,9 +1,6 @@
|
||||
import os, sys, time
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import random
|
||||
matplotlib.use('agg')
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
@ -53,6 +50,9 @@ class RecorderMeter(object):
|
||||
else: return self.epoch_accuracy[:self.current_epoch, 1].max()
|
||||
|
||||
def plot_curve(self, save_path):
|
||||
import matplotlib
|
||||
matplotlib.use('agg')
|
||||
import matplotlib.pyplot as plt
|
||||
title = 'the accuracy/loss curve of train/val'
|
||||
dpi = 100
|
||||
width, height = 1600, 1000
|
||||
@ -97,7 +97,7 @@ class RecorderMeter(object):
|
||||
plt.close(fig)
|
||||
|
||||
def print_log(print_string, log):
|
||||
print("{}".format(print_string))
|
||||
print ("{:}".format(print_string))
|
||||
if log is not None:
|
||||
log.write('{}\n'.format(print_string))
|
||||
log.flush()
|
||||
|
1
output/.gitignore
vendored
1
output/.gitignore
vendored
@ -1 +0,0 @@
|
||||
*
|
9
scripts-cluster/README.md
Normal file
9
scripts-cluster/README.md
Normal file
@ -0,0 +1,9 @@
|
||||
# Commands on Cluster
|
||||
|
||||
## RNN
|
||||
```
|
||||
bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS"
|
||||
bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS"
|
||||
```
|
||||
|
||||
## CNN
|
@ -1,6 +1,13 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
echo "CHECK-DATA-DIR START"
|
||||
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
|
||||
COMM_KM_Data COMM_km_2018 \
|
||||
`pwd`/hadoop-data \
|
||||
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
|
||||
|
||||
tar xvf ./hadoop-data/cifar.python.tar -C ./data/data/
|
||||
|
||||
cifar_dir="./data/data/cifar.python"
|
||||
if [ -d ${cifar_dir} ]; then
|
||||
echo "Find cifar-dir: "${cifar_dir}
|
||||
@ -10,20 +17,17 @@ else
|
||||
fi
|
||||
echo "CHECK-DATA-DIR DONE"
|
||||
|
||||
sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
|
||||
COMM_KM_Data COMM_km_2018 \
|
||||
`pwd`/hadoop-data \
|
||||
afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
|
||||
|
||||
echo "PWD: " `pwd`
|
||||
echo "files:: " `ls`
|
||||
echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
|
||||
|
||||
# config python
|
||||
PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz
|
||||
wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1
|
||||
tar xzf $PYTHON_ENV
|
||||
|
||||
alias python="./env/bin/python"
|
||||
echo "JOB-PWD : " `pwd`
|
||||
echo "JOB-files : " `ls`
|
||||
echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}
|
||||
|
||||
echo "Python: " `which python`
|
||||
echo `./env/bin/python --version`
|
||||
|
||||
# real commands
|
||||
bash ./scripts-rnn/train-WT2.sh GDAS
|
||||
|
@ -18,14 +18,15 @@ QUEUE=$1
|
||||
NAME=$2
|
||||
GPUs=$3
|
||||
CMD=$4
|
||||
TIME=$(date +"%Y-%h-%d-%T")
|
||||
TIME=$(date +"%Y-%h-%d--%T")
|
||||
TIME="${TIME//:/-}"
|
||||
|
||||
JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh"
|
||||
echo "JOB-SCRIPT: " ${JOB_SCRIPT}
|
||||
|
||||
cat ${FDIR}/job-script.sh > ${JOB_SCRIPT}
|
||||
echo ${CMD} >> ${JOB_SCRIPT}
|
||||
|
||||
exit 1
|
||||
HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin"
|
||||
|
||||
|
||||
@ -42,3 +43,6 @@ ${HGCP_CLIENT_BIN}/submit \
|
||||
--gpu-pnode ${GPUs} \
|
||||
--time-limit 0 \
|
||||
--job-script ${JOB_SCRIPT}
|
||||
|
||||
#--job-script ${FDIR}/job-script.sh
|
||||
#echo "JOB-SCRIPT: " ${JOB_SCRIPT}
|
||||
|
@ -7,8 +7,18 @@ fi
|
||||
|
||||
arch=$1
|
||||
SAVED=./output/NAS-RNN/Search-${arch}-PTB
|
||||
PY_C="./env/bin/python"
|
||||
|
||||
python ./exps-rnn/train_rnn_base.py \
|
||||
if [ ! -f ${PY_C} ]; then
|
||||
echo "Local Run with Python: "`which python`
|
||||
PY_C="python"
|
||||
else
|
||||
echo "Cluster Run with Python: "${PY_C}
|
||||
fi
|
||||
|
||||
${PY_C} --version
|
||||
|
||||
${PY_C} ./exps-rnn/train_rnn_base.py \
|
||||
--arch ${arch} \
|
||||
--save_path ${SAVED} \
|
||||
--config_path ./configs/NAS-PTB-BASE.config \
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env sh
|
||||
#!/bin/bash
|
||||
if [ "$#" -ne 1 ] ;then
|
||||
echo "Input illegal number of parameters " $#
|
||||
echo "Need 1 parameters for the architectures"
|
||||
@ -7,8 +7,18 @@ fi
|
||||
|
||||
arch=$1
|
||||
SAVED=./output/NAS-RNN/Search-${arch}-WT2
|
||||
PY_C="./env/bin/python"
|
||||
|
||||
python ./exps-rnn/train_rnn_base.py \
|
||||
if [ ! -f ${PY_C} ]; then
|
||||
echo "Local Run with Python: "`which python`
|
||||
PY_C="python"
|
||||
else
|
||||
echo "Cluster Run with Python: "${PY_C}
|
||||
fi
|
||||
|
||||
${PY_C} --version
|
||||
|
||||
${PY_C} ./exps-rnn/train_rnn_base.py \
|
||||
--arch ${arch} \
|
||||
--save_path ${SAVED} \
|
||||
--config_path ./configs/NAS-WT2-BASE.config \
|
||||
|
Loading…
Reference in New Issue
Block a user