update scripts

2019-03-30 02:10:20 +08:00 · 2019-03-30 02:10:20 +08:00 · c8dddf9cf9
commit c8dddf9cf9
parent 3734384b68
9 changed files with 61 additions and 23 deletions
--- a/exps-rnn/train_rnn_base.py
+++ b/exps-rnn/train_rnn_base.py
@ -7,6 +7,7 @@ import torch.nn.functional as F
 import torchvision.datasets as dset
 import torch.backends.cudnn as cudnn
 import torchvision.transforms as transforms
+import multiprocessing
 from pathlib import Path
 lib_dir = (Path(__file__).parent / '..' / 'lib').resolve()
 print ('lib-dir : {:}'.format(lib_dir))
@ -29,7 +30,7 @@ parser.add_argument('--config_path',       type=str, help='the training configur
 parser.add_argument('--save_path',         type=str, help='Folder to save checkpoints and log.')
 parser.add_argument('--print_freq',        type=int, help='print frequency (default: 200)')
 parser.add_argument('--manualSeed',        type=int, help='manual seed')
-parser.add_argument('--threads',           type=int, default=10, help='the number of threads')
+parser.add_argument('--threads',           type=int, default=4, help='the number of threads')
 args = parser.parse_args()

 assert torch.cuda.is_available(), 'torch.cuda is not available'
@ -50,7 +51,7 @@ def main():
  if not os.path.isdir(args.save_path):
    os.makedirs(args.save_path)
  log = open(os.path.join(args.save_path, 'log-seed-{:}-{:}.txt'.format(args.manualSeed, time_file_str())), 'w')
-  print_log('save path : {}'.format(args.save_path), log)
+  print_log('save path : {:}'.format(args.save_path), log)
  state = {k: v for k, v in args._get_kwargs()}
  print_log(state, log)
  print_log("Random Seed: {}".format(args.manualSeed), log)
@ -59,6 +60,7 @@ def main():
  print_log("CUDA   version : {}".format(torch.version.cuda), log)
  print_log("cuDNN  version : {}".format(cudnn.version()), log)
  print_log("Num of GPUs    : {}".format(torch.cuda.device_count()), log)
+  print_log("Num of CPUs    : {}".format(multiprocessing.cpu_count()), log)

  config = load_config( args.config_path )
  genotype = Networks[ args.arch ]
--- a/lib/utils/init.py
+++ b/lib/utils/init.py
@ -3,7 +3,7 @@ from .utils import time_file_str, time_string
 from .utils import test_imagenet_data
 from .utils import print_log
 from .evaluation_utils import obtain_accuracy
-from .draw_pts import draw_points
+#from .draw_pts import draw_points
 from .gpu_manager import GPUManager

 from .save_meta import Save_Meta
--- a/lib/utils/utils.py
+++ b/lib/utils/utils.py
@ -1,9 +1,6 @@
 import os, sys, time
 import numpy as np
-import matplotlib
 import random
-matplotlib.use('agg')
-import matplotlib.pyplot as plt

 class AverageMeter(object):
  """Computes and stores the average and current value"""
@ -53,6 +50,9 @@ class RecorderMeter(object):
    else:       return self.epoch_accuracy[:self.current_epoch, 1].max()

  def plot_curve(self, save_path):
+    import matplotlib
+    matplotlib.use('agg')
+    import matplotlib.pyplot as plt
    title = 'the accuracy/loss curve of train/val'
    dpi = 100 
    width, height = 1600, 1000
@ -97,7 +97,7 @@ class RecorderMeter(object):
    plt.close(fig)
    
 def print_log(print_string, log):
-  print("{}".format(print_string))
+  print ("{:}".format(print_string))
  if log is not None:
    log.write('{}\n'.format(print_string))
    log.flush()
--- a/output/.gitignore
+++ b/output/.gitignore
@ -1 +0,0 @@
-*
--- a/scripts-cluster/README.md
+++ b/scripts-cluster/README.md
@ -0,0 +1,9 @@
+# Commands on Cluster
+
+## RNN
+```
+bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS"
+bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS"
+```
+
+## CNN
--- a/scripts-cluster/job-script.sh
+++ b/scripts-cluster/job-script.sh
@ -1,6 +1,13 @@
 #!/bin/bash
 #
 echo "CHECK-DATA-DIR START"
+sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
+    COMM_KM_Data COMM_km_2018 \
+    `pwd`/hadoop-data \
+    afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
+
+tar xvf ./hadoop-data/cifar.python.tar -C ./data/data/
+
 cifar_dir="./data/data/cifar.python"
 if [ -d ${cifar_dir} ]; then
  echo "Find cifar-dir: "${cifar_dir}
@ -10,20 +17,17 @@ else
 fi
 echo "CHECK-DATA-DIR DONE"

-sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \
-    COMM_KM_Data COMM_km_2018 \
-    `pwd`/hadoop-data \
-    afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets
-
-echo "PWD: " `pwd`
-echo "files::  " `ls`
-echo "CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}

 # config python
 PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz
 wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1
 tar xzf $PYTHON_ENV

-alias python="./env/bin/python"
+echo "JOB-PWD   : " `pwd`
+echo "JOB-files :  " `ls`
+echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES}

-echo "Python:  " `which python`
+echo `./env/bin/python --version`
+
+# real commands
+bash ./scripts-rnn/train-WT2.sh GDAS
--- a/scripts-cluster/submit.sh
+++ b/scripts-cluster/submit.sh
@ -18,14 +18,15 @@ QUEUE=$1
 NAME=$2
 GPUs=$3
 CMD=$4
-TIME=$(date +"%Y-%h-%d-%T")
+TIME=$(date +"%Y-%h-%d--%T")
+TIME="${TIME//:/-}"

 JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh"
+echo "JOB-SCRIPT: " ${JOB_SCRIPT}

 cat ${FDIR}/job-script.sh > ${JOB_SCRIPT}
 echo ${CMD}              >> ${JOB_SCRIPT}

-exit 1
 HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin"


@ -42,3 +43,6 @@ ${HGCP_CLIENT_BIN}/submit \
    --gpu-pnode ${GPUs} \
    --time-limit 0 \
    --job-script ${JOB_SCRIPT}
+
+#--job-script ${FDIR}/job-script.sh
+#echo "JOB-SCRIPT: " ${JOB_SCRIPT}
--- a/scripts-rnn/train-PTB.sh
+++ b/scripts-rnn/train-PTB.sh
@ -7,8 +7,18 @@ fi

 arch=$1
 SAVED=./output/NAS-RNN/Search-${arch}-PTB
+PY_C="./env/bin/python"

-python ./exps-rnn/train_rnn_base.py \
+if [ ! -f ${PY_C} ]; then
+  echo "Local Run with Python: "`which python`
+  PY_C="python"
+else
+  echo "Cluster Run with Python: "${PY_C}
+fi
+
+${PY_C} --version
+
+${PY_C} ./exps-rnn/train_rnn_base.py \
 	--arch ${arch} \
 	--save_path ${SAVED} \
 	--config_path ./configs/NAS-PTB-BASE.config \
--- a/scripts-rnn/train-WT2.sh
+++ b/scripts-rnn/train-WT2.sh
@ -1,4 +1,4 @@
-#!/usr/bin/env sh
+#!/bin/bash
 if [ "$#" -ne 1 ] ;then
  echo "Input illegal number of parameters " $#
  echo "Need 1 parameters for the architectures"
@ -7,8 +7,18 @@ fi

 arch=$1
 SAVED=./output/NAS-RNN/Search-${arch}-WT2
+PY_C="./env/bin/python"

-python ./exps-rnn/train_rnn_base.py \
+if [ ! -f ${PY_C} ]; then
+  echo "Local Run with Python: "`which python`
+  PY_C="python"
+else
+  echo "Cluster Run with Python: "${PY_C}
+fi
+
+${PY_C} --version
+
+${PY_C} ./exps-rnn/train_rnn_base.py \
 	--arch ${arch} \
 	--save_path ${SAVED} \
 	--config_path ./configs/NAS-WT2-BASE.config \