Add more algorithms
This commit is contained in:
		
							
								
								
									
										14
									
								
								others/GDAS/scripts-cluster/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								others/GDAS/scripts-cluster/README.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| # Commands on Cluster | ||||
|  | ||||
| ## RNN | ||||
| ``` | ||||
| bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 WT2-GDAS 1 "bash ./scripts-rnn/train-WT2.sh GDAS" | ||||
| bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 PTB-GDAS 1 "bash ./scripts-rnn/train-PTB.sh GDAS" | ||||
| ``` | ||||
|  | ||||
| ## CNN | ||||
| ``` | ||||
| bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 CIFAR10-CUT-GDAS-F1 1 "bash ./scripts-cnn/train-cifar.sh GDAS_F1 cifar10  cut" | ||||
| bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 IMAGENET-GDAS-F1    1 "bash ./scripts-cnn/train-imagenet.sh GDAS_F1 52 14" | ||||
| bash scripts-cluster/submit.sh yq01-v100-box-idl-2-8 IMAGENET-GDAS-V1    1 "bash ./scripts-cnn/train-imagenet.sh GDAS_V1 50 14" | ||||
| ``` | ||||
							
								
								
									
										36
									
								
								others/GDAS/scripts-cluster/job-script.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								others/GDAS/scripts-cluster/job-script.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | ||||
| #!/bin/bash | ||||
| # | ||||
| echo "CHECK-DATA-DIR START" | ||||
| sh /home/HGCP_Program/software-install/afs_mount/bin/afs_mount.sh \ | ||||
|     COMM_KM_Data COMM_km_2018 \ | ||||
|     `pwd`/hadoop-data \ | ||||
|     afs://xingtian.afs.baidu.com:9902/user/COMM_KM_Data/dongxuanyi/datasets | ||||
|  | ||||
| export TORCH_HOME="./data/data/" | ||||
| tar -xf ./hadoop-data/cifar.python.tar -C ${TORCH_HOME} | ||||
|  | ||||
| cifar_dir="${TORCH_HOME}/cifar.python" | ||||
| if [ -d ${cifar_dir} ]; then | ||||
|   echo "Find cifar-dir: "${cifar_dir} | ||||
| else | ||||
|   echo "Can not find cifar-dir: "${cifar_dir} | ||||
|   exit 1 | ||||
| fi | ||||
| echo "CHECK-DATA-DIR DONE" | ||||
|  | ||||
| PID=$$ | ||||
|  | ||||
| # config python | ||||
| PYTHON_ENV=py36_pytorch1.0_env0.1.3.tar.gz | ||||
| wget -e "http_proxy=cp01-sys-hic-gpu-02.cp01:8888" http://cp01-sys-hic-gpu-02.cp01/HGCP_DEMO/$PYTHON_ENV > screen.log 2>&1 | ||||
| tar xzf $PYTHON_ENV | ||||
|  | ||||
| echo "JOB-PID   : "${PID} | ||||
| echo "JOB-PWD   : "$(pwd) | ||||
| echo "JOB-files : "$(ls) | ||||
| echo "JOB-CUDA_VISIBLE_DEVICES: " ${CUDA_VISIBLE_DEVICES} | ||||
|  | ||||
| ./env/bin/python --version | ||||
| echo "JOB-TORCH_HOME: "${TORCH_HOME} | ||||
|  | ||||
| # real commands | ||||
							
								
								
									
										52
									
								
								others/GDAS/scripts-cluster/submit.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								others/GDAS/scripts-cluster/submit.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | ||||
| #!/bin/bash | ||||
| # bash ./scripts-cluster/submit.sh ${QUEUE} ${JOB-NAME} ${GPUs} | ||||
| #find -name "._*" | xargs rm -rf | ||||
| ODIR=$(pwd) | ||||
| FDIR=$(cd $(dirname $0); pwd) | ||||
| echo "Bash-Dir  : "${ODIR} | ||||
| echo "File-Dir  : "${FDIR} | ||||
| echo "File-Name : "${0} | ||||
|  | ||||
| if [ "$#" -ne 4 ] ;then | ||||
|   echo "Input illegal number of parameters " $# | ||||
|   echo "Need 4 parameters for the queue-name, the job-name, and the number-of-GPUs" | ||||
|   exit 1                | ||||
| fi | ||||
| find -name "__pycache__" | xargs rm -rf | ||||
|  | ||||
| QUEUE=$1 | ||||
| NAME=$2 | ||||
| GPUs=$3 | ||||
| CMD=$4 | ||||
| TIME=$(date +"%Y-%h-%d--%T") | ||||
| TIME="${TIME//:/-}" | ||||
|  | ||||
| JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" | ||||
| HDFS_DIR="/user/COMM_KM_Data/${USER}/logs/alljobs/${NAME}-${TIME}" | ||||
| echo "JOB-SCRIPT: "${JOB_SCRIPT} | ||||
|  | ||||
| cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} | ||||
| echo ${CMD}              >> ${JOB_SCRIPT} | ||||
|  | ||||
| ${HDP} -mkdir ${HDFS_DIR}  | ||||
| echo "Create "${HDFS_DIR}" done!" | ||||
| sleep 1s | ||||
|  | ||||
| HGCP_CLIENT_BIN="${HOME}/.hgcp/software-install/HGCP_client/bin" | ||||
|  | ||||
| ${HGCP_CLIENT_BIN}/submit \ | ||||
|     --hdfs afs://xingtian.afs.baidu.com:9902 \ | ||||
|     --hdfs-user COMM_KM_Data \ | ||||
|     --hdfs-passwd COMM_km_2018 \ | ||||
|     --hdfs-path ${HDFS_DIR} \ | ||||
|     --file-dir ./ \ | ||||
|     --job-name ${NAME} \ | ||||
|     --queue-name ${QUEUE} \ | ||||
|     --num-nodes 1 \ | ||||
|     --num-task-pernode 1 \ | ||||
|     --gpu-pnode ${GPUs} \ | ||||
|     --time-limit 0 \ | ||||
|     --job-script ${JOB_SCRIPT} | ||||
|  | ||||
| #--job-script ${FDIR}/job-script.sh | ||||
| #echo "JOB-SCRIPT: " ${JOB_SCRIPT} | ||||
		Reference in New Issue
	
	Block a user