From 3f483c37e70e94fe3804d0e148284f0672d4c1fa Mon Sep 17 00:00:00 2001 From: Xuanyi Dong <280835372@qq.com> Date: Tue, 2 Apr 2019 14:58:25 +0800 Subject: [PATCH] update ZIP for imagenet --- data/compress.py | 17 +++++++++++------ data/decompress.py | 26 +++++++++++++++++--------- scripts-cluster/submit.sh | 2 +- scripts-cnn/train-imagenet.sh | 4 +++- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/data/compress.py b/data/compress.py index 9ce97a8..e84df9d 100644 --- a/data/compress.py +++ b/data/compress.py @@ -1,4 +1,5 @@ -# python ./data/compress.py $TORCH_HOME/ILSVRC2012/ $TORCH_HOME/ILSVRC2012-TAR +# python ./data/compress.py $TORCH_HOME/ILSVRC2012/ $TORCH_HOME/ILSVRC2012-TAR tar +# python ./data/compress.py $TORCH_HOME/ILSVRC2012/ $TORCH_HOME/ILSVRC2012-ZIP zip import os, sys from pathlib import Path @@ -8,7 +9,7 @@ def command(prefix, cmd): os.system(cmd) -def main(source, destination): +def main(source, destination, xtype): assert source.exists(), '{:} does not exist'.format(source) assert (source/'train').exists(), '{:}/train does not exist'.format(source) assert (source/'val' ).exists(), '{:}/val does not exist'.format(source) @@ -21,13 +22,17 @@ def main(source, destination): subdirs = list( (source / 'train').glob('n*') ) assert len(subdirs) == 1000, 'ILSVRC2012 should contain 1000 classes instead of {:}.'.format( len(subdirs) ) - command('', 'tar -cf {:} -C {:} val'.format(destination/'val.tar', source)) + if xtype == 'tar' : command('', 'tar -cf {:} -C {:} val'.format(destination/'val.tar', source)) + elif xtype == 'zip': command('', '(cd {:} ; zip -r {:} val)'.format(source, destination/'val.zip')) + else: raise ValueError('invalid compress type : {:}'.format(xtype)) for idx, subdir in enumerate(subdirs): name = subdir.name - command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -cf {:} -C {:} {:}'.format(destination/'train'/'{:}.tar'.format(name), source / 'train', name)) + if xtype == 'tar' : command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -cf {:} -C {:} {:}'.format(destination/'train'/'{:}.tar'.format(name), source / 'train', name)) + elif xtype == 'zip': command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), '(cd {:}; zip -r {:} {:})'.format(source / 'train', destination/'train'/'{:}.zip'.format(name), name)) + else: raise ValueError('invalid compress type : {:}'.format(xtype)) if __name__ == '__main__': - assert len(sys.argv) == 3, 'invalid argv : {:}'.format(sys.argv) + assert len(sys.argv) == 4, 'invalid argv : {:}'.format(sys.argv) source, destination = Path(sys.argv[1]), Path(sys.argv[2]) - main(source, destination) + main(source, destination, sys.argv[3]) diff --git a/data/decompress.py b/data/decompress.py index 14897f8..9811dd5 100644 --- a/data/decompress.py +++ b/data/decompress.py @@ -1,4 +1,5 @@ -# python ./data/decompress.py $TORCH_HOME/ILSVRC2012-TAR/ ./data/data/ILSVRC2012 +# python ./data/decompress.py $TORCH_HOME/ILSVRC2012-TAR/ ./data/data/ILSVRC2012 tar +# python ./data/decompress.py $TORCH_HOME/ILSVRC2012-ZIP/ ./data/data/ILSVRC2012 zip import os, gc, sys from pathlib import Path import multiprocessing @@ -15,14 +16,17 @@ def execute(cmds, idx, num): def command(prefix, cmd): #print ('{:}{:}'.format(prefix, cmd)) #if execute: os.system(cmd) - return cmd + xcmd = '(echo {:}; {:}; sleep 0.1s)'.format(prefix, cmd) + return xcmd -def main(source, destination, num_process): +def main(source, destination, xtype): assert source.exists(), '{:} does not exist'.format(source) assert (source/'train' ).exists(), '{:}/train does not exist'.format(source) - assert (source/'val.tar').exists(), '{:}/val does not exist'.format(source) - assert num_process > 0, 'invalid num_process : {:}'.format(num_process) + if xtype == 'tar' : assert (source/'val.tar').exists(), '{:}/val does not exist'.format(source) + elif xtype == 'zip': assert (source/'val.zip').exists(), '{:}/val does not exist'.format(source) + else : raise ValueError('invalid unzip type : {:}'.format(xtype)) + #assert num_process > 0, 'invalid num_process : {:}'.format(num_process) source = source.resolve() destination = destination.resolve() destination.mkdir(parents=True, exist_ok=True) @@ -33,11 +37,15 @@ def main(source, destination, num_process): subdirs = list( (source / 'train').glob('n*') ) all_commands = [] assert len(subdirs) == 1000, 'ILSVRC2012 should contain 1000 classes instead of {:}.'.format( len(subdirs) ) - cmd = command('', 'tar -xf {:} -C {:}'.format(source/'val.tar', destination)) + if xtype == 'tar' : cmd = command('', 'tar -xf {:} -C {:}'.format(source/'val.tar', destination)) + elif xtype == 'zip': cmd = command('', 'unzip -qd {:} {:}'.format(destination, source/'val.zip')) + else : raise ValueError('invalid unzip type : {:}'.format(xtype)) all_commands.append( cmd ) for idx, subdir in enumerate(subdirs): name = subdir.name - cmd = command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -xf {:} -C {:}'.format(source/'train'/'{:}'.format(name), destination / 'train')) + if xtype == 'tar' : cmd = command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -xf {:} -C {:}'.format(source/'train'/'{:}'.format(name), destination / 'train')) + elif xtype == 'zip': cmd = command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'unzip -qd {:} {:}'.format(destination / 'train', source/'train'/'{:}'.format(name))) + else : raise ValueError('invalid unzip type : {:}'.format(xtype)) all_commands.append( cmd ) #print ('Collect all commands done : {:} lines'.format( len(all_commands) )) @@ -61,5 +69,5 @@ def main(source, destination, num_process): if __name__ == '__main__': assert len(sys.argv) == 4, 'invalid argv : {:}'.format(sys.argv) source, destination = Path(sys.argv[1]), Path(sys.argv[2]) - num_process = int(sys.argv[3]) - main(source, destination, num_process) + #num_process = int(sys.argv[3]) + main(source, destination, sys.argv[3]) diff --git a/scripts-cluster/submit.sh b/scripts-cluster/submit.sh index 43c26bb..59f2017 100644 --- a/scripts-cluster/submit.sh +++ b/scripts-cluster/submit.sh @@ -22,7 +22,7 @@ TIME=$(date +"%Y-%h-%d--%T") TIME="${TIME//:/-}" JOB_SCRIPT="${FDIR}/tmps/job-${TIME}.sh" -HDFS_DIR="/user/COMM_KM_Data/${USER}/logs/alljobs/${TIME}" +HDFS_DIR="/user/COMM_KM_Data/${USER}/logs/alljobs/${NAME}-${TIME}" echo "JOB-SCRIPT: "${JOB_SCRIPT} cat ${FDIR}/job-script.sh > ${JOB_SCRIPT} diff --git a/scripts-cnn/train-imagenet.sh b/scripts-cnn/train-imagenet.sh index a53d7fb..d182060 100644 --- a/scripts-cnn/train-imagenet.sh +++ b/scripts-cnn/train-imagenet.sh @@ -27,7 +27,9 @@ else echo "Unzip ILSVRC2012" tar --version #tar xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} - ${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 5 | bash + #${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 tar > ./data/data/get_imagenet.sh + ${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-ZIP ./data/data/ILSVRC2012 zip > ./data/data/get_imagenet.sh + bash ./data/data/get_imagenet.sh echo "Unzip ILSVRC2012 done" fi