From 785fefe1d66e1bd694350ae91c4a27776b93eeda Mon Sep 17 00:00:00 2001 From: Xuanyi Dong <280835372@qq.com> Date: Mon, 1 Apr 2019 21:12:50 +0800 Subject: [PATCH] update de-compress and scripts --- data/compress.py | 33 ++++++++++++++++++ data/decompress.py | 65 +++++++++++++++++++++++++++++++++++ scripts-cnn/train-imagenet.sh | 5 ++- 3 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 data/compress.py create mode 100644 data/decompress.py diff --git a/data/compress.py b/data/compress.py new file mode 100644 index 0000000..9ce97a8 --- /dev/null +++ b/data/compress.py @@ -0,0 +1,33 @@ +# python ./data/compress.py $TORCH_HOME/ILSVRC2012/ $TORCH_HOME/ILSVRC2012-TAR +import os, sys +from pathlib import Path + + +def command(prefix, cmd): + print ('{:}{:}'.format(prefix, cmd)) + os.system(cmd) + + +def main(source, destination): + assert source.exists(), '{:} does not exist'.format(source) + assert (source/'train').exists(), '{:}/train does not exist'.format(source) + assert (source/'val' ).exists(), '{:}/val does not exist'.format(source) + source = source.resolve() + destination = destination.resolve() + destination.mkdir(parents=True, exist_ok=True) + os.system('rm -rf {:}'.format(destination)) + destination.mkdir(parents=True, exist_ok=True) + (destination/'train').mkdir(parents=True, exist_ok=True) + + subdirs = list( (source / 'train').glob('n*') ) + assert len(subdirs) == 1000, 'ILSVRC2012 should contain 1000 classes instead of {:}.'.format( len(subdirs) ) + command('', 'tar -cf {:} -C {:} val'.format(destination/'val.tar', source)) + for idx, subdir in enumerate(subdirs): + name = subdir.name + command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -cf {:} -C {:} {:}'.format(destination/'train'/'{:}.tar'.format(name), source / 'train', name)) + + +if __name__ == '__main__': + assert len(sys.argv) == 3, 'invalid argv : {:}'.format(sys.argv) + source, destination = Path(sys.argv[1]), Path(sys.argv[2]) + main(source, destination) diff --git a/data/decompress.py b/data/decompress.py new file mode 100644 index 0000000..14897f8 --- /dev/null +++ b/data/decompress.py @@ -0,0 +1,65 @@ +# python ./data/decompress.py $TORCH_HOME/ILSVRC2012-TAR/ ./data/data/ILSVRC2012 +import os, gc, sys +from pathlib import Path +import multiprocessing + + +def execute(cmds, idx, num): + #print ('{:03d} :: {:03d} :: {:03d}'.format(idx, num, len(cmds))) + for i, cmd in enumerate(cmds): + if i % num == idx: + print ('{:03d} :: {:03d} :: {:03d}/{:03d} : {:}'.format(idx, num, i, len(cmds), cmd)) + os.system(cmd) + + +def command(prefix, cmd): + #print ('{:}{:}'.format(prefix, cmd)) + #if execute: os.system(cmd) + return cmd + + +def main(source, destination, num_process): + assert source.exists(), '{:} does not exist'.format(source) + assert (source/'train' ).exists(), '{:}/train does not exist'.format(source) + assert (source/'val.tar').exists(), '{:}/val does not exist'.format(source) + assert num_process > 0, 'invalid num_process : {:}'.format(num_process) + source = source.resolve() + destination = destination.resolve() + destination.mkdir(parents=True, exist_ok=True) + os.system('rm -rf {:}'.format(destination)) + destination.mkdir(parents=True, exist_ok=True) + (destination/'train').mkdir(parents=True, exist_ok=True) + + subdirs = list( (source / 'train').glob('n*') ) + all_commands = [] + assert len(subdirs) == 1000, 'ILSVRC2012 should contain 1000 classes instead of {:}.'.format( len(subdirs) ) + cmd = command('', 'tar -xf {:} -C {:}'.format(source/'val.tar', destination)) + all_commands.append( cmd ) + for idx, subdir in enumerate(subdirs): + name = subdir.name + cmd = command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -xf {:} -C {:}'.format(source/'train'/'{:}'.format(name), destination / 'train')) + all_commands.append( cmd ) + #print ('Collect all commands done : {:} lines'.format( len(all_commands) )) + + for i, cmd in enumerate(all_commands): + print(cmd) + # os.system(cmd) + # print ('{:03d}/{:03d} : {:}'.format(i, len(all_commands), cmd)) + # gc.collect() + + """ + records = [] + for i in range(num_process): + process = multiprocessing.Process(target=execute, args=(all_commands, i, num_process)) + process.start() + records.append(process) + for process in records: + process.join() + """ + + +if __name__ == '__main__': + assert len(sys.argv) == 4, 'invalid argv : {:}'.format(sys.argv) + source, destination = Path(sys.argv[1]), Path(sys.argv[2]) + num_process = int(sys.argv[3]) + main(source, destination, num_process) diff --git a/scripts-cnn/train-imagenet.sh b/scripts-cnn/train-imagenet.sh index 9cb5167..a53d7fb 100644 --- a/scripts-cnn/train-imagenet.sh +++ b/scripts-cnn/train-imagenet.sh @@ -25,7 +25,10 @@ if [ ! -f ${PY_C} ]; then else echo "Cluster Run with Python: "${PY_C} echo "Unzip ILSVRC2012" - tar xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} + tar --version + #tar xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME} + ${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 5 | bash + echo "Unzip ILSVRC2012 done" fi ${PY_C} --version