update de-compress and scripts

This commit is contained in:
Xuanyi Dong 2019-04-01 21:12:50 +08:00
parent 90fb659b28
commit 785fefe1d6
3 changed files with 102 additions and 1 deletions

33
data/compress.py Normal file
View File

@ -0,0 +1,33 @@
# python ./data/compress.py $TORCH_HOME/ILSVRC2012/ $TORCH_HOME/ILSVRC2012-TAR
import os, sys
from pathlib import Path
def command(prefix, cmd):
print ('{:}{:}'.format(prefix, cmd))
os.system(cmd)
def main(source, destination):
assert source.exists(), '{:} does not exist'.format(source)
assert (source/'train').exists(), '{:}/train does not exist'.format(source)
assert (source/'val' ).exists(), '{:}/val does not exist'.format(source)
source = source.resolve()
destination = destination.resolve()
destination.mkdir(parents=True, exist_ok=True)
os.system('rm -rf {:}'.format(destination))
destination.mkdir(parents=True, exist_ok=True)
(destination/'train').mkdir(parents=True, exist_ok=True)
subdirs = list( (source / 'train').glob('n*') )
assert len(subdirs) == 1000, 'ILSVRC2012 should contain 1000 classes instead of {:}.'.format( len(subdirs) )
command('', 'tar -cf {:} -C {:} val'.format(destination/'val.tar', source))
for idx, subdir in enumerate(subdirs):
name = subdir.name
command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -cf {:} -C {:} {:}'.format(destination/'train'/'{:}.tar'.format(name), source / 'train', name))
if __name__ == '__main__':
assert len(sys.argv) == 3, 'invalid argv : {:}'.format(sys.argv)
source, destination = Path(sys.argv[1]), Path(sys.argv[2])
main(source, destination)

65
data/decompress.py Normal file
View File

@ -0,0 +1,65 @@
# python ./data/decompress.py $TORCH_HOME/ILSVRC2012-TAR/ ./data/data/ILSVRC2012
import os, gc, sys
from pathlib import Path
import multiprocessing
def execute(cmds, idx, num):
#print ('{:03d} :: {:03d} :: {:03d}'.format(idx, num, len(cmds)))
for i, cmd in enumerate(cmds):
if i % num == idx:
print ('{:03d} :: {:03d} :: {:03d}/{:03d} : {:}'.format(idx, num, i, len(cmds), cmd))
os.system(cmd)
def command(prefix, cmd):
#print ('{:}{:}'.format(prefix, cmd))
#if execute: os.system(cmd)
return cmd
def main(source, destination, num_process):
assert source.exists(), '{:} does not exist'.format(source)
assert (source/'train' ).exists(), '{:}/train does not exist'.format(source)
assert (source/'val.tar').exists(), '{:}/val does not exist'.format(source)
assert num_process > 0, 'invalid num_process : {:}'.format(num_process)
source = source.resolve()
destination = destination.resolve()
destination.mkdir(parents=True, exist_ok=True)
os.system('rm -rf {:}'.format(destination))
destination.mkdir(parents=True, exist_ok=True)
(destination/'train').mkdir(parents=True, exist_ok=True)
subdirs = list( (source / 'train').glob('n*') )
all_commands = []
assert len(subdirs) == 1000, 'ILSVRC2012 should contain 1000 classes instead of {:}.'.format( len(subdirs) )
cmd = command('', 'tar -xf {:} -C {:}'.format(source/'val.tar', destination))
all_commands.append( cmd )
for idx, subdir in enumerate(subdirs):
name = subdir.name
cmd = command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -xf {:} -C {:}'.format(source/'train'/'{:}'.format(name), destination / 'train'))
all_commands.append( cmd )
#print ('Collect all commands done : {:} lines'.format( len(all_commands) ))
for i, cmd in enumerate(all_commands):
print(cmd)
# os.system(cmd)
# print ('{:03d}/{:03d} : {:}'.format(i, len(all_commands), cmd))
# gc.collect()
"""
records = []
for i in range(num_process):
process = multiprocessing.Process(target=execute, args=(all_commands, i, num_process))
process.start()
records.append(process)
for process in records:
process.join()
"""
if __name__ == '__main__':
assert len(sys.argv) == 4, 'invalid argv : {:}'.format(sys.argv)
source, destination = Path(sys.argv[1]), Path(sys.argv[2])
num_process = int(sys.argv[3])
main(source, destination, num_process)

View File

@ -25,7 +25,10 @@ if [ ! -f ${PY_C} ]; then
else
echo "Cluster Run with Python: "${PY_C}
echo "Unzip ILSVRC2012"
tar xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME}
tar --version
#tar xf ./hadoop-data/ILSVRC2012.tar -C ${TORCH_HOME}
${PY_C} ./data/decompress.py ./hadoop-data/ILSVRC2012-TAR ./data/data/ILSVRC2012 5 | bash
echo "Unzip ILSVRC2012 done"
fi
${PY_C} --version