# python ./data/decompress.py $TORCH_HOME/ILSVRC2012-TAR/ ./data/data/ILSVRC2012 tar # python ./data/decompress.py $TORCH_HOME/ILSVRC2012-ZIP/ ./data/data/ILSVRC2012 zip import os, gc, sys from pathlib import Path import multiprocessing def execute(cmds, idx, num): #print ('{:03d} :: {:03d} :: {:03d}'.format(idx, num, len(cmds))) for i, cmd in enumerate(cmds): if i % num == idx: print ('{:03d} :: {:03d} :: {:03d}/{:03d} : {:}'.format(idx, num, i, len(cmds), cmd)) os.system(cmd) def command(prefix, cmd): #print ('{:}{:}'.format(prefix, cmd)) #if execute: os.system(cmd) #xcmd = '(echo {:} $(date +\"%Y-%h-%d--%T\") \"PID:\"$$; {:}; sleep 0.1s)'.format(prefix, cmd) #xcmd = '(echo {:} $(date +\"%Y-%h-%d--%T\") \"PID:\"$$; {:}; sleep 0.1s; pmap $$; echo \"\")'.format(prefix, cmd) #xcmd = '(echo {:} $(date +\"%Y-%h-%d--%T\") \"PID:\"$$; {:}; sleep 0.1s; pmap $$; echo \"\")'.format(prefix, cmd) xcmd = '(echo {:} $(date +\"%Y-%h-%d--%T\") \"PID:\"$$; {:}; sleep 0.1s)'.format(prefix, cmd) return xcmd def mkILSVRC2012(destination): destination = destination.resolve() destination.mkdir(parents=True, exist_ok=True) os.system('rm -rf {:}'.format(destination)) destination.mkdir(parents=True, exist_ok=True) (destination/'train').mkdir(parents=True, exist_ok=True) def main(source, destination, xtype): assert source.exists(), '{:} does not exist'.format(source) assert (source/'train' ).exists(), '{:}/train does not exist'.format(source) if xtype == 'tar' : assert (source/'val.tar').exists(), '{:}/val does not exist'.format(source) elif xtype == 'zip': assert (source/'val.zip').exists(), '{:}/val does not exist'.format(source) else : raise ValueError('invalid unzip type : {:}'.format(xtype)) #assert num_process > 0, 'invalid num_process : {:}'.format(num_process) source = source.resolve() mkILSVRC2012(destination) subdirs = list( (source / 'train').glob('n*') ) all_commands = [] assert len(subdirs) == 1000, 'ILSVRC2012 should contain 1000 classes instead of {:}.'.format( len(subdirs) ) for idx, subdir in enumerate(subdirs): name = subdir.name if xtype == 'tar' : cmd = command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -xf {:} -C {:}'.format(source/'train'/'{:}'.format(name), destination / 'train')) elif xtype == 'zip': cmd = command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'unzip -qd {:} {:}'.format(destination / 'train', source/'train'/'{:}'.format(name))) else : raise ValueError('invalid unzip type : {:}'.format(xtype)) all_commands.append( cmd ) if xtype == 'tar' : cmd = command('', 'tar -xf {:} -C {:}'.format(source/'val.tar', destination)) elif xtype == 'zip': cmd = command('', 'unzip -qd {:} {:}'.format(destination, source/'val.zip')) else : raise ValueError('invalid unzip type : {:}'.format(xtype)) all_commands.append( cmd ) #print ('Collect all commands done : {:} lines'.format( len(all_commands) )) for i, cmd in enumerate(all_commands): print(cmd) # os.system(cmd) # print ('{:03d}/{:03d} : {:}'.format(i, len(all_commands), cmd)) # gc.collect() """ records = [] for i in range(num_process): process = multiprocessing.Process(target=execute, args=(all_commands, i, num_process)) process.start() records.append(process) for process in records: process.join() """ if __name__ == '__main__': assert len(sys.argv) == 4, 'invalid argv : {:}'.format(sys.argv) source, destination = Path(sys.argv[1]), Path(sys.argv[2]) #num_process = int(sys.argv[3]) if sys.argv[3] == 'wget': with open(source) as f: content = f.readlines() content = [x.strip() for x in content] assert len(content) == 1000, 'invalid lines={:} from {:}'.format( len(content), source ) mkILSVRC2012(destination) all_commands = [] cmd = command('make-val', 'wget -q http://10.127.2.44:8000/ILSVRC2012-TAR/val.tar --directory-prefix={:} ; tar -xf {:} -C {:} ; rm {:}'.format(destination, destination / 'val.tar', destination, destination / 'val.tar')) all_commands.append(cmd) for idx, name in enumerate(content): cmd = command('{:03d}/{:03d}-th: '.format(idx, len(content)), 'wget -q http://10.127.2.44:8000/ILSVRC2012-TAR/train/{:}.tar --directory-prefix={:} ; tar -xf {:}.tar -C {:} ; rm {:}.tar'.format(name, destination / 'train', destination / 'train' / name, destination / 'train', destination / 'train' / name)) all_commands.append(cmd) for i, cmd in enumerate(all_commands): print(cmd) else: main(source, destination, sys.argv[3])