update missing data of GDAS

This commit is contained in:
D-X-Y 2019-09-28 20:10:23 +10:00
parent 0f46f63a25
commit 180702ab8e
22 changed files with 122858 additions and 0 deletions

BIN
others/GDAS/data/GDAS.pdf Normal file

Binary file not shown.

BIN
others/GDAS/data/GDAS.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 514 KiB

View File

@ -0,0 +1,49 @@
# https://github.com/salesforce/awd-lstm-lm
echo "=== Acquiring datasets ==="
echo "---"
mkdir -p save
mkdir -p data
cd data
echo "- Downloading WikiText-2 (WT2)"
wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
unzip -q wikitext-2-v1.zip
cd wikitext-2
mv wiki.train.tokens train.txt
mv wiki.valid.tokens valid.txt
mv wiki.test.tokens test.txt
cd ..
echo "- Downloading WikiText-103 (WT2)"
wget --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
unzip -q wikitext-103-v1.zip
cd wikitext-103
mv wiki.train.tokens train.txt
mv wiki.valid.tokens valid.txt
mv wiki.test.tokens test.txt
cd ..
echo "- Downloading Penn Treebank (PTB)"
wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
tar -xzf simple-examples.tgz
mkdir -p penn
cd penn
mv ../simple-examples/data/ptb.train.txt train.txt
mv ../simple-examples/data/ptb.test.txt test.txt
mv ../simple-examples/data/ptb.valid.txt valid.txt
cd ..
echo "- Downloading Penn Treebank (Character)"
mkdir -p pennchar
cd pennchar
mv ../simple-examples/data/ptb.char.train.txt train.txt
mv ../simple-examples/data/ptb.char.test.txt test.txt
mv ../simple-examples/data/ptb.char.valid.txt valid.txt
cd ..
rm -rf simple-examples/
echo "---"
echo "Happy language modeling :)"

View File

@ -0,0 +1,100 @@
n01532829
n01560419
n01580077
n01614925
n01664065
n01751748
n01871265
n01924916
n02087394
n02091134
n02091244
n02094433
n02097209
n02102040
n02102480
n02105251
n02106662
n02108422
n02108551
n02123597
n02165105
n02190166
n02268853
n02279972
n02408429
n02412080
n02443114
n02488702
n02509815
n02606052
n02701002
n02782093
n02794156
n02802426
n02804414
n02808440
n02906734
n02917067
n02950826
n02963159
n03017168
n03042490
n03045698
n03063689
n03065424
n03100240
n03109150
n03124170
n03131574
n03272562
n03345487
n03443371
n03461385
n03527444
n03690938
n03692522
n03721384
n03729826
n03792782
n03838899
n03843555
n03874293
n03877472
n03877845
n03908618
n03929660
n03930630
n03933933
n03970156
n03976657
n03982430
n04004767
n04065272
n04141975
n04146614
n04152593
n04192698
n04200800
n04204347
n04317175
n04326547
n04344873
n04370456
n04389033
n04501370
n04515003
n04542943
n04554684
n04562935
n04596742
n04597913
n04606251
n07583066
n07718472
n07734744
n07873807
n07880968
n09229709
n12768682
n12998815

View File

@ -0,0 +1,15 @@
# ImageNet
The class names of ImageNet-1K are in `classes.txt`.
# A 100-class subset of ImageNet-1K : ImageNet-100
The class names of ImageNet-100 are in `ImageNet-100.txt`.
Run `python split-imagenet.py` will automatically create ImageNet-100 based on the data of ImageNet-1K. By default, we assume the data of ImageNet-1K locates at `~/.torch/ILSVRC2012`. If your data is in a different location, you need to modify line-19 and line-20 in `split-imagenet.py`.
# Tiny-ImageNet
The official website is [here](https://tiny-imagenet.herokuapp.com/). Please run `python tiny-imagenet.py` to generate the correct format of Tiny ImageNet for training.
# PTB and WT2
Run `bash Get-PTB-WT2.sh` to download the data.

1000
others/GDAS/data/classes.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,38 @@
# python ./data/compress.py $TORCH_HOME/ILSVRC2012/ $TORCH_HOME/ILSVRC2012-TAR tar
# python ./data/compress.py $TORCH_HOME/ILSVRC2012/ $TORCH_HOME/ILSVRC2012-ZIP zip
import os, sys
from pathlib import Path
def command(prefix, cmd):
print ('{:}{:}'.format(prefix, cmd))
os.system(cmd)
def main(source, destination, xtype):
assert source.exists(), '{:} does not exist'.format(source)
assert (source/'train').exists(), '{:}/train does not exist'.format(source)
assert (source/'val' ).exists(), '{:}/val does not exist'.format(source)
source = source.resolve()
destination = destination.resolve()
destination.mkdir(parents=True, exist_ok=True)
os.system('rm -rf {:}'.format(destination))
destination.mkdir(parents=True, exist_ok=True)
(destination/'train').mkdir(parents=True, exist_ok=True)
subdirs = list( (source / 'train').glob('n*') )
assert len(subdirs) == 1000, 'ILSVRC2012 should contain 1000 classes instead of {:}.'.format( len(subdirs) )
if xtype == 'tar' : command('', 'tar -cf {:} -C {:} val'.format(destination/'val.tar', source))
elif xtype == 'zip': command('', '(cd {:} ; zip -r {:} val)'.format(source, destination/'val.zip'))
else: raise ValueError('invalid compress type : {:}'.format(xtype))
for idx, subdir in enumerate(subdirs):
name = subdir.name
if xtype == 'tar' : command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -cf {:} -C {:} {:}'.format(destination/'train'/'{:}.tar'.format(name), source / 'train', name))
elif xtype == 'zip': command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), '(cd {:}; zip -r {:} {:})'.format(source / 'train', destination/'train'/'{:}.zip'.format(name), name))
else: raise ValueError('invalid compress type : {:}'.format(xtype))
if __name__ == '__main__':
assert len(sys.argv) == 4, 'invalid argv : {:}'.format(sys.argv)
source, destination = Path(sys.argv[1]), Path(sys.argv[2])
main(source, destination, sys.argv[3])

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,94 @@
# python ./data/decompress.py $TORCH_HOME/ILSVRC2012-TAR/ ./data/data/ILSVRC2012 tar
# python ./data/decompress.py $TORCH_HOME/ILSVRC2012-ZIP/ ./data/data/ILSVRC2012 zip
import os, gc, sys
from pathlib import Path
import multiprocessing
def execute(cmds, idx, num):
#print ('{:03d} :: {:03d} :: {:03d}'.format(idx, num, len(cmds)))
for i, cmd in enumerate(cmds):
if i % num == idx:
print ('{:03d} :: {:03d} :: {:03d}/{:03d} : {:}'.format(idx, num, i, len(cmds), cmd))
os.system(cmd)
def command(prefix, cmd):
#print ('{:}{:}'.format(prefix, cmd))
#if execute: os.system(cmd)
#xcmd = '(echo {:} $(date +\"%Y-%h-%d--%T\") \"PID:\"$$; {:}; sleep 0.1s)'.format(prefix, cmd)
#xcmd = '(echo {:} $(date +\"%Y-%h-%d--%T\") \"PID:\"$$; {:}; sleep 0.1s; pmap $$; echo \"\")'.format(prefix, cmd)
#xcmd = '(echo {:} $(date +\"%Y-%h-%d--%T\") \"PID:\"$$; {:}; sleep 0.1s; pmap $$; echo \"\")'.format(prefix, cmd)
xcmd = '(echo {:} $(date +\"%Y-%h-%d--%T\") \"PID:\"$$; {:}; sleep 0.1s)'.format(prefix, cmd)
return xcmd
def mkILSVRC2012(destination):
destination = destination.resolve()
destination.mkdir(parents=True, exist_ok=True)
os.system('rm -rf {:}'.format(destination))
destination.mkdir(parents=True, exist_ok=True)
(destination/'train').mkdir(parents=True, exist_ok=True)
def main(source, destination, xtype):
assert source.exists(), '{:} does not exist'.format(source)
assert (source/'train' ).exists(), '{:}/train does not exist'.format(source)
if xtype == 'tar' : assert (source/'val.tar').exists(), '{:}/val does not exist'.format(source)
elif xtype == 'zip': assert (source/'val.zip').exists(), '{:}/val does not exist'.format(source)
else : raise ValueError('invalid unzip type : {:}'.format(xtype))
#assert num_process > 0, 'invalid num_process : {:}'.format(num_process)
source = source.resolve()
mkILSVRC2012(destination)
subdirs = list( (source / 'train').glob('n*') )
all_commands = []
assert len(subdirs) == 1000, 'ILSVRC2012 should contain 1000 classes instead of {:}.'.format( len(subdirs) )
for idx, subdir in enumerate(subdirs):
name = subdir.name
if xtype == 'tar' : cmd = command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'tar -xf {:} -C {:}'.format(source/'train'/'{:}'.format(name), destination / 'train'))
elif xtype == 'zip': cmd = command('{:03d}/{:03d}-th: '.format(idx, len(subdirs)), 'unzip -qd {:} {:}'.format(destination / 'train', source/'train'/'{:}'.format(name)))
else : raise ValueError('invalid unzip type : {:}'.format(xtype))
all_commands.append( cmd )
if xtype == 'tar' : cmd = command('', 'tar -xf {:} -C {:}'.format(source/'val.tar', destination))
elif xtype == 'zip': cmd = command('', 'unzip -qd {:} {:}'.format(destination, source/'val.zip'))
else : raise ValueError('invalid unzip type : {:}'.format(xtype))
all_commands.append( cmd )
#print ('Collect all commands done : {:} lines'.format( len(all_commands) ))
for i, cmd in enumerate(all_commands):
print(cmd)
# os.system(cmd)
# print ('{:03d}/{:03d} : {:}'.format(i, len(all_commands), cmd))
# gc.collect()
"""
records = []
for i in range(num_process):
process = multiprocessing.Process(target=execute, args=(all_commands, i, num_process))
process.start()
records.append(process)
for process in records:
process.join()
"""
if __name__ == '__main__':
assert len(sys.argv) == 4, 'invalid argv : {:}'.format(sys.argv)
source, destination = Path(sys.argv[1]), Path(sys.argv[2])
#num_process = int(sys.argv[3])
if sys.argv[3] == 'wget':
with open(source) as f:
content = f.readlines()
content = [x.strip() for x in content]
assert len(content) == 1000, 'invalid lines={:} from {:}'.format( len(content), source )
mkILSVRC2012(destination)
all_commands = []
cmd = command('make-val', 'wget -q http://10.127.2.44:8000/ILSVRC2012-TAR/val.tar --directory-prefix={:} ; tar -xf {:} -C {:} ; rm {:}'.format(destination, destination / 'val.tar', destination, destination / 'val.tar'))
all_commands.append(cmd)
for idx, name in enumerate(content):
cmd = command('{:03d}/{:03d}-th: '.format(idx, len(content)), 'wget -q http://10.127.2.44:8000/ILSVRC2012-TAR/train/{:}.tar --directory-prefix={:} ; tar -xf {:}.tar -C {:} ; rm {:}.tar'.format(name, destination / 'train', destination / 'train' / name, destination / 'train', destination / 'train' / name))
all_commands.append(cmd)
for i, cmd in enumerate(all_commands): print(cmd)
else:
main(source, destination, sys.argv[3])

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

View File

@ -0,0 +1,15 @@
import json
def main():
xpath = 'caption_all.json'
with open(xpath, 'r') as cfile:
cap_data = json.load(cfile)
print ('There are {:} images'.format( len(cap_data) ))
IDs = set()
for idx, data in enumerate( cap_data ):
IDs.add( data['id'] )
assert len( data['captions'] ) > 0, 'invalid {:}-th caption length : {:} {:}'.format(idx, data['captions'], len(data['captions']))
print ('IDs :: min={:}, max={:}, num={:}'.format(min(IDs), max(IDs), len(IDs)))
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

661
others/GDAS/data/ps_mem.py Normal file
View File

@ -0,0 +1,661 @@
#!/usr/bin/env python
# Try to determine how much RAM is currently being used per program.
# Note per _program_, not per process. So for example this script
# will report RAM used by all httpd process together. In detail it reports:
# sum(private RAM for program processes) + sum(Shared RAM for program processes)
# The shared RAM is problematic to calculate, and this script automatically
# selects the most accurate method available for your kernel.
# Licence: LGPLv2
# Author: P@draigBrady.com
# Source: http://www.pixelbeat.org/scripts/ps_mem.py
# V1.0 06 Jul 2005 Initial release
# V1.1 11 Aug 2006 root permission required for accuracy
# V1.2 08 Nov 2006 Add total to output
# Use KiB,MiB,... for units rather than K,M,...
# V1.3 22 Nov 2006 Ignore shared col from /proc/$pid/statm for
# 2.6 kernels up to and including 2.6.9.
# There it represented the total file backed extent
# V1.4 23 Nov 2006 Remove total from output as it's meaningless
# (the shared values overlap with other programs).
# Display the shared column. This extra info is
# useful, especially as it overlaps between programs.
# V1.5 26 Mar 2007 Remove redundant recursion from human()
# V1.6 05 Jun 2007 Also report number of processes with a given name.
# Patch from riccardo.murri@gmail.com
# V1.7 20 Sep 2007 Use PSS from /proc/$pid/smaps if available, which
# fixes some over-estimation and allows totalling.
# Enumerate the PIDs directly rather than using ps,
# which fixes the possible race between reading
# RSS with ps, and shared memory with this program.
# Also we can show non truncated command names.
# V1.8 28 Sep 2007 More accurate matching for stats in /proc/$pid/smaps
# as otherwise could match libraries causing a crash.
# Patch from patrice.bouchand.fedora@gmail.com
# V1.9 20 Feb 2008 Fix invalid values reported when PSS is available.
# Reported by Andrey Borzenkov <arvidjaar@mail.ru>
# V3.13 17 Sep 2018
# http://github.com/pixelb/scripts/commits/master/scripts/ps_mem.py
# Notes:
#
# All interpreted programs where the interpreter is started
# by the shell or with env, will be merged to the interpreter
# (as that's what's given to exec). For e.g. all python programs
# starting with "#!/usr/bin/env python" will be grouped under python.
# You can change this by using the full command line but that will
# have the undesirable affect of splitting up programs started with
# differing parameters (for e.g. mingetty tty[1-6]).
#
# For 2.6 kernels up to and including 2.6.13 and later 2.4 redhat kernels
# (rmap vm without smaps) it can not be accurately determined how many pages
# are shared between processes in general or within a program in our case:
# http://lkml.org/lkml/2005/7/6/250
# A warning is printed if overestimation is possible.
# In addition for 2.6 kernels up to 2.6.9 inclusive, the shared
# value in /proc/$pid/statm is the total file-backed extent of a process.
# We ignore that, introducing more overestimation, again printing a warning.
# Since kernel 2.6.23-rc8-mm1 PSS is available in smaps, which allows
# us to calculate a more accurate value for the total RAM used by programs.
#
# Programs that use CLONE_VM without CLONE_THREAD are discounted by assuming
# they're the only programs that have the same /proc/$PID/smaps file for
# each instance. This will fail if there are multiple real instances of a
# program that then use CLONE_VM without CLONE_THREAD, or if a clone changes
# its memory map while we're checksumming each /proc/$PID/smaps.
#
# I don't take account of memory allocated for a program
# by other programs. For e.g. memory used in the X server for
# a program could be determined, but is not.
#
# FreeBSD is supported if linprocfs is mounted at /compat/linux/proc/
# FreeBSD 8.0 supports up to a level of Linux 2.6.16
import getopt
import time
import errno
import os
import sys
# The following exits cleanly on Ctrl-C or EPIPE
# while treating other exceptions as before.
def std_exceptions(etype, value, tb):
sys.excepthook = sys.__excepthook__
if issubclass(etype, KeyboardInterrupt):
pass
elif issubclass(etype, IOError) and value.errno == errno.EPIPE:
pass
else:
sys.__excepthook__(etype, value, tb)
sys.excepthook = std_exceptions
#
# Define some global variables
#
PAGESIZE = os.sysconf("SC_PAGE_SIZE") / 1024 #KiB
our_pid = os.getpid()
have_pss = 0
have_swap_pss = 0
class Unbuffered(object):
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def close(self):
self.stream.close()
def flush(self):
self.stream.flush()
class Proc:
def __init__(self):
uname = os.uname()
if uname[0] == "FreeBSD":
self.proc = '/compat/linux/proc'
else:
self.proc = '/proc'
def path(self, *args):
return os.path.join(self.proc, *(str(a) for a in args))
def open(self, *args):
try:
if sys.version_info < (3,):
return open(self.path(*args))
else:
return open(self.path(*args), errors='ignore')
except (IOError, OSError):
val = sys.exc_info()[1]
if (val.errno == errno.ENOENT or # kernel thread or process gone
val.errno == errno.EPERM or
val.errno == errno.EACCES):
raise LookupError
raise
proc = Proc()
#
# Functions
#
def parse_options():
try:
long_options = [
'split-args',
'help',
'version',
'total',
'discriminate-by-pid',
'swap'
]
opts, args = getopt.getopt(sys.argv[1:], "shtdSp:w:", long_options)
except getopt.GetoptError:
sys.stderr.write(help())
sys.exit(3)
if len(args):
sys.stderr.write("Extraneous arguments: %s\n" % args)
sys.exit(3)
# ps_mem.py options
split_args = False
pids_to_show = None
discriminate_by_pid = False
show_swap = False
watch = None
only_total = False
for o, a in opts:
if o in ('-s', '--split-args'):
split_args = True
if o in ('-t', '--total'):
only_total = True
if o in ('-d', '--discriminate-by-pid'):
discriminate_by_pid = True
if o in ('-S', '--swap'):
show_swap = True
if o in ('-h', '--help'):
sys.stdout.write(help())
sys.exit(0)
if o in ('--version'):
sys.stdout.write('3.13'+'\n')
sys.exit(0)
if o in ('-p',):
try:
pids_to_show = [int(x) for x in a.split(',')]
except:
sys.stderr.write(help())
sys.exit(3)
if o in ('-w',):
try:
watch = int(a)
except:
sys.stderr.write(help())
sys.exit(3)
return (
split_args,
pids_to_show,
watch,
only_total,
discriminate_by_pid,
show_swap
)
def help():
help_msg = 'Usage: ps_mem [OPTION]...\n' \
'Show program core memory usage\n' \
'\n' \
' -h, -help Show this help\n' \
' -p <pid>[,pid2,...pidN] Only show memory usage PIDs in the '\
'specified list\n' \
' -s, --split-args Show and separate by, all command line'\
' arguments\n' \
' -t, --total Show only the total value\n' \
' -d, --discriminate-by-pid Show by process rather than by program\n' \
' -S, --swap Show swap information\n' \
' -w <N> Measure and show process memory every'\
' N seconds\n'
return help_msg
# (major,minor,release)
def kernel_ver():
kv = proc.open('sys/kernel/osrelease').readline().split(".")[:3]
last = len(kv)
if last == 2:
kv.append('0')
last -= 1
while last > 0:
for char in "-_":
kv[last] = kv[last].split(char)[0]
try:
int(kv[last])
except:
kv[last] = 0
last -= 1
return (int(kv[0]), int(kv[1]), int(kv[2]))
#return Private,Shared,Swap(Pss),unique_id
#Note shared is always a subset of rss (trs is not always)
def getMemStats(pid):
global have_pss
global have_swap_pss
mem_id = pid #unique
Private_lines = []
Shared_lines = []
Pss_lines = []
Rss = (int(proc.open(pid, 'statm').readline().split()[1])
* PAGESIZE)
Swap_lines = []
Swap_pss_lines = []
Swap = 0
if os.path.exists(proc.path(pid, 'smaps')): # stat
smaps = 'smaps'
if os.path.exists(proc.path(pid, 'smaps_rollup')):
smaps = 'smaps_rollup' # faster to process
lines = proc.open(pid, smaps).readlines() # open
# Note we checksum smaps as maps is usually but
# not always different for separate processes.
mem_id = hash(''.join(lines))
for line in lines:
if line.startswith("Shared"):
Shared_lines.append(line)
elif line.startswith("Private"):
Private_lines.append(line)
elif line.startswith("Pss"):
have_pss = 1
Pss_lines.append(line)
elif line.startswith("Swap:"):
Swap_lines.append(line)
elif line.startswith("SwapPss:"):
have_swap_pss = 1
Swap_pss_lines.append(line)
Shared = sum([int(line.split()[1]) for line in Shared_lines])
Private = sum([int(line.split()[1]) for line in Private_lines])
#Note Shared + Private = Rss above
#The Rss in smaps includes video card mem etc.
if have_pss:
pss_adjust = 0.5 # add 0.5KiB as this avg error due to truncation
Pss = sum([float(line.split()[1])+pss_adjust for line in Pss_lines])
Shared = Pss - Private
if have_swap_pss:
# The kernel supports SwapPss, that shows proportional swap share.
# Note that Swap - SwapPss is not Private Swap.
Swap = sum([int(line.split()[1]) for line in Swap_pss_lines])
else:
# Note that Swap = Private swap + Shared swap.
Swap = sum([int(line.split()[1]) for line in Swap_lines])
elif (2,6,1) <= kernel_ver() <= (2,6,9):
Shared = 0 #lots of overestimation, but what can we do?
Private = Rss
else:
Shared = int(proc.open(pid, 'statm').readline().split()[2])
Shared *= PAGESIZE
Private = Rss - Shared
return (Private, Shared, Swap, mem_id)
def getCmdName(pid, split_args, discriminate_by_pid, exe_only=False):
cmdline = proc.open(pid, 'cmdline').read().split("\0")
if cmdline[-1] == '' and len(cmdline) > 1:
cmdline = cmdline[:-1]
path = proc.path(pid, 'exe')
try:
path = os.readlink(path)
# Some symlink targets were seen to contain NULs on RHEL 5 at least
# https://github.com/pixelb/scripts/pull/10, so take string up to NUL
path = path.split('\0')[0]
except OSError:
val = sys.exc_info()[1]
if (val.errno == errno.ENOENT or # either kernel thread or process gone
val.errno == errno.EPERM or
val.errno == errno.EACCES):
raise LookupError
raise
if split_args:
return ' '.join(cmdline).replace('\n', ' ')
if path.endswith(" (deleted)"):
path = path[:-10]
if os.path.exists(path):
path += " [updated]"
else:
#The path could be have prelink stuff so try cmdline
#which might have the full path present. This helped for:
#/usr/libexec/notification-area-applet.#prelink#.fX7LCT (deleted)
if os.path.exists(cmdline[0]):
path = cmdline[0] + " [updated]"
else:
path += " [deleted]"
exe = os.path.basename(path)
if exe_only: return exe
proc_status = proc.open(pid, 'status').readlines()
cmd = proc_status[0][6:-1]
if exe.startswith(cmd):
cmd = exe #show non truncated version
#Note because we show the non truncated name
#one can have separated programs as follows:
#584.0 KiB + 1.0 MiB = 1.6 MiB mozilla-thunder (exe -> bash)
# 56.0 MiB + 22.2 MiB = 78.2 MiB mozilla-thunderbird-bin
else:
#Lookup the parent's exe and use that if matching
#which will merge "Web Content" with "firefox" for example
ppid = 0
for l in range(10):
ps_line = proc_status[l]
if ps_line.startswith('PPid:'):
ppid = int(ps_line[6:-1])
break
if ppid:
p_exe = getCmdName(ppid, False, False, exe_only=True)
if exe == p_exe:
cmd = exe
if sys.version_info >= (3,):
cmd = cmd.encode(errors='replace').decode()
if discriminate_by_pid:
cmd = '%s [%d]' % (cmd, pid)
return cmd
#The following matches "du -h" output
#see also human.py
def human(num, power="Ki", units=None):
if units is None:
powers = ["Ki", "Mi", "Gi", "Ti"]
while num >= 1000: #4 digits
num /= 1024.0
power = powers[powers.index(power)+1]
return "%.1f %sB" % (num, power)
else:
return "%.f" % ((num * 1024) / units)
def cmd_with_count(cmd, count):
if count > 1:
return "%s (%u)" % (cmd, count)
else:
return cmd
#Warn of possible inaccuracies
#RAM:
#2 = accurate & can total
#1 = accurate only considering each process in isolation
#0 = some shared mem not reported
#-1= all shared mem not reported
#SWAP:
#2 = accurate & can total
#1 = accurate only considering each process in isolation
#-1= not available
def val_accuracy(show_swap):
"""http://wiki.apache.org/spamassassin/TopSharedMemoryBug"""
kv = kernel_ver()
pid = os.getpid()
swap_accuracy = -1
if kv[:2] == (2,4):
if proc.open('meminfo').read().find("Inact_") == -1:
return 1, swap_accuracy
return 0, swap_accuracy
elif kv[:2] == (2,6):
if os.path.exists(proc.path(pid, 'smaps')):
swap_accuracy = 1
if proc.open(pid, 'smaps').read().find("Pss:")!=-1:
return 2, swap_accuracy
else:
return 1, swap_accuracy
if (2,6,1) <= kv <= (2,6,9):
return -1, swap_accuracy
return 0, swap_accuracy
elif kv[0] > 2 and os.path.exists(proc.path(pid, 'smaps')):
swap_accuracy = 1
if show_swap and proc.open(pid, 'smaps').read().find("SwapPss:")!=-1:
swap_accuracy = 2
return 2, swap_accuracy
else:
return 1, swap_accuracy
def show_val_accuracy( ram_inacc, swap_inacc, only_total, show_swap ):
level = ("Warning","Error")[only_total]
# Only show significant warnings
if not show_swap:
swap_inacc = 2
elif only_total:
ram_inacc = 2
if ram_inacc == -1:
sys.stderr.write(
"%s: Shared memory is not reported by this system.\n" % level
)
sys.stderr.write(
"Values reported will be too large, and totals are not reported\n"
)
elif ram_inacc == 0:
sys.stderr.write(
"%s: Shared memory is not reported accurately by this system.\n" % level
)
sys.stderr.write(
"Values reported could be too large, and totals are not reported\n"
)
elif ram_inacc == 1:
sys.stderr.write(
"%s: Shared memory is slightly over-estimated by this system\n"
"for each program, so totals are not reported.\n" % level
)
if swap_inacc == -1:
sys.stderr.write(
"%s: Swap is not reported by this system.\n" % level
)
elif swap_inacc == 1:
sys.stderr.write(
"%s: Swap is over-estimated by this system for each program,\n"
"so totals are not reported.\n" % level
)
sys.stderr.close()
if only_total:
if show_swap:
accuracy = swap_inacc
else:
accuracy = ram_inacc
if accuracy != 2:
sys.exit(1)
def get_memory_usage(pids_to_show, split_args, discriminate_by_pid,
include_self=False, only_self=False):
cmds = {}
shareds = {}
mem_ids = {}
count = {}
swaps = {}
for pid in os.listdir(proc.path('')):
if not pid.isdigit():
continue
pid = int(pid)
# Some filters
if only_self and pid != our_pid:
continue
if pid == our_pid and not include_self:
continue
if pids_to_show is not None and pid not in pids_to_show:
continue
try:
cmd = getCmdName(pid, split_args, discriminate_by_pid)
except LookupError:
#operation not permitted
#kernel threads don't have exe links or
#process gone
continue
try:
private, shared, swap, mem_id = getMemStats(pid)
except RuntimeError:
continue #process gone
if shareds.get(cmd):
if have_pss: #add shared portion of PSS together
shareds[cmd] += shared
elif shareds[cmd] < shared: #just take largest shared val
shareds[cmd] = shared
else:
shareds[cmd] = shared
cmds[cmd] = cmds.setdefault(cmd, 0) + private
if cmd in count:
count[cmd] += 1
else:
count[cmd] = 1
mem_ids.setdefault(cmd, {}).update({mem_id: None})
# Swap (overcounting for now...)
swaps[cmd] = swaps.setdefault(cmd, 0) + swap
# Total swaped mem for each program
total_swap = 0
# Add shared mem for each program
total = 0
for cmd in cmds:
cmd_count = count[cmd]
if len(mem_ids[cmd]) == 1 and cmd_count > 1:
# Assume this program is using CLONE_VM without CLONE_THREAD
# so only account for one of the processes
cmds[cmd] /= cmd_count
if have_pss:
shareds[cmd] /= cmd_count
cmds[cmd] = cmds[cmd] + shareds[cmd]
total += cmds[cmd] # valid if PSS available
total_swap += swaps[cmd]
sorted_cmds = sorted(cmds.items(), key=lambda x:x[1])
sorted_cmds = [x for x in sorted_cmds if x[1]]
return sorted_cmds, shareds, count, total, swaps, total_swap
def print_header(show_swap, discriminate_by_pid):
output_string = " Private + Shared = RAM used"
if show_swap:
output_string += " Swap used"
output_string += "\tProgram"
if discriminate_by_pid:
output_string += "[pid]"
output_string += "\n\n"
sys.stdout.write(output_string)
def print_memory_usage(sorted_cmds, shareds, count, total, swaps, total_swap,
show_swap):
for cmd in sorted_cmds:
output_string = "%9s + %9s = %9s"
output_data = (human(cmd[1]-shareds[cmd[0]]),
human(shareds[cmd[0]]), human(cmd[1]))
if show_swap:
output_string += " %9s"
output_data += (human(swaps[cmd[0]]),)
output_string += "\t%s\n"
output_data += (cmd_with_count(cmd[0], count[cmd[0]]),)
sys.stdout.write(output_string % output_data)
# Only show totals if appropriate
if have_swap_pss and show_swap: # kernel will have_pss
sys.stdout.write("%s\n%s%9s%s%9s\n%s\n" %
("-" * 45, " " * 24, human(total), " " * 3,
human(total_swap), "=" * 45))
elif have_pss:
sys.stdout.write("%s\n%s%9s\n%s\n" %
("-" * 33, " " * 24, human(total), "=" * 33))
def verify_environment(pids_to_show):
if os.geteuid() != 0 and not pids_to_show:
sys.stderr.write("Sorry, root permission required, or specify pids with -p\n")
sys.stderr.close()
sys.exit(1)
try:
kernel_ver()
except (IOError, OSError):
val = sys.exc_info()[1]
if val.errno == errno.ENOENT:
sys.stderr.write(
"Couldn't access " + proc.path('') + "\n"
"Only GNU/Linux and FreeBSD (with linprocfs) are supported\n")
sys.exit(2)
else:
raise
def main():
# Force the stdout and stderr streams to be unbuffered
sys.stdout = Unbuffered(sys.stdout)
sys.stderr = Unbuffered(sys.stderr)
split_args, pids_to_show, watch, only_total, discriminate_by_pid, \
show_swap = parse_options()
verify_environment(pids_to_show)
if not only_total:
print_header(show_swap, discriminate_by_pid)
if watch is not None:
try:
sorted_cmds = True
while sorted_cmds:
sorted_cmds, shareds, count, total, swaps, total_swap = \
get_memory_usage(pids_to_show, split_args,
discriminate_by_pid)
if only_total and show_swap and have_swap_pss:
sys.stdout.write(human(total_swap, units=1)+'\n')
elif only_total and not show_swap and have_pss:
sys.stdout.write(human(total, units=1)+'\n')
elif not only_total:
print_memory_usage(sorted_cmds, shareds, count, total,
swaps, total_swap, show_swap)
sys.stdout.flush()
time.sleep(watch)
else:
sys.stdout.write('Process does not exist anymore.\n')
except KeyboardInterrupt:
pass
else:
# This is the default behavior
sorted_cmds, shareds, count, total, swaps, total_swap = \
get_memory_usage(pids_to_show, split_args,
discriminate_by_pid)
if only_total and show_swap and have_swap_pss:
sys.stdout.write(human(total_swap, units=1)+'\n')
elif only_total and not show_swap and have_pss:
sys.stdout.write(human(total, units=1)+'\n')
elif not only_total:
print_memory_usage(sorted_cmds, shareds, count, total, swaps,
total_swap, show_swap)
# We must close explicitly, so that any EPIPE exception
# is handled by our excepthook, rather than the default
# one which is reenabled after this script finishes.
sys.stdout.close()
ram_accuracy, swap_accuracy = val_accuracy( show_swap )
show_val_accuracy( ram_accuracy, swap_accuracy, only_total, show_swap )
if __name__ == '__main__':
main()

View File

@ -0,0 +1,35 @@
#!/bin/bash
# Show High-priority
echo '-------------------------------'
echo 'Queue in high-priority clusters'
echo '-------------------------------'
queues="yq01-v100-box-1-8 yq01-v100-box-idl-2-8"
for queue in ${queues}
do
showjob -p ${queue}
sleep 0.3s
done
echo '-------------------------------'
echo 'Queue in low-priority clusters'
echo '-------------------------------'
#queues="yq01-p40-3-8 yq01-p40-2-8 yq01-p40-box-1-8 yq01-v100-box-2-8"
queues="yq01-p40-3-8 yq01-p40-box-1-8 yq01-v100-box-2-8"
for queue in ${queues}
do
showjob -p ${queue}
sleep 0.3s
done
echo '-------------------------------'
echo 'Queue for other IDL teams'
echo '-------------------------------'
queues="yq01-v100-box-idl-8 yq01-v100-box-idl-3-8"
for queue in ${queues}
do
showjob -p ${queue}
sleep 0.3s
done

View File

@ -0,0 +1,37 @@
import os, sys, random
from pathlib import Path
def sample_100_cls():
with open('classes.txt') as f:
content = f.readlines()
content = [x.strip() for x in content]
random.seed(111)
classes = random.sample(content, 100)
classes.sort()
with open('ImageNet-100.txt', 'w') as f:
for cls in classes: f.write('{:}\n'.format(cls))
print('-'*100)
if __name__ == "__main__":
#sample_100_cls()
IN1K_root = Path.home() / '.torch' / 'ILSVRC2012'
IN100_root = Path.home() / '.torch' / 'ILSVRC2012-100'
assert IN1K_root.exists(), 'ImageNet directory does not exist : {:}'.format(IN1K_root)
print ('Create soft link from ImageNet directory into : {:}'.format(IN100_root))
with open('ImageNet-100.txt', 'r') as f:
classes = f.readlines()
classes = [x.strip() for x in classes]
for sub in ['train', 'val']:
xdir = IN100_root / sub
if not xdir.exists(): xdir.mkdir(parents=True, exist_ok=True)
for idx, cls in enumerate(classes):
xdir = IN1K_root / 'train' / cls
assert xdir.exists(), '{:} does not exist'.format(xdir)
os.system('ln -s {:} {:}'.format(xdir, IN100_root / 'train' / cls))
xdir = IN1K_root / 'val' / cls
assert xdir.exists(), '{:} does not exist'.format(xdir)
os.system('ln -s {:} {:}'.format(xdir, IN100_root / 'val' / cls))

View File

@ -0,0 +1,53 @@
import os, sys
from pathlib import Path
url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
def load_val():
path = 'tiny-imagenet-200/val/val_annotations.txt'
cfile = open(path, 'r')
content = cfile.readlines()
content = [x.strip().split('\t') for x in content]
cfile.close()
images = [x[0] for x in content]
labels = [x[1] for x in content]
return images, labels
def main():
os.system("wget {:}".format(url))
os.system("rm -rf tiny-imagenet-200")
os.system("unzip -o tiny-imagenet-200.zip")
images, labels = load_val()
savedir = 'tiny-imagenet-200/new_val'
if not os.path.exists(savedir): os.makedirs(savedir)
for image, label in zip(images, labels):
cdir = savedir + '/' + label
if not os.path.exists(cdir): os.makedirs(cdir)
ori_path = 'tiny-imagenet-200/val/images/' + image
os.system("cp {:} {:}".format(ori_path, cdir))
os.system("rm -rf tiny-imagenet-200/val")
os.system("mv {:} tiny-imagenet-200/val".format(savedir))
def generate_salt_pepper():
targetdir = Path('tiny-imagenet-200/val')
noisedir = Path('tiny-imagenet-200/val-noise')
assert targetdir.exists(), '{:} does not exist'.format(targetdir)
from imgaug import augmenters as iaa
import cv2
aug = iaa.SaltAndPepper(p=0.2)
for sub in targetdir.iterdir():
if not sub.is_dir(): continue
subdir = noisedir / sub.name
if not subdir.exists(): os.makedirs('{:}'.format(subdir))
images = sub.glob('*.JPEG')
for image in images:
I = cv2.imread(str(image))
Inoise = aug.augment_image(I)
savepath = subdir / image.name
cv2.imwrite(str(savepath), Inoise)
print ('{:} done'.format(sub))
if __name__ == "__main__":
#main()
generate_salt_pepper()