import argparse
import collections
import math
import os
import re
import resource
import shutil
import sys
import textwrap
import time
import numpy as np
with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f:
PAGE_SIZE = resource.getpagesize()
PAGE_SHIFT = int(math.log2(PAGE_SIZE))
PMD_SIZE = int(f.read())
PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE))
def align_forward(v, a): return (v + (a - 1)) & ~(a - 1)
def align_offset(v, a): return v & (a - 1)
def kbnr(kb): # Convert KB to number of pages. return (kb << 10) >> PAGE_SHIFT
def nrkb(nr): # Convert number of pages to KB. return (nr << PAGE_SHIFT) >> 10
def odkb(order): # Convert page order to KB. return (PAGE_SIZE << order) >> 10
def cont_ranges_all(search, index): # Given a list of arrays, find the ranges for which values are monotonically # incrementing in all arrays. all arrays in search and index must be the # same size.
sz = len(search[0])
r = np.full(sz, 2)
d = np.diff(search[0]) == 1 for dd in [np.diff(arr) == 1 for arr in search[1:]]:
d &= dd
r[1:] -= d
r[:-1] -= d return [np.repeat(arr, r).reshape(-1, 2) for arr in index]
class ArgException(Exception):
pass
class FileIOException(Exception):
pass
class BinArrayFile: # Base class used to read /proc/<pid>/pagemap and /proc/kpageflags into a # numpy array. Use inherrited class in a with clause to ensure file is # closed when it goes out of scope.
def __init__(self, filename, element_size):
self.element_size = element_size
self.filename = filename
self.fd = os.open(self.filename, os.O_RDONLY)
class PageMap(BinArrayFile): # Read ranges of a given pid's pagemap into a numpy array.
def __init__(self, pid='self'):
super().__init__(f'/proc/{pid}/pagemap', 8)
# Account pages for any partially mapped THP at the front. In that case, # the first page of the range is a tail.
nr = (int(folios[0]) if len(folios) else index_end) - index_next
stats['anon'if anons[index_next] else'file']['partial'] += nr
# Account pages for any partially mapped THP at the back. In that case, # the next page after the range is a tail. if len(folios):
flags = int(kpageflags.get(pfn_end)[0]) if flags & KPF_COMPOUND_TAIL:
nr = index_end - int(folios[-1])
folios = folios[:-1]
index_end -= nr
stats['anon'if anons[index_end - 1] else'file']['partial'] += nr
# Account fully mapped THPs in the middle of the range. if len(folios):
folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1]))
folio_orders = np.log2(folio_nrs).astype(np.uint64) for index, order in zip(folios, folio_orders):
index = int(index)
order = int(order)
nr = 1 << order
vfn = int(vfns[index])
align = 'aligned'if align_forward(vfn, nr) == vfn else'unaligned'
anon = 'anon'if anons[index] else'file'
stats[anon][align][order] += nr
# Account PMD-mapped THPs spearately, so filter out of the stats. There is a # race between acquiring the smaps stats and reading pagemap, where memory # could be deallocated. So clamp to zero incase it would have gone negative.
anon_pmd_mapped = vma.stats['AnonHugePages']['value']
file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
vma.stats['FilePmdMapped']['value']
stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped))
stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped))
def cont_parse(vma, order, ranges, anons, heads): # Given 4 same-sized arrays representing a range within a page table backed # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons: # Trueif page is anonymous, heads: Trueif page is head of a THP), return a # dictionary of statistics describing the contiguous blocks.
nr_cont = 1 << order
nr_anon = 0
nr_file = 0
for rindex, rvfn, rpfn in zip(*ranges):
index_next = int(rindex[0])
index_end = int(rindex[1]) + 1
vfn_start = int(rvfn[0])
pfn_start = int(rpfn[0])
if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont): continue
off = align_forward(vfn_start, nr_cont) - vfn_start
index_next += off
# Account blocks that are PMD-mapped spearately, so filter out of the stats. # There is a race between acquiring the smaps stats and reading pagemap, # where memory could be deallocated. So clamp to zero incase it would have # gone negative.
anon_pmd_mapped = vma.stats['AnonHugePages']['value']
file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
vma.stats['FilePmdMapped']['value']
nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped))
nr_file = max(0, nr_file - kbnr(file_pmd_mapped))
def vma_print(vma, pid): # Prints a VMA instance in a format similar to smaps. The main difference is # that the pid is included as the first value.
print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}"
.format(
pid, vma.start, vma.end, 'r'if vma.read else'-', 'w'if vma.write else'-', 'x'if vma.execute else'-', 'p'if vma.privateelse's',
vma.pgoff, vma.major, vma.minor, vma.inode, vma.name
))
def stats_print(stats, tot_anon, tot_file, inc_empty): # Print a statistics dictionary.
label_field = 32 for label, stat in stats.items():
type = stat['type']
value = stat['value'] if value or inc_empty:
pad = max(0, label_field - len(label) - 1) if type == 'anon'and tot_anon > 0:
percent = f' ({value / tot_anon:3.0%})'
elif type == 'file'and tot_file > 0:
percent = f' ({value / tot_file:3.0%})' else:
percent = ''
print(f"{label}:{' ' * pad}{value:8} kB{percent}")
def vma_parse(vma, pagemap, kpageflags, contorders): # Generate thp and cont statistics for a single VMA.
start = vma.start >> PAGE_SHIFT
end = vma.end >> PAGE_SHIFT
if args.cgroup:
strict = False for walk_info in os.walk(args.cgroup):
cgroup = walk_info[0]
with open(f'{cgroup}/cgroup.procs') as pidfile: for line in pidfile.readlines():
pids.add(int(line.strip()))
elif args.pid:
strict = True
pids = pids.union(args.pid) else:
strict = False for pid in os.listdir('/proc'): if pid.isdigit():
pids.add(int(pid))
ifnot args.rollup:
print(" PID START END PROT OFFSET DEV INODE OBJECT")
for pid in pids: try:
with PageMap(pid) as pagemap:
with KPageFlags() as kpageflags: for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats): if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0:
stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont) else:
stats = {}
vma_anon = 0
vma_file = 0 if args.inc_smaps:
stats = {**vma.stats, **stats} if args.rollup: for k, v in stats.items(): if k in rollup:
assert(rollup[k]['type'] == v['type'])
rollup[k]['value'] += v['value'] else:
rollup[k] = v
rollup_anon += vma_anon
rollup_file += vma_file else:
vma_print(vma, pid)
stats_print(stats, vma_anon, vma_file, args.inc_empty)
except (FileNotFoundError, ProcessLookupError, FileIOException): if strict:
raise
if args.rollup:
stats_print(rollup, rollup_anon, rollup_file, args.inc_empty)
def format(string):
text = re.sub(r'\s+', ' ', string)
text = re.sub(r'\s*\\n\s*', '\n', text)
paras = text.split('\n')
paras = [textwrap.fill(p, width=docs_width) for p in paras] return'\n'.join(paras)
def size2order(human):
units = { "K": 2**10, "M": 2**20, "G": 2**30, "k": 2**10, "m": 2**20, "g": 2**30,
}
unit = 1 if human[-1] in units:
unit = units[human[-1]]
human = human[:-1] try:
size = int(human)
except ValueError:
raise ArgException('error: --cont value must be integer size with optional KMG unit')
size *= unit
order = int(math.log2(size / PAGE_SIZE)) if order < 1:
raise ArgException('error: --cont value must be size of at least 2 pages') if (1 << order) * PAGE_SIZE != size:
raise ArgException('error: --cont value must be size of power-of-2 pages') if order > PMD_ORDER:
raise ArgException('error: --cont value must be less than or equal to PMD order') return order
parser = argparse.ArgumentParser(formatter_class=formatter,
description=format("""Prints information about how transparent huge
pages are mapped, either system-wide, orfor a specified
process or cgroup.\\n
\\n
When run with --pid, the user explicitly specifies the set
of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run
with --cgroup, the user passes either a v1 or v2 cgroup and
all pids that belong to the cgroup subtree are scanned. When
run with neither --pid nor --cgroup, the full set of pids on
the system is gathered from /proc and scanned as if the user
had provided "--pid 1 --pid 2 ...".\\n
\\n
A default set of statistics is always generated for THP
mappings. However, it is also possible to generate
additional statistics for"contiguous block mappings" where
the block size is user-defined.\\n
\\n
Statistics are maintained independently for anonymous and
file-backed (pagecache) memory and are shown both in kB and
as a percentage of either total anonymous or total
file-backed memory as appropriate.\\n
\\n
THP Statistics\\n
--------------\\n
\\n
Statistics are always generated for fully- and
contiguously-mapped THPs whose mapping address is aligned to
their size, for each <size> supported by the system.
Separate counters describe THPs mapped by PTE vs those
mapped by PMD. (Although note a THP can only be mapped by
PMD if it is PMD-sized):\\n
\\n
- anon-thp-pte-aligned-<size>kB\\n
- file-thp-pte-aligned-<size>kB\\n
- anon-thp-pmd-aligned-<size>kB\\n
- file-thp-pmd-aligned-<size>kB\\n
\\n
Similarly, statistics are always generated for fully- and
contiguously-mapped THPs whose mapping address is *not*
aligned to their size, for each <size> supported by the
system. Due to the unaligned mapping, it is impossible to
map by PMD, so there are only PTE counters forthiscase:\\n
\\n
- anon-thp-pte-unaligned-<size>kB\\n
- file-thp-pte-unaligned-<size>kB\\n
\\n
Statistics are also always generated for mapped pages that
belong to a THP but where the is THP is *not* fully- and
contiguously- mapped. These "partial" mappings are all
counted in the same counter regardless of the size of the
THP that is partially mapped:\\n
\\n
- anon-thp-pte-partial\\n
- file-thp-pte-partial\\n
\\n
Contiguous Block Statistics\\n
---------------------------\\n
\\n
An optional, additional set of statistics is generated for
every contiguous block size specified with `--cont <size>`.
These statistics show how much memory is mapped in
contiguous blocks of <size> and also aligned to <size>. A
given contiguous block must all belong to the same THP, but
there is no requirement for it to be the *whole* THP.
Separate counters describe contiguous blocks mapped by PTE
vs those mapped by PMD:\\n
\\n
- anon-cont-pte-aligned-<size>kB\\n
- file-cont-pte-aligned-<size>kB\\n
- anon-cont-pmd-aligned-<size>kB\\n
- file-cont-pmd-aligned-<size>kB\\n
\\n
As an example, if monitoring 64K contiguous blocks (--cont
64K), there are a number of sources that could provide such
blocks: a fully- and contiguously-mapped 64K THP that is
aligned to a 64K boundary would provide 1 block. A fully- and contiguously-mapped 128K THP that is aligned to at least
a 64K boundary would provide 2 blocks. Or a 128K THP that
maps its first 100K, but contiguously and starting at a 64K
boundary would provide 1 block. A fully- and
contiguously-mapped 2M THP would provide 32 blocks. There
are many other possible permutations.\\n"""),
epilog=format("""Requires root privilege to access pagemap and
kpageflags."""))
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument('--pid',
metavar='pid', required=False, type=int, default=[], action='append',
help="""Process id of the target process. Maybe issued multiple times to
scan multiple processes. --pid and --cgroup are mutually exclusive. If neither are provided, all processes are scanned to provide
system-wide information.""")
group.add_argument('--cgroup',
metavar='path', required=False,
help="""Path to the target cgroup in sysfs. Iterates over every pid in
the cgroup and its children. --pid and --cgroup are mutually
exclusive. If neither are provided, all processes are scanned to
provide system-wide information.""")
parser.add_argument('--rollup',
required=False, default=False, action='store_true',
help="""Sum the per-vma statistics to provide a summary over the whole
system, process or cgroup.""")
parser.add_argument('--cont',
metavar='size[KMG]', required=False, default=[], action='append',
help="""Adds stats for memory that is mapped in contiguous blocks of
<size> and also aligned to <size>. May be issued multiple times to
track multiple sized blocks. Useful to infer e.g. arm64 contpte and
hpa mappings. Size must be a power-of-2 number of pages.""")
parser.add_argument('--inc-smaps',
required=False, default=False, action='store_true',
help="""Include all numerical, additive /proc//smaps stats in the
output.""")
parser.add_argument('--inc-empty',
required=False, default=False, action='store_true',
help="""Show all statistics including those whose value is 0.""")
parser.add_argument('--periodic',
metavar='sleep_ms', required=False, type=int,
help="""Run in a loop, polling every sleep_ms milliseconds.""")
args = parser.parse_args()
try:
args.cont = [size2order(cont) for cont in args.cont]
except ArgException as e:
parser.print_usage()
raise
if args.periodic: whileTrue:
do_main(args)
print()
time.sleep(args.periodic / 1000) else:
do_main(args)
if __name__ == "__main__": try:
main()
except Exception as e:
prog = os.path.basename(sys.argv[0])
print(f'{prog}: {e}') exit(1)
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.