forensics/mxvmem

#! /usr/bin/env python3

'''
   show memory consumption per user

   The script analyses actual or recorded memory usage on the
   host where it is run.

   The default is to read the information live from the /proc
   directory. When called with the '-c' option, /var/log is
   searched for the most recent forensics dump.

   Setting the FORENSICS_STORE environment variable overrides the
   default directory where the forensics logs are searched.


   What to do if too much memory is burned on a given host?

      #> ps -u $USER -o pid,state,lstart,vsz,cmd

   This helps finding old processes, or one may just end _all_
   own sessions on this host:

      #> killall -u $USER


   History:
   13.10.2022, created, Kreitler
   08.05.2024, default to reading the state 'live'

'''

import os
import re
import pwd
import sys
import time
import argparse

# # Debug helper
# try:
#   from dumper import dump
# except ModuleNotFoundError:
#   from pprint import pprint as dump

# ------------------------------------------ group/summarize data in a tree

class Classifier():
  ''' Use to group/summarize values into a tree structure. '''
  def __init__(self):
    self.cnt = 0
    self.fld = dict()
    self.val = 0
    self._ik = None

  def __iter__(self): # this is either expensive, stupid, or already present elsewhere in python
    self._ik = iter(sorted (self.fld.keys()))
    return self

  def __next__(self):
    n = next(self._ik)
    return (n, self.fld[n])

  def cfy(self, fields, val=0):
    self.cnt += 1
    self.val += val
    if len(fields) > 0:
      f = fields.pop(0)
      if f not in self.fld: self.fld[f] = Classifier()
      self.fld[f].cfy(fields, val)
    return val

  def get(self, key):
    return self.fld[key]

  def getval(self):
    return self.val

  def dump_simple(self, lbl='.', ind=0):
    print('# %s %-8s %6d ,'%(' '*ind,lbl+':', self.cnt), self.val)
    ks = sorted(self.fld.keys())
    for k in ks:
      self.fld[k].dump_simple(k, ind+2)
    pass


# ------------------------------------ access to proc data (stored or live)

class ProcBuffer():
  '''
    Reads from /proc, provides an iterator. Keeps ProcStream simple,
    allows handling /proc the same way as a 'forensics' log.

    Example usage:
      pb = ProcBuffer()
      print(pb.read())
      for line in pb:
        print(line)
  '''
  def __init__(self, pdir='/proc'):
    self.pdir  = pdir
    self.buf   = []
    self.bufit = iter(self.buf)

  def __iter__(self):
    return self

  def __next__(self):
    return next(self.bufit)

  def readfile(self, fn):
    lines = 0
    try:
      f = open(fn, errors='backslashreplace')
    except (FileNotFoundError, PermissionError): # do nothing
      return 0
    for line in f:
      self.buf.append(fn + ' ' + line.strip())
      lines += 1
    f.close()
    return lines

  def read(self):
    # get machine info, return number of items
    for pf in ('uptime', 'meminfo'):
      self.readfile(self.pdir + '/' + pf)
    # go for the PIDs
    for pd in os.scandir(self.pdir):
      if pd.name.isnumeric() and pd.is_dir(): # isdigit(), isdecimal() -- which one ?
        for pf in ('stat', 'status'):
          self.readfile(self.pdir + '/' + pd.name + '/' + pf)
    return len(self.buf)

  def get_iter(self):
    self.bufit = iter(self.buf)
    return self.bufit

class ProcInfoBase():
  def __iter__(self):
    return self
  # today nobody closes files, but I like to ...
  def close(self):
    pass

class ProcInfo(ProcInfoBase):
  ''' Read from /proc .'''
  def __init__(self, pdir='/proc'):
    pb = ProcBuffer(pdir)
    pb.read()
    self.pbi = pb.get_iter()
    self.source = pdir

  def __next__(self):
    return next(self.pbi)

class ProcInfoSaved(ProcInfoBase): # aka forensics
  ''' Read from forensics file. '''
  def __init__(self, logfile):
    self.file = open(logfile, errors='backslashreplace')
    self.source = logfile

  def __next__(self):
    line = self.file.readline()
    if not line:
      self.close()
      raise StopIteration
    return line.strip()

  def close(self):
    if self.file:
      self.file.close()
      self.file = None


# ------------------------------------------------------- collect proc data

class ProcStreamParser():
  '''
      state: R is running, S is sleeping,
             D is sleeping in an uninterruptible wait,
             Z is zombie, T is traced or stopped,
             I is idle, and not in the docs ?
  '''
  def __init__(self,procfshandler):
    self.pfh      = procfshandler
    self.pidrex   = re.compile(r'/proc/(\d+)/(\S+)\s+(.*)') # :xxx: hardcoded '/proc'

  def parse(self, line):
    m = self.pidrex.match(line)
    if not m:
      if   line.startswith('/proc/uptime '):
        wall_clock = float(line.split()[1])
        self.pfh.set_uptime(wall_clock)
        self.pfh.report_append( '# uptime: %.2f s, %.2f d' % (wall_clock, wall_clock/3600.0/24.0) )
      elif line.startswith('/proc/meminfo MemTotal'):
        memtotal = int(line.split()[2])
        self.pfh.set_memtotal(memtotal)
        self.pfh.report_append( '# memtotal: %d kB, %.2f GB' % (memtotal, memtotal/1024/1024) )
    else:
      pid   = int(m.group(1))
      pfile = m.group(2)
      entry = m.group(3)

      if   pfile == 'stat':
        fields = entry.split()
        state = fields[2]
        start_time = float(fields[21])/100.0 # seconds please ...
        self.pfh.set_info(pid, 'start_time', start_time)
      elif pfile == 'status':
        if   entry.startswith('VmData:'):
          val = int(entry.split()[1])
          self.pfh.set_info(pid, 'vmdata', val)
        elif entry.startswith('State:'): # redundant
          val = entry.split()[1]
          self.pfh.set_info(pid, 'state', val)
        elif entry.startswith('Uid:'):
          val = int(entry.split()[1])
          self.pfh.set_info(pid, 'uid', val)


# --------------------------------------------------------------- workhorse

class ProcFsHandler():
  def __init__(self, classifier = Classifier(), age_thresh = 2):
    self.Cfy      = classifier
    self.store    = {}
    self.usermap  = {}
    self.report   = []
    self.uptime   = -1
    self.memtotal = -1
    self.age_threshold = age_thresh * 60*60*24 # days
    self.supr_sys = False # used to suppress system accounts

  def set_uptime(self, t):
    self.uptime = t

  def set_memtotal(self, m):
    self.memtotal = m

  def suppress_system_acc(self):
    self.supr_sys = True

  def report_append(self, s):
    self.report.append(s)

  # fill hash
  def set_info(self, pid, key, val):
    if pid not in self.store:
      self.store[pid] = {}
    self.store[pid][key] = val

  def analyze(self):
    for p in self.store:
      # --------- resolve_user_names
      name = '_unknown_'
      vmdata = 0
      state = ''
      uid = self.store[p]['uid']
      if self.supr_sys and (uid < 100 or uid >= 65533): continue

      if not uid in self.usermap:
        try:
          name = pwd.getpwuid(uid).pw_name
        except KeyError:
          name = str(uid)
        self.usermap[uid] = name
      name = self.usermap[uid]

      # ------------------- classify
      state = self.store[p]['state']
      if state in 'DZT': state = 'DZT'

      if 'vmdata' in self.store[p]:
        vmdata = int(self.store[p]['vmdata'])
        if (self.uptime - self.store[p]['start_time']) < self.age_threshold:
          # 'young' ones get a lower case letter
          state = state.lower()
      else:
        continue # nothing to sum up

      self.Cfy.cfy( [name, state], vmdata)


# -------------------------- memory classifier that 'reports' formated bars

class ProcMemClassifier(Classifier):

  keys = ('DZT','S','R','dzt','s','r')

  def maxMem(self):
    m = max(k[1].val for k in self)
    return m

  def barStrings(self, scmax = 100, width = 80):
    '''
    returns a string array for display:
      string has 3 components:  'user  |X-----|  memory' -- name bar amount
    '''
    # 10 for the user, 12 is default width for the number,
    #  2 for fillers and such, 2 for the border
    bar_width = width - 10 - 12 - 2 - 2
    ret = list()

    for k in self:
      user = k[0]
      mtot = k[1].val
      bar  = ' %-10s' % str(user)[:9] + '|'
      nused = 0
      for key in self.keys:
        if key in k[1].fld:
          ks = k[1].fld[key]
          n = int(ks.val/scmax * bar_width)
          if n > 0:
            # todo: mind max clipping!
            bar += key[0] + '-' * (n-1)
            nused += n

      remain = bar_width - nused
      if remain <= 0:
        bar += '|'
      else:
        bar += '|' + ' ' * (remain-1) + '.'
      bar += sep_fmt12(k[1].val)
      ret.append((user, mtot, bar))

    return ret

  def summaryStrings(self):
    '''
    returns a string array for condensed display:
      string has 3 components:  'user memory states'
    '''
    ret = list()

    for k in self:
      states = ''
      user = k[0]
      mtot = k[1].val
      line = '  %-10s' % str(user)[:9] + ':'
      line += sep_fmt12(k[1].val)
      for key in self.keys:
        if key in k[1].fld:
          states += key[0]
      line += ' : [' + states + ']'
      ret.append((user, mtot, line))

    return ret

  def collectusage(self, keys= '', memlimit=0):
    ret = list()
    for k in self:
      memused = 0
      for key in keys:
        if key == 'D': key = 'DZT' # ugly
        if key == 'd': key = 'dzt'
        if key in k[1].fld:
          if k[1].fld[key].val > memlimit:
            memused += k[1].fld[key].val

      if memused:
        ret.append((k[0], memused))

    return ret

# ------------------------------------------------------------------- tools

def register_logs(logdir):
  ''' Returns forensics-logs ordered by age, newest first. '''
  tmp_logrec = []
  for log in '00 10 20 30 40 50'.split():
    logfile = os.path.join(logdir, 'forensics-%sth_min.log' % log)
    if os.access(logfile, os.R_OK):
      mt = os.stat(logfile).st_mtime
      tmp_logrec.append((logfile,mt))
  logs = sorted (tmp_logrec, key=lambda L: L[1], reverse=True)
  return logs

def sep_fmt12(n = 0.0):
  ''' Gives readable positive numbers of width 12. '''
  ret = '%12s' % '{0:,}'.format(n) # new in C: printf("%'.2f", 1234567.89); ???
  ret = ret.replace(',','.')
  if len(ret) > 12: ret = '%12.4g' % n
  return ret

def get_term_size():
  ''' Say 'no' to shutil. '''
  cols_lines = (80, 24)
  try:
    cols_lines = os.get_terminal_size()
  except (ValueError, OSError):
    pass
  return cols_lines

def chk_term_size_usable():
  ''' Make sure terminal area is large enough, tell user. '''
  bail  = False
  min_w = 40
  min_h = 12
  cols, lines = get_term_size()
  if cols < min_w:
    print('# Fatal: Terminal window is not wide enough. Columns needed:', min_w,
             '( got', cols, ')', file=sys.stderr )
    bail = True
  if lines < min_h:
    print('# Fatal: Terminal window is not high enough. Lines needed:', min_h,
             '( got', lines, ')', file=sys.stderr )
    bail = True
  if bail:
    print('# Will end now ...', file=sys.stderr)
    return False
  return True

# ---------------------------------------------------------- user interface

def handle_args():
  ''' Guess? '''
  ap = argparse.ArgumentParser(allow_abbrev=True, description =
       'Read conserved or live information from the /proc directory, ' +
       'and shows memory consumption per user.' )
  ap.add_argument('-H', dest='pydoc',
     help="show documentation", action='store_true')
  ap.add_argument("-a", dest='allentries',
     help='print all entries, makes you scroll, helps when piping', action='store_true')
  ap.add_argument('-c', dest='readconserved',
     help='read data from a conserved forensics dump', action='store_true')
  ap.add_argument("-d", dest='logdir', metavar='dir', default=None,
     help='location of forensics logs (/var/log)')
  ap.add_argument('-m', dest='memthresh', metavar='percent' ,
     help='threshold for memory usage report (10%%)', default=10.0, type=float)
  ap.add_argument('-o', dest='no_sysacc',
     help='omit system accounts from being reported', action='store_true', default=False)
  ap.add_argument('-q', dest='query', metavar='query' ,
     help='report memory usage for given categories (eg. \'SD\')', default='')
  ap.add_argument('-s', dest='summary',
     help='print short summary', action='store_true')
  ap.add_argument('-t', dest='durationthresh', metavar='days' ,
     help='time in days when a job is considered as old', default=2)
  ap.add_argument('-v', dest='verbose',
     help='be a bit more verbose', action='store_true')
  ap.add_argument('forensicsfile', metavar='file', nargs='?',
     help='forensics file (defaults to most recent log found)')
  return ap.parse_args()

# ------------------------------------------------------------- ab die post

if __name__ == '__main__':

  args = handle_args()

  if args.pydoc:
    import subprocess
    subprocess.run(('pydoc3', sys.argv[0]))
    quit()

  if not chk_term_size_usable(): quit()

  proc = None

  if args.forensicsfile:
    if os.access(args.forensicsfile, os.R_OK):
      proc = ProcInfoSaved(args.forensicsfile)
    else:
      print('# Error: can not read', args.forensicsfile, file=sys.stderr)
      quit()
  elif args.readconserved:
    default_logdir = '/var/log'
    logdir = None
    if 'FORENSICS_STORE' in os.environ:
      logdir = os.environ['FORENSICS_STORE']
    if args.logdir:
      logdir = args.logdir
    if not logdir:
      logdir = default_logdir
    if not os.access(logdir, os.R_OK):
      print('# Error: can not access', logdir, file=sys.stderr)
      quit()
    logs = register_logs(logdir)
    if not len(logs):
      print('# Fatal: no logs found in', logdir, file=sys.stderr)
      quit()
    proc = ProcInfoSaved(logs[0][0])
  else:
    proc = ProcInfo()

  if args.verbose:
    print(' Reading:', "'%s'" % proc.source, '...')

  pmc = ProcMemClassifier()
  whs = ProcFsHandler(pmc, float(args.durationthresh))
  if args.no_sysacc: whs.suppress_system_acc()
  psp = ProcStreamParser(whs)

  for line in proc:
    psp.parse(line)
  proc.close()

  whs.analyze()

  if args.query:
    memthresh = args.memthresh
    keys = args.query
    result = pmc.collectusage(keys, whs.memtotal*(memthresh/100))
    for r in result:
      print('%s %.1f' % (r[0], r[1]/1e6))
    quit()

  print('  Memory: %.1f Gb available, %.1f Gb in use (%.1f %%)\n' %
         ( whs.memtotal/1024**2,
           pmc.getval()/1024**2,
           100 * pmc.getval()/whs.memtotal )
       )

  if args.summary:

    for i in sorted(pmc.summaryStrings(), key=lambda L: L[1], reverse=True):
      print(i[2])

  else:

    cols, lines = get_term_size()
    maxmem_used = pmc.maxMem()

    print(' USER    ', '*** OLD/young processes ***'.center(cols-23) , 'Amount [kb]')

    mem_bars = pmc.barStrings(maxmem_used, cols)
    limit = lines-11 if not args.allentries else len(mem_bars)
    for i in sorted(mem_bars, key=lambda L: L[1], reverse=True)[:limit]:
      print(i[2])

    print('*** R = running, S = sleeping, D = deep sleep, zombie, or debugged ***'.center(cols))

  print()