root/tools/accounting/delaytop.c
// SPDX-License-Identifier: GPL-2.0
/*
 * delaytop.c - system-wide delay monitoring tool.
 *
 * This tool provides real-time monitoring and statistics of
 * system, container, and task-level delays, including CPU,
 * memory, IO, and IRQ. It supports both interactive (top-like),
 * and can output delay information for the whole system, specific
 * containers (cgroups), or individual tasks (PIDs).
 *
 * Key features:
 *      - Collects per-task delay accounting statistics via taskstats.
 *      - Collects system-wide PSI information.
 *      - Supports sorting, filtering.
 *      - Supports both interactive (screen refresh).
 *
 * Copyright (C) Fan Yu, ZTE Corp. 2025
 * Copyright (C) Wang Yaxin, ZTE Corp. 2025
 *
 * Compile with
 *      gcc -I/usr/src/linux/include delaytop.c -o delaytop
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
#include <getopt.h>
#include <signal.h>
#include <time.h>
#include <dirent.h>
#include <ctype.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/select.h>
#include <termios.h>
#include <limits.h>
#include <linux/genetlink.h>
#include <linux/taskstats.h>
#include <linux/cgroupstats.h>
#include <stddef.h>

#define PSI_PATH        "/proc/pressure"
#define PSI_CPU_PATH    "/proc/pressure/cpu"
#define PSI_MEMORY_PATH "/proc/pressure/memory"
#define PSI_IO_PATH     "/proc/pressure/io"
#define PSI_IRQ_PATH    "/proc/pressure/irq"

#define NLA_NEXT(na)                    ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
#define NLA_DATA(na)                    ((void *)((char *)(na) + NLA_HDRLEN))
#define NLA_PAYLOAD(len)                (len - NLA_HDRLEN)

#define GENLMSG_DATA(glh)               ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
#define GENLMSG_PAYLOAD(glh)    (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)

#define TASK_COMM_LEN   16
#define MAX_MSG_SIZE    1024
#define MAX_TASKS               1000
#define MAX_BUF_LEN             256
#define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
#define BOOL_FPRINT(stream, fmt, ...) \
({ \
        int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
        ret >= 0; \
})
#define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count)
#define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
#define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n"
#define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n"
#define SORT_FIELD(name, cmd, modes) \
        {#name, #cmd, \
        offsetof(struct task_info, name##_delay_total), \
        offsetof(struct task_info, name##_count), \
        modes}
#define END_FIELD {NULL, 0, 0}

/* Display mode types */
#define MODE_TYPE_ALL   (0xFFFFFFFF)
#define MODE_DEFAULT    (1 << 0)
#define MODE_MEMVERBOSE (1 << 1)

/* PSI statistics structure */
struct psi_stats {
        double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300;
        unsigned long long cpu_some_total;
        double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300;
        unsigned long long cpu_full_total;
        double memory_some_avg10, memory_some_avg60, memory_some_avg300;
        unsigned long long memory_some_total;
        double memory_full_avg10, memory_full_avg60, memory_full_avg300;
        unsigned long long memory_full_total;
        double io_some_avg10, io_some_avg60, io_some_avg300;
        unsigned long long io_some_total;
        double io_full_avg10, io_full_avg60, io_full_avg300;
        unsigned long long io_full_total;
        double irq_full_avg10, irq_full_avg60, irq_full_avg300;
        unsigned long long irq_full_total;
};

/* Task delay information structure */
struct task_info {
        int pid;
        int tgid;
        char command[TASK_COMM_LEN];
        unsigned long long cpu_count;
        unsigned long long cpu_delay_total;
        unsigned long long blkio_count;
        unsigned long long blkio_delay_total;
        unsigned long long swapin_count;
        unsigned long long swapin_delay_total;
        unsigned long long freepages_count;
        unsigned long long freepages_delay_total;
        unsigned long long thrashing_count;
        unsigned long long thrashing_delay_total;
        unsigned long long compact_count;
        unsigned long long compact_delay_total;
        unsigned long long wpcopy_count;
        unsigned long long wpcopy_delay_total;
        unsigned long long irq_count;
        unsigned long long irq_delay_total;
        unsigned long long mem_count;
        unsigned long long mem_delay_total;
};

/* Container statistics structure */
struct container_stats {
        int nr_sleeping;                /* Number of sleeping processes */
        int nr_running;                 /* Number of running processes */
        int nr_stopped;                 /* Number of stopped processes */
        int nr_uninterruptible; /* Number of uninterruptible processes */
        int nr_io_wait;                 /* Number of processes in IO wait */
};

/* Delay field structure */
struct field_desc {
        const char *name;       /* Field name for cmdline argument */
        const char *cmd_char;   /* Interactive command */
        unsigned long total_offset; /* Offset of total delay in task_info */
        unsigned long count_offset; /* Offset of count in task_info */
        size_t supported_modes; /* Supported display modes */
};

/* Program settings structure */
struct config {
        int delay;                              /* Update interval in seconds */
        int iterations;                 /* Number of iterations, 0 == infinite */
        int max_processes;              /* Maximum number of processes to show */
        int output_one_time;    /* Output once and exit */
        int monitor_pid;                /* Monitor specific PID */
        char *container_path;   /* Path to container cgroup */
        const struct field_desc *sort_field;    /* Current sort field */
        size_t display_mode;    /* Current display mode */
};

/* Global variables */
static struct config cfg;
static struct psi_stats psi;
static struct task_info tasks[MAX_TASKS];
static int task_count;
static int running = 1;
static struct container_stats container_stats;
static const struct field_desc sort_fields[] = {
        SORT_FIELD(cpu,         c,      MODE_DEFAULT),
        SORT_FIELD(blkio,       i,      MODE_DEFAULT),
        SORT_FIELD(irq,         q,      MODE_DEFAULT),
        SORT_FIELD(mem,         m,      MODE_DEFAULT | MODE_MEMVERBOSE),
        SORT_FIELD(swapin,      s,      MODE_MEMVERBOSE),
        SORT_FIELD(freepages,   r,      MODE_MEMVERBOSE),
        SORT_FIELD(thrashing,   t,      MODE_MEMVERBOSE),
        SORT_FIELD(compact,     p,      MODE_MEMVERBOSE),
        SORT_FIELD(wpcopy,      w,      MODE_MEMVERBOSE),
        END_FIELD
};
static int sort_selected;

/* Netlink socket variables */
static int nl_sd = -1;
static int family_id;

/* Set terminal to non-canonical mode for q-to-quit */
static struct termios orig_termios;
static void enable_raw_mode(void)
{
        struct termios raw;

        tcgetattr(STDIN_FILENO, &orig_termios);
        raw = orig_termios;
        raw.c_lflag &= ~(ICANON | ECHO);
        tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw);
}
static void disable_raw_mode(void)
{
        tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios);
}

/* Find field descriptor by command line */
static const struct field_desc *get_field_by_cmd_char(char ch)
{
        const struct field_desc *field;

        for (field = sort_fields; field->name != NULL; field++) {
                if (field->cmd_char[0] == ch)
                        return field;
        }

        return NULL;
}

/* Find field descriptor by name with string comparison */
static const struct field_desc *get_field_by_name(const char *name)
{
        const struct field_desc *field;
        size_t field_len;

        for (field = sort_fields; field->name != NULL; field++) {
                field_len = strlen(field->name);
                if (field_len != strlen(name))
                        continue;
                if (strncmp(field->name, name, field_len) == 0)
                        return field;
        }

        return NULL;
}

/* Find display name for a field descriptor */
static const char *get_name_by_field(const struct field_desc *field)
{
        return field ? field->name : "UNKNOWN";
}

/* Generate string of available field names */
static void display_available_fields(size_t mode)
{
        const struct field_desc *field;
        char buf[MAX_BUF_LEN];

        buf[0] = '\0';

        for (field = sort_fields; field->name != NULL; field++) {
                if (!(field->supported_modes & mode))
                        continue;
                strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1);
                strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1);
                buf[MAX_BUF_LEN - 1] = '\0';
        }

        fprintf(stderr, "Available fields: %s\n", buf);
}

/* Display usage information and command line options */
static void usage(void)
{
        printf("Usage: delaytop [Options]\n"
        "Options:\n"
        "  -h, --help               Show this help message and exit\n"
        "  -d, --delay=SECONDS      Set refresh interval (default: 2 seconds, min: 1)\n"
        "  -n, --iterations=COUNT   Set number of updates (default: 0 = infinite)\n"
        "  -P, --processes=NUMBER   Set maximum number of processes to show (default: 20, max: 1000)\n"
        "  -o, --once               Display once and exit\n"
        "  -p, --pid=PID            Monitor only the specified PID\n"
        "  -C, --container=PATH     Monitor the container at specified cgroup path\n"
        "  -s, --sort=FIELD         Sort by delay field (default: cpu)\n"
        "  -M, --memverbose         Display memory detailed information\n");
        exit(0);
}

/* Parse command line arguments and set configuration */
static void parse_args(int argc, char **argv)
{
        int c;
        const struct field_desc *field;
        struct option long_options[] = {
                {"help", no_argument, 0, 'h'},
                {"delay", required_argument, 0, 'd'},
                {"iterations", required_argument, 0, 'n'},
                {"pid", required_argument, 0, 'p'},
                {"once", no_argument, 0, 'o'},
                {"processes", required_argument, 0, 'P'},
                {"sort", required_argument, 0, 's'},
                {"container", required_argument, 0, 'C'},
                {"memverbose", no_argument, 0, 'M'},
                {0, 0, 0, 0}
        };

        /* Set defaults */
        cfg.delay = 2;
        cfg.iterations = 0;
        cfg.max_processes = 20;
        cfg.sort_field = &sort_fields[0];       /* Default sorted by CPU delay */
        cfg.output_one_time = 0;
        cfg.monitor_pid = 0;    /* 0 means monitor all PIDs */
        cfg.container_path = NULL;
        cfg.display_mode = MODE_DEFAULT;

        while (1) {
                int option_index = 0;

                c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index);
                if (c == -1)
                        break;

                switch (c) {
                case 'h':
                        usage();
                        break;
                case 'd':
                        cfg.delay = atoi(optarg);
                        if (cfg.delay < 1) {
                                fprintf(stderr, "Error: delay must be >= 1.\n");
                                exit(1);
                        }
                        break;
                case 'n':
                        cfg.iterations = atoi(optarg);
                        if (cfg.iterations < 0) {
                                fprintf(stderr, "Error: iterations must be >= 0.\n");
                                exit(1);
                        }
                        break;
                case 'p':
                        cfg.monitor_pid = atoi(optarg);
                        if (cfg.monitor_pid < 1) {
                                fprintf(stderr, "Error: pid must be >= 1.\n");
                                exit(1);
                        }
                        break;
                case 'o':
                        cfg.output_one_time = 1;
                        break;
                case 'P':
                        cfg.max_processes = atoi(optarg);
                        if (cfg.max_processes < 1) {
                                fprintf(stderr, "Error: processes must be >= 1.\n");
                                exit(1);
                        }
                        if (cfg.max_processes > MAX_TASKS) {
                                fprintf(stderr, "Warning: processes capped to %d.\n",
                                        MAX_TASKS);
                                cfg.max_processes = MAX_TASKS;
                        }
                        break;
                case 'C':
                        cfg.container_path = strdup(optarg);
                        break;
                case 's':
                        if (strlen(optarg) == 0) {
                                fprintf(stderr, "Error: empty sort field\n");
                                exit(1);
                        }

                        field = get_field_by_name(optarg);
                        /* Show available fields if invalid option provided */
                        if (!field) {
                                fprintf(stderr, "Error: invalid sort field '%s'\n", optarg);
                                display_available_fields(MODE_TYPE_ALL);
                                exit(1);
                        }

                        cfg.sort_field = field;
                        break;
                case 'M':
                        cfg.display_mode = MODE_MEMVERBOSE;
                        cfg.sort_field = get_field_by_name("mem");
                        break;
                default:
                        fprintf(stderr, "Try 'delaytop --help' for more information.\n");
                        exit(1);
                }
        }
}

/* Calculate average delay in milliseconds for overall memory */
static void set_mem_delay_total(struct task_info *t)
{
        t->mem_delay_total = t->swapin_delay_total +
                t->freepages_delay_total +
                t->thrashing_delay_total +
                t->compact_delay_total +
                t->wpcopy_delay_total;
}

static void set_mem_count(struct task_info *t)
{
        t->mem_count = t->swapin_count +
                t->freepages_count +
                t->thrashing_count +
                t->compact_count +
                t->wpcopy_count;
}

/* Create a raw netlink socket and bind */
static int create_nl_socket(void)
{
        int fd;
        struct sockaddr_nl local;

        fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
        if (fd < 0)
                return -1;

        memset(&local, 0, sizeof(local));
        local.nl_family = AF_NETLINK;

        if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) {
                fprintf(stderr, "Failed to bind socket when create nl_socket\n");
                close(fd);
                return -1;
        }

        return fd;
}

/* Send a command via netlink */
static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
                         __u8 genl_cmd, __u16 nla_type,
                         void *nla_data, int nla_len)
{
        struct sockaddr_nl nladdr;
        struct nlattr *na;
        int r, buflen;
        char *buf;

        struct {
                struct nlmsghdr n;
                struct genlmsghdr g;
                char buf[MAX_MSG_SIZE];
        } msg;

        msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
        msg.n.nlmsg_type = nlmsg_type;
        msg.n.nlmsg_flags = NLM_F_REQUEST;
        msg.n.nlmsg_seq = 0;
        msg.n.nlmsg_pid = nlmsg_pid;
        msg.g.cmd = genl_cmd;
        msg.g.version = 0x1;
        na = (struct nlattr *) GENLMSG_DATA(&msg);
        na->nla_type = nla_type;
        na->nla_len = nla_len + NLA_HDRLEN;
        memcpy(NLA_DATA(na), nla_data, nla_len);
        msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);

        buf = (char *) &msg;
        buflen = msg.n.nlmsg_len;
        memset(&nladdr, 0, sizeof(nladdr));
        nladdr.nl_family = AF_NETLINK;
        while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
                                        sizeof(nladdr))) < buflen) {
                if (r > 0) {
                        buf += r;
                        buflen -= r;
                } else if (errno != EAGAIN)
                        return -1;
        }
        return 0;
}

/* Get family ID for taskstats via netlink */
static int get_family_id(int sd)
{
        struct {
                struct nlmsghdr n;
                struct genlmsghdr g;
                char buf[256];
        } ans;

        int id = 0, rc;
        struct nlattr *na;
        int rep_len;
        char name[100];

        strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1);
        name[sizeof(name) - 1] = '\0';
        rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
                        CTRL_ATTR_FAMILY_NAME, (void *)name,
                        strlen(TASKSTATS_GENL_NAME)+1);
        if (rc < 0) {
                fprintf(stderr, "Failed to send cmd for family id\n");
                return 0;
        }

        rep_len = recv(sd, &ans, sizeof(ans), 0);
        if (ans.n.nlmsg_type == NLMSG_ERROR ||
                (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) {
                fprintf(stderr, "Failed to receive response for family id\n");
                return 0;
        }

        na = (struct nlattr *) GENLMSG_DATA(&ans);
        na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
        if (na->nla_type == CTRL_ATTR_FAMILY_ID)
                id = *(__u16 *) NLA_DATA(na);
        return id;
}

static int read_psi_stats(void)
{
        FILE *fp;
        char line[256];
        int ret = 0;
        int error_count = 0;

        /* Check if PSI path exists */
        if (access(PSI_PATH, F_OK) != 0) {
                fprintf(stderr, "Error: PSI interface not found at %s\n", PSI_PATH);
                fprintf(stderr, "Please ensure your kernel supports PSI (Pressure Stall Information)\n");
                return -1;
        }

        /* Zero all fields */
        memset(&psi, 0, sizeof(psi));

        /* CPU pressure */
        fp = fopen(PSI_CPU_PATH, "r");
        if (fp) {
                while (fgets(line, sizeof(line), fp)) {
                        if (strncmp(line, "some", 4) == 0) {
                                ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
                                                        &psi.cpu_some_avg10, &psi.cpu_some_avg60,
                                                        &psi.cpu_some_avg300, &psi.cpu_some_total);
                                if (ret != 4) {
                                        fprintf(stderr, "Failed to parse CPU some PSI data\n");
                                        error_count++;
                                }
                        } else if (strncmp(line, "full", 4) == 0) {
                                ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
                                                &psi.cpu_full_avg10, &psi.cpu_full_avg60,
                                                &psi.cpu_full_avg300, &psi.cpu_full_total);
                                if (ret != 4) {
                                        fprintf(stderr, "Failed to parse CPU full PSI data\n");
                                        error_count++;
                                }
                        }
                }
                fclose(fp);
        } else {
                fprintf(stderr, "Warning: Failed to open %s\n", PSI_CPU_PATH);
                error_count++;
        }

        /* Memory pressure */
        fp = fopen(PSI_MEMORY_PATH, "r");
        if (fp) {
                while (fgets(line, sizeof(line), fp)) {
                        if (strncmp(line, "some", 4) == 0) {
                                ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
                                                &psi.memory_some_avg10, &psi.memory_some_avg60,
                                                &psi.memory_some_avg300, &psi.memory_some_total);
                                if (ret != 4) {
                                        fprintf(stderr, "Failed to parse Memory some PSI data\n");
                                        error_count++;
                                }
                        } else if (strncmp(line, "full", 4) == 0) {
                                ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
                                                &psi.memory_full_avg10, &psi.memory_full_avg60,
                                                &psi.memory_full_avg300, &psi.memory_full_total);
                                if (ret != 4) {
                                        fprintf(stderr, "Failed to parse Memory full PSI data\n");
                                        error_count++;
                                }
                        }
                }
                fclose(fp);
        } else {
                fprintf(stderr, "Warning: Failed to open %s\n", PSI_MEMORY_PATH);
                error_count++;
        }

        /* IO pressure */
        fp = fopen(PSI_IO_PATH, "r");
        if (fp) {
                while (fgets(line, sizeof(line), fp)) {
                        if (strncmp(line, "some", 4) == 0) {
                                ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
                                                &psi.io_some_avg10, &psi.io_some_avg60,
                                                &psi.io_some_avg300, &psi.io_some_total);
                                if (ret != 4) {
                                        fprintf(stderr, "Failed to parse IO some PSI data\n");
                                        error_count++;
                                }
                        } else if (strncmp(line, "full", 4) == 0) {
                                ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
                                                &psi.io_full_avg10, &psi.io_full_avg60,
                                                &psi.io_full_avg300, &psi.io_full_total);
                                if (ret != 4) {
                                        fprintf(stderr, "Failed to parse IO full PSI data\n");
                                        error_count++;
                                }
                        }
                }
                fclose(fp);
        } else {
                fprintf(stderr, "Warning: Failed to open %s\n", PSI_IO_PATH);
                error_count++;
        }

        /* IRQ pressure (only full) */
        fp = fopen(PSI_IRQ_PATH, "r");
        if (fp) {
                while (fgets(line, sizeof(line), fp)) {
                        if (strncmp(line, "full", 4) == 0) {
                                ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
                                                &psi.irq_full_avg10, &psi.irq_full_avg60,
                                                &psi.irq_full_avg300, &psi.irq_full_total);
                                if (ret != 4) {
                                        fprintf(stderr, "Failed to parse IRQ full PSI data\n");
                                        error_count++;
                                }
                        }
                }
                fclose(fp);
        } else {
                fprintf(stderr, "Warning: Failed to open %s\n", PSI_IRQ_PATH);
                error_count++;
        }

        /* Return error count: 0 means success, >0 means warnings, -1 means fatal error */
        if (error_count > 0) {
                fprintf(stderr, "PSI stats reading completed with %d warnings\n", error_count);
                return error_count;
        }

        return 0;
}

static int read_comm(int pid, char *comm_buf, size_t buf_size)
{
        char path[64];
        int ret = -1;
        size_t len;
        FILE *fp;

        snprintf(path, sizeof(path), "/proc/%d/comm", pid);
        fp = fopen(path, "r");
        if (!fp) {
                fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid);
                return ret;
        }

        if (fgets(comm_buf, buf_size, fp)) {
                len = strlen(comm_buf);
                if (len > 0 && comm_buf[len - 1] == '\n')
                        comm_buf[len - 1] = '\0';
                ret = 0;
        }

        fclose(fp);

        return ret;
}

static void fetch_and_fill_task_info(int pid, const char *comm)
{
        struct {
                struct nlmsghdr n;
                struct genlmsghdr g;
                char buf[MAX_MSG_SIZE];
        } resp;
        struct taskstats stats;
        struct nlattr *nested;
        struct nlattr *na;
        int nested_len;
        int nl_len;
        int rc;

        /* Send request for task stats */
        if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET,
                                 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) {
                fprintf(stderr, "Failed to send request for task stats\n");
                return;
        }

        /* Receive response */
        rc = recv(nl_sd, &resp, sizeof(resp), 0);
        if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
                fprintf(stderr, "Failed to receive response for task stats\n");
                return;
        }

        /* Parse response */
        nl_len = GENLMSG_PAYLOAD(&resp.n);
        na = (struct nlattr *) GENLMSG_DATA(&resp);
        while (nl_len > 0) {
                if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) {
                        nested = (struct nlattr *) NLA_DATA(na);
                        nested_len = NLA_PAYLOAD(na->nla_len);
                        while (nested_len > 0) {
                                if (nested->nla_type == TASKSTATS_TYPE_STATS) {
                                        memcpy(&stats, NLA_DATA(nested), sizeof(stats));
                                        if (task_count < MAX_TASKS) {
                                                tasks[task_count].pid = pid;
                                                tasks[task_count].tgid = pid;
                                                strncpy(tasks[task_count].command, comm,
                                                        TASK_COMM_LEN - 1);
                                                tasks[task_count].command[TASK_COMM_LEN - 1] = '\0';
                                                SET_TASK_STAT(task_count, cpu_count);
                                                SET_TASK_STAT(task_count, cpu_delay_total);
                                                SET_TASK_STAT(task_count, blkio_count);
                                                SET_TASK_STAT(task_count, blkio_delay_total);
                                                SET_TASK_STAT(task_count, swapin_count);
                                                SET_TASK_STAT(task_count, swapin_delay_total);
                                                SET_TASK_STAT(task_count, freepages_count);
                                                SET_TASK_STAT(task_count, freepages_delay_total);
                                                SET_TASK_STAT(task_count, thrashing_count);
                                                SET_TASK_STAT(task_count, thrashing_delay_total);
                                                SET_TASK_STAT(task_count, compact_count);
                                                SET_TASK_STAT(task_count, compact_delay_total);
                                                SET_TASK_STAT(task_count, wpcopy_count);
                                                SET_TASK_STAT(task_count, wpcopy_delay_total);
                                                SET_TASK_STAT(task_count, irq_count);
                                                SET_TASK_STAT(task_count, irq_delay_total);
                                                set_mem_count(&tasks[task_count]);
                                                set_mem_delay_total(&tasks[task_count]);
                                                task_count++;
                                        }
                                        break;
                                }
                                nested_len -= NLA_ALIGN(nested->nla_len);
                                nested = NLA_NEXT(nested);
                        }
                }
                nl_len -= NLA_ALIGN(na->nla_len);
                na = NLA_NEXT(na);
        }
        return;
}

static void get_task_delays(void)
{
        char comm[TASK_COMM_LEN];
        struct dirent *entry;
        DIR *dir;
        int pid;

        task_count = 0;
        if (cfg.monitor_pid > 0) {
                if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0)
                        fetch_and_fill_task_info(cfg.monitor_pid, comm);
                return;
        }

        dir = opendir("/proc");
        if (!dir) {
                fprintf(stderr, "Error opening /proc directory\n");
                return;
        }

        while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) {
                if (!isdigit(entry->d_name[0]))
                        continue;
                pid = atoi(entry->d_name);
                if (pid == 0)
                        continue;
                if (read_comm(pid, comm, sizeof(comm)) != 0)
                        continue;
                fetch_and_fill_task_info(pid, comm);
        }
        closedir(dir);
}

/* Calculate average delay in milliseconds */
static double average_ms(unsigned long long total, unsigned long long count)
{
        if (count == 0)
                return 0;
        return (double)total / 1000000.0 / count;
}

/* Comparison function for sorting tasks */
static int compare_tasks(const void *a, const void *b)
{
        const struct task_info *t1 = (const struct task_info *)a;
        const struct task_info *t2 = (const struct task_info *)b;
        unsigned long long total1;
        unsigned long long total2;
        unsigned long count1;
        unsigned long count2;
        double avg1, avg2;

        total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset);
        total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset);
        count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset);
        count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset);

        avg1 = average_ms(total1, count1);
        avg2 = average_ms(total2, count2);
        if (avg1 != avg2)
                return avg2 > avg1 ? 1 : -1;

        return 0;
}

/* Sort tasks by selected field */
static void sort_tasks(void)
{
        if (task_count > 0)
                qsort(tasks, task_count, sizeof(struct task_info), compare_tasks);
}

/* Get container statistics via cgroupstats */
static void get_container_stats(void)
{
        int rc, cfd;
        struct {
                struct nlmsghdr n;
                struct genlmsghdr g;
                char buf[MAX_MSG_SIZE];
        } req, resp;
        struct nlattr *na;
        int nl_len;
        struct cgroupstats stats;

        /* Check if container path is set */
        if (!cfg.container_path)
                return;

        /* Open container cgroup */
        cfd = open(cfg.container_path, O_RDONLY);
        if (cfd < 0) {
                fprintf(stderr, "Error opening container path: %s\n", cfg.container_path);
                return;
        }

        /* Send request for container stats */
        if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET,
                                CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) {
                fprintf(stderr, "Failed to send request for container stats\n");
                close(cfd);
                return;
        }

        /* Receive response */
        rc = recv(nl_sd, &resp, sizeof(resp), 0);
        if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
                fprintf(stderr, "Failed to receive response for container stats\n");
                close(cfd);
                return;
        }

        /* Parse response */
        nl_len = GENLMSG_PAYLOAD(&resp.n);
        na = (struct nlattr *) GENLMSG_DATA(&resp);
        while (nl_len > 0) {
                if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) {
                        /* Get the cgroupstats structure */
                        memcpy(&stats, NLA_DATA(na), sizeof(stats));

                        /* Fill container stats */
                        container_stats.nr_sleeping = stats.nr_sleeping;
                        container_stats.nr_running = stats.nr_running;
                        container_stats.nr_stopped = stats.nr_stopped;
                        container_stats.nr_uninterruptible = stats.nr_uninterruptible;
                        container_stats.nr_io_wait = stats.nr_io_wait;
                        break;
                }
                nl_len -= NLA_ALIGN(na->nla_len);
                na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
        }

        close(cfd);
}

/* Display results to stdout or log file */
static void display_results(int psi_ret)
{
        time_t now = time(NULL);
        struct tm *tm_now = localtime(&now);
        FILE *out = stdout;
        char timestamp[32];
        bool suc = true;
        int i, count;

        /* Clear terminal screen */
        suc &= BOOL_FPRINT(out, "\033[H\033[J");

        /* PSI output (one-line, no cat style) */
        suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60vg300/total)\n");
        if (psi_ret) {
                suc &= BOOL_FPRINT(out, "  PSI not found: check if psi=1 enabled in cmdline\n");
        } else {
                suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
                        "CPU some:",
                        psi.cpu_some_avg10,
                        psi.cpu_some_avg60,
                        psi.cpu_some_avg300,
                        psi.cpu_some_total / 1000);
                suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
                        "CPU full:",
                        psi.cpu_full_avg10,
                        psi.cpu_full_avg60,
                        psi.cpu_full_avg300,
                        psi.cpu_full_total / 1000);
                suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
                        "Memory full:",
                        psi.memory_full_avg10,
                        psi.memory_full_avg60,
                        psi.memory_full_avg300,
                        psi.memory_full_total / 1000);
                suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
                        "Memory some:",
                        psi.memory_some_avg10,
                        psi.memory_some_avg60,
                        psi.memory_some_avg300,
                        psi.memory_some_total / 1000);
                suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
                        "IO full:",
                        psi.io_full_avg10,
                        psi.io_full_avg60,
                        psi.io_full_avg300,
                        psi.io_full_total / 1000);
                suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
                        "IO some:",
                        psi.io_some_avg10,
                        psi.io_some_avg60,
                        psi.io_some_avg300,
                        psi.io_some_total / 1000);
                suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
                        "IRQ full:",
                        psi.irq_full_avg10,
                        psi.irq_full_avg60,
                        psi.irq_full_avg300,
                        psi.irq_full_total / 1000);
        }

        if (cfg.container_path) {
                suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path);
                suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ",
                        container_stats.nr_running, container_stats.nr_sleeping);
                suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
                        container_stats.nr_stopped, container_stats.nr_uninterruptible,
                        container_stats.nr_io_wait);
        }

        /* Interacive command */
        suc &= BOOL_FPRINT(out, "[o]sort [M]memverbose [q]quit\n");
        if (sort_selected) {
                if (cfg.display_mode == MODE_MEMVERBOSE)
                        suc &= BOOL_FPRINT(out,
                                "sort selection: [m]MEM [r]RCL [t]THR [p]CMP [w]WP\n");
                else
                        suc &= BOOL_FPRINT(out,
                                "sort selection: [c]CPU [i]IO [m]MEM [q]IRQ\n");
        }

        /* Task delay output */
        suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n",
                        cfg.max_processes, get_name_by_field(cfg.sort_field));

        suc &= BOOL_FPRINT(out, "%8s  %8s  %-17s", "PID", "TGID", "COMMAND");
        if (cfg.display_mode == MODE_MEMVERBOSE) {
                suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n",
                        "MEM(ms)", "SWAP(ms)", "RCL(ms)",
                        "THR(ms)", "CMP(ms)", "WP(ms)");
                suc &= BOOL_FPRINT(out, "-----------------------");
                suc &= BOOL_FPRINT(out, "-----------------------");
                suc &= BOOL_FPRINT(out, "-----------------------");
                suc &= BOOL_FPRINT(out, "---------------------\n");
        } else {
                suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n",
                        "CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)");
                suc &= BOOL_FPRINT(out, "-----------------------");
                suc &= BOOL_FPRINT(out, "-----------------------");
                suc &= BOOL_FPRINT(out, "--------------------------\n");
        }

        count = task_count < cfg.max_processes ? task_count : cfg.max_processes;

        for (i = 0; i < count; i++) {
                suc &= BOOL_FPRINT(out, "%8d  %8d  %-15s",
                        tasks[i].pid, tasks[i].tgid, tasks[i].command);
                if (cfg.display_mode == MODE_MEMVERBOSE) {
                        suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE,
                                TASK_AVG(tasks[i], mem),
                                TASK_AVG(tasks[i], swapin),
                                TASK_AVG(tasks[i], freepages),
                                TASK_AVG(tasks[i], thrashing),
                                TASK_AVG(tasks[i], compact),
                                TASK_AVG(tasks[i], wpcopy));
                } else {
                        suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT,
                                TASK_AVG(tasks[i], cpu),
                                TASK_AVG(tasks[i], blkio),
                                TASK_AVG(tasks[i], irq),
                                TASK_AVG(tasks[i], mem));
                }
        }

        suc &= BOOL_FPRINT(out, "\n");

        if (!suc)
                perror("Error writing to output");
}

/* Check for keyboard input with timeout based on cfg.delay */
static char check_for_keypress(void)
{
        struct timeval tv = {cfg.delay, 0};
        fd_set readfds;
        char ch = 0;

        FD_ZERO(&readfds);
        FD_SET(STDIN_FILENO, &readfds);
        int r = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv);

        if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
                read(STDIN_FILENO, &ch, 1);
                return ch;
        }

        return 0;
}

#define MAX_MODE_SIZE 2
static void toggle_display_mode(void)
{
        static const size_t modes[MAX_MODE_SIZE] = {MODE_DEFAULT, MODE_MEMVERBOSE};
        static size_t cur_index;

        cur_index = (cur_index + 1) % MAX_MODE_SIZE;
        cfg.display_mode = modes[cur_index];
}

/* Handle keyboard input: sorting selection, mode toggle, or quit */
static void handle_keypress(char ch, int *running)
{
        const struct field_desc *field;

        /* Change sort field */
        if (sort_selected) {
                field = get_field_by_cmd_char(ch);
                if (field && (field->supported_modes & cfg.display_mode))
                        cfg.sort_field = field;

                sort_selected = 0;
        /* Handle mode changes or quit */
        } else {
                switch (ch) {
                case 'o':
                        sort_selected = 1;
                        break;
                case 'M':
                        toggle_display_mode();
                        for (field = sort_fields; field->name != NULL; field++) {
                                if (field->supported_modes & cfg.display_mode) {
                                        cfg.sort_field = field;
                                        break;
                                }
                        }
                        break;
                case 'q':
                case 'Q':
                        *running = 0;
                        break;
                default:
                        break;
                }
        }
}

/* Main function */
int main(int argc, char **argv)
{
        const struct field_desc *field;
        int iterations = 0;
        int psi_ret = 0;
        char keypress;

        /* Parse command line arguments */
        parse_args(argc, argv);

        /* Setup netlink socket */
        nl_sd = create_nl_socket();
        if (nl_sd < 0) {
                fprintf(stderr, "Error creating netlink socket\n");
                exit(1);
        }

        /* Get family ID for taskstats via netlink */
        family_id = get_family_id(nl_sd);
        if (!family_id) {
                fprintf(stderr, "Error getting taskstats family ID\n");
                close(nl_sd);
                exit(1);
        }

        /* Set terminal to non-canonical mode for interaction */
        enable_raw_mode();

        /* Main loop */
        while (running) {
                /* Auto-switch sort field when not matching display mode */
                if (!(cfg.sort_field->supported_modes & cfg.display_mode)) {
                        for (field = sort_fields; field->name != NULL; field++) {
                                if (field->supported_modes & cfg.display_mode) {
                                        cfg.sort_field = field;
                                        printf("Auto-switched sort field to: %s\n", field->name);
                                        break;
                                }
                        }
                }

                /* Read PSI statistics */
                psi_ret = read_psi_stats();

                /* Get container stats if container path provided */
                if (cfg.container_path)
                        get_container_stats();

                /* Get task delays */
                get_task_delays();

                /* Sort tasks */
                sort_tasks();

                /* Display results to stdout or log file */
                display_results(psi_ret);

                /* Check for iterations */
                if (cfg.iterations > 0 && ++iterations >= cfg.iterations)
                        break;

                /* Exit if output_one_time is set */
                if (cfg.output_one_time)
                        break;

                /* Keypress for interactive usage */
                keypress = check_for_keypress();
                if (keypress)
                        handle_keypress(keypress, &running);
        }

        /* Restore terminal mode */
        disable_raw_mode();

        /* Cleanup */
        close(nl_sd);
        if (cfg.container_path)
                free(cfg.container_path);

        return 0;
}