mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-18 22:14:16 +00:00 
			
		
		
		
	 0e0af57e0e
			
		
	
	
		0e0af57e0e
		
	
	
	
	
		
			
			The task exit struct needs some crucial information to be able to provide an enhanced version of process and thread accounting. This change provides: 1. ac_tgid in additon to ac_pid 2. thread group execution walltime in ac_tgetime 3. flag AGROUP in ac_flag to indicate the last task in a thread group / process 4. device ID and inode of task's /proc/self/exe in ac_exe_dev and ac_exe_inode 5. tools/accounting/procacct as demonstrator When a task exits, taskstats are reported to userspace including the task's pid and ppid, but without the id of the thread group this task is part of. Without the tgid, the stats of single tasks cannot be correlated to each other as a thread group (process). The taskstats documentation suggests that on process exit a data set consisting of accumulated stats for the whole group is produced. But such an additional set of stats is only produced for actually multithreaded processes, not groups that had only one thread, and also those stats only contain data about delay accounting and not the more basic information about CPU and memory resource usage. Adding the AGROUP flag to be set when the last task of a group exited enables determination of process end also for single-threaded processes. My applicaton basically does enhanced process accounting with summed cputime, biggest maxrss, tasks per process. The data is not available with the traditional BSD process accounting (which is not designed to be extensible) and the taskstats interface allows more efficient on-the-fly grouping and summing of the stats, anyway, without intermediate disk writes. Furthermore, I do carry statistics on which exact program binary is used how often with associated resources, getting a picture on how important which parts of a collection of installed scientific software in different versions are, and how well they put load on the machine. This is enabled by providing information on /proc/self/exe for each task. I assume the two 64-bit fields for device ID and inode are more appropriate than the possibly large resolved path to keep the data volume down. Add the tgid to the stats to complete task identification, the flag AGROUP to mark the last task of a group, the group wallclock time, and inode-based identification of the associated executable file. Add tools/accounting/procacct.c as a simplified fork of getdelays.c to demonstrate process and thread accounting. [thomas.orgis@uni-hamburg.de: fix version number in comment] Link: https://lkml.kernel.org/r/20220405003601.7a5f6008@plasteblaster Link: https://lkml.kernel.org/r/20220331004106.64e5616b@plasteblaster Signed-off-by: Dr. Thomas Orgis <thomas.orgis@uni-hamburg.de> Reviewed-by: Ismael Luceno <ismael@iodev.co.uk> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: xu xin <xu.xin16@zte.com.cn> Cc: Yang Yang <yang.yang29@zte.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
		
			
				
	
	
		
			182 lines
		
	
	
	
		
			5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			182 lines
		
	
	
	
		
			5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0-or-later
 | |
| /*
 | |
|  * tsacct.c - System accounting over taskstats interface
 | |
|  *
 | |
|  * Copyright (C) Jay Lan,	<jlan@sgi.com>
 | |
|  */
 | |
| 
 | |
| #include <linux/kernel.h>
 | |
| #include <linux/sched/signal.h>
 | |
| #include <linux/sched/mm.h>
 | |
| #include <linux/sched/cputime.h>
 | |
| #include <linux/tsacct_kern.h>
 | |
| #include <linux/acct.h>
 | |
| #include <linux/jiffies.h>
 | |
| #include <linux/mm.h>
 | |
| 
 | |
| /*
 | |
|  * fill in basic accounting fields
 | |
|  */
 | |
| void bacct_add_tsk(struct user_namespace *user_ns,
 | |
| 		   struct pid_namespace *pid_ns,
 | |
| 		   struct taskstats *stats, struct task_struct *tsk)
 | |
| {
 | |
| 	const struct cred *tcred;
 | |
| 	u64 utime, stime, utimescaled, stimescaled;
 | |
| 	u64 now_ns, delta;
 | |
| 	time64_t btime;
 | |
| 
 | |
| 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
 | |
| 
 | |
| 	/* calculate task elapsed time in nsec */
 | |
| 	now_ns = ktime_get_ns();
 | |
| 	/* store whole group time first */
 | |
| 	delta = now_ns - tsk->group_leader->start_time;
 | |
| 	/* Convert to micro seconds */
 | |
| 	do_div(delta, NSEC_PER_USEC);
 | |
| 	stats->ac_tgetime = delta;
 | |
| 	delta = now_ns - tsk->start_time;
 | |
| 	do_div(delta, NSEC_PER_USEC);
 | |
| 	stats->ac_etime = delta;
 | |
| 	/* Convert to seconds for btime (note y2106 limit) */
 | |
| 	btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC);
 | |
| 	stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
 | |
| 	stats->ac_btime64 = btime;
 | |
| 
 | |
| 	if (tsk->flags & PF_EXITING)
 | |
| 		stats->ac_exitcode = tsk->exit_code;
 | |
| 	if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
 | |
| 		stats->ac_flag |= AFORK;
 | |
| 	if (tsk->flags & PF_SUPERPRIV)
 | |
| 		stats->ac_flag |= ASU;
 | |
| 	if (tsk->flags & PF_DUMPCORE)
 | |
| 		stats->ac_flag |= ACORE;
 | |
| 	if (tsk->flags & PF_SIGNALED)
 | |
| 		stats->ac_flag |= AXSIG;
 | |
| 	stats->ac_nice	 = task_nice(tsk);
 | |
| 	stats->ac_sched	 = tsk->policy;
 | |
| 	stats->ac_pid	 = task_pid_nr_ns(tsk, pid_ns);
 | |
| 	stats->ac_tgid   = task_tgid_nr_ns(tsk, pid_ns);
 | |
| 	rcu_read_lock();
 | |
| 	tcred = __task_cred(tsk);
 | |
| 	stats->ac_uid	 = from_kuid_munged(user_ns, tcred->uid);
 | |
| 	stats->ac_gid	 = from_kgid_munged(user_ns, tcred->gid);
 | |
| 	stats->ac_ppid	 = pid_alive(tsk) ?
 | |
| 		task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
 | |
| 	rcu_read_unlock();
 | |
| 
 | |
| 	task_cputime(tsk, &utime, &stime);
 | |
| 	stats->ac_utime = div_u64(utime, NSEC_PER_USEC);
 | |
| 	stats->ac_stime = div_u64(stime, NSEC_PER_USEC);
 | |
| 
 | |
| 	task_cputime_scaled(tsk, &utimescaled, &stimescaled);
 | |
| 	stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC);
 | |
| 	stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC);
 | |
| 
 | |
| 	stats->ac_minflt = tsk->min_flt;
 | |
| 	stats->ac_majflt = tsk->maj_flt;
 | |
| 
 | |
| 	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
 | |
| }
 | |
| 
 | |
| 
 | |
| #ifdef CONFIG_TASK_XACCT
 | |
| 
 | |
| #define KB 1024
 | |
| #define MB (1024*KB)
 | |
| #define KB_MASK (~(KB-1))
 | |
| /*
 | |
|  * fill in extended accounting fields
 | |
|  */
 | |
| void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 | |
| {
 | |
| 	struct mm_struct *mm;
 | |
| 
 | |
| 	/* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
 | |
| 	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
 | |
| 	do_div(stats->coremem, 1000 * KB);
 | |
| 	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
 | |
| 	do_div(stats->virtmem, 1000 * KB);
 | |
| 	mm = get_task_mm(p);
 | |
| 	if (mm) {
 | |
| 		/* adjust to KB unit */
 | |
| 		stats->hiwater_rss   = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
 | |
| 		stats->hiwater_vm    = get_mm_hiwater_vm(mm)  * PAGE_SIZE / KB;
 | |
| 		mmput(mm);
 | |
| 	}
 | |
| 	stats->read_char	= p->ioac.rchar & KB_MASK;
 | |
| 	stats->write_char	= p->ioac.wchar & KB_MASK;
 | |
| 	stats->read_syscalls	= p->ioac.syscr & KB_MASK;
 | |
| 	stats->write_syscalls	= p->ioac.syscw & KB_MASK;
 | |
| #ifdef CONFIG_TASK_IO_ACCOUNTING
 | |
| 	stats->read_bytes	= p->ioac.read_bytes & KB_MASK;
 | |
| 	stats->write_bytes	= p->ioac.write_bytes & KB_MASK;
 | |
| 	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK;
 | |
| #else
 | |
| 	stats->read_bytes	= 0;
 | |
| 	stats->write_bytes	= 0;
 | |
| 	stats->cancelled_write_bytes = 0;
 | |
| #endif
 | |
| }
 | |
| #undef KB
 | |
| #undef MB
 | |
| 
 | |
| static void __acct_update_integrals(struct task_struct *tsk,
 | |
| 				    u64 utime, u64 stime)
 | |
| {
 | |
| 	u64 time, delta;
 | |
| 
 | |
| 	if (!likely(tsk->mm))
 | |
| 		return;
 | |
| 
 | |
| 	time = stime + utime;
 | |
| 	delta = time - tsk->acct_timexpd;
 | |
| 
 | |
| 	if (delta < TICK_NSEC)
 | |
| 		return;
 | |
| 
 | |
| 	tsk->acct_timexpd = time;
 | |
| 	/*
 | |
| 	 * Divide by 1024 to avoid overflow, and to avoid division.
 | |
| 	 * The final unit reported to userspace is Mbyte-usecs,
 | |
| 	 * the rest of the math is done in xacct_add_tsk.
 | |
| 	 */
 | |
| 	tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
 | |
| 	tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * acct_update_integrals - update mm integral fields in task_struct
 | |
|  * @tsk: task_struct for accounting
 | |
|  */
 | |
| void acct_update_integrals(struct task_struct *tsk)
 | |
| {
 | |
| 	u64 utime, stime;
 | |
| 	unsigned long flags;
 | |
| 
 | |
| 	local_irq_save(flags);
 | |
| 	task_cputime(tsk, &utime, &stime);
 | |
| 	__acct_update_integrals(tsk, utime, stime);
 | |
| 	local_irq_restore(flags);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * acct_account_cputime - update mm integral after cputime update
 | |
|  * @tsk: task_struct for accounting
 | |
|  */
 | |
| void acct_account_cputime(struct task_struct *tsk)
 | |
| {
 | |
| 	__acct_update_integrals(tsk, tsk->utime, tsk->stime);
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * acct_clear_integrals - clear the mm integral fields in task_struct
 | |
|  * @tsk: task_struct whose accounting fields are cleared
 | |
|  */
 | |
| void acct_clear_integrals(struct task_struct *tsk)
 | |
| {
 | |
| 	tsk->acct_timexpd = 0;
 | |
| 	tsk->acct_rss_mem1 = 0;
 | |
| 	tsk->acct_vm_mem1 = 0;
 | |
| }
 | |
| #endif
 |