Merge patch series "net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid"

Christian Brauner <brauner@kernel.org> says: SO_PEERPIDFD currently doesn't support handing out pidfds if the sk->sk_peer_pid thread-group leader has already been reaped. In this case it currently returns EINVAL. Userspace still wants to get a pidfd for a reaped process to have a stable handle it can pass on. This is especially useful now that it is possible to retrieve exit information through a pidfd via the PIDFD_GET_INFO ioctl()'s PIDFD_INFO_EXIT flag. Another summary has been provided by David in [1]: > A pidfd can outlive the task it refers to, and thus user-space must > already be prepared that the task underlying a pidfd is gone at the time > they get their hands on the pidfd. For instance, resolving the pidfd to > a PID via the fdinfo must be prepared to read `-1`. > > Despite user-space knowing that a pidfd might be stale, several kernel > APIs currently add another layer that checks for this. In particular, > SO_PEERPIDFD returns `EINVAL` if the peer-task was already reaped, > but returns a stale pidfd if the task is reaped immediately after the > respective alive-check. > > This has the unfortunate effect that user-space now has two ways to > check for the exact same scenario: A syscall might return > EINVAL/ESRCH/... *or* the pidfd might be stale, even though there is no > particular reason to distinguish both cases. This also propagates > through user-space APIs, which pass on pidfds. They must be prepared to > pass on `-1` *or* the pidfd, because there is no guaranteed way to get a > stale pidfd from the kernel. > Userspace must already deal with a pidfd referring to a reaped task as > the task may exit and get reaped at any time will there are still many > pidfds referring to it. In order to allow handing out reaped pidfd SO_PEERPIDFD needs to ensure that PIDFD_INFO_EXIT information is available whenever a pidfd for a reaped task is created by PIDFD_INFO_EXIT. The uapi promises that reaped pidfds are only handed out if it is guaranteed that the caller sees the exit information: TEST_F(pidfd_info, success_reaped) { struct pidfd_info info = { .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, }; /* * Process has already been reaped and PIDFD_INFO_EXIT been set. * Verify that we can retrieve the exit status of the process. */ ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0); ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); ASSERT_TRUE(WIFEXITED(info.exit_code)); ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); } To hand out pidfds for reaped processes we thus allocate a pidfs entry for the relevant sk->sk_peer_pid at the time the sk->sk_peer_pid is stashed and drop it when the socket is destroyed. This guarantees that exit information will always be recorded for the sk->sk_peer_pid task and we can hand out pidfds for reaped processes. * patches from https://lore.kernel.org/20250425-work-pidfs-net-v2-0-450a19461e75@kernel.org: net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid pidfs: get rid of __pidfd_prepare() net, pidfs: prepare for handing out pidfds for reaped sk->sk_peer_pid pidfs: register pid in pidfs Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-0-450a19461e75@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-08-05 16:54:27 +00:00 · 2025-04-25 10:35:27 +02:00 · 2025-04-25 10:35:27 +02:00 · 923ea4d448
commit 923ea4d448
parent b590c928cc 20b70e5896
7 changed files with 192 additions and 86 deletions
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@ -768,7 +768,7 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
 {
 	enum pid_type type;
-	if (flags & PIDFD_CLONE)
+	if (flags & PIDFD_STALE)
 		return true;
 	/*
@ -777,10 +777,14 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
 	 * pidfd has been allocated perform another check that the pid
 	 * is still alive. If it is exit information is available even
 	 * if the task gets reaped before the pidfd is returned to
-	 * userspace. The only exception is PIDFD_CLONE where no task
+	 * userspace. The only exception are indicated by PIDFD_STALE:
-	 * linkage has been established for @pid yet and the kernel is
+	 *
-	 * in the middle of process creation so there's nothing for
+	 * (1) The kernel is in the middle of task creation and thus no
-	 * pidfs to miss.
+	 *     task linkage has been established yet.
 	 * (2) The caller knows @pid has been registered in pidfs at a
 	 *     time when the task was still alive.
 	 *
 	 * In both cases exit information will have been reported.
 	 */
 	if (flags & PIDFD_THREAD)
 		type = PIDTYPE_PID;
@ -874,11 +878,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	int ret;
 	/*
-	 * Ensure that PIDFD_CLONE can be passed as a flag without
+	 * Ensure that PIDFD_STALE can be passed as a flag without
 	 * overloading other uapi pidfd flags.
 	 */
-	BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD);
+	BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
-	BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK);
+	BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
 	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
 	if (ret < 0)
@ -887,7 +891,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	if (!pidfs_pid_valid(pid, &path, flags))
 		return ERR_PTR(-ESRCH);
-	flags &= ~PIDFD_CLONE;
+	flags &= ~PIDFD_STALE;
 	pidfd_file = dentry_open(&path, flags, current_cred());
 	/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
 	if (!IS_ERR(pidfd_file))
@ -896,6 +900,65 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	return pidfd_file;
 }
 /**
 * pidfs_register_pid - register a struct pid in pidfs
 * @pid: pid to pin
 *
 * Register a struct pid in pidfs. Needs to be paired with
 * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
 *
 * Return: On success zero, on error a negative error code is returned.
 */
 int pidfs_register_pid(struct pid *pid)
 {
 	struct path path __free(path_put) = {};
 	int ret;
 	might_sleep();
 	if (!pid)
 		return 0;
 	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
 	if (unlikely(ret))
 		return ret;
 	/* Keep the dentry and only put the reference to the mount. */
 	path.dentry = NULL;
 	return 0;
 }
 /**
 * pidfs_get_pid - pin a struct pid through pidfs
 * @pid: pid to pin
 *
 * Similar to pidfs_register_pid() but only valid if the caller knows
 * there's a reference to the @pid through a dentry already that can't
 * go away.
 */
 void pidfs_get_pid(struct pid *pid)
 {
 	if (!pid)
 		return;
 	WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed));
 }
 /**
 * pidfs_put_pid - drop a pidfs reference
 * @pid: pid to drop
 *
 * Drop a reference to @pid via pidfs. This is only safe if the
 * reference has been taken via pidfs_get_pid().
 */
 void pidfs_put_pid(struct pid *pid)
 {
 	might_sleep();
 	if (!pid)
 		return;
 	VFS_WARN_ON_ONCE(!pid->stashed);
 	dput(pid->stashed);
 }
 static void pidfs_inode_init_once(void *data)
 {
 	struct pidfs_inode *pi = data;
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@ -77,7 +77,7 @@ struct file;
 struct pid *pidfd_pid(const struct file *file);
 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
 struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
-int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file);
 void do_notify_pidfd(struct task_struct *task);
 static inline struct pid *get_pid(struct pid *pid)
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@ -8,5 +8,8 @@ void pidfs_add_pid(struct pid *pid);
 void pidfs_remove_pid(struct pid *pid);
 void pidfs_exit(struct task_struct *tsk);
 extern const struct dentry_operations pidfs_dentry_operations;
 int pidfs_register_pid(struct pid *pid);
 void pidfs_get_pid(struct pid *pid);
 void pidfs_put_pid(struct pid *pid);
 #endif /* _LINUX_PID_FS_H */
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@ -12,7 +12,7 @@
 #define PIDFD_THREAD	O_EXCL
 #ifdef __KERNEL__
 #include <linux/sched.h>
-#define PIDFD_CLONE CLONE_PIDFD
+#define PIDFD_STALE CLONE_PIDFD
 #endif
 /* Flags for pidfd_send_signal(). */
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -2035,55 +2035,11 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 }
 /**
 * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret: Where to return the file for the pidfd.
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper doesn't perform checks on @pid which makes it useful for pidfds
 * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
 * pidfd file are prepared.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
 static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
 {
 	struct file *pidfd_file;
 	CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
 	if (pidfd < 0)
 		return pidfd;
 	pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
 	if (IS_ERR(pidfd_file))
 		return PTR_ERR(pidfd_file);
 	*ret = pidfd_file;
 	return take_fd(pidfd);
 }
 /**
 * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
- * @ret: Where to return the pidfd.
+ * @ret_file: return the new pidfs file
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
@ -2106,16 +2062,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
-int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
 {
 	struct file *pidfs_file;
 	/*
-	 * While holding the pidfd waitqueue lock removing the task
+	 * PIDFD_STALE is only allowed to be passed if the caller knows
-	 * linkage for the thread-group leader pid (PIDTYPE_TGID) isn't
+	 * that @pid is already registered in pidfs and thus
-	 * possible. Thus, if there's still task linkage for PIDTYPE_PID
+	 * PIDFD_INFO_EXIT information is guaranteed to be available.
 	 * not having thread-group leader linkage for the pid means it
 	 * wasn't a thread-group leader in the first place.
 	 */
-	scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) {
+	if (!(flags & PIDFD_STALE)) {
 		/*
 		 * While holding the pidfd waitqueue lock removing the
 		 * task linkage for the thread-group leader pid
 		 * (PIDTYPE_TGID) isn't possible. Thus, if there's still
 		 * task linkage for PIDTYPE_PID not having thread-group
 		 * leader linkage for the pid means it wasn't a
 		 * thread-group leader in the first place.
 		 */
 		guard(spinlock_irq)(&pid->wait_pidfd.lock);
 		/* Task has already been reaped. */
 		if (!pid_has_task(pid, PIDTYPE_PID))
 			return -ESRCH;
@ -2128,7 +2094,16 @@ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
 			return -ENOENT;
 	}
-	return __pidfd_prepare(pid, flags, ret);
+	CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
 	if (pidfd < 0)
 		return pidfd;
 	pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
 	if (IS_ERR(pidfs_file))
 		return PTR_ERR(pidfs_file);
 	*ret_file = pidfs_file;
 	return take_fd(pidfd);
 }
 static void __delayed_free_task(struct rcu_head *rhp)
@ -2477,7 +2452,7 @@ __latent_entropy struct task_struct *copy_process(
 		 * Note that no task has been attached to @pid yet indicate
 		 * that via CLONE_PIDFD.
 		 */
-		retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile);
+		retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
 		if (retval < 0)
 			goto bad_fork_free_pid;
 		pidfd = retval;
--- a/net/core/sock.c
+++ b/net/core/sock.c
@ -148,6 +148,8 @@
 #include <linux/ethtool.h>
 #include <uapi/linux/pidfd.h>
 #include "dev.h"
 static DEFINE_MUTEX(proto_list_mutex);
@ -1879,6 +1881,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
 	{
 		struct pid *peer_pid;
 		struct file *pidfd_file = NULL;
 		unsigned int flags = 0;
 		int pidfd;
 		if (len > sizeof(pidfd))
@ -1891,18 +1894,17 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
 		if (!peer_pid)
 			return -ENODATA;
-		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
+		/* The use of PIDFD_STALE requires stashing of struct pid
 		 * on pidfs with pidfs_register_pid() and only AF_UNIX
 		 * were prepared for this.
 		 */
 		if (sk->sk_family == AF_UNIX)
 			flags = PIDFD_STALE;
 		pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
 		put_pid(peer_pid);
-		if (pidfd < 0) {
+		if (pidfd < 0)
 			/*
 			 * dbus-broker relies on -EINVAL being returned
 			 * to indicate ESRCH. Paper over it until this
 			 * is fixed in userspace.
 			 */
 			if (pidfd == -ESRCH)
 				pidfd = -EINVAL;
 			return pidfd;
 		}
 		if (copy_to_sockptr(optval, &pidfd, len) ||
 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@ -100,6 +100,7 @@
 #include <linux/splice.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/pidfs.h>
 #include <net/af_unix.h>
 #include <net/net_namespace.h>
 #include <net/scm.h>
@ -643,6 +644,9 @@ static void unix_sock_destructor(struct sock *sk)
 		return;
 	}
 	if (sk->sk_peer_pid)
 		pidfs_put_pid(sk->sk_peer_pid);
 	if (u->addr)
 		unix_release_addr(u->addr);
@ -734,13 +738,48 @@ static void unix_release_sock(struct sock *sk, int embrion)
 		unix_gc();		/* Garbage collect fds */
 }
-static void init_peercred(struct sock *sk)
+struct unix_peercred {
 	struct pid *peer_pid;
 	const struct cred *peer_cred;
 };
 static inline int prepare_peercred(struct unix_peercred *peercred)
 {
-	sk->sk_peer_pid = get_pid(task_tgid(current));
+	struct pid *pid;
-	sk->sk_peer_cred = get_current_cred();
+	int err;
 	pid = task_tgid(current);
 	err = pidfs_register_pid(pid);
 	if (likely(!err)) {
 		peercred->peer_pid = get_pid(pid);
 		peercred->peer_cred = get_current_cred();
 	}
 	return err;
 }
-static void update_peercred(struct sock *sk)
+static void drop_peercred(struct unix_peercred *peercred)
 {
 	const struct cred *cred = NULL;
 	struct pid *pid = NULL;
 	might_sleep();
 	swap(peercred->peer_pid, pid);
 	swap(peercred->peer_cred, cred);
 	pidfs_put_pid(pid);
 	put_pid(pid);
 	put_cred(cred);
 }
 static inline void init_peercred(struct sock *sk,
 				 const struct unix_peercred *peercred)
 {
 	sk->sk_peer_pid = peercred->peer_pid;
 	sk->sk_peer_cred = peercred->peer_cred;
 }
 static void update_peercred(struct sock *sk, struct unix_peercred *peercred)
 {
 	const struct cred *old_cred;
 	struct pid *old_pid;
@ -748,11 +787,11 @@ static void update_peercred(struct sock *sk)
 	spin_lock(&sk->sk_peer_lock);
 	old_pid = sk->sk_peer_pid;
 	old_cred = sk->sk_peer_cred;
-	init_peercred(sk);
+	init_peercred(sk, peercred);
 	spin_unlock(&sk->sk_peer_lock);
-	put_pid(old_pid);
+	peercred->peer_pid = old_pid;
-	put_cred(old_cred);
+	peercred->peer_cred = old_cred;
 }
 static void copy_peercred(struct sock *sk, struct sock *peersk)
@ -761,6 +800,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk)
 	spin_lock(&sk->sk_peer_lock);
 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
 	pidfs_get_pid(sk->sk_peer_pid);
 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 	spin_unlock(&sk->sk_peer_lock);
 }
@ -770,6 +810,7 @@ static int unix_listen(struct socket *sock, int backlog)
 	int err;
 	struct sock *sk = sock->sk;
 	struct unix_sock *u = unix_sk(sk);
 	struct unix_peercred peercred = {};
 	err = -EOPNOTSUPP;
 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
@ -777,6 +818,9 @@ static int unix_listen(struct socket *sock, int backlog)
 	err = -EINVAL;
 	if (!READ_ONCE(u->addr))
 		goto out;	/* No listens on an unbound socket */
 	err = prepare_peercred(&peercred);
 	if (err)
 		goto out;
 	unix_state_lock(sk);
 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 		goto out_unlock;
@ -786,11 +830,12 @@ static int unix_listen(struct socket *sock, int backlog)
 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
 	/* set credentials so connect can copy them */
-	update_peercred(sk);
+	update_peercred(sk, &peercred);
 	err = 0;
 out_unlock:
 	unix_state_unlock(sk);
 	drop_peercred(&peercred);
 out:
 	return err;
 }
@ -1525,6 +1570,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
 	struct unix_peercred peercred = {};
 	struct net *net = sock_net(sk);
 	struct sk_buff *skb = NULL;
 	unsigned char state;
@ -1561,6 +1607,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		goto out;
 	}
 	err = prepare_peercred(&peercred);
 	if (err)
 		goto out;
 	/* Allocate skb for sending to listening sock */
 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
 	if (!skb) {
@ -1636,7 +1686,7 @@ restart:
 	unix_peer(newsk)	= sk;
 	newsk->sk_state		= TCP_ESTABLISHED;
 	newsk->sk_type		= sk->sk_type;
-	init_peercred(newsk);
+	init_peercred(newsk, &peercred);
 	newu = unix_sk(newsk);
 	newu->listener = other;
 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
@ -1695,20 +1745,33 @@ out_free_skb:
 out_free_sk:
 	unix_release_sock(newsk, 0);
 out:
 	drop_peercred(&peercred);
 	return err;
 }
 static int unix_socketpair(struct socket *socka, struct socket *sockb)
 {
 	struct unix_peercred ska_peercred = {}, skb_peercred = {};
 	struct sock *ska = socka->sk, *skb = sockb->sk;
 	int err;
 	err = prepare_peercred(&ska_peercred);
 	if (err)
 		return err;
 	err = prepare_peercred(&skb_peercred);
 	if (err) {
 		drop_peercred(&ska_peercred);
 		return err;
 	}
 	/* Join our sockets back to back */
 	sock_hold(ska);
 	sock_hold(skb);
 	unix_peer(ska) = skb;
 	unix_peer(skb) = ska;
-	init_peercred(ska);
+	init_peercred(ska, &ska_peercred);
-	init_peercred(skb);
+	init_peercred(skb, &skb_peercred);
 	ska->sk_state = TCP_ESTABLISHED;
 	skb->sk_state = TCP_ESTABLISHED;