linux/fs/kernfs/kernfs-internal.h
Imran Khan 9caf696142 kernfs: Introduce separate rwsem to protect inode attributes.
Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple
kernfs operations. On a large system with few hundred CPUs and few
hundred applications simultaneoulsy trying to access sysfs, this
results in multiple sys_open(s) contending on kernfs_rwsem via
kernfs_iop_permission and kernfs_dop_revalidate.

For example on a system with 384 cores, if I run 200 instances of an
application which is mostly executing the following loop:

  for (int loop = 0; loop <100 ; loop++)
  {
    for (int port_num = 1; port_num < 2; port_num++)
    {
      for (int gid_index = 0; gid_index < 254; gid_index++ )
      {
        char ret_buf[64], ret_buf_lo[64];
        char gid_file_path[1024];

        int      ret_len;
        int      ret_fd;
        ssize_t  ret_rd;

        ub4  i, saved_errno;

        memset(ret_buf, 0, sizeof(ret_buf));
        memset(gid_file_path, 0, sizeof(gid_file_path));

        ret_len = snprintf(gid_file_path, sizeof(gid_file_path),
                           "/sys/class/infiniband/%s/ports/%d/gids/%d",
                           dev_name,
                           port_num,
                           gid_index);

        ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC);
        if (ret_fd < 0)
        {
          printf("Failed to open %s\n", gid_file_path);
          continue;
        }

        /* Read the GID */
        ret_rd = read(ret_fd, ret_buf, 40);

        if (ret_rd == -1)
        {
          printf("Failed to read from file %s, errno: %u\n",
                 gid_file_path, saved_errno);

          continue;
        }

        close(ret_fd);
      }
    }

I see contention around kernfs_rwsem as follows:

path_openat
|
|----link_path_walk.part.0.constprop.0
|      |
|      |--49.92%--inode_permission
|      |          |
|      |           --48.69%--kernfs_iop_permission
|      |                     |
|      |                     |--18.16%--down_read
|      |                     |
|      |                     |--15.38%--up_read
|      |                     |
|      |                      --14.58%--_raw_spin_lock
|      |                                |
|      |                                 -----
|      |
|      |--29.08%--walk_component
|      |          |
|      |           --29.02%--lookup_fast
|      |                     |
|      |                     |--24.26%--kernfs_dop_revalidate
|      |                     |          |
|      |                     |          |--14.97%--down_read
|      |                     |          |
|      |                     |           --9.01%--up_read
|      |                     |
|      |                      --4.74%--__d_lookup
|      |                                |
|      |                                 --4.64%--_raw_spin_lock
|      |                                           |
|      |                                            ----

Having a separate per-fs rwsem to protect kernfs inode attributes,
will avoid the above mentioned contention and result in better
performance as can bee seen below:

path_openat
|
|----link_path_walk.part.0.constprop.0
|     |
|     |
|     |--27.06%--inode_permission
|     |          |
|     |           --25.84%--kernfs_iop_permission
|     |                     |
|     |                     |--9.29%--up_read
|     |                     |
|     |                     |--8.19%--down_read
|     |                     |
|     |                      --7.89%--_raw_spin_lock
|     |                                |
|     |                                 ----
|     |
|     |--22.42%--walk_component
|     |          |
|     |           --22.36%--lookup_fast
|     |                     |
|     |                     |--16.07%--__d_lookup
|     |                     |          |
|     |                     |           --16.01%--_raw_spin_lock
|     |                     |                     |
|     |                     |                      ----
|     |                     |
|     |                      --6.28%--kernfs_dop_revalidate
|     |                                |
|     |                                |--3.76%--down_read
|     |                                |
|     |                                 --2.26%--up_read

As can be seen from the above data the overhead due to both
kerfs_iop_permission and kernfs_dop_revalidate have gone down and
this also reduces overall run time of the earlier mentioned loop.

Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2023-03-29 12:23:45 +02:00

173 lines
4.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* fs/kernfs/kernfs-internal.h - kernfs internal header file
*
* Copyright (c) 2001-3 Patrick Mochel
* Copyright (c) 2007 SUSE Linux Products GmbH
* Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
*/
#ifndef __KERNFS_INTERNAL_H
#define __KERNFS_INTERNAL_H
#include <linux/lockdep.h>
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/xattr.h>
#include <linux/kernfs.h>
#include <linux/fs_context.h>
struct kernfs_iattrs {
kuid_t ia_uid;
kgid_t ia_gid;
struct timespec64 ia_atime;
struct timespec64 ia_mtime;
struct timespec64 ia_ctime;
struct simple_xattrs xattrs;
atomic_t nr_user_xattrs;
atomic_t user_xattr_size;
};
struct kernfs_root {
/* published fields */
struct kernfs_node *kn;
unsigned int flags; /* KERNFS_ROOT_* flags */
/* private fields, do not use outside kernfs proper */
struct idr ino_idr;
u32 last_id_lowbits;
u32 id_highbits;
struct kernfs_syscall_ops *syscall_ops;
/* list of kernfs_super_info of this root, protected by kernfs_rwsem */
struct list_head supers;
wait_queue_head_t deactivate_waitq;
struct rw_semaphore kernfs_rwsem;
struct rw_semaphore kernfs_iattr_rwsem;
};
/* +1 to avoid triggering overflow warning when negating it */
#define KN_DEACTIVATED_BIAS (INT_MIN + 1)
/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
/**
* kernfs_root - find out the kernfs_root a kernfs_node belongs to
* @kn: kernfs_node of interest
*
* Return: the kernfs_root @kn belongs to.
*/
static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
{
/* if parent exists, it's always a dir; otherwise, @sd is a dir */
if (kn->parent)
kn = kn->parent;
return kn->dir.root;
}
/*
* mount.c
*/
struct kernfs_super_info {
struct super_block *sb;
/*
* The root associated with this super_block. Each super_block is
* identified by the root and ns it's associated with.
*/
struct kernfs_root *root;
/*
* Each sb is associated with one namespace tag, currently the
* network namespace of the task which mounted this kernfs
* instance. If multiple tags become necessary, make the following
* an array and compare kernfs_node tag against every entry.
*/
const void *ns;
/* anchored at kernfs_root->supers, protected by kernfs_rwsem */
struct list_head node;
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
{
if (d_really_is_negative(dentry))
return NULL;
return d_inode(dentry)->i_private;
}
static inline void kernfs_set_rev(struct kernfs_node *parent,
struct dentry *dentry)
{
dentry->d_time = parent->dir.rev;
}
static inline void kernfs_inc_rev(struct kernfs_node *parent)
{
parent->dir.rev++;
}
static inline bool kernfs_dir_changed(struct kernfs_node *parent,
struct dentry *dentry)
{
if (parent->dir.rev != dentry->d_time)
return true;
return false;
}
extern const struct super_operations kernfs_sops;
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
/*
* inode.c
*/
extern const struct xattr_handler *kernfs_xattr_handlers[];
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask);
int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *iattr);
int kernfs_iop_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
/*
* dir.c
*/
extern const struct dentry_operations kernfs_dops;
extern const struct file_operations kernfs_dir_fops;
extern const struct inode_operations kernfs_dir_iops;
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
void kernfs_put_active(struct kernfs_node *kn);
int kernfs_add_one(struct kernfs_node *kn);
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
const char *name, umode_t mode,
kuid_t uid, kgid_t gid,
unsigned flags);
/*
* file.c
*/
extern const struct file_operations kernfs_file_fops;
bool kernfs_should_drain_open_files(struct kernfs_node *kn);
void kernfs_drain_open_files(struct kernfs_node *kn);
/*
* symlink.c
*/
extern const struct inode_operations kernfs_symlink_iops;
/*
* kernfs locks
*/
extern struct kernfs_global_locks *kernfs_locks;
#endif /* __KERNFS_INTERNAL_H */