linux/fs/nilfs2/nilfs.h

416 lines
14 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0+ */
/*
* NILFS local header file.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
* Written by Koji Sato and Ryusuke Konishi.
*/
#ifndef _NILFS_H
#define _NILFS_H
#include <linux/kernel.h>
#include <linux/buffer_head.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/nilfs2_api.h>
#include <linux/nilfs2_ondisk.h>
#include "the_nilfs.h"
#include "bmap.h"
/**
* struct nilfs_inode_info - nilfs inode data in memory
* @i_flags: inode flags
* @i_type: inode type (combination of flags that inidicate usage)
* @i_state: dynamic state flags
* @i_bmap: pointer on i_bmap_data
* @i_bmap_data: raw block mapping
* @i_xattr: <TODO>
* @i_dir_start_lookup: page index of last successful search
* @i_cno: checkpoint number for GC inode
nilfs2: fix lockdep warnings in page operations for btree nodes Patch series "nilfs2 lockdep warning fixes". The first two are to resolve the lockdep warning issue, and the last one is the accompanying cleanup and low priority. Based on your comment, this series solves the issue by separating inode object as needed. Since I was worried about the impact of the object composition changes, I tested the series carefully not to cause regressions especially for delicate functions such like disk space reclamation and snapshots. This patch (of 3): If CONFIG_LOCKDEP is enabled, nilfs2 hits lockdep warnings at inode_to_wb() during page/folio operations for btree nodes: WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 inode_to_wb include/linux/backing-dev.h:269 [inline] WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 folio_account_dirtied mm/page-writeback.c:2460 [inline] WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 __folio_mark_dirty+0xa7c/0xe30 mm/page-writeback.c:2509 Modules linked in: ... RIP: 0010:inode_to_wb include/linux/backing-dev.h:269 [inline] RIP: 0010:folio_account_dirtied mm/page-writeback.c:2460 [inline] RIP: 0010:__folio_mark_dirty+0xa7c/0xe30 mm/page-writeback.c:2509 ... Call Trace: __set_page_dirty include/linux/pagemap.h:834 [inline] mark_buffer_dirty+0x4e6/0x650 fs/buffer.c:1145 nilfs_btree_propagate_p fs/nilfs2/btree.c:1889 [inline] nilfs_btree_propagate+0x4ae/0xea0 fs/nilfs2/btree.c:2085 nilfs_bmap_propagate+0x73/0x170 fs/nilfs2/bmap.c:337 nilfs_collect_dat_data+0x45/0xd0 fs/nilfs2/segment.c:625 nilfs_segctor_apply_buffers+0x14a/0x470 fs/nilfs2/segment.c:1009 nilfs_segctor_scan_file+0x47a/0x700 fs/nilfs2/segment.c:1048 nilfs_segctor_collect_blocks fs/nilfs2/segment.c:1224 [inline] nilfs_segctor_collect fs/nilfs2/segment.c:1494 [inline] nilfs_segctor_do_construct+0x14f3/0x6c60 fs/nilfs2/segment.c:2036 nilfs_segctor_construct+0x7a7/0xb30 fs/nilfs2/segment.c:2372 nilfs_segctor_thread_construct fs/nilfs2/segment.c:2480 [inline] nilfs_segctor_thread+0x3c3/0xf90 fs/nilfs2/segment.c:2563 kthread+0x405/0x4f0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 This is because nilfs2 uses two page caches for each inode and inode->i_mapping never points to one of them, the btree node cache. This causes inode_to_wb(inode) to refer to a different page cache than the caller page/folio operations such like __folio_start_writeback(), __folio_end_writeback(), or __folio_mark_dirty() acquired the lock. This patch resolves the issue by allocating and using an additional inode to hold the page cache of btree nodes. The inode is attached one-to-one to the traditional nilfs2 inode if it requires a block mapping with b-tree. This setup change is in memory only and does not affect the disk format. Link: https://lkml.kernel.org/r/1647867427-30498-1-git-send-email-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/1647867427-30498-2-git-send-email-konishi.ryusuke@gmail.com Link: https://lore.kernel.org/r/YXrYvIo8YRnAOJCj@casper.infradead.org Link: https://lore.kernel.org/r/9a20b33d-b38f-b4a2-4742-c1eb5b8e4d6c@redhat.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Reported-by: syzbot+0d5b462a6f07447991b3@syzkaller.appspotmail.com Reported-by: syzbot+34ef28bb2aeb28724aa0@syzkaller.appspotmail.com Reported-by: Hao Sun <sunhao.th@gmail.com> Reported-by: David Hildenbrand <david@redhat.com> Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-04-01 11:28:18 -07:00
* @i_assoc_inode: associated inode (B-tree node cache holder or back pointer)
* @i_dirty: list for connecting dirty files
* @xattr_sem: semaphore for extended attributes processing
* @i_bh: buffer contains disk inode
* @i_root: root object of the current filesystem tree
* @vfs_inode: VFS inode object
*/
struct nilfs_inode_info {
__u32 i_flags;
unsigned int i_type;
unsigned long i_state; /* Dynamic state flags */
struct nilfs_bmap *i_bmap;
struct nilfs_bmap i_bmap_data;
__u64 i_xattr; /* sector_t ??? */
__u32 i_dir_start_lookup;
__u64 i_cno; /* check point number for GC inode */
nilfs2: fix lockdep warnings in page operations for btree nodes Patch series "nilfs2 lockdep warning fixes". The first two are to resolve the lockdep warning issue, and the last one is the accompanying cleanup and low priority. Based on your comment, this series solves the issue by separating inode object as needed. Since I was worried about the impact of the object composition changes, I tested the series carefully not to cause regressions especially for delicate functions such like disk space reclamation and snapshots. This patch (of 3): If CONFIG_LOCKDEP is enabled, nilfs2 hits lockdep warnings at inode_to_wb() during page/folio operations for btree nodes: WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 inode_to_wb include/linux/backing-dev.h:269 [inline] WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 folio_account_dirtied mm/page-writeback.c:2460 [inline] WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 __folio_mark_dirty+0xa7c/0xe30 mm/page-writeback.c:2509 Modules linked in: ... RIP: 0010:inode_to_wb include/linux/backing-dev.h:269 [inline] RIP: 0010:folio_account_dirtied mm/page-writeback.c:2460 [inline] RIP: 0010:__folio_mark_dirty+0xa7c/0xe30 mm/page-writeback.c:2509 ... Call Trace: __set_page_dirty include/linux/pagemap.h:834 [inline] mark_buffer_dirty+0x4e6/0x650 fs/buffer.c:1145 nilfs_btree_propagate_p fs/nilfs2/btree.c:1889 [inline] nilfs_btree_propagate+0x4ae/0xea0 fs/nilfs2/btree.c:2085 nilfs_bmap_propagate+0x73/0x170 fs/nilfs2/bmap.c:337 nilfs_collect_dat_data+0x45/0xd0 fs/nilfs2/segment.c:625 nilfs_segctor_apply_buffers+0x14a/0x470 fs/nilfs2/segment.c:1009 nilfs_segctor_scan_file+0x47a/0x700 fs/nilfs2/segment.c:1048 nilfs_segctor_collect_blocks fs/nilfs2/segment.c:1224 [inline] nilfs_segctor_collect fs/nilfs2/segment.c:1494 [inline] nilfs_segctor_do_construct+0x14f3/0x6c60 fs/nilfs2/segment.c:2036 nilfs_segctor_construct+0x7a7/0xb30 fs/nilfs2/segment.c:2372 nilfs_segctor_thread_construct fs/nilfs2/segment.c:2480 [inline] nilfs_segctor_thread+0x3c3/0xf90 fs/nilfs2/segment.c:2563 kthread+0x405/0x4f0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 This is because nilfs2 uses two page caches for each inode and inode->i_mapping never points to one of them, the btree node cache. This causes inode_to_wb(inode) to refer to a different page cache than the caller page/folio operations such like __folio_start_writeback(), __folio_end_writeback(), or __folio_mark_dirty() acquired the lock. This patch resolves the issue by allocating and using an additional inode to hold the page cache of btree nodes. The inode is attached one-to-one to the traditional nilfs2 inode if it requires a block mapping with b-tree. This setup change is in memory only and does not affect the disk format. Link: https://lkml.kernel.org/r/1647867427-30498-1-git-send-email-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/1647867427-30498-2-git-send-email-konishi.ryusuke@gmail.com Link: https://lore.kernel.org/r/YXrYvIo8YRnAOJCj@casper.infradead.org Link: https://lore.kernel.org/r/9a20b33d-b38f-b4a2-4742-c1eb5b8e4d6c@redhat.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Reported-by: syzbot+0d5b462a6f07447991b3@syzkaller.appspotmail.com Reported-by: syzbot+34ef28bb2aeb28724aa0@syzkaller.appspotmail.com Reported-by: Hao Sun <sunhao.th@gmail.com> Reported-by: David Hildenbrand <david@redhat.com> Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-04-01 11:28:18 -07:00
struct inode *i_assoc_inode;
struct list_head i_dirty; /* List for connecting dirty files */
#ifdef CONFIG_NILFS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_sem even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
#endif
struct buffer_head *i_bh; /*
* i_bh contains a new or dirty
* disk inode.
*/
struct nilfs_root *i_root;
struct inode vfs_inode;
};
static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
{
return container_of(inode, struct nilfs_inode_info, vfs_inode);
}
static inline struct nilfs_inode_info *
NILFS_BMAP_I(const struct nilfs_bmap *bmap)
{
return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
}
/*
* Dynamic state flags of NILFS on-memory inode (i_state)
*/
enum {
NILFS_I_NEW = 0, /* Inode is newly created */
NILFS_I_DIRTY, /* The file is dirty */
NILFS_I_QUEUED, /* inode is in dirty_files list */
NILFS_I_BUSY, /*
* Inode is grabbed by a segment
* constructor
*/
NILFS_I_COLLECTED, /* All dirty blocks are collected */
NILFS_I_UPDATED, /* The file has been written back */
NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
NILFS_I_BMAP, /* has bmap and btnode_cache */
};
/*
* Flags to identify the usage of on-memory inodes (i_type)
*/
enum {
NILFS_I_TYPE_NORMAL = 0,
NILFS_I_TYPE_GC = 0x0001, /* For data caching during GC */
NILFS_I_TYPE_BTNC = 0x0002, /* For btree node cache */
NILFS_I_TYPE_SHADOW = 0x0004, /* For shadowed page cache */
};
/*
* commit flags for nilfs_commit_super and nilfs_sync_super
*/
enum {
NILFS_SB_COMMIT = 0, /* Commit a super block alternately */
NILFS_SB_COMMIT_ALL /* Commit both super blocks */
};
/**
* define NILFS_MAX_VOLUME_NAME - maximum number of characters (bytes) in a
* file system volume name
*
* Defined by the size of the volume name field in the on-disk superblocks.
* This volume name does not include the terminating NULL byte if the string
* length matches the field size, so use (NILFS_MAX_VOLUME_NAME + 1) for the
* size of the buffer that requires a NULL byte termination.
*/
#define NILFS_MAX_VOLUME_NAME \
sizeof_field(struct nilfs_super_block, s_volume_name)
/*
* Macros to check inode numbers
*/
#define NILFS_MDT_INO_BITS \
(BIT(NILFS_DAT_INO) | BIT(NILFS_CPFILE_INO) | \
BIT(NILFS_SUFILE_INO) | BIT(NILFS_IFILE_INO) | \
BIT(NILFS_ATIME_INO) | BIT(NILFS_SKETCH_INO))
#define NILFS_SYS_INO_BITS (BIT(NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
#define NILFS_MDT_INODE(sb, ino) \
nilfs2: fix inode number range checks Patch series "nilfs2: fix potential issues related to reserved inodes". This series fixes one use-after-free issue reported by syzbot, caused by nilfs2's internal inode being exposed in the namespace on a corrupted filesystem, and a couple of flaws that cause problems if the starting number of non-reserved inodes written in the on-disk super block is intentionally (or corruptly) changed from its default value. This patch (of 3): In the current implementation of nilfs2, "nilfs->ns_first_ino", which gives the first non-reserved inode number, is read from the superblock, but its lower limit is not checked. As a result, if a number that overlaps with the inode number range of reserved inodes such as the root directory or metadata files is set in the super block parameter, the inode number test macros (NILFS_MDT_INODE and NILFS_VALID_INODE) will not function properly. In addition, these test macros use left bit-shift calculations using with the inode number as the shift count via the BIT macro, but the result of a shift calculation that exceeds the bit width of an integer is undefined in the C specification, so if "ns_first_ino" is set to a large value other than the default value NILFS_USER_INO (=11), the macros may potentially malfunction depending on the environment. Fix these issues by checking the lower bound of "nilfs->ns_first_ino" and by preventing bit shifts equal to or greater than the NILFS_USER_INO constant in the inode number test macros. Also, change the type of "ns_first_ino" from signed integer to unsigned integer to avoid the need for type casting in comparisons such as the lower bound check introduced this time. Link: https://lkml.kernel.org/r/20240623051135.4180-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20240623051135.4180-2-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Jan Kara <jack@suse.cz> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-23 14:11:33 +09:00
((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
nilfs2: fix inode number range checks Patch series "nilfs2: fix potential issues related to reserved inodes". This series fixes one use-after-free issue reported by syzbot, caused by nilfs2's internal inode being exposed in the namespace on a corrupted filesystem, and a couple of flaws that cause problems if the starting number of non-reserved inodes written in the on-disk super block is intentionally (or corruptly) changed from its default value. This patch (of 3): In the current implementation of nilfs2, "nilfs->ns_first_ino", which gives the first non-reserved inode number, is read from the superblock, but its lower limit is not checked. As a result, if a number that overlaps with the inode number range of reserved inodes such as the root directory or metadata files is set in the super block parameter, the inode number test macros (NILFS_MDT_INODE and NILFS_VALID_INODE) will not function properly. In addition, these test macros use left bit-shift calculations using with the inode number as the shift count via the BIT macro, but the result of a shift calculation that exceeds the bit width of an integer is undefined in the C specification, so if "ns_first_ino" is set to a large value other than the default value NILFS_USER_INO (=11), the macros may potentially malfunction depending on the environment. Fix these issues by checking the lower bound of "nilfs->ns_first_ino" and by preventing bit shifts equal to or greater than the NILFS_USER_INO constant in the inode number test macros. Also, change the type of "ns_first_ino" from signed integer to unsigned integer to avoid the need for type casting in comparisons such as the lower bound check introduced this time. Link: https://lkml.kernel.org/r/20240623051135.4180-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20240623051135.4180-2-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Jan Kara <jack@suse.cz> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-06-23 14:11:33 +09:00
((ino) >= NILFS_FIRST_INO(sb) || \
((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))
2024-06-23 14:11:34 +09:00
#define NILFS_PRIVATE_INODE(ino) ({ \
ino_t __ino = (ino); \
((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO && \
(__ino) != NILFS_SKETCH_INO); })
/**
* struct nilfs_transaction_info: context information for synchronization
* @ti_magic: Magic number
* @ti_save: Backup of journal_info field of task_struct
* @ti_flags: Flags
* @ti_count: Nest level
*/
struct nilfs_transaction_info {
u32 ti_magic;
void *ti_save;
/*
* This should never be used. If it happens,
* one of other filesystems has a bug.
*/
unsigned short ti_flags;
unsigned short ti_count;
};
/* ti_magic */
#define NILFS_TI_MAGIC 0xd9e392fb
/* ti_flags */
#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */
#define NILFS_TI_SYNC 0x0002 /*
* Force to construct segment at the
* end of transaction.
*/
#define NILFS_TI_GC 0x0004 /* GC context */
#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */
#define NILFS_TI_WRITER 0x0010 /* Constructor context */
int nilfs_transaction_begin(struct super_block *,
struct nilfs_transaction_info *, int);
int nilfs_transaction_commit(struct super_block *);
void nilfs_transaction_abort(struct super_block *);
static inline void nilfs_set_transaction_flag(unsigned int flag)
{
struct nilfs_transaction_info *ti = current->journal_info;
ti->ti_flags |= flag;
}
static inline int nilfs_test_transaction_flag(unsigned int flag)
{
struct nilfs_transaction_info *ti = current->journal_info;
if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
return 0;
return !!(ti->ti_flags & flag);
}
static inline int nilfs_doing_gc(void)
{
return nilfs_test_transaction_flag(NILFS_TI_GC);
}
static inline int nilfs_doing_construction(void)
{
return nilfs_test_transaction_flag(NILFS_TI_WRITER);
}
/*
* function prototype
*/
#ifdef CONFIG_NILFS_POSIX_ACL
#error "NILFS: not yet supported POSIX ACL"
extern int nilfs_acl_chmod(struct inode *);
extern int nilfs_init_acl(struct inode *, struct inode *);
#else
static inline int nilfs_acl_chmod(struct inode *inode)
{
return 0;
}
static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
{
if (S_ISLNK(inode->i_mode))
return 0;
inode->i_mode &= ~current_umask();
return 0;
}
#endif
#define NILFS_ATIME_DISABLE
/* Flags that should be inherited by new inodes from their parent. */
#define NILFS_FL_INHERITED \
(FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL | \
FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\
FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
{
if (S_ISDIR(mode))
return flags;
else if (S_ISREG(mode))
return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL);
else
return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
}
/* dir.c */
int nilfs_add_link(struct dentry *, struct inode *);
int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino);
int nilfs_make_empty(struct inode *, struct inode *);
struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *,
struct folio **);
int nilfs_delete_entry(struct nilfs_dir_entry *, struct folio *);
int nilfs_empty_dir(struct inode *);
struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct folio **);
nilfs2: handle errors that nilfs_prepare_chunk() may return Patch series "nilfs2: fix issues with rename operations". This series fixes BUG_ON check failures reported by syzbot around rename operations, and a minor behavioral issue where the mtime of a child directory changes when it is renamed instead of moved. This patch (of 2): The directory manipulation routines nilfs_set_link() and nilfs_delete_entry() rewrite the directory entry in the folio/page previously read by nilfs_find_entry(), so error handling is omitted on the assumption that nilfs_prepare_chunk(), which prepares the buffer for rewriting, will always succeed for these. And if an error is returned, it triggers the legacy BUG_ON() checks in each routine. This assumption is wrong, as proven by syzbot: the buffer layer called by nilfs_prepare_chunk() may call nilfs_get_block() if necessary, which may fail due to metadata corruption or other reasons. This has been there all along, but improved sanity checks and error handling may have made it more reproducible in fuzzing tests. Fix this issue by adding missing error paths in nilfs_set_link(), nilfs_delete_entry(), and their caller nilfs_rename(). Link: https://lkml.kernel.org/r/20250111143518.7901-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20250111143518.7901-2-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Reported-by: syzbot+32c3706ebf5d95046ea1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=32c3706ebf5d95046ea1 Reported-by: syzbot+1097e95f134f37d9395c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1097e95f134f37d9395c Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations") Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-11 23:26:35 +09:00
int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
struct folio *folio, struct inode *inode);
/* file.c */
extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
/* ioctl.c */
int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *m);
int nilfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct file_kattr *fa);
long nilfs_ioctl(struct file *, unsigned int, unsigned long);
long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
nilfs2: fix lock order reversal in nilfs_clean_segments ioctl This is a companion patch to ("nilfs2: fix possible circular locking for get information ioctls"). This corrects lock order reversal between mm->mmap_sem and nilfs->ns_segctor_sem in nilfs_clean_segments() which was detected by lockdep check: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.30-rc3-nilfs-00003-g360bdc1 #7 ------------------------------------------------------- mmap/5294 is trying to acquire lock: (&nilfs->ns_segctor_sem){++++.+}, at: [<d0d0e846>] nilfs_transaction_begin+0xb6/0x10c [nilfs2] but task is already holding lock: (&mm->mmap_sem){++++++}, at: [<c043700a>] do_page_fault+0x1d8/0x30a which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&mm->mmap_sem){++++++}: [<c01470a5>] __lock_acquire+0x1066/0x13b0 [<c01474a9>] lock_acquire+0xba/0xdd [<c01836bc>] might_fault+0x68/0x88 [<c023c61d>] copy_from_user+0x2a/0x111 [<d0d120d0>] nilfs_ioctl_prepare_clean_segments+0x1d/0xf1 [nilfs2] [<d0d0e2aa>] nilfs_clean_segments+0x6d/0x1b9 [nilfs2] [<d0d11f68>] nilfs_ioctl+0x2ad/0x318 [nilfs2] [<c01a3be7>] vfs_ioctl+0x22/0x69 [<c01a408e>] do_vfs_ioctl+0x460/0x499 [<c01a4107>] sys_ioctl+0x40/0x5a [<c01031a4>] sysenter_do_call+0x12/0x38 [<ffffffff>] 0xffffffff -> #0 (&nilfs->ns_segctor_sem){++++.+}: [<c0146e0b>] __lock_acquire+0xdcc/0x13b0 [<c01474a9>] lock_acquire+0xba/0xdd [<c0433f1d>] down_read+0x2a/0x3e [<d0d0e846>] nilfs_transaction_begin+0xb6/0x10c [nilfs2] [<d0cfe0e5>] nilfs_page_mkwrite+0xe7/0x154 [nilfs2] [<c0183b0b>] __do_fault+0x165/0x376 [<c01855cd>] handle_mm_fault+0x287/0x5d1 [<c043712d>] do_page_fault+0x2fb/0x30a [<c0435462>] error_code+0x72/0x78 [<ffffffff>] 0xffffffff where nilfs_clean_segments() holds: nilfs->ns_segctor_sem -> copy_from_user() --> page fault -> mm->mmap_sem And, page fault path may hold: page fault -> mm->mmap_sem --> nilfs_page_mkwrite() -> nilfs->ns_segctor_sem Even though nilfs_clean_segments() does not perform write access on given user pages, it may cause deadlock because nilfs->ns_segctor_sem is shared per device and mm->mmap_sem can be shared with other tasks. To avoid this problem, this patch moves all calls of copy_from_user() outside the nilfs->ns_segctor_sem lock in the ioctl. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
2009-05-10 22:41:43 +09:00
int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
void **);
/* inode.c */
void nilfs_inode_add_blocks(struct inode *inode, int n);
void nilfs_inode_sub_blocks(struct inode *inode, int n);
extern struct inode *nilfs_new_inode(struct inode *, umode_t);
extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern void nilfs_set_inode_flags(struct inode *);
extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
void nilfs_write_inode_common(struct inode *inode,
struct nilfs_inode *raw_inode);
struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
unsigned long ino);
struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
unsigned long ino);
struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
unsigned long ino);
extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
unsigned long ino, __u64 cno);
nilfs2: fix lockdep warnings in page operations for btree nodes Patch series "nilfs2 lockdep warning fixes". The first two are to resolve the lockdep warning issue, and the last one is the accompanying cleanup and low priority. Based on your comment, this series solves the issue by separating inode object as needed. Since I was worried about the impact of the object composition changes, I tested the series carefully not to cause regressions especially for delicate functions such like disk space reclamation and snapshots. This patch (of 3): If CONFIG_LOCKDEP is enabled, nilfs2 hits lockdep warnings at inode_to_wb() during page/folio operations for btree nodes: WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 inode_to_wb include/linux/backing-dev.h:269 [inline] WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 folio_account_dirtied mm/page-writeback.c:2460 [inline] WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 __folio_mark_dirty+0xa7c/0xe30 mm/page-writeback.c:2509 Modules linked in: ... RIP: 0010:inode_to_wb include/linux/backing-dev.h:269 [inline] RIP: 0010:folio_account_dirtied mm/page-writeback.c:2460 [inline] RIP: 0010:__folio_mark_dirty+0xa7c/0xe30 mm/page-writeback.c:2509 ... Call Trace: __set_page_dirty include/linux/pagemap.h:834 [inline] mark_buffer_dirty+0x4e6/0x650 fs/buffer.c:1145 nilfs_btree_propagate_p fs/nilfs2/btree.c:1889 [inline] nilfs_btree_propagate+0x4ae/0xea0 fs/nilfs2/btree.c:2085 nilfs_bmap_propagate+0x73/0x170 fs/nilfs2/bmap.c:337 nilfs_collect_dat_data+0x45/0xd0 fs/nilfs2/segment.c:625 nilfs_segctor_apply_buffers+0x14a/0x470 fs/nilfs2/segment.c:1009 nilfs_segctor_scan_file+0x47a/0x700 fs/nilfs2/segment.c:1048 nilfs_segctor_collect_blocks fs/nilfs2/segment.c:1224 [inline] nilfs_segctor_collect fs/nilfs2/segment.c:1494 [inline] nilfs_segctor_do_construct+0x14f3/0x6c60 fs/nilfs2/segment.c:2036 nilfs_segctor_construct+0x7a7/0xb30 fs/nilfs2/segment.c:2372 nilfs_segctor_thread_construct fs/nilfs2/segment.c:2480 [inline] nilfs_segctor_thread+0x3c3/0xf90 fs/nilfs2/segment.c:2563 kthread+0x405/0x4f0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 This is because nilfs2 uses two page caches for each inode and inode->i_mapping never points to one of them, the btree node cache. This causes inode_to_wb(inode) to refer to a different page cache than the caller page/folio operations such like __folio_start_writeback(), __folio_end_writeback(), or __folio_mark_dirty() acquired the lock. This patch resolves the issue by allocating and using an additional inode to hold the page cache of btree nodes. The inode is attached one-to-one to the traditional nilfs2 inode if it requires a block mapping with b-tree. This setup change is in memory only and does not affect the disk format. Link: https://lkml.kernel.org/r/1647867427-30498-1-git-send-email-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/1647867427-30498-2-git-send-email-konishi.ryusuke@gmail.com Link: https://lore.kernel.org/r/YXrYvIo8YRnAOJCj@casper.infradead.org Link: https://lore.kernel.org/r/9a20b33d-b38f-b4a2-4742-c1eb5b8e4d6c@redhat.com Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Reported-by: syzbot+0d5b462a6f07447991b3@syzkaller.appspotmail.com Reported-by: syzbot+34ef28bb2aeb28724aa0@syzkaller.appspotmail.com Reported-by: Hao Sun <sunhao.th@gmail.com> Reported-by: David Hildenbrand <david@redhat.com> Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-04-01 11:28:18 -07:00
int nilfs_attach_btree_node_cache(struct inode *inode);
void nilfs_detach_btree_node_cache(struct inode *inode);
struct inode *nilfs_iget_for_shadow(struct inode *inode);
extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
extern void nilfs_truncate(struct inode *);
extern void nilfs_evict_inode(struct inode *);
extern int nilfs_setattr(struct mnt_idmap *, struct dentry *,
struct iattr *);
extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
int nilfs_permission(struct mnt_idmap *idmap, struct inode *inode,
int mask);
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty);
extern int __nilfs_mark_inode_dirty(struct inode *, int);
extern void nilfs_dirty_inode(struct inode *, int flags);
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
static inline int nilfs_mark_inode_dirty(struct inode *inode)
{
return __nilfs_mark_inode_dirty(inode, I_DIRTY);
}
static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
{
return __nilfs_mark_inode_dirty(inode, I_DIRTY_SYNC);
}
/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);
__printf(2, 3)
void __nilfs_msg(struct super_block *sb, const char *fmt, ...);
extern __printf(3, 4)
void __nilfs_error(struct super_block *sb, const char *function,
const char *fmt, ...);
#ifdef CONFIG_PRINTK
#define nilfs_msg(sb, level, fmt, ...) \
__nilfs_msg(sb, level fmt, ##__VA_ARGS__)
#define nilfs_error(sb, fmt, ...) \
__nilfs_error(sb, __func__, fmt, ##__VA_ARGS__)
#else
#define nilfs_msg(sb, level, fmt, ...) \
do { \
no_printk(level fmt, ##__VA_ARGS__); \
(void)(sb); \
} while (0)
#define nilfs_error(sb, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
__nilfs_error(sb, "", " "); \
} while (0)
#endif /* CONFIG_PRINTK */
#define nilfs_crit(sb, fmt, ...) \
nilfs_msg(sb, KERN_CRIT, fmt, ##__VA_ARGS__)
#define nilfs_err(sb, fmt, ...) \
nilfs_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
#define nilfs_warn(sb, fmt, ...) \
nilfs_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
#define nilfs_info(sb, fmt, ...) \
nilfs_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__)
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic(struct super_block *sb,
struct nilfs_super_block *sbp);
extern int nilfs_check_feature_compatibility(struct super_block *,
struct nilfs_super_block *);
extern void nilfs_set_log_cursor(struct nilfs_super_block *,
struct the_nilfs *);
struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
int flip);
int nilfs_commit_super(struct super_block *sb, int flag);
int nilfs_cleanup_super(struct super_block *sb);
int nilfs_resize_fs(struct super_block *sb, __u64 newsize);
int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
struct nilfs_root **root);
int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
/* gcinode.c */
int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
struct buffer_head **);
int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
struct buffer_head **);
int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
int nilfs_init_gcinode(struct inode *inode);
void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
nilfs2: integrate sysfs support into driver This patch integrates creation of sysfs groups and attributes into NILFS file system driver. It was found the issue with nilfs_sysfs_{create/delete}_snapshot_group functions by Michael L Semon <mlsemon35@gmail.com> in the first version of the patch: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:579 in_atomic(): 1, irqs_disabled(): 0, pid: 32676, name: umount.nilfs2 2 locks held by umount.nilfs2/32676: #0: (&type->s_umount_key#21){++++..}, at: [<790c18e2>] deactivate_super+0x37/0x58 #1: (&(&nilfs->ns_cptree_lock)->rlock){+.+...}, at: [<791bf659>] nilfs_put_root+0x23/0x5a Preemption disabled at:[<791bf659>] nilfs_put_root+0x23/0x5a CPU: 0 PID: 32676 Comm: umount.nilfs2 Not tainted 3.14.0+ #2 Hardware name: Dell Computer Corporation Dimension 2350/07W080, BIOS A01 12/17/2002 Call Trace: dump_stack+0x4b/0x75 __might_sleep+0x111/0x16f mutex_lock_nested+0x1e/0x3ad kernfs_remove+0x12/0x26 sysfs_remove_dir+0x3d/0x62 kobject_del+0x13/0x38 nilfs_sysfs_delete_snapshot_group+0xb/0xd nilfs_put_root+0x2a/0x5a nilfs_detach_log_writer+0x1ab/0x2c1 nilfs_put_super+0x13/0x68 generic_shutdown_super+0x60/0xd1 kill_block_super+0x1d/0x60 deactivate_locked_super+0x22/0x3f deactivate_super+0x3e/0x58 mntput_no_expire+0xe2/0x141 SyS_oldumount+0x70/0xa5 syscall_call+0x7/0xb The reason of the issue was placement of nilfs_sysfs_{create/delete}_snapshot_group() call under nilfs->ns_cptree_lock protection. But this protection is unnecessary and wrong solution. The second version of the patch fixes this issue. [fengguang.wu@intel.com: nilfs_sysfs_create_mounted_snapshots_group can be static] Reported-by: Michael L. Semon <mlsemon35@gmail.com> Signed-off-by: Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com> Cc: Vyacheslav Dubeyko <slava@dubeyko.com> Cc: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Tested-by: Michael L. Semon <mlsemon35@gmail.com> Signed-off-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-08 14:20:55 -07:00
/* sysfs.c */
int __init nilfs_sysfs_init(void);
void nilfs_sysfs_exit(void);
int nilfs_sysfs_create_device_group(struct super_block *);
void nilfs_sysfs_delete_device_group(struct the_nilfs *);
int nilfs_sysfs_create_snapshot_group(struct nilfs_root *);
void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *);
/*
* Inodes and files operations
*/
extern const struct file_operations nilfs_dir_operations;
extern const struct inode_operations nilfs_file_inode_operations;
extern const struct file_operations nilfs_file_operations;
extern const struct address_space_operations nilfs_aops;
nilfs2: fix buffer head leaks in calls to truncate_inode_pages() When block_invalidatepage was converted to block_invalidate_folio, the fallback to block_invalidatepage in folio_invalidate() if the address_space_operations method invalidatepage (currently invalidate_folio) was not set, was removed. Unfortunately, some pseudo-inodes in nilfs2 use empty_aops set by inode_init_always_gfp() as is, or explicitly set it to address_space_operations. Therefore, with this change, block_invalidatepage() is no longer called from folio_invalidate(), and as a result, the buffer_head structures attached to these pages/folios are no longer freed via try_to_free_buffers(). Thus, these buffer heads are now leaked by truncate_inode_pages(), which cleans up the page cache from inode evict(), etc. Three types of caches use empty_aops: gc inode caches and the DAT shadow inode used by GC, and b-tree node caches. Of these, b-tree node caches explicitly call invalidate_mapping_pages() during cleanup, which involves calling try_to_free_buffers(), so the leak was not visible during normal operation but worsened when GC was performed. Fix this issue by using address_space_operations with invalidate_folio set to block_invalidate_folio instead of empty_aops, which will ensure the same behavior as before. Link: https://lkml.kernel.org/r/20241212164556.21338-1-konishi.ryusuke@gmail.com Fixes: 7ba13abbd31e ("fs: Turn block_invalidatepage into block_invalidate_folio") Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: <stable@vger.kernel.org> [5.18+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-12-13 01:43:28 +09:00
extern const struct address_space_operations nilfs_buffer_cache_aops;
extern const struct inode_operations nilfs_dir_inode_operations;
extern const struct inode_operations nilfs_special_inode_operations;
extern const struct inode_operations nilfs_symlink_inode_operations;
/*
* filesystem type
*/
extern struct file_system_type nilfs_fs_type;
#endif /* _NILFS_H */