mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00
ceph: track pending caps flushing accurately
Previously we do not trace accurate TID for flushing caps. when MDS failovers, we have no choice but to re-send all flushing caps with a new TID. This can cause problem because MDS can has already flushed some caps and has issued the same caps to other client. The re-sent cap flush has a new TID, which makes MDS unable to detect if it has already processed the cap flush. This patch adds code to track pending caps flushing accurately. When re-sending cap flush is needed, we use its original flush TID. Signed-off-by: Yan, Zheng <zyan@redhat.com>
This commit is contained in:
parent
6c13a6bb55
commit
553adfd941
5 changed files with 192 additions and 88 deletions
245
fs/ceph/caps.c
245
fs/ceph/caps.c
|
@ -1097,7 +1097,8 @@ void ceph_queue_caps_release(struct inode *inode)
|
||||||
* caller should hold snap_rwsem (read), s_mutex.
|
* caller should hold snap_rwsem (read), s_mutex.
|
||||||
*/
|
*/
|
||||||
static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
|
static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
|
||||||
int op, int used, int want, int retain, int flushing)
|
int op, int used, int want, int retain, int flushing,
|
||||||
|
u64 flush_tid)
|
||||||
__releases(cap->ci->i_ceph_lock)
|
__releases(cap->ci->i_ceph_lock)
|
||||||
{
|
{
|
||||||
struct ceph_inode_info *ci = cap->ci;
|
struct ceph_inode_info *ci = cap->ci;
|
||||||
|
@ -1115,8 +1116,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
|
||||||
u64 xattr_version = 0;
|
u64 xattr_version = 0;
|
||||||
struct ceph_buffer *xattr_blob = NULL;
|
struct ceph_buffer *xattr_blob = NULL;
|
||||||
int delayed = 0;
|
int delayed = 0;
|
||||||
u64 flush_tid = 0;
|
|
||||||
int i;
|
|
||||||
int ret;
|
int ret;
|
||||||
bool inline_data;
|
bool inline_data;
|
||||||
|
|
||||||
|
@ -1160,24 +1159,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
|
||||||
cap->implemented &= cap->issued | used;
|
cap->implemented &= cap->issued | used;
|
||||||
cap->mds_wanted = want;
|
cap->mds_wanted = want;
|
||||||
|
|
||||||
if (flushing) {
|
follows = flushing ? ci->i_head_snapc->seq : 0;
|
||||||
/*
|
|
||||||
* assign a tid for flush operations so we can avoid
|
|
||||||
* flush1 -> dirty1 -> flush2 -> flushack1 -> mark
|
|
||||||
* clean type races. track latest tid for every bit
|
|
||||||
* so we can handle flush AxFw, flush Fw, and have the
|
|
||||||
* first ack clean Ax.
|
|
||||||
*/
|
|
||||||
flush_tid = ++ci->i_cap_flush_last_tid;
|
|
||||||
dout(" cap_flush_tid %d\n", (int)flush_tid);
|
|
||||||
for (i = 0; i < CEPH_CAP_BITS; i++)
|
|
||||||
if (flushing & (1 << i))
|
|
||||||
ci->i_cap_flush_tid[i] = flush_tid;
|
|
||||||
|
|
||||||
follows = ci->i_head_snapc->seq;
|
|
||||||
} else {
|
|
||||||
follows = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
keep = cap->implemented;
|
keep = cap->implemented;
|
||||||
seq = cap->seq;
|
seq = cap->seq;
|
||||||
|
@ -1311,7 +1293,10 @@ retry:
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
|
|
||||||
capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
|
spin_lock(&mdsc->cap_dirty_lock);
|
||||||
|
capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
|
||||||
|
spin_unlock(&mdsc->cap_dirty_lock);
|
||||||
|
|
||||||
atomic_inc(&capsnap->nref);
|
atomic_inc(&capsnap->nref);
|
||||||
if (list_empty(&capsnap->flushing_item))
|
if (list_empty(&capsnap->flushing_item))
|
||||||
list_add_tail(&capsnap->flushing_item,
|
list_add_tail(&capsnap->flushing_item,
|
||||||
|
@ -1407,6 +1392,29 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
|
||||||
return dirty;
|
return dirty;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
|
||||||
|
struct ceph_cap_flush *cf)
|
||||||
|
{
|
||||||
|
struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
|
||||||
|
struct rb_node *parent = NULL;
|
||||||
|
struct ceph_cap_flush *other = NULL;
|
||||||
|
|
||||||
|
while (*p) {
|
||||||
|
parent = *p;
|
||||||
|
other = rb_entry(parent, struct ceph_cap_flush, i_node);
|
||||||
|
|
||||||
|
if (cf->tid < other->tid)
|
||||||
|
p = &(*p)->rb_left;
|
||||||
|
else if (cf->tid > other->tid)
|
||||||
|
p = &(*p)->rb_right;
|
||||||
|
else
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
|
||||||
|
rb_link_node(&cf->i_node, parent, p);
|
||||||
|
rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add dirty inode to the flushing list. Assigned a seq number so we
|
* Add dirty inode to the flushing list. Assigned a seq number so we
|
||||||
* can wait for caps to flush without starving.
|
* can wait for caps to flush without starving.
|
||||||
|
@ -1414,10 +1422,12 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
|
||||||
* Called under i_ceph_lock.
|
* Called under i_ceph_lock.
|
||||||
*/
|
*/
|
||||||
static int __mark_caps_flushing(struct inode *inode,
|
static int __mark_caps_flushing(struct inode *inode,
|
||||||
struct ceph_mds_session *session)
|
struct ceph_mds_session *session,
|
||||||
|
u64 *flush_tid)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
struct ceph_cap_flush *cf;
|
||||||
int flushing;
|
int flushing;
|
||||||
|
|
||||||
BUG_ON(ci->i_dirty_caps == 0);
|
BUG_ON(ci->i_dirty_caps == 0);
|
||||||
|
@ -1432,9 +1442,14 @@ static int __mark_caps_flushing(struct inode *inode,
|
||||||
ci->i_dirty_caps = 0;
|
ci->i_dirty_caps = 0;
|
||||||
dout(" inode %p now !dirty\n", inode);
|
dout(" inode %p now !dirty\n", inode);
|
||||||
|
|
||||||
|
cf = kmalloc(sizeof(*cf), GFP_ATOMIC);
|
||||||
|
cf->caps = flushing;
|
||||||
|
|
||||||
spin_lock(&mdsc->cap_dirty_lock);
|
spin_lock(&mdsc->cap_dirty_lock);
|
||||||
list_del_init(&ci->i_dirty_item);
|
list_del_init(&ci->i_dirty_item);
|
||||||
|
|
||||||
|
cf->tid = ++mdsc->last_cap_flush_tid;
|
||||||
|
|
||||||
if (list_empty(&ci->i_flushing_item)) {
|
if (list_empty(&ci->i_flushing_item)) {
|
||||||
ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
|
ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
|
||||||
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
|
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
|
||||||
|
@ -1448,6 +1463,9 @@ static int __mark_caps_flushing(struct inode *inode,
|
||||||
}
|
}
|
||||||
spin_unlock(&mdsc->cap_dirty_lock);
|
spin_unlock(&mdsc->cap_dirty_lock);
|
||||||
|
|
||||||
|
__add_cap_flushing_to_inode(ci, cf);
|
||||||
|
|
||||||
|
*flush_tid = cf->tid;
|
||||||
return flushing;
|
return flushing;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1493,6 +1511,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||||
struct inode *inode = &ci->vfs_inode;
|
struct inode *inode = &ci->vfs_inode;
|
||||||
struct ceph_cap *cap;
|
struct ceph_cap *cap;
|
||||||
|
u64 flush_tid;
|
||||||
int file_wanted, used, cap_used;
|
int file_wanted, used, cap_used;
|
||||||
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
|
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
|
||||||
int issued, implemented, want, retain, revoking, flushing = 0;
|
int issued, implemented, want, retain, revoking, flushing = 0;
|
||||||
|
@ -1711,17 +1730,20 @@ ack:
|
||||||
took_snap_rwsem = 1;
|
took_snap_rwsem = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cap == ci->i_auth_cap && ci->i_dirty_caps)
|
if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
|
||||||
flushing = __mark_caps_flushing(inode, session);
|
flushing = __mark_caps_flushing(inode, session,
|
||||||
else
|
&flush_tid);
|
||||||
|
} else {
|
||||||
flushing = 0;
|
flushing = 0;
|
||||||
|
flush_tid = 0;
|
||||||
|
}
|
||||||
|
|
||||||
mds = cap->mds; /* remember mds, so we don't repeat */
|
mds = cap->mds; /* remember mds, so we don't repeat */
|
||||||
sent++;
|
sent++;
|
||||||
|
|
||||||
/* __send_cap drops i_ceph_lock */
|
/* __send_cap drops i_ceph_lock */
|
||||||
delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
|
delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
|
||||||
want, retain, flushing);
|
want, retain, flushing, flush_tid);
|
||||||
goto retry; /* retake i_ceph_lock and restart our cap scan. */
|
goto retry; /* retake i_ceph_lock and restart our cap scan. */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1750,12 +1772,13 @@ ack:
|
||||||
/*
|
/*
|
||||||
* Try to flush dirty caps back to the auth mds.
|
* Try to flush dirty caps back to the auth mds.
|
||||||
*/
|
*/
|
||||||
static int try_flush_caps(struct inode *inode, u16 flush_tid[])
|
static int try_flush_caps(struct inode *inode, u64 *ptid)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_mds_session *session = NULL;
|
struct ceph_mds_session *session = NULL;
|
||||||
int flushing = 0;
|
int flushing = 0;
|
||||||
|
u64 flush_tid = 0;
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
|
@ -1780,46 +1803,52 @@ retry:
|
||||||
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
|
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
flushing = __mark_caps_flushing(inode, session);
|
flushing = __mark_caps_flushing(inode, session, &flush_tid);
|
||||||
|
|
||||||
/* __send_cap drops i_ceph_lock */
|
/* __send_cap drops i_ceph_lock */
|
||||||
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
|
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
|
||||||
cap->issued | cap->implemented, flushing);
|
(cap->issued | cap->implemented),
|
||||||
|
flushing, flush_tid);
|
||||||
|
|
||||||
spin_lock(&ci->i_ceph_lock);
|
if (delayed) {
|
||||||
if (delayed)
|
spin_lock(&ci->i_ceph_lock);
|
||||||
__cap_delay_requeue(mdsc, ci);
|
__cap_delay_requeue(mdsc, ci);
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
|
||||||
|
if (n) {
|
||||||
|
struct ceph_cap_flush *cf =
|
||||||
|
rb_entry(n, struct ceph_cap_flush, i_node);
|
||||||
|
flush_tid = cf->tid;
|
||||||
|
}
|
||||||
|
flushing = ci->i_flushing_caps;
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
flushing = ci->i_flushing_caps;
|
|
||||||
if (flushing)
|
|
||||||
memcpy(flush_tid, ci->i_cap_flush_tid,
|
|
||||||
sizeof(ci->i_cap_flush_tid));
|
|
||||||
out:
|
out:
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
|
||||||
if (session)
|
if (session)
|
||||||
mutex_unlock(&session->s_mutex);
|
mutex_unlock(&session->s_mutex);
|
||||||
|
|
||||||
|
*ptid = flush_tid;
|
||||||
return flushing;
|
return flushing;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return true if we've flushed caps through the given flush_tid.
|
* Return true if we've flushed caps through the given flush_tid.
|
||||||
*/
|
*/
|
||||||
static int caps_are_flushed(struct inode *inode, u16 flush_tid[])
|
static int caps_are_flushed(struct inode *inode, u64 flush_tid)
|
||||||
{
|
{
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
int i, ret = 1;
|
struct ceph_cap_flush *cf;
|
||||||
|
struct rb_node *n;
|
||||||
|
int ret = 1;
|
||||||
|
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
for (i = 0; i < CEPH_CAP_BITS; i++) {
|
n = rb_first(&ci->i_cap_flush_tree);
|
||||||
if (!(ci->i_flushing_caps & (1 << i)))
|
if (n) {
|
||||||
continue;
|
cf = rb_entry(n, struct ceph_cap_flush, i_node);
|
||||||
// tid only has 16 bits. we need to handle wrapping
|
if (cf->tid <= flush_tid)
|
||||||
if ((s16)(ci->i_cap_flush_tid[i] - flush_tid[i]) <= 0) {
|
|
||||||
/* still flushing this bit */
|
|
||||||
ret = 0;
|
ret = 0;
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -1922,7 +1951,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
|
||||||
{
|
{
|
||||||
struct inode *inode = file->f_mapping->host;
|
struct inode *inode = file->f_mapping->host;
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
u16 flush_tid[CEPH_CAP_BITS];
|
u64 flush_tid;
|
||||||
int ret;
|
int ret;
|
||||||
int dirty;
|
int dirty;
|
||||||
|
|
||||||
|
@ -1938,7 +1967,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
|
||||||
|
|
||||||
mutex_lock(&inode->i_mutex);
|
mutex_lock(&inode->i_mutex);
|
||||||
|
|
||||||
dirty = try_flush_caps(inode, flush_tid);
|
dirty = try_flush_caps(inode, &flush_tid);
|
||||||
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
|
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
|
||||||
|
|
||||||
ret = unsafe_dirop_wait(inode);
|
ret = unsafe_dirop_wait(inode);
|
||||||
|
@ -1967,14 +1996,14 @@ out:
|
||||||
int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
|
int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
|
||||||
{
|
{
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
u16 flush_tid[CEPH_CAP_BITS];
|
u64 flush_tid;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
int dirty;
|
int dirty;
|
||||||
int wait = wbc->sync_mode == WB_SYNC_ALL;
|
int wait = wbc->sync_mode == WB_SYNC_ALL;
|
||||||
|
|
||||||
dout("write_inode %p wait=%d\n", inode, wait);
|
dout("write_inode %p wait=%d\n", inode, wait);
|
||||||
if (wait) {
|
if (wait) {
|
||||||
dirty = try_flush_caps(inode, flush_tid);
|
dirty = try_flush_caps(inode, &flush_tid);
|
||||||
if (dirty)
|
if (dirty)
|
||||||
err = wait_event_interruptible(ci->i_cap_wq,
|
err = wait_event_interruptible(ci->i_cap_wq,
|
||||||
caps_are_flushed(inode, flush_tid));
|
caps_are_flushed(inode, flush_tid));
|
||||||
|
@ -2022,6 +2051,51 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
|
||||||
|
struct ceph_mds_session *session,
|
||||||
|
struct ceph_inode_info *ci)
|
||||||
|
{
|
||||||
|
struct inode *inode = &ci->vfs_inode;
|
||||||
|
struct ceph_cap *cap;
|
||||||
|
struct ceph_cap_flush *cf;
|
||||||
|
struct rb_node *n;
|
||||||
|
int delayed = 0;
|
||||||
|
u64 first_tid = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
spin_lock(&ci->i_ceph_lock);
|
||||||
|
cap = ci->i_auth_cap;
|
||||||
|
if (!(cap && cap->session == session)) {
|
||||||
|
pr_err("%p auth cap %p not mds%d ???\n", inode,
|
||||||
|
cap, session->s_mds);
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
|
||||||
|
cf = rb_entry(n, struct ceph_cap_flush, i_node);
|
||||||
|
if (cf->tid >= first_tid)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!n) {
|
||||||
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
cf = rb_entry(n, struct ceph_cap_flush, i_node);
|
||||||
|
first_tid = cf->tid + 1;
|
||||||
|
|
||||||
|
dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
|
||||||
|
cap, cf->tid, ceph_cap_string(cf->caps));
|
||||||
|
delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
|
||||||
|
__ceph_caps_used(ci),
|
||||||
|
__ceph_caps_wanted(ci),
|
||||||
|
cap->issued | cap->implemented,
|
||||||
|
cf->caps, cf->tid);
|
||||||
|
}
|
||||||
|
return delayed;
|
||||||
|
}
|
||||||
|
|
||||||
void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
|
void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
|
||||||
struct ceph_mds_session *session)
|
struct ceph_mds_session *session)
|
||||||
{
|
{
|
||||||
|
@ -2031,28 +2105,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
|
||||||
|
|
||||||
dout("kick_flushing_caps mds%d\n", session->s_mds);
|
dout("kick_flushing_caps mds%d\n", session->s_mds);
|
||||||
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
|
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
|
||||||
struct inode *inode = &ci->vfs_inode;
|
int delayed = __kick_flushing_caps(mdsc, session, ci);
|
||||||
struct ceph_cap *cap;
|
if (delayed) {
|
||||||
int delayed = 0;
|
spin_lock(&ci->i_ceph_lock);
|
||||||
|
__cap_delay_requeue(mdsc, ci);
|
||||||
spin_lock(&ci->i_ceph_lock);
|
|
||||||
cap = ci->i_auth_cap;
|
|
||||||
if (cap && cap->session == session) {
|
|
||||||
dout("kick_flushing_caps %p cap %p %s\n", inode,
|
|
||||||
cap, ceph_cap_string(ci->i_flushing_caps));
|
|
||||||
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
|
|
||||||
__ceph_caps_used(ci),
|
|
||||||
__ceph_caps_wanted(ci),
|
|
||||||
cap->issued | cap->implemented,
|
|
||||||
ci->i_flushing_caps);
|
|
||||||
if (delayed) {
|
|
||||||
spin_lock(&ci->i_ceph_lock);
|
|
||||||
__cap_delay_requeue(mdsc, ci);
|
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
pr_err("%p auth cap %p not mds%d ???\n", inode,
|
|
||||||
cap, session->s_mds);
|
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2064,7 +2120,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
|
||||||
{
|
{
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_cap *cap;
|
struct ceph_cap *cap;
|
||||||
int delayed = 0;
|
|
||||||
|
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
cap = ci->i_auth_cap;
|
cap = ci->i_auth_cap;
|
||||||
|
@ -2074,16 +2129,16 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
|
||||||
__ceph_flush_snaps(ci, &session, 1);
|
__ceph_flush_snaps(ci, &session, 1);
|
||||||
|
|
||||||
if (ci->i_flushing_caps) {
|
if (ci->i_flushing_caps) {
|
||||||
|
int delayed;
|
||||||
|
|
||||||
spin_lock(&mdsc->cap_dirty_lock);
|
spin_lock(&mdsc->cap_dirty_lock);
|
||||||
list_move_tail(&ci->i_flushing_item,
|
list_move_tail(&ci->i_flushing_item,
|
||||||
&cap->session->s_cap_flushing);
|
&cap->session->s_cap_flushing);
|
||||||
spin_unlock(&mdsc->cap_dirty_lock);
|
spin_unlock(&mdsc->cap_dirty_lock);
|
||||||
|
|
||||||
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
__ceph_caps_used(ci),
|
|
||||||
__ceph_caps_wanted(ci),
|
delayed = __kick_flushing_caps(mdsc, session, ci);
|
||||||
cap->issued | cap->implemented,
|
|
||||||
ci->i_flushing_caps);
|
|
||||||
if (delayed) {
|
if (delayed) {
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
__cap_delay_requeue(mdsc, ci);
|
__cap_delay_requeue(mdsc, ci);
|
||||||
|
@ -2836,16 +2891,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
|
||||||
{
|
{
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||||
|
struct ceph_cap_flush *cf;
|
||||||
|
struct rb_node *n;
|
||||||
|
LIST_HEAD(to_remove);
|
||||||
unsigned seq = le32_to_cpu(m->seq);
|
unsigned seq = le32_to_cpu(m->seq);
|
||||||
int dirty = le32_to_cpu(m->dirty);
|
int dirty = le32_to_cpu(m->dirty);
|
||||||
int cleaned = 0;
|
int cleaned = 0;
|
||||||
int drop = 0;
|
int drop = 0;
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < CEPH_CAP_BITS; i++)
|
n = rb_first(&ci->i_cap_flush_tree);
|
||||||
if ((dirty & (1 << i)) &&
|
while (n) {
|
||||||
(u16)flush_tid == ci->i_cap_flush_tid[i])
|
cf = rb_entry(n, struct ceph_cap_flush, i_node);
|
||||||
cleaned |= 1 << i;
|
n = rb_next(&cf->i_node);
|
||||||
|
if (cf->tid == flush_tid)
|
||||||
|
cleaned = cf->caps;
|
||||||
|
if (cf->tid <= flush_tid) {
|
||||||
|
rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
|
||||||
|
list_add_tail(&cf->list, &to_remove);
|
||||||
|
} else {
|
||||||
|
cleaned &= ~cf->caps;
|
||||||
|
if (!cleaned)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
|
dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
|
||||||
" flushing %s -> %s\n",
|
" flushing %s -> %s\n",
|
||||||
|
@ -2890,6 +2958,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
|
||||||
|
|
||||||
out:
|
out:
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
|
||||||
|
while (!list_empty(&to_remove)) {
|
||||||
|
cf = list_first_entry(&to_remove,
|
||||||
|
struct ceph_cap_flush, list);
|
||||||
|
list_del(&cf->list);
|
||||||
|
kfree(cf);
|
||||||
|
}
|
||||||
if (drop)
|
if (drop)
|
||||||
iput(inode);
|
iput(inode);
|
||||||
}
|
}
|
||||||
|
|
|
@ -417,8 +417,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
|
||||||
INIT_LIST_HEAD(&ci->i_dirty_item);
|
INIT_LIST_HEAD(&ci->i_dirty_item);
|
||||||
INIT_LIST_HEAD(&ci->i_flushing_item);
|
INIT_LIST_HEAD(&ci->i_flushing_item);
|
||||||
ci->i_cap_flush_seq = 0;
|
ci->i_cap_flush_seq = 0;
|
||||||
ci->i_cap_flush_last_tid = 0;
|
ci->i_cap_flush_tree = RB_ROOT;
|
||||||
memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
|
|
||||||
init_waitqueue_head(&ci->i_cap_wq);
|
init_waitqueue_head(&ci->i_cap_wq);
|
||||||
ci->i_hold_caps_min = 0;
|
ci->i_hold_caps_min = 0;
|
||||||
ci->i_hold_caps_max = 0;
|
ci->i_hold_caps_max = 0;
|
||||||
|
|
|
@ -1142,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
||||||
void *arg)
|
void *arg)
|
||||||
{
|
{
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
LIST_HEAD(to_remove);
|
||||||
int drop = 0;
|
int drop = 0;
|
||||||
|
|
||||||
dout("removing cap %p, ci is %p, inode is %p\n",
|
dout("removing cap %p, ci is %p, inode is %p\n",
|
||||||
|
@ -1149,9 +1150,19 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
__ceph_remove_cap(cap, false);
|
__ceph_remove_cap(cap, false);
|
||||||
if (!ci->i_auth_cap) {
|
if (!ci->i_auth_cap) {
|
||||||
|
struct ceph_cap_flush *cf;
|
||||||
struct ceph_mds_client *mdsc =
|
struct ceph_mds_client *mdsc =
|
||||||
ceph_sb_to_client(inode->i_sb)->mdsc;
|
ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
|
||||||
|
if (!n)
|
||||||
|
break;
|
||||||
|
cf = rb_entry(n, struct ceph_cap_flush, i_node);
|
||||||
|
rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
|
||||||
|
list_add(&cf->list, &to_remove);
|
||||||
|
}
|
||||||
|
|
||||||
spin_lock(&mdsc->cap_dirty_lock);
|
spin_lock(&mdsc->cap_dirty_lock);
|
||||||
if (!list_empty(&ci->i_dirty_item)) {
|
if (!list_empty(&ci->i_dirty_item)) {
|
||||||
pr_warn_ratelimited(
|
pr_warn_ratelimited(
|
||||||
|
@ -1173,8 +1184,16 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
||||||
drop = 1;
|
drop = 1;
|
||||||
}
|
}
|
||||||
spin_unlock(&mdsc->cap_dirty_lock);
|
spin_unlock(&mdsc->cap_dirty_lock);
|
||||||
|
|
||||||
}
|
}
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
while (!list_empty(&to_remove)) {
|
||||||
|
struct ceph_cap_flush *cf;
|
||||||
|
cf = list_first_entry(&to_remove,
|
||||||
|
struct ceph_cap_flush, list);
|
||||||
|
list_del(&cf->list);
|
||||||
|
kfree(cf);
|
||||||
|
}
|
||||||
while (drop--)
|
while (drop--)
|
||||||
iput(inode);
|
iput(inode);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -3408,6 +3427,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
|
||||||
INIT_LIST_HEAD(&mdsc->snap_flush_list);
|
INIT_LIST_HEAD(&mdsc->snap_flush_list);
|
||||||
spin_lock_init(&mdsc->snap_flush_lock);
|
spin_lock_init(&mdsc->snap_flush_lock);
|
||||||
mdsc->cap_flush_seq = 0;
|
mdsc->cap_flush_seq = 0;
|
||||||
|
mdsc->last_cap_flush_tid = 1;
|
||||||
INIT_LIST_HEAD(&mdsc->cap_dirty);
|
INIT_LIST_HEAD(&mdsc->cap_dirty);
|
||||||
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
|
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
|
||||||
mdsc->num_cap_flushing = 0;
|
mdsc->num_cap_flushing = 0;
|
||||||
|
|
|
@ -307,6 +307,7 @@ struct ceph_mds_client {
|
||||||
spinlock_t snap_flush_lock;
|
spinlock_t snap_flush_lock;
|
||||||
|
|
||||||
u64 cap_flush_seq;
|
u64 cap_flush_seq;
|
||||||
|
u64 last_cap_flush_tid;
|
||||||
struct list_head cap_dirty; /* inodes with dirty caps */
|
struct list_head cap_dirty; /* inodes with dirty caps */
|
||||||
struct list_head cap_dirty_migrating; /* ...that are migration... */
|
struct list_head cap_dirty_migrating; /* ...that are migration... */
|
||||||
int num_cap_flushing; /* # caps we are flushing */
|
int num_cap_flushing; /* # caps we are flushing */
|
||||||
|
|
|
@ -186,6 +186,15 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ceph_cap_flush {
|
||||||
|
u64 tid;
|
||||||
|
int caps;
|
||||||
|
union {
|
||||||
|
struct rb_node i_node;
|
||||||
|
struct list_head list;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The frag tree describes how a directory is fragmented, potentially across
|
* The frag tree describes how a directory is fragmented, potentially across
|
||||||
* multiple metadata servers. It is also used to indicate points where
|
* multiple metadata servers. It is also used to indicate points where
|
||||||
|
@ -299,7 +308,7 @@ struct ceph_inode_info {
|
||||||
/* we need to track cap writeback on a per-cap-bit basis, to allow
|
/* we need to track cap writeback on a per-cap-bit basis, to allow
|
||||||
* overlapping, pipelined cap flushes to the mds. we can probably
|
* overlapping, pipelined cap flushes to the mds. we can probably
|
||||||
* reduce the tid to 8 bits if we're concerned about inode size. */
|
* reduce the tid to 8 bits if we're concerned about inode size. */
|
||||||
u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
|
struct rb_root i_cap_flush_tree;
|
||||||
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
|
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
|
||||||
unsigned long i_hold_caps_min; /* jiffies */
|
unsigned long i_hold_caps_min; /* jiffies */
|
||||||
unsigned long i_hold_caps_max; /* jiffies */
|
unsigned long i_hold_caps_max; /* jiffies */
|
||||||
|
|
Loading…
Add table
Reference in a new issue