linux/fs/afs/server_list.c
David Howells 4882ba7857 afs: Fix afs_server ref accounting
The current way that afs_server refs are accounted and cleaned up sometimes
cause rmmod to hang when it is waiting for cell records to be removed.  The
problem is that the cell cleanup might occasionally happen before the
server cleanup and then there's nothing that causes the cell to
garbage-collect the remaining servers as they become inactive.

Partially fix this by:

 (1) Give each afs_server record its own management timer that rather than
     relying on the cell manager's central timer to drive each individual
     cell's maintenance work item to garbage collect servers.

     This timer is set when afs_unuse_server() reduces a server's activity
     count to zero and will schedule the server's destroyer work item upon
     firing.

 (2) Give each afs_server record its own destroyer work item that removes
     the record from the cell's database, shuts down the timer, cancels any
     pending work for itself, sends an RPC to the server to cancel
     outstanding callbacks.

     This change, in combination with the timer, obviates the need to try
     and coordinate so closely between the cell record and a bunch of other
     server records to try and tear everything down in a coordinated
     fashion.  With this, the cell record is pinned until the server RCU is
     complete and namespace/module removal will wait until all the cell
     records are removed.

 (3) Now that incoming calls are mapped to servers (and thus cells) using
     data attached to an rxrpc_peer, the UUID-to-server mapping tree is
     moved from the namespace to the cell (cell->fs_servers).  This means
     there can no longer be duplicates therein - and that allows the
     mapping tree to be simpler as there doesn't need to be a chain of
     same-UUID servers that are in different cells.

 (4) The lock protecting the UUID mapping tree is switched to an
     rw_semaphore on the cell rather than a seqlock on the namespace as
     it's now only used during mounting in contexts in which we're allowed
     to sleep.

 (5) When it comes time for a cell that is being removed to purge its set
     of servers, it just needs to iterate over them and wake them up.  Once
     a server becomes inactive, its destroyer work item will observe the
     state of the cell and immediately remove that record.

 (6) When a server record is removed, it is marked AFS_SERVER_FL_EXPIRED to
     prevent reattempts at removal.  The record will be dispatched to RCU
     for destruction once its refcount reaches 0.

 (7) The AFS_SERVER_FL_UNCREATED/CREATING flags are used to synchronise
     simultaneous creation attempts.  If one attempt fails, it will abandon
     the attempt and allow another to try again.

     Note that the record can't just be abandoned when dead as it's bound
     into a server list attached to a volume and only subject to
     replacement if the server list obtained for the volume from the VLDB
     changes.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20250224234154.2014840-15-dhowells@redhat.com/ # v1
Link: https://lore.kernel.org/r/20250310094206.801057-11-dhowells@redhat.com/ # v4
2025-03-10 09:47:15 +00:00

249 lines
6.2 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/* AFS fileserver list management.
*
* Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include "internal.h"
void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
{
int i;
if (slist && refcount_dec_and_test(&slist->usage)) {
for (i = 0; i < slist->nr_servers; i++)
afs_unuse_server(net, slist->servers[i].server,
afs_server_trace_unuse_slist);
kfree_rcu(slist, rcu);
}
}
/*
* Build a server list from a VLDB record.
*/
struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
struct key *key,
struct afs_vldb_entry *vldb)
{
struct afs_server_list *slist;
struct afs_server *server;
unsigned int type_mask = 1 << volume->type;
bool use_newrepsites = false;
int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j, usable = 0;
/* Work out if we're going to restrict to NEWREPSITE-marked servers or
* not. If at least one site is marked as NEWREPSITE, then it's likely
* that "vos release" is busy updating RO sites. We cut over from one
* to the other when >=50% of the sites have been updated. Sites that
* are in the process of being updated are marked DONTUSE.
*/
for (i = 0; i < vldb->nr_servers; i++) {
if (!(vldb->fs_mask[i] & type_mask))
continue;
nr_servers++;
if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
continue;
usable++;
if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE)
newrep++;
}
slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL);
if (!slist)
goto error;
if (newrep) {
if (newrep < usable / 2) {
slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD;
} else {
slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW;
use_newrepsites = true;
}
}
refcount_set(&slist->usage, 1);
rwlock_init(&slist->lock);
/* Make sure a records exists for each server in the list. */
for (i = 0; i < vldb->nr_servers; i++) {
unsigned long se_flags = 0;
bool newrepsite = vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE;
if (!(vldb->fs_mask[i] & type_mask))
continue;
if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
__set_bit(AFS_SE_EXCLUDED, &se_flags);
if (newrep && (newrepsite ^ use_newrepsites))
__set_bit(AFS_SE_EXCLUDED, &se_flags);
server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i],
vldb->addr_version[i]);
if (IS_ERR(server)) {
ret = PTR_ERR(server);
if (ret == -ENOENT ||
ret == -ENOMEDIUM)
continue;
goto error_2;
}
/* Insertion-sort by UUID */
for (j = 0; j < slist->nr_servers; j++)
if (memcmp(&slist->servers[j].server->uuid,
&server->uuid,
sizeof(server->uuid)) >= 0)
break;
if (j < slist->nr_servers) {
if (slist->servers[j].server == server) {
afs_unuse_server_notime(volume->cell->net, server,
afs_server_trace_unuse_slist_isort);
continue;
}
memmove(slist->servers + j + 1,
slist->servers + j,
(slist->nr_servers - j) * sizeof(struct afs_server_entry));
}
slist->servers[j].server = server;
slist->servers[j].volume = volume;
slist->servers[j].flags = se_flags;
slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE;
slist->nr_servers++;
}
if (slist->nr_servers == 0) {
ret = -EDESTADDRREQ;
goto error_2;
}
return slist;
error_2:
afs_put_serverlist(volume->cell->net, slist);
error:
return ERR_PTR(ret);
}
/*
* Copy the annotations from an old server list to its potential replacement.
*/
bool afs_annotate_server_list(struct afs_server_list *new,
struct afs_server_list *old)
{
unsigned long mask = 1UL << AFS_SE_EXCLUDED;
int i;
if (old->nr_servers != new->nr_servers ||
old->ro_replicating != new->ro_replicating)
goto changed;
for (i = 0; i < old->nr_servers; i++) {
if (old->servers[i].server != new->servers[i].server)
goto changed;
if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask))
goto changed;
}
return false;
changed:
return true;
}
/*
* Attach a volume to the servers it is going to use.
*/
void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist)
{
struct afs_server_entry *se, *pe;
struct afs_server *server;
struct list_head *p;
unsigned int i;
down_write(&volume->cell->vs_lock);
for (i = 0; i < slist->nr_servers; i++) {
se = &slist->servers[i];
server = se->server;
list_for_each(p, &server->volumes) {
pe = list_entry(p, struct afs_server_entry, slink);
if (volume->vid <= pe->volume->vid)
break;
}
list_add_tail(&se->slink, p);
}
slist->attached = true;
up_write(&volume->cell->vs_lock);
}
/*
* Reattach a volume to the servers it is going to use when server list is
* replaced. We try to switch the attachment points to avoid rewalking the
* lists.
*/
void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *new,
struct afs_server_list *old)
{
unsigned int n = 0, o = 0;
down_write(&volume->cell->vs_lock);
while (n < new->nr_servers || o < old->nr_servers) {
struct afs_server_entry *pn = n < new->nr_servers ? &new->servers[n] : NULL;
struct afs_server_entry *po = o < old->nr_servers ? &old->servers[o] : NULL;
struct afs_server_entry *s;
struct list_head *p;
int diff;
if (pn && po && pn->server == po->server) {
pn->cb_expires_at = po->cb_expires_at;
list_replace(&po->slink, &pn->slink);
n++;
o++;
continue;
}
if (pn && po)
diff = memcmp(&pn->server->uuid, &po->server->uuid,
sizeof(pn->server->uuid));
else
diff = pn ? -1 : 1;
if (diff < 0) {
list_for_each(p, &pn->server->volumes) {
s = list_entry(p, struct afs_server_entry, slink);
if (volume->vid <= s->volume->vid)
break;
}
list_add_tail(&pn->slink, p);
n++;
} else {
list_del(&po->slink);
o++;
}
}
up_write(&volume->cell->vs_lock);
}
/*
* Detach a volume from the servers it has been using.
*/
void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist)
{
unsigned int i;
if (!slist->attached)
return;
down_write(&volume->cell->vs_lock);
for (i = 0; i < slist->nr_servers; i++)
list_del(&slist->servers[i].slink);
slist->attached = false;
up_write(&volume->cell->vs_lock);
}