linux/include/trace/events/afs.h

1836 lines
58 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0-or-later */
/* AFS tracepoints
*
* Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM afs
#if !defined(_TRACE_AFS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_AFS_H
#include <linux/tracepoint.h>
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
/*
* Define enums for tracing information.
*/
#ifndef __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
#define __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
enum afs_fs_operation {
afs_FS_FetchData = 130, /* AFS Fetch file data */
afs_FS_FetchACL = 131, /* AFS Fetch file ACL */
afs_FS_FetchStatus = 132, /* AFS Fetch file status */
afs_FS_StoreData = 133, /* AFS Store file data */
afs_FS_StoreACL = 134, /* AFS Store file ACL */
afs_FS_StoreStatus = 135, /* AFS Store file status */
afs_FS_RemoveFile = 136, /* AFS Remove a file */
afs_FS_CreateFile = 137, /* AFS Create a file */
afs_FS_Rename = 138, /* AFS Rename or move a file or directory */
afs_FS_Symlink = 139, /* AFS Create a symbolic link */
afs_FS_Link = 140, /* AFS Create a hard link */
afs_FS_MakeDir = 141, /* AFS Create a directory */
afs_FS_RemoveDir = 142, /* AFS Remove a directory */
afs_FS_GetVolumeInfo = 148, /* AFS Get information about a volume */
afs_FS_GetVolumeStatus = 149, /* AFS Get volume status information */
afs_FS_GetRootVolume = 151, /* AFS Get root volume name */
afs_FS_SetLock = 156, /* AFS Request a file lock */
afs_FS_ExtendLock = 157, /* AFS Extend a file lock */
afs_FS_ReleaseLock = 158, /* AFS Release a file lock */
afs_FS_Lookup = 161, /* AFS lookup file in directory */
afs_FS_InlineBulkStatus = 65536, /* AFS Fetch multiple file statuses with errors */
afs_FS_FetchData64 = 65537, /* AFS Fetch file data */
afs_FS_StoreData64 = 65538, /* AFS Store file data */
afs_FS_GiveUpAllCallBacks = 65539, /* AFS Give up all our callbacks on a server */
afs_FS_GetCapabilities = 65540, /* AFS Get FS server capabilities */
yfs_FS_FetchData = 130, /* YFS Fetch file data */
yfs_FS_FetchACL = 64131, /* YFS Fetch file ACL */
yfs_FS_FetchStatus = 64132, /* YFS Fetch file status */
yfs_FS_StoreACL = 64134, /* YFS Store file ACL */
yfs_FS_StoreStatus = 64135, /* YFS Store file status */
yfs_FS_RemoveFile = 64136, /* YFS Remove a file */
yfs_FS_CreateFile = 64137, /* YFS Create a file */
yfs_FS_Rename = 64138, /* YFS Rename or move a file or directory */
yfs_FS_Symlink = 64139, /* YFS Create a symbolic link */
yfs_FS_Link = 64140, /* YFS Create a hard link */
yfs_FS_MakeDir = 64141, /* YFS Create a directory */
yfs_FS_RemoveDir = 64142, /* YFS Remove a directory */
yfs_FS_GetVolumeStatus = 64149, /* YFS Get volume status information */
yfs_FS_SetVolumeStatus = 64150, /* YFS Set volume status information */
yfs_FS_SetLock = 64156, /* YFS Request a file lock */
yfs_FS_ExtendLock = 64157, /* YFS Extend a file lock */
yfs_FS_ReleaseLock = 64158, /* YFS Release a file lock */
yfs_FS_Lookup = 64161, /* YFS lookup file in directory */
yfs_FS_FlushCPS = 64165,
yfs_FS_FetchOpaqueACL = 64168,
yfs_FS_WhoAmI = 64170,
yfs_FS_RemoveACL = 64171,
yfs_FS_RemoveFile2 = 64173,
yfs_FS_StoreOpaqueACL2 = 64174,
yfs_FS_InlineBulkStatus = 64536, /* YFS Fetch multiple file statuses with errors */
yfs_FS_FetchData64 = 64537, /* YFS Fetch file data */
yfs_FS_StoreData64 = 64538, /* YFS Store file data */
yfs_FS_UpdateSymlink = 64540,
};
enum afs_vl_operation {
afs_VL_GetEntryByNameU = 527, /* AFS Get Vol Entry By Name operation ID */
afs_VL_GetAddrsU = 533, /* AFS Get FS server addresses */
afs_YFSVL_GetEndpoints = 64002, /* YFS Get FS & Vol server addresses */
afs_YFSVL_GetCellName = 64014, /* YFS Get actual cell name */
afs_VL_GetCapabilities = 65537, /* AFS Get VL server capabilities */
};
afs: Fix tracepoint string placement with built-in AFS To quote Alexey[1]: I was adding custom tracepoint to the kernel, grabbed full F34 kernel .config, disabled modules and booted whole shebang as VM kernel. Then did perf record -a -e ... It crashed: general protection fault, probably for non-canonical address 0x435f5346592e4243: 0000 [#1] SMP PTI CPU: 1 PID: 842 Comm: cat Not tainted 5.12.6+ #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 RIP: 0010:t_show+0x22/0xd0 Then reproducer was narrowed to # cat /sys/kernel/tracing/printk_formats Original F34 kernel with modules didn't crash. So I started to disable options and after disabling AFS everything started working again. The root cause is that AFS was placing char arrays content into a section full of _pointers_ to strings with predictable consequences. Non canonical address 435f5346592e4243 is "CB.YFS_" which came from CM_NAME macro. Steps to reproduce: CONFIG_AFS=y CONFIG_TRACING=y # cat /sys/kernel/tracing/printk_formats Fix this by the following means: (1) Add enum->string translation tables in the event header with the AFS and YFS cache/callback manager operations listed by RPC operation ID. (2) Modify the afs_cb_call tracepoint to print the string from the translation table rather than using the string at the afs_call name pointer. (3) Switch translation table depending on the service we're being accessed as (AFS or YFS) in the tracepoint print clause. Will this cause problems to userspace utilities? Note that the symbolic representation of the YFS service ID isn't available to this header, so I've put it in as a number. I'm not sure if this is the best way to do this. (4) Remove the name wrangling (CM_NAME) macro and put the names directly into the afs_call_type structs in cmservice.c. Fixes: 8e8d7f13b6d5a9 ("afs: Add some tracepoints") Reported-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Reviewed-by: Marc Dionne <marc.dionne@auristor.com> cc: Andrew Morton <akpm@linux-foundation.org> cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/YLAXfvZ+rObEOdc%2F@localhost.localdomain/ [1] Link: https://lore.kernel.org/r/643721.1623754699@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/162430903582.2896199.6098150063997983353.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/162609463957.3133237.15916579353149746363.stgit@warthog.procyon.org.uk/ # v1 (repost) Link: https://lore.kernel.org/r/162610726860.3408253.445207609466288531.stgit@warthog.procyon.org.uk/ # v2
2021-06-15 11:57:26 +01:00
enum afs_cm_operation {
afs_CB_CallBack = 204, /* AFS break callback promises */
afs_CB_InitCallBackState = 205, /* AFS initialise callback state */
afs_CB_Probe = 206, /* AFS probe client */
afs_CB_GetLock = 207, /* AFS get contents of CM lock table */
afs_CB_GetCE = 208, /* AFS get cache file description */
afs_CB_GetXStatsVersion = 209, /* AFS get version of extended statistics */
afs_CB_GetXStats = 210, /* AFS get contents of extended statistics data */
afs_CB_InitCallBackState3 = 213, /* AFS initialise callback state, version 3 */
afs_CB_ProbeUuid = 214, /* AFS check the client hasn't rebooted */
};
enum yfs_cm_operation {
yfs_CB_Probe = 206, /* YFS probe client */
yfs_CB_GetLock = 207, /* YFS get contents of CM lock table */
yfs_CB_XStatsVersion = 209, /* YFS get version of extended statistics */
yfs_CB_GetXStats = 210, /* YFS get contents of extended statistics data */
yfs_CB_InitCallBackState3 = 213, /* YFS initialise callback state, version 3 */
yfs_CB_ProbeUuid = 214, /* YFS check the client hasn't rebooted */
yfs_CB_GetServerPrefs = 215,
yfs_CB_GetCellServDV = 216,
yfs_CB_GetLocalCell = 217,
yfs_CB_GetCacheConfig = 218,
yfs_CB_GetCellByNum = 65537,
yfs_CB_TellMeAboutYourself = 65538, /* get client capabilities */
yfs_CB_CallBack = 64204,
};
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
#endif /* end __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY */
/*
* Declare tracing information enums and their string mappings for display.
*/
#define afs_call_traces \
EM(afs_call_trace_alloc, "ALLOC") \
EM(afs_call_trace_async_abort, "ASYAB") \
EM(afs_call_trace_async_kill, "ASYKL") \
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
EM(afs_call_trace_free, "FREE ") \
EM(afs_call_trace_get, "GET ") \
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
EM(afs_call_trace_put, "PUT ") \
EM(afs_call_trace_wake, "WAKE ") \
E_(afs_call_trace_work, "QUEUE")
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
#define afs_server_traces \
EM(afs_server_trace_callback, "CALLBACK ") \
EM(afs_server_trace_destroy, "DESTROY ") \
EM(afs_server_trace_free, "FREE ") \
EM(afs_server_trace_gc, "GC ") \
afs: Actively poll fileservers to maintain NAT or firewall openings When an AFS client accesses a file, it receives a limited-duration callback promise that the server will notify it if another client changes a file. This callback duration can be a few hours in length. If a client mounts a volume and then an application prevents it from being unmounted, say by chdir'ing into it, but then does nothing for some time, the rxrpc_peer record will expire and rxrpc-level keepalive will cease. If there is NAT or a firewall between the client and the server, the route back for the server may close after a comparatively short duration, meaning that attempts by the server to notify the client may then bounce. The client, however, may (so far as it knows) still have a valid unexpired promise and will then rely on its cached data and will not see changes made on the server by a third party until it incidentally rechecks the status or the promise needs renewal. To deal with this, the client needs to regularly probe the server. This has two effects: firstly, it keeps a route open back for the server, and secondly, it causes the server to disgorge any notifications that got queued up because they couldn't be sent. Fix this by adding a mechanism to emit regular probes. Two levels of probing are made available: Under normal circumstances the 'slow' queue will be used for a fileserver - this just probes the preferred address once every 5 mins or so; however, if server fails to respond to any probes, the server will shift to the 'fast' queue from which all its interfaces will be probed every 30s. When it finally responds, the record will switch back to the slow queue. Further notes: (1) Probing is now no longer driven from the fileserver rotation algorithm. (2) Probes are dispatched to all interfaces on a fileserver when that an afs_server object is set up to record it. (3) The afs_server object is removed from the probe queues when we start to probe it. afs_is_probing_server() returns true if it's not listed - ie. it's undergoing probing. (4) The afs_server object is added back on to the probe queue when the final outstanding probe completes, but the probed_at time is set when we're about to launch a probe so that it's not dependent on the probe duration. (5) The timer and the work item added for this must be handed a count on net->servers_outstanding, which they hand on or release. This makes sure that network namespace cleanup waits for them. Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation") Reported-by: Dave Botsch <botsch@cnf.cornell.edu> Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 15:10:00 +01:00
EM(afs_server_trace_get_probe, "GET probe") \
EM(afs_server_trace_purging, "PURGE ") \
EM(afs_server_trace_put_cbi, "PUT cbi ") \
afs: Actively poll fileservers to maintain NAT or firewall openings When an AFS client accesses a file, it receives a limited-duration callback promise that the server will notify it if another client changes a file. This callback duration can be a few hours in length. If a client mounts a volume and then an application prevents it from being unmounted, say by chdir'ing into it, but then does nothing for some time, the rxrpc_peer record will expire and rxrpc-level keepalive will cease. If there is NAT or a firewall between the client and the server, the route back for the server may close after a comparatively short duration, meaning that attempts by the server to notify the client may then bounce. The client, however, may (so far as it knows) still have a valid unexpired promise and will then rely on its cached data and will not see changes made on the server by a third party until it incidentally rechecks the status or the promise needs renewal. To deal with this, the client needs to regularly probe the server. This has two effects: firstly, it keeps a route open back for the server, and secondly, it causes the server to disgorge any notifications that got queued up because they couldn't be sent. Fix this by adding a mechanism to emit regular probes. Two levels of probing are made available: Under normal circumstances the 'slow' queue will be used for a fileserver - this just probes the preferred address once every 5 mins or so; however, if server fails to respond to any probes, the server will shift to the 'fast' queue from which all its interfaces will be probed every 30s. When it finally responds, the record will switch back to the slow queue. Further notes: (1) Probing is now no longer driven from the fileserver rotation algorithm. (2) Probes are dispatched to all interfaces on a fileserver when that an afs_server object is set up to record it. (3) The afs_server object is removed from the probe queues when we start to probe it. afs_is_probing_server() returns true if it's not listed - ie. it's undergoing probing. (4) The afs_server object is added back on to the probe queue when the final outstanding probe completes, but the probed_at time is set when we're about to launch a probe so that it's not dependent on the probe duration. (5) The timer and the work item added for this must be handed a count on net->servers_outstanding, which they hand on or release. This makes sure that network namespace cleanup waits for them. Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation") Reported-by: Dave Botsch <botsch@cnf.cornell.edu> Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-24 15:10:00 +01:00
EM(afs_server_trace_put_probe, "PUT probe") \
afs: Fix afs_server ref accounting The current way that afs_server refs are accounted and cleaned up sometimes cause rmmod to hang when it is waiting for cell records to be removed. The problem is that the cell cleanup might occasionally happen before the server cleanup and then there's nothing that causes the cell to garbage-collect the remaining servers as they become inactive. Partially fix this by: (1) Give each afs_server record its own management timer that rather than relying on the cell manager's central timer to drive each individual cell's maintenance work item to garbage collect servers. This timer is set when afs_unuse_server() reduces a server's activity count to zero and will schedule the server's destroyer work item upon firing. (2) Give each afs_server record its own destroyer work item that removes the record from the cell's database, shuts down the timer, cancels any pending work for itself, sends an RPC to the server to cancel outstanding callbacks. This change, in combination with the timer, obviates the need to try and coordinate so closely between the cell record and a bunch of other server records to try and tear everything down in a coordinated fashion. With this, the cell record is pinned until the server RCU is complete and namespace/module removal will wait until all the cell records are removed. (3) Now that incoming calls are mapped to servers (and thus cells) using data attached to an rxrpc_peer, the UUID-to-server mapping tree is moved from the namespace to the cell (cell->fs_servers). This means there can no longer be duplicates therein - and that allows the mapping tree to be simpler as there doesn't need to be a chain of same-UUID servers that are in different cells. (4) The lock protecting the UUID mapping tree is switched to an rw_semaphore on the cell rather than a seqlock on the namespace as it's now only used during mounting in contexts in which we're allowed to sleep. (5) When it comes time for a cell that is being removed to purge its set of servers, it just needs to iterate over them and wake them up. Once a server becomes inactive, its destroyer work item will observe the state of the cell and immediately remove that record. (6) When a server record is removed, it is marked AFS_SERVER_FL_EXPIRED to prevent reattempts at removal. The record will be dispatched to RCU for destruction once its refcount reaches 0. (7) The AFS_SERVER_FL_UNCREATED/CREATING flags are used to synchronise simultaneous creation attempts. If one attempt fails, it will abandon the attempt and allow another to try again. Note that the record can't just be abandoned when dead as it's bound into a server list attached to a volume and only subject to replacement if the server list obtained for the volume from the VLDB changes. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-15-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-11-dhowells@redhat.com/ # v4
2025-02-24 16:51:36 +00:00
EM(afs_server_trace_see_destroyer, "SEE destr") \
EM(afs_server_trace_see_expired, "SEE expd ") \
afs: Fix afs_server ref accounting The current way that afs_server refs are accounted and cleaned up sometimes cause rmmod to hang when it is waiting for cell records to be removed. The problem is that the cell cleanup might occasionally happen before the server cleanup and then there's nothing that causes the cell to garbage-collect the remaining servers as they become inactive. Partially fix this by: (1) Give each afs_server record its own management timer that rather than relying on the cell manager's central timer to drive each individual cell's maintenance work item to garbage collect servers. This timer is set when afs_unuse_server() reduces a server's activity count to zero and will schedule the server's destroyer work item upon firing. (2) Give each afs_server record its own destroyer work item that removes the record from the cell's database, shuts down the timer, cancels any pending work for itself, sends an RPC to the server to cancel outstanding callbacks. This change, in combination with the timer, obviates the need to try and coordinate so closely between the cell record and a bunch of other server records to try and tear everything down in a coordinated fashion. With this, the cell record is pinned until the server RCU is complete and namespace/module removal will wait until all the cell records are removed. (3) Now that incoming calls are mapped to servers (and thus cells) using data attached to an rxrpc_peer, the UUID-to-server mapping tree is moved from the namespace to the cell (cell->fs_servers). This means there can no longer be duplicates therein - and that allows the mapping tree to be simpler as there doesn't need to be a chain of same-UUID servers that are in different cells. (4) The lock protecting the UUID mapping tree is switched to an rw_semaphore on the cell rather than a seqlock on the namespace as it's now only used during mounting in contexts in which we're allowed to sleep. (5) When it comes time for a cell that is being removed to purge its set of servers, it just needs to iterate over them and wake them up. Once a server becomes inactive, its destroyer work item will observe the state of the cell and immediately remove that record. (6) When a server record is removed, it is marked AFS_SERVER_FL_EXPIRED to prevent reattempts at removal. The record will be dispatched to RCU for destruction once its refcount reaches 0. (7) The AFS_SERVER_FL_UNCREATED/CREATING flags are used to synchronise simultaneous creation attempts. If one attempt fails, it will abandon the attempt and allow another to try again. Note that the record can't just be abandoned when dead as it's bound into a server list attached to a volume and only subject to replacement if the server list obtained for the volume from the VLDB changes. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-15-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-11-dhowells@redhat.com/ # v4
2025-02-24 16:51:36 +00:00
EM(afs_server_trace_see_purge, "SEE purge") \
EM(afs_server_trace_see_timer, "SEE timer") \
EM(afs_server_trace_unuse_call, "UNU call ") \
EM(afs_server_trace_unuse_create_fail, "UNU cfail") \
EM(afs_server_trace_unuse_slist, "UNU slist") \
EM(afs_server_trace_unuse_slist_isort, "UNU isort") \
EM(afs_server_trace_update, "UPDATE ") \
EM(afs_server_trace_use_by_uuid, "USE uuid ") \
EM(afs_server_trace_use_cm_call, "USE cm-cl") \
EM(afs_server_trace_use_get_caps, "USE gcaps") \
EM(afs_server_trace_use_give_up_cb, "USE gvupc") \
afs: Simplify cell record handling Simplify afs_cell record handling to avoid very occasional races that cause module removal to hang (it waits for all cell records to be removed). There are two things that particularly contribute to the difficulty: firstly, the code tries to pass a ref on the cell to the cell's maintenance work item (which gets awkward if the work item is already queued); and, secondly, there's an overall cell manager that tries to use just one timer for the entire cell collection (to avoid having loads of timers). However, both of these are probably unnecessarily restrictive. To simplify this, the following changes are made: (1) The cell record collection manager is removed. Each cell record manages itself individually. (2) Each afs_cell is given a second work item (cell->destroyer) that is queued when its refcount reaches zero. This is not done in the context of the putting thread as it might be in an inconvenient place to sleep. (3) Each afs_cell is given its own timer. The timer is used to expire the cell record after a period of unuse if not otherwise pinned and can also be used for other maintenance tasks if necessary (of which there are currently none as DNS refresh is triggered by filesystem operations). (4) The afs_cell manager work item (cell->manager) is no longer given a ref on the cell when queued; rather, the manager must be deleted. This does away with the need to deal with the consequences of losing a race to queue cell->manager. Clean up of extra queuing is deferred to the destroyer. (5) The cell destroyer work item makes sure the cell timer is removed and that the normal cell work is cancelled before farming the actual destruction off to RCU. (6) When a network namespace is destroyed or the kafs module is unloaded, it's now a simple matter of marking the namespace as dead then just waking up all the cell work items. They will then remove and destroy themselves once all remaining activity counts and/or a ref counts are dropped. This makes sure that all server records are dropped first. (7) The cell record state set is reduced to just four states: SETTING_UP, ACTIVE, REMOVING and DEAD. The record persists in the active state even when it's not being used until the time comes to remove it rather than downgrading it to an inactive state from whence it can be restored. This means that the cell still appears in /proc and /afs when not in use until it switches to the REMOVING state - at which point it is removed. Note that the REMOVING state is included so that someone wanting to resurrect the cell record is forced to wait whilst the cell is torn down in that state. Once it's in the DEAD state, it has been removed from net->cells tree and is no longer findable and can be replaced. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-16-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-12-dhowells@redhat.com/ # v4
2025-02-24 16:06:03 +00:00
EM(afs_server_trace_use_install, "USE inst ") \
E_(afs_server_trace_wait_create, "WAIT crt ")
#define afs_volume_traces \
EM(afs_volume_trace_alloc, "ALLOC ") \
EM(afs_volume_trace_free, "FREE ") \
EM(afs_volume_trace_get_alloc_sbi, "GET sbi-alloc ") \
EM(afs_volume_trace_get_callback, "GET callback ") \
EM(afs_volume_trace_get_cell_insert, "GET cell-insrt") \
EM(afs_volume_trace_get_new_op, "GET op-new ") \
EM(afs_volume_trace_get_query_alias, "GET cell-alias") \
EM(afs_volume_trace_put_callback, "PUT callback ") \
EM(afs_volume_trace_put_cell_dup, "PUT cell-dup ") \
EM(afs_volume_trace_put_cell_root, "PUT cell-root ") \
EM(afs_volume_trace_put_destroy_sbi, "PUT sbi-destry") \
EM(afs_volume_trace_put_free_fc, "PUT fc-free ") \
EM(afs_volume_trace_put_put_op, "PUT op-put ") \
EM(afs_volume_trace_put_query_alias, "PUT cell-alias") \
EM(afs_volume_trace_put_validate_fc, "PUT fc-validat") \
E_(afs_volume_trace_remove, "REMOVE ")
#define afs_cell_traces \
EM(afs_cell_trace_alloc, "ALLOC ") \
afs: Simplify cell record handling Simplify afs_cell record handling to avoid very occasional races that cause module removal to hang (it waits for all cell records to be removed). There are two things that particularly contribute to the difficulty: firstly, the code tries to pass a ref on the cell to the cell's maintenance work item (which gets awkward if the work item is already queued); and, secondly, there's an overall cell manager that tries to use just one timer for the entire cell collection (to avoid having loads of timers). However, both of these are probably unnecessarily restrictive. To simplify this, the following changes are made: (1) The cell record collection manager is removed. Each cell record manages itself individually. (2) Each afs_cell is given a second work item (cell->destroyer) that is queued when its refcount reaches zero. This is not done in the context of the putting thread as it might be in an inconvenient place to sleep. (3) Each afs_cell is given its own timer. The timer is used to expire the cell record after a period of unuse if not otherwise pinned and can also be used for other maintenance tasks if necessary (of which there are currently none as DNS refresh is triggered by filesystem operations). (4) The afs_cell manager work item (cell->manager) is no longer given a ref on the cell when queued; rather, the manager must be deleted. This does away with the need to deal with the consequences of losing a race to queue cell->manager. Clean up of extra queuing is deferred to the destroyer. (5) The cell destroyer work item makes sure the cell timer is removed and that the normal cell work is cancelled before farming the actual destruction off to RCU. (6) When a network namespace is destroyed or the kafs module is unloaded, it's now a simple matter of marking the namespace as dead then just waking up all the cell work items. They will then remove and destroy themselves once all remaining activity counts and/or a ref counts are dropped. This makes sure that all server records are dropped first. (7) The cell record state set is reduced to just four states: SETTING_UP, ACTIVE, REMOVING and DEAD. The record persists in the active state even when it's not being used until the time comes to remove it rather than downgrading it to an inactive state from whence it can be restored. This means that the cell still appears in /proc and /afs when not in use until it switches to the REMOVING state - at which point it is removed. Note that the REMOVING state is included so that someone wanting to resurrect the cell record is forced to wait whilst the cell is torn down in that state. Once it's in the DEAD state, it has been removed from net->cells tree and is no longer findable and can be replaced. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-16-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-12-dhowells@redhat.com/ # v4
2025-02-24 16:06:03 +00:00
EM(afs_cell_trace_destroy, "DESTROY ") \
EM(afs_cell_trace_free, "FREE ") \
EM(afs_cell_trace_get_atcell, "GET atcell") \
EM(afs_cell_trace_get_server, "GET server") \
EM(afs_cell_trace_get_vol, "GET vol ") \
afs: Simplify cell record handling Simplify afs_cell record handling to avoid very occasional races that cause module removal to hang (it waits for all cell records to be removed). There are two things that particularly contribute to the difficulty: firstly, the code tries to pass a ref on the cell to the cell's maintenance work item (which gets awkward if the work item is already queued); and, secondly, there's an overall cell manager that tries to use just one timer for the entire cell collection (to avoid having loads of timers). However, both of these are probably unnecessarily restrictive. To simplify this, the following changes are made: (1) The cell record collection manager is removed. Each cell record manages itself individually. (2) Each afs_cell is given a second work item (cell->destroyer) that is queued when its refcount reaches zero. This is not done in the context of the putting thread as it might be in an inconvenient place to sleep. (3) Each afs_cell is given its own timer. The timer is used to expire the cell record after a period of unuse if not otherwise pinned and can also be used for other maintenance tasks if necessary (of which there are currently none as DNS refresh is triggered by filesystem operations). (4) The afs_cell manager work item (cell->manager) is no longer given a ref on the cell when queued; rather, the manager must be deleted. This does away with the need to deal with the consequences of losing a race to queue cell->manager. Clean up of extra queuing is deferred to the destroyer. (5) The cell destroyer work item makes sure the cell timer is removed and that the normal cell work is cancelled before farming the actual destruction off to RCU. (6) When a network namespace is destroyed or the kafs module is unloaded, it's now a simple matter of marking the namespace as dead then just waking up all the cell work items. They will then remove and destroy themselves once all remaining activity counts and/or a ref counts are dropped. This makes sure that all server records are dropped first. (7) The cell record state set is reduced to just four states: SETTING_UP, ACTIVE, REMOVING and DEAD. The record persists in the active state even when it's not being used until the time comes to remove it rather than downgrading it to an inactive state from whence it can be restored. This means that the cell still appears in /proc and /afs when not in use until it switches to the REMOVING state - at which point it is removed. Note that the REMOVING state is included so that someone wanting to resurrect the cell record is forced to wait whilst the cell is torn down in that state. Once it's in the DEAD state, it has been removed from net->cells tree and is no longer findable and can be replaced. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-16-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-12-dhowells@redhat.com/ # v4
2025-02-24 16:06:03 +00:00
EM(afs_cell_trace_purge, "PURGE ") \
EM(afs_cell_trace_put_atcell, "PUT atcell") \
EM(afs_cell_trace_put_candidate, "PUT candid") \
afs: Simplify cell record handling Simplify afs_cell record handling to avoid very occasional races that cause module removal to hang (it waits for all cell records to be removed). There are two things that particularly contribute to the difficulty: firstly, the code tries to pass a ref on the cell to the cell's maintenance work item (which gets awkward if the work item is already queued); and, secondly, there's an overall cell manager that tries to use just one timer for the entire cell collection (to avoid having loads of timers). However, both of these are probably unnecessarily restrictive. To simplify this, the following changes are made: (1) The cell record collection manager is removed. Each cell record manages itself individually. (2) Each afs_cell is given a second work item (cell->destroyer) that is queued when its refcount reaches zero. This is not done in the context of the putting thread as it might be in an inconvenient place to sleep. (3) Each afs_cell is given its own timer. The timer is used to expire the cell record after a period of unuse if not otherwise pinned and can also be used for other maintenance tasks if necessary (of which there are currently none as DNS refresh is triggered by filesystem operations). (4) The afs_cell manager work item (cell->manager) is no longer given a ref on the cell when queued; rather, the manager must be deleted. This does away with the need to deal with the consequences of losing a race to queue cell->manager. Clean up of extra queuing is deferred to the destroyer. (5) The cell destroyer work item makes sure the cell timer is removed and that the normal cell work is cancelled before farming the actual destruction off to RCU. (6) When a network namespace is destroyed or the kafs module is unloaded, it's now a simple matter of marking the namespace as dead then just waking up all the cell work items. They will then remove and destroy themselves once all remaining activity counts and/or a ref counts are dropped. This makes sure that all server records are dropped first. (7) The cell record state set is reduced to just four states: SETTING_UP, ACTIVE, REMOVING and DEAD. The record persists in the active state even when it's not being used until the time comes to remove it rather than downgrading it to an inactive state from whence it can be restored. This means that the cell still appears in /proc and /afs when not in use until it switches to the REMOVING state - at which point it is removed. Note that the REMOVING state is included so that someone wanting to resurrect the cell record is forced to wait whilst the cell is torn down in that state. Once it's in the DEAD state, it has been removed from net->cells tree and is no longer findable and can be replaced. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-16-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-12-dhowells@redhat.com/ # v4
2025-02-24 16:06:03 +00:00
EM(afs_cell_trace_put_final, "PUT final ") \
EM(afs_cell_trace_put_server, "PUT server") \
EM(afs_cell_trace_put_vol, "PUT vol ") \
afs: Simplify cell record handling Simplify afs_cell record handling to avoid very occasional races that cause module removal to hang (it waits for all cell records to be removed). There are two things that particularly contribute to the difficulty: firstly, the code tries to pass a ref on the cell to the cell's maintenance work item (which gets awkward if the work item is already queued); and, secondly, there's an overall cell manager that tries to use just one timer for the entire cell collection (to avoid having loads of timers). However, both of these are probably unnecessarily restrictive. To simplify this, the following changes are made: (1) The cell record collection manager is removed. Each cell record manages itself individually. (2) Each afs_cell is given a second work item (cell->destroyer) that is queued when its refcount reaches zero. This is not done in the context of the putting thread as it might be in an inconvenient place to sleep. (3) Each afs_cell is given its own timer. The timer is used to expire the cell record after a period of unuse if not otherwise pinned and can also be used for other maintenance tasks if necessary (of which there are currently none as DNS refresh is triggered by filesystem operations). (4) The afs_cell manager work item (cell->manager) is no longer given a ref on the cell when queued; rather, the manager must be deleted. This does away with the need to deal with the consequences of losing a race to queue cell->manager. Clean up of extra queuing is deferred to the destroyer. (5) The cell destroyer work item makes sure the cell timer is removed and that the normal cell work is cancelled before farming the actual destruction off to RCU. (6) When a network namespace is destroyed or the kafs module is unloaded, it's now a simple matter of marking the namespace as dead then just waking up all the cell work items. They will then remove and destroy themselves once all remaining activity counts and/or a ref counts are dropped. This makes sure that all server records are dropped first. (7) The cell record state set is reduced to just four states: SETTING_UP, ACTIVE, REMOVING and DEAD. The record persists in the active state even when it's not being used until the time comes to remove it rather than downgrading it to an inactive state from whence it can be restored. This means that the cell still appears in /proc and /afs when not in use until it switches to the REMOVING state - at which point it is removed. Note that the REMOVING state is included so that someone wanting to resurrect the cell record is forced to wait whilst the cell is torn down in that state. Once it's in the DEAD state, it has been removed from net->cells tree and is no longer findable and can be replaced. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-16-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-12-dhowells@redhat.com/ # v4
2025-02-24 16:06:03 +00:00
EM(afs_cell_trace_queue_again, "QUE again ") \
EM(afs_cell_trace_queue_dns, "QUE dns ") \
EM(afs_cell_trace_queue_new, "QUE new ") \
EM(afs_cell_trace_queue_purge, "QUE purge ") \
EM(afs_cell_trace_manage, "MANAGE ") \
EM(afs_cell_trace_managed, "MANAGED ") \
EM(afs_cell_trace_see_source, "SEE source") \
afs: Simplify cell record handling Simplify afs_cell record handling to avoid very occasional races that cause module removal to hang (it waits for all cell records to be removed). There are two things that particularly contribute to the difficulty: firstly, the code tries to pass a ref on the cell to the cell's maintenance work item (which gets awkward if the work item is already queued); and, secondly, there's an overall cell manager that tries to use just one timer for the entire cell collection (to avoid having loads of timers). However, both of these are probably unnecessarily restrictive. To simplify this, the following changes are made: (1) The cell record collection manager is removed. Each cell record manages itself individually. (2) Each afs_cell is given a second work item (cell->destroyer) that is queued when its refcount reaches zero. This is not done in the context of the putting thread as it might be in an inconvenient place to sleep. (3) Each afs_cell is given its own timer. The timer is used to expire the cell record after a period of unuse if not otherwise pinned and can also be used for other maintenance tasks if necessary (of which there are currently none as DNS refresh is triggered by filesystem operations). (4) The afs_cell manager work item (cell->manager) is no longer given a ref on the cell when queued; rather, the manager must be deleted. This does away with the need to deal with the consequences of losing a race to queue cell->manager. Clean up of extra queuing is deferred to the destroyer. (5) The cell destroyer work item makes sure the cell timer is removed and that the normal cell work is cancelled before farming the actual destruction off to RCU. (6) When a network namespace is destroyed or the kafs module is unloaded, it's now a simple matter of marking the namespace as dead then just waking up all the cell work items. They will then remove and destroy themselves once all remaining activity counts and/or a ref counts are dropped. This makes sure that all server records are dropped first. (7) The cell record state set is reduced to just four states: SETTING_UP, ACTIVE, REMOVING and DEAD. The record persists in the active state even when it's not being used until the time comes to remove it rather than downgrading it to an inactive state from whence it can be restored. This means that the cell still appears in /proc and /afs when not in use until it switches to the REMOVING state - at which point it is removed. Note that the REMOVING state is included so that someone wanting to resurrect the cell record is forced to wait whilst the cell is torn down in that state. Once it's in the DEAD state, it has been removed from net->cells tree and is no longer findable and can be replaced. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-16-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-12-dhowells@redhat.com/ # v4
2025-02-24 16:06:03 +00:00
EM(afs_cell_trace_see_mgmt_timer, "SEE mtimer") \
EM(afs_cell_trace_unuse_alias, "UNU alias ") \
EM(afs_cell_trace_unuse_check_alias, "UNU chk-al") \
EM(afs_cell_trace_unuse_delete, "UNU delete") \
afs: Change dynroot to create contents on demand Change the AFS dynamic root to do things differently: (1) Rather than having the creation of cell records create inodes and dentries for cell mountpoints, create them on demand during lookup. This simplifies cell management and locking as we no longer have to create these objects in advance *and* on speculative lookup by the user for a cell that isn't precreated. (2) Rather than using the libfs dentry-based readdir (the dentries now no longer exist until accessed from (1)), have readdir generate the contents by reading the list of cells. The @cell symlinks get pushed in positions 2 and 3 if rootcell has been configured. (3) Make the @cell symlink dentries persist for the life of the superblock or until reclaimed, but make cell mountpoints disappear immediately if unused. It's not perfect as someone doing an "ls -l /afs" may create a whole bunch of dentries which will be garbage collected immediately. But any dentry that gets automounted will be pinned by the mount, so it shouldn't be too bad. (4) Allocate the inode numbers for the cell mountpoints from an IDR to prevent duplicates appearing in the event it cycles round. The number allocated from the IDR is doubled to provide two inode numbers - one for the normal cell name (RO) and one for the dotted cell name (RW). Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-8-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-4-dhowells@redhat.com/ # v4
2025-02-24 09:52:58 +00:00
EM(afs_cell_trace_unuse_dynroot_mntpt, "UNU dyn-mp") \
EM(afs_cell_trace_unuse_fc, "UNU fc ") \
afs: Change dynroot to create contents on demand Change the AFS dynamic root to do things differently: (1) Rather than having the creation of cell records create inodes and dentries for cell mountpoints, create them on demand during lookup. This simplifies cell management and locking as we no longer have to create these objects in advance *and* on speculative lookup by the user for a cell that isn't precreated. (2) Rather than using the libfs dentry-based readdir (the dentries now no longer exist until accessed from (1)), have readdir generate the contents by reading the list of cells. The @cell symlinks get pushed in positions 2 and 3 if rootcell has been configured. (3) Make the @cell symlink dentries persist for the life of the superblock or until reclaimed, but make cell mountpoints disappear immediately if unused. It's not perfect as someone doing an "ls -l /afs" may create a whole bunch of dentries which will be garbage collected immediately. But any dentry that gets automounted will be pinned by the mount, so it shouldn't be too bad. (4) Allocate the inode numbers for the cell mountpoints from an IDR to prevent duplicates appearing in the event it cycles round. The number allocated from the IDR is doubled to provide two inode numbers - one for the normal cell name (RO) and one for the dotted cell name (RW). Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-8-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-4-dhowells@redhat.com/ # v4
2025-02-24 09:52:58 +00:00
EM(afs_cell_trace_unuse_lookup_dynroot, "UNU lu-dyn") \
EM(afs_cell_trace_unuse_lookup_error, "UNU lu-err") \
EM(afs_cell_trace_unuse_mntpt, "UNU mntpt ") \
EM(afs_cell_trace_unuse_no_pin, "UNU no-pin") \
EM(afs_cell_trace_unuse_parse, "UNU parse ") \
EM(afs_cell_trace_unuse_pin, "UNU pin ") \
EM(afs_cell_trace_unuse_sbi, "UNU sbi ") \
EM(afs_cell_trace_unuse_ws, "UNU ws ") \
EM(afs_cell_trace_use_alias, "USE alias ") \
EM(afs_cell_trace_use_check_alias, "USE chk-al") \
EM(afs_cell_trace_use_fc, "USE fc ") \
EM(afs_cell_trace_use_fc_alias, "USE fc-al ") \
EM(afs_cell_trace_use_lookup_add, "USE lu-add") \
EM(afs_cell_trace_use_lookup_canonical, "USE lu-can") \
EM(afs_cell_trace_use_lookup_dynroot, "USE lu-dyn") \
EM(afs_cell_trace_use_lookup_mntpt, "USE lu-mpt") \
EM(afs_cell_trace_use_lookup_mount, "USE lu-mnt") \
EM(afs_cell_trace_use_lookup_ws, "USE lu-ws ") \
EM(afs_cell_trace_use_mntpt, "USE mntpt ") \
EM(afs_cell_trace_use_pin, "USE pin ") \
EM(afs_cell_trace_use_probe, "USE probe ") \
EM(afs_cell_trace_use_sbi, "USE sbi ") \
E_(afs_cell_trace_wait, "WAIT ")
#define afs_alist_traces \
EM(afs_alist_trace_alloc, "ALLOC ") \
EM(afs_alist_trace_get_estate, "GET estate") \
EM(afs_alist_trace_get_vlgetcaps, "GET vgtcap") \
EM(afs_alist_trace_get_vlprobe, "GET vprobe") \
EM(afs_alist_trace_get_vlrotate_set, "GET vl-rot") \
EM(afs_alist_trace_put_estate, "PUT estate") \
EM(afs_alist_trace_put_getaddru, "PUT GtAdrU") \
EM(afs_alist_trace_put_parse_empty, "PUT p-empt") \
EM(afs_alist_trace_put_parse_error, "PUT p-err ") \
afs: Fix afs_server ref accounting The current way that afs_server refs are accounted and cleaned up sometimes cause rmmod to hang when it is waiting for cell records to be removed. The problem is that the cell cleanup might occasionally happen before the server cleanup and then there's nothing that causes the cell to garbage-collect the remaining servers as they become inactive. Partially fix this by: (1) Give each afs_server record its own management timer that rather than relying on the cell manager's central timer to drive each individual cell's maintenance work item to garbage collect servers. This timer is set when afs_unuse_server() reduces a server's activity count to zero and will schedule the server's destroyer work item upon firing. (2) Give each afs_server record its own destroyer work item that removes the record from the cell's database, shuts down the timer, cancels any pending work for itself, sends an RPC to the server to cancel outstanding callbacks. This change, in combination with the timer, obviates the need to try and coordinate so closely between the cell record and a bunch of other server records to try and tear everything down in a coordinated fashion. With this, the cell record is pinned until the server RCU is complete and namespace/module removal will wait until all the cell records are removed. (3) Now that incoming calls are mapped to servers (and thus cells) using data attached to an rxrpc_peer, the UUID-to-server mapping tree is moved from the namespace to the cell (cell->fs_servers). This means there can no longer be duplicates therein - and that allows the mapping tree to be simpler as there doesn't need to be a chain of same-UUID servers that are in different cells. (4) The lock protecting the UUID mapping tree is switched to an rw_semaphore on the cell rather than a seqlock on the namespace as it's now only used during mounting in contexts in which we're allowed to sleep. (5) When it comes time for a cell that is being removed to purge its set of servers, it just needs to iterate over them and wake them up. Once a server becomes inactive, its destroyer work item will observe the state of the cell and immediately remove that record. (6) When a server record is removed, it is marked AFS_SERVER_FL_EXPIRED to prevent reattempts at removal. The record will be dispatched to RCU for destruction once its refcount reaches 0. (7) The AFS_SERVER_FL_UNCREATED/CREATING flags are used to synchronise simultaneous creation attempts. If one attempt fails, it will abandon the attempt and allow another to try again. Note that the record can't just be abandoned when dead as it's bound into a server list attached to a volume and only subject to replacement if the server list obtained for the volume from the VLDB changes. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20250224234154.2014840-15-dhowells@redhat.com/ # v1 Link: https://lore.kernel.org/r/20250310094206.801057-11-dhowells@redhat.com/ # v4
2025-02-24 16:51:36 +00:00
EM(afs_alist_trace_put_server_create, "PUT sv-crt") \
EM(afs_alist_trace_put_server_oom, "PUT sv-oom") \
EM(afs_alist_trace_put_server_update, "PUT sv-upd") \
EM(afs_alist_trace_put_vlgetcaps, "PUT vgtcap") \
EM(afs_alist_trace_put_vlprobe, "PUT vprobe") \
EM(afs_alist_trace_put_vlrotate_end, "PUT vr-end") \
EM(afs_alist_trace_put_vlrotate_fail, "PUT vr-fai") \
EM(afs_alist_trace_put_vlrotate_next, "PUT vr-nxt") \
EM(afs_alist_trace_put_vlrotate_restart,"PUT vr-rst") \
EM(afs_alist_trace_put_vlserver, "PUT vlsrvr") \
EM(afs_alist_trace_put_vlserver_old, "PUT vs-old") \
E_(afs_alist_trace_free, "FREE ")
#define afs_estate_traces \
EM(afs_estate_trace_alloc_probe, "ALLOC prob") \
EM(afs_estate_trace_alloc_server, "ALLOC srvr") \
afs: Fix fileserver rotation Fix the fileserver rotation so that it doesn't use RTT as the basis for deciding which server and address to use as this doesn't necessarily give a good indication of the best path. Instead, use the configurable preference list in conjunction with whatever probes have succeeded at the time of looking. To this end, make the following changes: (1) Keep an array of "server states" to track what addresses we've tried on each server and move the waitqueue entries there that we'll need for probing. (2) Each afs_server_state struct is made to pin the corresponding server's endpoint state rather than the afs_operation struct carrying a pin on the server we're currently looking at. (3) Drop the server list preference; we now always rescan the server list. (4) afs_wait_for_probes() now uses the server state list to guide it in what it waits for (and to provide the waitqueue entries) and returns an indication of whether we'd got a response, run out of responsive addresses or the endpoint state had been superseded and we need to restart the iteration. (5) Call afs_get_address_preferences*() occasionally to refresh the preference values. (6) When picking a server, scan the addresses of the servers for which we have as-yet untested communications, looking for the highest priority one and use that instead of trying all the addresses for a particular server in ascending-RTT order. (7) When a Busy or Offline state is seen across all available servers, do a short sleep. (8) If we detect that we accessed a future RO volume version whilst it is undergoing replication, reissue the op against the older version until at least half of the servers are replicated. (9) Whilst RO replication is ongoing, increase the frequency of Volume Location server checks for that volume to every ten minutes instead of hourly. Also add a tracepoint to track progress through the rotation algorithm. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-10-18 09:24:01 +01:00
EM(afs_estate_trace_get_server_state, "GET srv-st") \
EM(afs_estate_trace_get_getcaps, "GET getcap") \
EM(afs_estate_trace_put_getcaps, "PUT getcap") \
EM(afs_estate_trace_put_probe, "PUT probe ") \
EM(afs_estate_trace_put_server, "PUT server") \
afs: Fix fileserver rotation Fix the fileserver rotation so that it doesn't use RTT as the basis for deciding which server and address to use as this doesn't necessarily give a good indication of the best path. Instead, use the configurable preference list in conjunction with whatever probes have succeeded at the time of looking. To this end, make the following changes: (1) Keep an array of "server states" to track what addresses we've tried on each server and move the waitqueue entries there that we'll need for probing. (2) Each afs_server_state struct is made to pin the corresponding server's endpoint state rather than the afs_operation struct carrying a pin on the server we're currently looking at. (3) Drop the server list preference; we now always rescan the server list. (4) afs_wait_for_probes() now uses the server state list to guide it in what it waits for (and to provide the waitqueue entries) and returns an indication of whether we'd got a response, run out of responsive addresses or the endpoint state had been superseded and we need to restart the iteration. (5) Call afs_get_address_preferences*() occasionally to refresh the preference values. (6) When picking a server, scan the addresses of the servers for which we have as-yet untested communications, looking for the highest priority one and use that instead of trying all the addresses for a particular server in ascending-RTT order. (7) When a Busy or Offline state is seen across all available servers, do a short sleep. (8) If we detect that we accessed a future RO volume version whilst it is undergoing replication, reissue the op against the older version until at least half of the servers are replicated. (9) Whilst RO replication is ongoing, increase the frequency of Volume Location server checks for that volume to every ten minutes instead of hourly. Also add a tracepoint to track progress through the rotation algorithm. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-10-18 09:24:01 +01:00
EM(afs_estate_trace_put_server_state, "PUT srv-st") \
E_(afs_estate_trace_free, "FREE ")
#define afs_fs_operations \
EM(afs_FS_FetchData, "FS.FetchData") \
EM(afs_FS_FetchStatus, "FS.FetchStatus") \
EM(afs_FS_StoreData, "FS.StoreData") \
EM(afs_FS_StoreStatus, "FS.StoreStatus") \
EM(afs_FS_RemoveFile, "FS.RemoveFile") \
EM(afs_FS_CreateFile, "FS.CreateFile") \
EM(afs_FS_Rename, "FS.Rename") \
EM(afs_FS_Symlink, "FS.Symlink") \
EM(afs_FS_Link, "FS.Link") \
EM(afs_FS_MakeDir, "FS.MakeDir") \
EM(afs_FS_RemoveDir, "FS.RemoveDir") \
EM(afs_FS_GetVolumeInfo, "FS.GetVolumeInfo") \
EM(afs_FS_GetVolumeStatus, "FS.GetVolumeStatus") \
EM(afs_FS_GetRootVolume, "FS.GetRootVolume") \
EM(afs_FS_SetLock, "FS.SetLock") \
EM(afs_FS_ExtendLock, "FS.ExtendLock") \
EM(afs_FS_ReleaseLock, "FS.ReleaseLock") \
EM(afs_FS_Lookup, "FS.Lookup") \
EM(afs_FS_InlineBulkStatus, "FS.InlineBulkStatus") \
EM(afs_FS_FetchData64, "FS.FetchData64") \
EM(afs_FS_StoreData64, "FS.StoreData64") \
EM(afs_FS_GiveUpAllCallBacks, "FS.GiveUpAllCallBacks") \
EM(afs_FS_GetCapabilities, "FS.GetCapabilities") \
EM(yfs_FS_FetchACL, "YFS.FetchACL") \
EM(yfs_FS_FetchStatus, "YFS.FetchStatus") \
EM(yfs_FS_StoreACL, "YFS.StoreACL") \
EM(yfs_FS_StoreStatus, "YFS.StoreStatus") \
EM(yfs_FS_RemoveFile, "YFS.RemoveFile") \
EM(yfs_FS_CreateFile, "YFS.CreateFile") \
EM(yfs_FS_Rename, "YFS.Rename") \
EM(yfs_FS_Symlink, "YFS.Symlink") \
EM(yfs_FS_Link, "YFS.Link") \
EM(yfs_FS_MakeDir, "YFS.MakeDir") \
EM(yfs_FS_RemoveDir, "YFS.RemoveDir") \
EM(yfs_FS_GetVolumeStatus, "YFS.GetVolumeStatus") \
EM(yfs_FS_SetVolumeStatus, "YFS.SetVolumeStatus") \
EM(yfs_FS_SetLock, "YFS.SetLock") \
EM(yfs_FS_ExtendLock, "YFS.ExtendLock") \
EM(yfs_FS_ReleaseLock, "YFS.ReleaseLock") \
EM(yfs_FS_Lookup, "YFS.Lookup") \
EM(yfs_FS_FlushCPS, "YFS.FlushCPS") \
EM(yfs_FS_FetchOpaqueACL, "YFS.FetchOpaqueACL") \
EM(yfs_FS_WhoAmI, "YFS.WhoAmI") \
EM(yfs_FS_RemoveACL, "YFS.RemoveACL") \
EM(yfs_FS_RemoveFile2, "YFS.RemoveFile2") \
EM(yfs_FS_StoreOpaqueACL2, "YFS.StoreOpaqueACL2") \
EM(yfs_FS_InlineBulkStatus, "YFS.InlineBulkStatus") \
EM(yfs_FS_FetchData64, "YFS.FetchData64") \
EM(yfs_FS_StoreData64, "YFS.StoreData64") \
E_(yfs_FS_UpdateSymlink, "YFS.UpdateSymlink")
#define afs_vl_operations \
EM(afs_VL_GetEntryByNameU, "VL.GetEntryByNameU") \
EM(afs_VL_GetAddrsU, "VL.GetAddrsU") \
EM(afs_YFSVL_GetEndpoints, "YFSVL.GetEndpoints") \
EM(afs_YFSVL_GetCellName, "YFSVL.GetCellName") \
E_(afs_VL_GetCapabilities, "VL.GetCapabilities")
afs: Fix tracepoint string placement with built-in AFS To quote Alexey[1]: I was adding custom tracepoint to the kernel, grabbed full F34 kernel .config, disabled modules and booted whole shebang as VM kernel. Then did perf record -a -e ... It crashed: general protection fault, probably for non-canonical address 0x435f5346592e4243: 0000 [#1] SMP PTI CPU: 1 PID: 842 Comm: cat Not tainted 5.12.6+ #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 RIP: 0010:t_show+0x22/0xd0 Then reproducer was narrowed to # cat /sys/kernel/tracing/printk_formats Original F34 kernel with modules didn't crash. So I started to disable options and after disabling AFS everything started working again. The root cause is that AFS was placing char arrays content into a section full of _pointers_ to strings with predictable consequences. Non canonical address 435f5346592e4243 is "CB.YFS_" which came from CM_NAME macro. Steps to reproduce: CONFIG_AFS=y CONFIG_TRACING=y # cat /sys/kernel/tracing/printk_formats Fix this by the following means: (1) Add enum->string translation tables in the event header with the AFS and YFS cache/callback manager operations listed by RPC operation ID. (2) Modify the afs_cb_call tracepoint to print the string from the translation table rather than using the string at the afs_call name pointer. (3) Switch translation table depending on the service we're being accessed as (AFS or YFS) in the tracepoint print clause. Will this cause problems to userspace utilities? Note that the symbolic representation of the YFS service ID isn't available to this header, so I've put it in as a number. I'm not sure if this is the best way to do this. (4) Remove the name wrangling (CM_NAME) macro and put the names directly into the afs_call_type structs in cmservice.c. Fixes: 8e8d7f13b6d5a9 ("afs: Add some tracepoints") Reported-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Reviewed-by: Marc Dionne <marc.dionne@auristor.com> cc: Andrew Morton <akpm@linux-foundation.org> cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/YLAXfvZ+rObEOdc%2F@localhost.localdomain/ [1] Link: https://lore.kernel.org/r/643721.1623754699@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/162430903582.2896199.6098150063997983353.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/162609463957.3133237.15916579353149746363.stgit@warthog.procyon.org.uk/ # v1 (repost) Link: https://lore.kernel.org/r/162610726860.3408253.445207609466288531.stgit@warthog.procyon.org.uk/ # v2
2021-06-15 11:57:26 +01:00
#define afs_cm_operations \
EM(afs_CB_CallBack, "CB.CallBack") \
EM(afs_CB_InitCallBackState, "CB.InitCallBackState") \
EM(afs_CB_Probe, "CB.Probe") \
EM(afs_CB_GetLock, "CB.GetLock") \
EM(afs_CB_GetCE, "CB.GetCE") \
EM(afs_CB_GetXStatsVersion, "CB.GetXStatsVersion") \
EM(afs_CB_GetXStats, "CB.GetXStats") \
EM(afs_CB_InitCallBackState3, "CB.InitCallBackState3") \
E_(afs_CB_ProbeUuid, "CB.ProbeUuid")
#define yfs_cm_operations \
EM(yfs_CB_Probe, "YFSCB.Probe") \
EM(yfs_CB_GetLock, "YFSCB.GetLock") \
EM(yfs_CB_XStatsVersion, "YFSCB.XStatsVersion") \
EM(yfs_CB_GetXStats, "YFSCB.GetXStats") \
EM(yfs_CB_InitCallBackState3, "YFSCB.InitCallBackState3") \
EM(yfs_CB_ProbeUuid, "YFSCB.ProbeUuid") \
EM(yfs_CB_GetServerPrefs, "YFSCB.GetServerPrefs") \
EM(yfs_CB_GetCellServDV, "YFSCB.GetCellServDV") \
EM(yfs_CB_GetLocalCell, "YFSCB.GetLocalCell") \
EM(yfs_CB_GetCacheConfig, "YFSCB.GetCacheConfig") \
EM(yfs_CB_GetCellByNum, "YFSCB.GetCellByNum") \
EM(yfs_CB_TellMeAboutYourself, "YFSCB.TellMeAboutYourself") \
E_(yfs_CB_CallBack, "YFSCB.CallBack")
#define afs_cb_promise_traces \
EM(afs_cb_promise_clear_cb_break, "CLEAR cb-break") \
EM(afs_cb_promise_clear_rmdir, "CLEAR rmdir") \
EM(afs_cb_promise_clear_rotate_server, "CLEAR rot-srv") \
EM(afs_cb_promise_clear_server_change, "CLEAR srv-chg") \
EM(afs_cb_promise_clear_vol_init_cb, "CLEAR vol-init-cb") \
EM(afs_cb_promise_set_apply_cb, "SET apply-cb") \
EM(afs_cb_promise_set_new_inode, "SET new-inode") \
E_(afs_cb_promise_set_new_symlink, "SET new-symlink")
#define afs_vnode_invalid_traces \
EM(afs_vnode_invalid_trace_cb_ro_snapshot, "cb-ro-snapshot") \
EM(afs_vnode_invalid_trace_cb_scrub, "cb-scrub") \
EM(afs_vnode_invalid_trace_cb_v_break, "cb-v-break") \
EM(afs_vnode_invalid_trace_expired, "expired") \
EM(afs_vnode_invalid_trace_no_cb_promise, "no-cb-promise") \
EM(afs_vnode_invalid_trace_vol_expired, "vol-expired") \
EM(afs_vnode_invalid_trace_zap_data, "zap-data") \
E_(afs_vnode_valid_trace, "valid")
#define afs_dir_invalid_traces \
EM(afs_dir_invalid_edit_add_bad_size, "edit-add-bad-size") \
EM(afs_dir_invalid_edit_add_no_slots, "edit-add-no-slots") \
EM(afs_dir_invalid_edit_add_too_many_blocks, "edit-add-too-many-blocks") \
EM(afs_dir_invalid_edit_get_block, "edit-get-block") \
EM(afs_dir_invalid_edit_mkdir, "edit-mkdir") \
EM(afs_dir_invalid_edit_rem_bad_size, "edit-rem-bad-size") \
EM(afs_dir_invalid_edit_rem_wrong_name, "edit-rem-wrong_name") \
EM(afs_dir_invalid_edit_upd_bad_size, "edit-upd-bad-size") \
EM(afs_dir_invalid_edit_upd_no_dd, "edit-upd-no-dotdot") \
EM(afs_dir_invalid_dv_mismatch, "dv-mismatch") \
EM(afs_dir_invalid_inval_folio, "inv-folio") \
EM(afs_dir_invalid_iter_stale, "iter-stale") \
EM(afs_dir_invalid_reclaimed_folio, "reclaimed-folio") \
EM(afs_dir_invalid_release_folio, "rel-folio") \
EM(afs_dir_invalid_remote, "remote") \
E_(afs_dir_invalid_subdir_removed, "subdir-removed")
#define afs_edit_dir_ops \
EM(afs_edit_dir_create, "create") \
EM(afs_edit_dir_create_error, "c_fail") \
EM(afs_edit_dir_create_inval, "c_invl") \
EM(afs_edit_dir_create_nospc, "c_nspc") \
EM(afs_edit_dir_delete, "delete") \
EM(afs_edit_dir_delete_error, "d_err ") \
EM(afs_edit_dir_delete_inval, "d_invl") \
2024-10-23 11:40:10 +01:00
EM(afs_edit_dir_delete_noent, "d_nent") \
EM(afs_edit_dir_mkdir, "mk_ent") \
2024-10-23 11:40:10 +01:00
EM(afs_edit_dir_update_dd, "u_ddot") \
EM(afs_edit_dir_update_error, "u_fail") \
EM(afs_edit_dir_update_inval, "u_invl") \
E_(afs_edit_dir_update_nodd, "u_nodd")
#define afs_edit_dir_reasons \
EM(afs_edit_dir_for_create, "Create") \
EM(afs_edit_dir_for_link, "Link ") \
EM(afs_edit_dir_for_mkdir, "MkDir ") \
EM(afs_edit_dir_for_rename_0, "Renam0") \
EM(afs_edit_dir_for_rename_1, "Renam1") \
EM(afs_edit_dir_for_rename_2, "Renam2") \
2024-10-23 11:40:10 +01:00
EM(afs_edit_dir_for_rename_sub, "RnmSub") \
EM(afs_edit_dir_for_rmdir, "RmDir ") \
EM(afs_edit_dir_for_silly_0, "S_Ren0") \
EM(afs_edit_dir_for_silly_1, "S_Ren1") \
EM(afs_edit_dir_for_symlink, "Symlnk") \
E_(afs_edit_dir_for_unlink, "Unlink")
#define afs_eproto_causes \
EM(afs_eproto_bad_status, "BadStatus") \
EM(afs_eproto_cb_count, "CbCount") \
EM(afs_eproto_cb_fid_count, "CbFidCount") \
EM(afs_eproto_cellname_len, "CellNameLen") \
EM(afs_eproto_file_type, "FileTYpe") \
EM(afs_eproto_ibulkst_cb_count, "IBS.CbCount") \
EM(afs_eproto_ibulkst_count, "IBS.FidCount") \
EM(afs_eproto_motd_len, "MotdLen") \
EM(afs_eproto_offline_msg_len, "OfflineMsgLen") \
EM(afs_eproto_volname_len, "VolNameLen") \
EM(afs_eproto_yvl_fsendpt4_len, "YVL.FsEnd4Len") \
EM(afs_eproto_yvl_fsendpt6_len, "YVL.FsEnd6Len") \
EM(afs_eproto_yvl_fsendpt_num, "YVL.FsEndCount") \
EM(afs_eproto_yvl_fsendpt_type, "YVL.FsEndType") \
EM(afs_eproto_yvl_vlendpt4_len, "YVL.VlEnd4Len") \
EM(afs_eproto_yvl_vlendpt6_len, "YVL.VlEnd6Len") \
E_(afs_eproto_yvl_vlendpt_type, "YVL.VlEndType")
#define afs_io_errors \
EM(afs_io_error_cm_reply, "CM_REPLY") \
EM(afs_io_error_extract, "EXTRACT") \
EM(afs_io_error_fs_probe_fail, "FS_PROBE_FAIL") \
EM(afs_io_error_vl_lookup_fail, "VL_LOOKUP_FAIL") \
E_(afs_io_error_vl_probe_fail, "VL_PROBE_FAIL")
#define afs_file_errors \
EM(afs_file_error_dir_bad_magic, "DIR_BAD_MAGIC") \
EM(afs_file_error_dir_big, "DIR_BIG") \
EM(afs_file_error_dir_missing_page, "DIR_MISSING_PAGE") \
EM(afs_file_error_dir_name_too_long, "DIR_NAME_TOO_LONG") \
EM(afs_file_error_dir_over_end, "DIR_ENT_OVER_END") \
EM(afs_file_error_dir_small, "DIR_SMALL") \
EM(afs_file_error_dir_unmarked_ext, "DIR_UNMARKED_EXT") \
EM(afs_file_error_symlink_big, "SYM_BIG") \
EM(afs_file_error_mntpt, "MNTPT_READ_FAILED") \
E_(afs_file_error_writeback_fail, "WRITEBACK_FAILED")
#define afs_flock_types \
EM(F_RDLCK, "RDLCK") \
EM(F_WRLCK, "WRLCK") \
E_(F_UNLCK, "UNLCK")
#define afs_flock_states \
EM(AFS_VNODE_LOCK_NONE, "NONE") \
EM(AFS_VNODE_LOCK_WAITING_FOR_CB, "WAIT_FOR_CB") \
EM(AFS_VNODE_LOCK_SETTING, "SETTING") \
EM(AFS_VNODE_LOCK_GRANTED, "GRANTED") \
EM(AFS_VNODE_LOCK_EXTENDING, "EXTENDING") \
EM(AFS_VNODE_LOCK_NEED_UNLOCK, "NEED_UNLOCK") \
EM(AFS_VNODE_LOCK_UNLOCKING, "UNLOCKING") \
E_(AFS_VNODE_LOCK_DELETED, "DELETED")
#define afs_flock_events \
EM(afs_flock_acquired, "Acquired") \
EM(afs_flock_callback_break, "Callback") \
EM(afs_flock_defer_unlock, "D-Unlock") \
EM(afs_flock_extend_fail, "Ext_Fail") \
EM(afs_flock_fail_other, "ErrOther") \
EM(afs_flock_fail_perm, "ErrPerm ") \
EM(afs_flock_no_lockers, "NoLocker") \
EM(afs_flock_release_fail, "Rel_Fail") \
EM(afs_flock_silly_delete, "SillyDel") \
EM(afs_flock_timestamp, "Timestmp") \
EM(afs_flock_try_to_lock, "TryToLck") \
EM(afs_flock_vfs_lock, "VFSLock ") \
EM(afs_flock_vfs_locking, "VFSLking") \
EM(afs_flock_waited, "Waited ") \
EM(afs_flock_waiting, "Waiting ") \
EM(afs_flock_work_extending, "Extendng") \
EM(afs_flock_work_retry, "Retry ") \
EM(afs_flock_work_unlocking, "Unlcking") \
E_(afs_flock_would_block, "EWOULDBL")
#define afs_flock_operations \
EM(afs_flock_op_copy_lock, "COPY ") \
EM(afs_flock_op_flock, "->flock ") \
EM(afs_flock_op_grant, "GRANT ") \
EM(afs_flock_op_lock, "->lock ") \
EM(afs_flock_op_release_lock, "RELEASE ") \
EM(afs_flock_op_return_ok, "<-OK ") \
EM(afs_flock_op_return_edeadlk, "<-EDEADL") \
EM(afs_flock_op_return_eagain, "<-EAGAIN") \
EM(afs_flock_op_return_error, "<-ERROR ") \
EM(afs_flock_op_set_lock, "SET ") \
EM(afs_flock_op_unlock, "UNLOCK ") \
E_(afs_flock_op_wake, "WAKE ")
#define afs_cb_break_reasons \
EM(afs_cb_break_no_break, "no-break") \
EM(afs_cb_break_for_callback, "break-cb") \
afs: Parse the VolSync record in the reply of a number of RPC ops A number of fileserver RPC operations return a VolSync record as part of their reply that gives some information about the state of the volume being accessed, including: (1) A volume Creation timestamp. For an RW volume, this is the time at which the volume was created; if it changes, the RW volume was presumably restored from a backup and all cached data should be scrubbed as Data Version numbers could regress on the files in the volume. For an RO volume, this is the time it was last snapshotted from the RW volume. It is expected to advance each time this happens; if it regresses, cached data should be scrubbed. (2) A volume Update timestamp (Auristor only). For an RW volume, this is updated any time any change is made to a volume or its contents. If it regresses, all cached data must be scrubbed. For an RO volume, this is a copy of the RW volume's Update timestamp at the point of snapshotting. It can be used as a version number when checking to see if a callback on a RO volume was due to a snapshot. If it regresses, all cached data must be scrubbed. but this is currently not made use of by the in-kernel afs filesystem. Make the afs filesystem use this by: (1) Add an update time field to the afs_volsync struct and use a value of TIME64_MIN in both that and the creation time to indicate that they are unset. (2) Add creation and update time fields to the afs_volume struct and use this to track the two timestamps. (3) Add a volsync_lock mutex to the afs_volume struct to control modification access for when we detect a change in these values. (3) Add a 'pre-op volsync' struct to the afs_operation struct to record the state of the volume tracking before the op. (4) Add a new counter, cb_scrub, to the afs_volume struct to count events that require all data to be scrubbed. A copy is placed in the afs_vnode struct (inode) and if they no longer match, a scrub takes place. (5) When the result of an operation is being parsed, parse the VolSync data too, if it is provided. Note that the two timestamps are handled separately, since they don't work in quite the same way. - If the afs_volume tracking is unset, just set it and do nothing else. - If the result timestamps are the same as the ones in afs_volume, do nothing. - If the timestamps regress, increment cb_scrub if not already done so. - If the creation timestamp on a RW volume changes, increment cb_scrub if not already done so. - If the creation timestamp on a RO volume advances, update the server list and see if the current server has been excluded, if so reissue the op. Once over half of the replication sites have been updated, increment cb_ro_snapshot to indicate updates may be required and switch over to excluding unupdated replication sites. - If the creation timestamp on a Backup volume advances, just increment cb_ro_snapshot to trigger updates. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-11-05 16:11:07 +00:00
EM(afs_cb_break_for_creation_regress, "creation-regress") \
EM(afs_cb_break_for_deleted, "break-del") \
afs: Try to avoid taking RCU read lock when checking vnode validity Try to avoid taking the RCU read lock when checking the validity of a vnode's callback state. The only thing it's needed for is to pin the parent volume's server list whilst we search it to find the record of the server we're currently using to see if it has been reinitialised (ie. it sent us a CB.InitCallBackState* RPC). Do this by the following means: (1) Keep an additional per-cell counter (fs_s_break) that's incremented each time any of the fileservers in the cell reinitialises. Since the new counter can be accessed without RCU from the vnode, we can check that first - and only if it differs, get the RCU read lock and check the volume's server list. (2) Replace afs_get_s_break_rcu() with afs_check_server_good() which now indicates whether the callback promise is still expected to be present on the server. This does the checks as described in (1). (3) Restructure afs_check_validity() to take account of the change in (2). We can also get rid of the valid variable and just use the need_clear variable with the addition of the afs_cb_break_no_promise reason. (4) afs_check_validity() probably shouldn't be altering vnode->cb_v_break and vnode->cb_s_break when it doesn't have cb_lock exclusively locked. Move the change to vnode->cb_v_break to __afs_break_callback(). Delegate the change to vnode->cb_s_break to afs_select_fileserver() and set vnode->cb_fs_s_break there also. (5) afs_validate() no longer needs to get the RCU read lock around its call to afs_check_validity() - and can skip the call entirely if we don't have a promise. Signed-off-by: David Howells <dhowells@redhat.com> Tested-by: Markus Suvanto <markus.suvanto@gmail.com> cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/163111669583.283156.1397603105683094563.stgit@warthog.procyon.org.uk/
2021-09-02 21:51:01 +01:00
EM(afs_cb_break_for_s_reinit, "s-reinit") \
EM(afs_cb_break_for_unlink, "break-unlink") \
afs: Parse the VolSync record in the reply of a number of RPC ops A number of fileserver RPC operations return a VolSync record as part of their reply that gives some information about the state of the volume being accessed, including: (1) A volume Creation timestamp. For an RW volume, this is the time at which the volume was created; if it changes, the RW volume was presumably restored from a backup and all cached data should be scrubbed as Data Version numbers could regress on the files in the volume. For an RO volume, this is the time it was last snapshotted from the RW volume. It is expected to advance each time this happens; if it regresses, cached data should be scrubbed. (2) A volume Update timestamp (Auristor only). For an RW volume, this is updated any time any change is made to a volume or its contents. If it regresses, all cached data must be scrubbed. For an RO volume, this is a copy of the RW volume's Update timestamp at the point of snapshotting. It can be used as a version number when checking to see if a callback on a RO volume was due to a snapshot. If it regresses, all cached data must be scrubbed. but this is currently not made use of by the in-kernel afs filesystem. Make the afs filesystem use this by: (1) Add an update time field to the afs_volsync struct and use a value of TIME64_MIN in both that and the creation time to indicate that they are unset. (2) Add creation and update time fields to the afs_volume struct and use this to track the two timestamps. (3) Add a volsync_lock mutex to the afs_volume struct to control modification access for when we detect a change in these values. (3) Add a 'pre-op volsync' struct to the afs_operation struct to record the state of the volume tracking before the op. (4) Add a new counter, cb_scrub, to the afs_volume struct to count events that require all data to be scrubbed. A copy is placed in the afs_vnode struct (inode) and if they no longer match, a scrub takes place. (5) When the result of an operation is being parsed, parse the VolSync data too, if it is provided. Note that the two timestamps are handled separately, since they don't work in quite the same way. - If the afs_volume tracking is unset, just set it and do nothing else. - If the result timestamps are the same as the ones in afs_volume, do nothing. - If the timestamps regress, increment cb_scrub if not already done so. - If the creation timestamp on a RW volume changes, increment cb_scrub if not already done so. - If the creation timestamp on a RO volume advances, update the server list and see if the current server has been excluded, if so reissue the op. Once over half of the replication sites have been updated, increment cb_ro_snapshot to indicate updates may be required and switch over to excluding unupdated replication sites. - If the creation timestamp on a Backup volume advances, just increment cb_ro_snapshot to trigger updates. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-11-05 16:11:07 +00:00
EM(afs_cb_break_for_update_regress, "update-regress") \
EM(afs_cb_break_for_volume_callback, "break-v-cb") \
afs: Parse the VolSync record in the reply of a number of RPC ops A number of fileserver RPC operations return a VolSync record as part of their reply that gives some information about the state of the volume being accessed, including: (1) A volume Creation timestamp. For an RW volume, this is the time at which the volume was created; if it changes, the RW volume was presumably restored from a backup and all cached data should be scrubbed as Data Version numbers could regress on the files in the volume. For an RO volume, this is the time it was last snapshotted from the RW volume. It is expected to advance each time this happens; if it regresses, cached data should be scrubbed. (2) A volume Update timestamp (Auristor only). For an RW volume, this is updated any time any change is made to a volume or its contents. If it regresses, all cached data must be scrubbed. For an RO volume, this is a copy of the RW volume's Update timestamp at the point of snapshotting. It can be used as a version number when checking to see if a callback on a RO volume was due to a snapshot. If it regresses, all cached data must be scrubbed. but this is currently not made use of by the in-kernel afs filesystem. Make the afs filesystem use this by: (1) Add an update time field to the afs_volsync struct and use a value of TIME64_MIN in both that and the creation time to indicate that they are unset. (2) Add creation and update time fields to the afs_volume struct and use this to track the two timestamps. (3) Add a volsync_lock mutex to the afs_volume struct to control modification access for when we detect a change in these values. (3) Add a 'pre-op volsync' struct to the afs_operation struct to record the state of the volume tracking before the op. (4) Add a new counter, cb_scrub, to the afs_volume struct to count events that require all data to be scrubbed. A copy is placed in the afs_vnode struct (inode) and if they no longer match, a scrub takes place. (5) When the result of an operation is being parsed, parse the VolSync data too, if it is provided. Note that the two timestamps are handled separately, since they don't work in quite the same way. - If the afs_volume tracking is unset, just set it and do nothing else. - If the result timestamps are the same as the ones in afs_volume, do nothing. - If the timestamps regress, increment cb_scrub if not already done so. - If the creation timestamp on a RW volume changes, increment cb_scrub if not already done so. - If the creation timestamp on a RO volume advances, update the server list and see if the current server has been excluded, if so reissue the op. Once over half of the replication sites have been updated, increment cb_ro_snapshot to indicate updates may be required and switch over to excluding unupdated replication sites. - If the creation timestamp on a Backup volume advances, just increment cb_ro_snapshot to trigger updates. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-11-05 16:11:07 +00:00
EM(afs_cb_break_for_vos_release, "break-vos-release") \
E_(afs_cb_break_volume_excluded, "vol-excluded")
afs: Fix fileserver rotation Fix the fileserver rotation so that it doesn't use RTT as the basis for deciding which server and address to use as this doesn't necessarily give a good indication of the best path. Instead, use the configurable preference list in conjunction with whatever probes have succeeded at the time of looking. To this end, make the following changes: (1) Keep an array of "server states" to track what addresses we've tried on each server and move the waitqueue entries there that we'll need for probing. (2) Each afs_server_state struct is made to pin the corresponding server's endpoint state rather than the afs_operation struct carrying a pin on the server we're currently looking at. (3) Drop the server list preference; we now always rescan the server list. (4) afs_wait_for_probes() now uses the server state list to guide it in what it waits for (and to provide the waitqueue entries) and returns an indication of whether we'd got a response, run out of responsive addresses or the endpoint state had been superseded and we need to restart the iteration. (5) Call afs_get_address_preferences*() occasionally to refresh the preference values. (6) When picking a server, scan the addresses of the servers for which we have as-yet untested communications, looking for the highest priority one and use that instead of trying all the addresses for a particular server in ascending-RTT order. (7) When a Busy or Offline state is seen across all available servers, do a short sleep. (8) If we detect that we accessed a future RO volume version whilst it is undergoing replication, reissue the op against the older version until at least half of the servers are replicated. (9) Whilst RO replication is ongoing, increase the frequency of Volume Location server checks for that volume to every ten minutes instead of hourly. Also add a tracepoint to track progress through the rotation algorithm. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-10-18 09:24:01 +01:00
#define afs_rotate_traces \
EM(afs_rotate_trace_aborted, "Abortd") \
EM(afs_rotate_trace_busy_sleep, "BsySlp") \
EM(afs_rotate_trace_check_vol_status, "VolStt") \
EM(afs_rotate_trace_failed, "Failed") \
EM(afs_rotate_trace_iter, "Iter ") \
EM(afs_rotate_trace_iterate_addr, "ItAddr") \
EM(afs_rotate_trace_next_server, "NextSv") \
EM(afs_rotate_trace_no_more_servers, "NoMore") \
EM(afs_rotate_trace_nomem, "Nomem ") \
EM(afs_rotate_trace_probe_error, "PrbErr") \
EM(afs_rotate_trace_probe_fileserver, "PrbFsv") \
EM(afs_rotate_trace_probe_none, "PrbNon") \
EM(afs_rotate_trace_probe_response, "PrbRsp") \
EM(afs_rotate_trace_probe_superseded, "PrbSup") \
EM(afs_rotate_trace_restart, "Rstart") \
EM(afs_rotate_trace_retry_server, "RtrySv") \
EM(afs_rotate_trace_selected_server, "SlctSv") \
EM(afs_rotate_trace_stale_lock, "StlLck") \
EM(afs_rotate_trace_start, "Start ") \
EM(afs_rotate_trace_stop, "Stop ") \
E_(afs_rotate_trace_stopped, "Stoppd")
/*
* Generate enums for tracing information.
*/
#ifndef __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY
#define __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY
#undef EM
#undef E_
#define EM(a, b) a,
#define E_(a, b) a
enum afs_alist_trace { afs_alist_traces } __mode(byte);
enum afs_call_trace { afs_call_traces } __mode(byte);
enum afs_cb_break_reason { afs_cb_break_reasons } __mode(byte);
enum afs_cb_promise_trace { afs_cb_promise_traces } __mode(byte);
enum afs_cell_trace { afs_cell_traces } __mode(byte);
enum afs_dir_invalid_trace { afs_dir_invalid_traces} __mode(byte);
enum afs_edit_dir_op { afs_edit_dir_ops } __mode(byte);
enum afs_edit_dir_reason { afs_edit_dir_reasons } __mode(byte);
enum afs_eproto_cause { afs_eproto_causes } __mode(byte);
enum afs_estate_trace { afs_estate_traces } __mode(byte);
enum afs_file_error { afs_file_errors } __mode(byte);
enum afs_flock_event { afs_flock_events } __mode(byte);
enum afs_flock_operation { afs_flock_operations } __mode(byte);
enum afs_io_error { afs_io_errors } __mode(byte);
afs: Fix fileserver rotation Fix the fileserver rotation so that it doesn't use RTT as the basis for deciding which server and address to use as this doesn't necessarily give a good indication of the best path. Instead, use the configurable preference list in conjunction with whatever probes have succeeded at the time of looking. To this end, make the following changes: (1) Keep an array of "server states" to track what addresses we've tried on each server and move the waitqueue entries there that we'll need for probing. (2) Each afs_server_state struct is made to pin the corresponding server's endpoint state rather than the afs_operation struct carrying a pin on the server we're currently looking at. (3) Drop the server list preference; we now always rescan the server list. (4) afs_wait_for_probes() now uses the server state list to guide it in what it waits for (and to provide the waitqueue entries) and returns an indication of whether we'd got a response, run out of responsive addresses or the endpoint state had been superseded and we need to restart the iteration. (5) Call afs_get_address_preferences*() occasionally to refresh the preference values. (6) When picking a server, scan the addresses of the servers for which we have as-yet untested communications, looking for the highest priority one and use that instead of trying all the addresses for a particular server in ascending-RTT order. (7) When a Busy or Offline state is seen across all available servers, do a short sleep. (8) If we detect that we accessed a future RO volume version whilst it is undergoing replication, reissue the op against the older version until at least half of the servers are replicated. (9) Whilst RO replication is ongoing, increase the frequency of Volume Location server checks for that volume to every ten minutes instead of hourly. Also add a tracepoint to track progress through the rotation algorithm. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-10-18 09:24:01 +01:00
enum afs_rotate_trace { afs_rotate_traces } __mode(byte);
enum afs_server_trace { afs_server_traces } __mode(byte);
enum afs_vnode_invalid_trace { afs_vnode_invalid_traces} __mode(byte);
enum afs_volume_trace { afs_volume_traces } __mode(byte);
#endif /* end __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY */
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
/*
* Export enum symbols via userspace.
*/
#undef EM
#undef E_
#define EM(a, b) TRACE_DEFINE_ENUM(a);
#define E_(a, b) TRACE_DEFINE_ENUM(a);
afs_alist_traces;
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
afs_call_traces;
afs: Fix fileserver rotation Fix the fileserver rotation so that it doesn't use RTT as the basis for deciding which server and address to use as this doesn't necessarily give a good indication of the best path. Instead, use the configurable preference list in conjunction with whatever probes have succeeded at the time of looking. To this end, make the following changes: (1) Keep an array of "server states" to track what addresses we've tried on each server and move the waitqueue entries there that we'll need for probing. (2) Each afs_server_state struct is made to pin the corresponding server's endpoint state rather than the afs_operation struct carrying a pin on the server we're currently looking at. (3) Drop the server list preference; we now always rescan the server list. (4) afs_wait_for_probes() now uses the server state list to guide it in what it waits for (and to provide the waitqueue entries) and returns an indication of whether we'd got a response, run out of responsive addresses or the endpoint state had been superseded and we need to restart the iteration. (5) Call afs_get_address_preferences*() occasionally to refresh the preference values. (6) When picking a server, scan the addresses of the servers for which we have as-yet untested communications, looking for the highest priority one and use that instead of trying all the addresses for a particular server in ascending-RTT order. (7) When a Busy or Offline state is seen across all available servers, do a short sleep. (8) If we detect that we accessed a future RO volume version whilst it is undergoing replication, reissue the op against the older version until at least half of the servers are replicated. (9) Whilst RO replication is ongoing, increase the frequency of Volume Location server checks for that volume to every ten minutes instead of hourly. Also add a tracepoint to track progress through the rotation algorithm. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-10-18 09:24:01 +01:00
afs_cb_break_reasons;
afs_cb_promise_traces;
afs_cell_traces;
afs: Fix tracepoint string placement with built-in AFS To quote Alexey[1]: I was adding custom tracepoint to the kernel, grabbed full F34 kernel .config, disabled modules and booted whole shebang as VM kernel. Then did perf record -a -e ... It crashed: general protection fault, probably for non-canonical address 0x435f5346592e4243: 0000 [#1] SMP PTI CPU: 1 PID: 842 Comm: cat Not tainted 5.12.6+ #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 RIP: 0010:t_show+0x22/0xd0 Then reproducer was narrowed to # cat /sys/kernel/tracing/printk_formats Original F34 kernel with modules didn't crash. So I started to disable options and after disabling AFS everything started working again. The root cause is that AFS was placing char arrays content into a section full of _pointers_ to strings with predictable consequences. Non canonical address 435f5346592e4243 is "CB.YFS_" which came from CM_NAME macro. Steps to reproduce: CONFIG_AFS=y CONFIG_TRACING=y # cat /sys/kernel/tracing/printk_formats Fix this by the following means: (1) Add enum->string translation tables in the event header with the AFS and YFS cache/callback manager operations listed by RPC operation ID. (2) Modify the afs_cb_call tracepoint to print the string from the translation table rather than using the string at the afs_call name pointer. (3) Switch translation table depending on the service we're being accessed as (AFS or YFS) in the tracepoint print clause. Will this cause problems to userspace utilities? Note that the symbolic representation of the YFS service ID isn't available to this header, so I've put it in as a number. I'm not sure if this is the best way to do this. (4) Remove the name wrangling (CM_NAME) macro and put the names directly into the afs_call_type structs in cmservice.c. Fixes: 8e8d7f13b6d5a9 ("afs: Add some tracepoints") Reported-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Reviewed-by: Marc Dionne <marc.dionne@auristor.com> cc: Andrew Morton <akpm@linux-foundation.org> cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/YLAXfvZ+rObEOdc%2F@localhost.localdomain/ [1] Link: https://lore.kernel.org/r/643721.1623754699@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/162430903582.2896199.6098150063997983353.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/162609463957.3133237.15916579353149746363.stgit@warthog.procyon.org.uk/ # v1 (repost) Link: https://lore.kernel.org/r/162610726860.3408253.445207609466288531.stgit@warthog.procyon.org.uk/ # v2
2021-06-15 11:57:26 +01:00
afs_cm_operations;
afs_dir_invalid_traces;
afs_edit_dir_ops;
afs_edit_dir_reasons;
afs_eproto_causes;
afs_estate_traces;
afs_file_errors;
afs_flock_operations;
afs: Fix fileserver rotation Fix the fileserver rotation so that it doesn't use RTT as the basis for deciding which server and address to use as this doesn't necessarily give a good indication of the best path. Instead, use the configurable preference list in conjunction with whatever probes have succeeded at the time of looking. To this end, make the following changes: (1) Keep an array of "server states" to track what addresses we've tried on each server and move the waitqueue entries there that we'll need for probing. (2) Each afs_server_state struct is made to pin the corresponding server's endpoint state rather than the afs_operation struct carrying a pin on the server we're currently looking at. (3) Drop the server list preference; we now always rescan the server list. (4) afs_wait_for_probes() now uses the server state list to guide it in what it waits for (and to provide the waitqueue entries) and returns an indication of whether we'd got a response, run out of responsive addresses or the endpoint state had been superseded and we need to restart the iteration. (5) Call afs_get_address_preferences*() occasionally to refresh the preference values. (6) When picking a server, scan the addresses of the servers for which we have as-yet untested communications, looking for the highest priority one and use that instead of trying all the addresses for a particular server in ascending-RTT order. (7) When a Busy or Offline state is seen across all available servers, do a short sleep. (8) If we detect that we accessed a future RO volume version whilst it is undergoing replication, reissue the op against the older version until at least half of the servers are replicated. (9) Whilst RO replication is ongoing, increase the frequency of Volume Location server checks for that volume to every ten minutes instead of hourly. Also add a tracepoint to track progress through the rotation algorithm. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-10-18 09:24:01 +01:00
afs_flock_types;
afs_fs_operations;
afs_io_errors;
afs_rotate_traces;
afs_server_traces;
afs_vnode_invalid_traces;
afs: Fix fileserver rotation Fix the fileserver rotation so that it doesn't use RTT as the basis for deciding which server and address to use as this doesn't necessarily give a good indication of the best path. Instead, use the configurable preference list in conjunction with whatever probes have succeeded at the time of looking. To this end, make the following changes: (1) Keep an array of "server states" to track what addresses we've tried on each server and move the waitqueue entries there that we'll need for probing. (2) Each afs_server_state struct is made to pin the corresponding server's endpoint state rather than the afs_operation struct carrying a pin on the server we're currently looking at. (3) Drop the server list preference; we now always rescan the server list. (4) afs_wait_for_probes() now uses the server state list to guide it in what it waits for (and to provide the waitqueue entries) and returns an indication of whether we'd got a response, run out of responsive addresses or the endpoint state had been superseded and we need to restart the iteration. (5) Call afs_get_address_preferences*() occasionally to refresh the preference values. (6) When picking a server, scan the addresses of the servers for which we have as-yet untested communications, looking for the highest priority one and use that instead of trying all the addresses for a particular server in ascending-RTT order. (7) When a Busy or Offline state is seen across all available servers, do a short sleep. (8) If we detect that we accessed a future RO volume version whilst it is undergoing replication, reissue the op against the older version until at least half of the servers are replicated. (9) Whilst RO replication is ongoing, increase the frequency of Volume Location server checks for that volume to every ten minutes instead of hourly. Also add a tracepoint to track progress through the rotation algorithm. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-10-18 09:24:01 +01:00
afs_vl_operations;
yfs_cm_operations;
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
/*
* Now redefine the EM() and E_() macros to map the enums to the strings that
* will be printed in the output.
*/
#undef EM
#undef E_
#define EM(a, b) { a, b },
#define E_(a, b) { a, b }
TRACE_EVENT(afs_receive_data,
TP_PROTO(struct afs_call *call, struct iov_iter *iter,
bool want_more, int ret),
TP_ARGS(call, iter, want_more, ret),
TP_STRUCT__entry(
__field(loff_t, remain)
__field(unsigned int, call)
__field(enum afs_call_state, state)
__field(unsigned short, unmarshall)
__field(bool, want_more)
__field(int, ret)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->state = call->state;
__entry->unmarshall = call->unmarshall;
__entry->remain = iov_iter_count(iter);
__entry->want_more = want_more;
__entry->ret = ret;
),
TP_printk("c=%08x r=%llu u=%u w=%u s=%u ret=%d",
__entry->call,
__entry->remain,
__entry->unmarshall,
__entry->want_more,
__entry->state,
__entry->ret)
);
TRACE_EVENT(afs_notify_call,
TP_PROTO(struct rxrpc_call *rxcall, struct afs_call *call),
TP_ARGS(rxcall, call),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum afs_call_state, state)
__field(unsigned short, unmarshall)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->state = call->state;
__entry->unmarshall = call->unmarshall;
),
TP_printk("c=%08x s=%u u=%u",
__entry->call,
__entry->state, __entry->unmarshall)
);
TRACE_EVENT(afs_cb_call,
TP_PROTO(struct afs_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(u32, op)
__field(u16, service_id)
__field(u8, security_ix)
__field(u32, enctype)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->op = call->operation_ID;
afs: Fix tracepoint string placement with built-in AFS To quote Alexey[1]: I was adding custom tracepoint to the kernel, grabbed full F34 kernel .config, disabled modules and booted whole shebang as VM kernel. Then did perf record -a -e ... It crashed: general protection fault, probably for non-canonical address 0x435f5346592e4243: 0000 [#1] SMP PTI CPU: 1 PID: 842 Comm: cat Not tainted 5.12.6+ #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 RIP: 0010:t_show+0x22/0xd0 Then reproducer was narrowed to # cat /sys/kernel/tracing/printk_formats Original F34 kernel with modules didn't crash. So I started to disable options and after disabling AFS everything started working again. The root cause is that AFS was placing char arrays content into a section full of _pointers_ to strings with predictable consequences. Non canonical address 435f5346592e4243 is "CB.YFS_" which came from CM_NAME macro. Steps to reproduce: CONFIG_AFS=y CONFIG_TRACING=y # cat /sys/kernel/tracing/printk_formats Fix this by the following means: (1) Add enum->string translation tables in the event header with the AFS and YFS cache/callback manager operations listed by RPC operation ID. (2) Modify the afs_cb_call tracepoint to print the string from the translation table rather than using the string at the afs_call name pointer. (3) Switch translation table depending on the service we're being accessed as (AFS or YFS) in the tracepoint print clause. Will this cause problems to userspace utilities? Note that the symbolic representation of the YFS service ID isn't available to this header, so I've put it in as a number. I'm not sure if this is the best way to do this. (4) Remove the name wrangling (CM_NAME) macro and put the names directly into the afs_call_type structs in cmservice.c. Fixes: 8e8d7f13b6d5a9 ("afs: Add some tracepoints") Reported-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Reviewed-by: Marc Dionne <marc.dionne@auristor.com> cc: Andrew Morton <akpm@linux-foundation.org> cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/YLAXfvZ+rObEOdc%2F@localhost.localdomain/ [1] Link: https://lore.kernel.org/r/643721.1623754699@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/162430903582.2896199.6098150063997983353.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/162609463957.3133237.15916579353149746363.stgit@warthog.procyon.org.uk/ # v1 (repost) Link: https://lore.kernel.org/r/162610726860.3408253.445207609466288531.stgit@warthog.procyon.org.uk/ # v2
2021-06-15 11:57:26 +01:00
__entry->service_id = call->service_id;
__entry->security_ix = call->security_ix;
__entry->enctype = call->enctype;
),
TP_printk("c=%08x %s sv=%u sx=%u en=%u",
__entry->call,
afs: Fix tracepoint string placement with built-in AFS To quote Alexey[1]: I was adding custom tracepoint to the kernel, grabbed full F34 kernel .config, disabled modules and booted whole shebang as VM kernel. Then did perf record -a -e ... It crashed: general protection fault, probably for non-canonical address 0x435f5346592e4243: 0000 [#1] SMP PTI CPU: 1 PID: 842 Comm: cat Not tainted 5.12.6+ #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 RIP: 0010:t_show+0x22/0xd0 Then reproducer was narrowed to # cat /sys/kernel/tracing/printk_formats Original F34 kernel with modules didn't crash. So I started to disable options and after disabling AFS everything started working again. The root cause is that AFS was placing char arrays content into a section full of _pointers_ to strings with predictable consequences. Non canonical address 435f5346592e4243 is "CB.YFS_" which came from CM_NAME macro. Steps to reproduce: CONFIG_AFS=y CONFIG_TRACING=y # cat /sys/kernel/tracing/printk_formats Fix this by the following means: (1) Add enum->string translation tables in the event header with the AFS and YFS cache/callback manager operations listed by RPC operation ID. (2) Modify the afs_cb_call tracepoint to print the string from the translation table rather than using the string at the afs_call name pointer. (3) Switch translation table depending on the service we're being accessed as (AFS or YFS) in the tracepoint print clause. Will this cause problems to userspace utilities? Note that the symbolic representation of the YFS service ID isn't available to this header, so I've put it in as a number. I'm not sure if this is the best way to do this. (4) Remove the name wrangling (CM_NAME) macro and put the names directly into the afs_call_type structs in cmservice.c. Fixes: 8e8d7f13b6d5a9 ("afs: Add some tracepoints") Reported-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Reviewed-by: Marc Dionne <marc.dionne@auristor.com> cc: Andrew Morton <akpm@linux-foundation.org> cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/YLAXfvZ+rObEOdc%2F@localhost.localdomain/ [1] Link: https://lore.kernel.org/r/643721.1623754699@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/162430903582.2896199.6098150063997983353.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/162609463957.3133237.15916579353149746363.stgit@warthog.procyon.org.uk/ # v1 (repost) Link: https://lore.kernel.org/r/162610726860.3408253.445207609466288531.stgit@warthog.procyon.org.uk/ # v2
2021-06-15 11:57:26 +01:00
__entry->service_id == 2501 ?
__print_symbolic(__entry->op, yfs_cm_operations) :
__print_symbolic(__entry->op, afs_cm_operations),
__entry->service_id,
__entry->security_ix,
__entry->enctype)
);
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
TRACE_EVENT(afs_call,
TP_PROTO(unsigned int call_debug_id, enum afs_call_trace op,
int ref, int outstanding, const void *where),
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
TP_ARGS(call_debug_id, op, ref, outstanding, where),
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
TP_STRUCT__entry(
__field(unsigned int, call)
__field(int, op)
__field(int, ref)
__field(int, outstanding)
__field(const void *, where)
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
),
TP_fast_assign(
__entry->call = call_debug_id;
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
__entry->op = op;
__entry->ref = ref;
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
__entry->outstanding = outstanding;
__entry->where = where;
),
TP_printk("c=%08x %s r=%d o=%d sp=%pSR",
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
__entry->call,
__print_symbolic(__entry->op, afs_call_traces),
__entry->ref,
afs: Refcount the afs_call struct A static checker warning occurs in the AFS filesystem: fs/afs/cmservice.c:155 SRXAFSCB_CallBack() error: dereferencing freed memory 'call' due to the reply being sent before we access the server it points to. The act of sending the reply causes the call to be freed if an error occurs (but not if it doesn't). On top of this, the lifetime handling of afs_call structs is fragile because they get passed around through workqueues without any sort of refcounting. Deal with the issues by: (1) Fix the maybe/maybe not nature of the reply sending functions with regards to whether they release the call struct. (2) Refcount the afs_call struct and sort out places that need to get/put references. (3) Pass a ref through the work queue and release (or pass on) that ref in the work function. Care has to be taken because a work queue may already own a ref to the call. (4) Do the cleaning up in the put function only. (5) Simplify module cleanup by always incrementing afs_outstanding_calls whenever a call is allocated. (6) Set the backlog to 0 with kernel_listen() at the beginning of the process of closing the socket to prevent new incoming calls from occurring and to remove the contribution of preallocated calls from afs_outstanding_calls before we wait on it. A tracepoint is also added to monitor the afs_call refcount and lifetime. Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: David Howells <dhowells@redhat.com> Fixes: 08e0e7c82eea: "[AF_RXRPC]: Make the in-kernel AFS filesystem use AF_RXRPC."
2017-01-05 10:38:36 +00:00
__entry->outstanding,
__entry->where)
);
TRACE_EVENT(afs_make_fs_call,
TP_PROTO(struct afs_call *call, const struct afs_fid *fid),
TP_ARGS(call, fid),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum afs_fs_operation, op)
__field_struct(struct afs_fid, fid)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->op = call->operation_ID;
if (fid) {
__entry->fid = *fid;
} else {
__entry->fid.vid = 0;
__entry->fid.vnode = 0;
__entry->fid.unique = 0;
}
),
TP_printk("c=%08x V=%llx i=%llx:%x %s",
__entry->call,
__entry->fid.vid,
__entry->fid.vnode,
__entry->fid.unique,
__print_symbolic(__entry->op, afs_fs_operations))
);
TRACE_EVENT(afs_make_fs_calli,
TP_PROTO(struct afs_call *call, const struct afs_fid *fid,
unsigned int i),
TP_ARGS(call, fid, i),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(unsigned int, i)
__field(enum afs_fs_operation, op)
__field_struct(struct afs_fid, fid)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->i = i;
__entry->op = call->operation_ID;
if (fid) {
__entry->fid = *fid;
} else {
__entry->fid.vid = 0;
__entry->fid.vnode = 0;
__entry->fid.unique = 0;
}
),
TP_printk("c=%08x V=%llx i=%llx:%x %s i=%u",
__entry->call,
__entry->fid.vid,
__entry->fid.vnode,
__entry->fid.unique,
__print_symbolic(__entry->op, afs_fs_operations),
__entry->i)
);
TRACE_EVENT(afs_make_fs_call1,
TP_PROTO(struct afs_call *call, const struct afs_fid *fid,
afs: Build an abstraction around an "operation" concept Turn the afs_operation struct into the main way that most fileserver operations are managed. Various things are added to the struct, including the following: (1) All the parameters and results of the relevant operations are moved into it, removing corresponding fields from the afs_call struct. afs_call gets a pointer to the op. (2) The target volume is made the main focus of the operation, rather than the target vnode(s), and a bunch of op->vnode->volume are made op->volume instead. (3) Two vnode records are defined (op->file[]) for the vnode(s) involved in most operations. The vnode record (struct afs_vnode_param) contains: - The vnode pointer. - The fid of the vnode to be included in the parameters or that was returned in the reply (eg. FS.MakeDir). - The status and callback information that may be returned in the reply about the vnode. - Callback break and data version tracking for detecting simultaneous third-parth changes. (4) Pointers to dentries to be updated with new inodes. (5) An operations table pointer. The table includes pointers to functions for issuing AFS and YFS-variant RPCs, handling the success and abort of an operation and handling post-I/O-lock local editing of a directory. To make this work, the following function restructuring is made: (A) The rotation loop that issues calls to fileservers that can be found in each function that wants to issue an RPC (such as afs_mkdir()) is extracted out into common code, in a new file called fs_operation.c. (B) The rotation loops, such as the one in afs_mkdir(), are replaced with a much smaller piece of code that allocates an operation, sets the parameters and then calls out to the common code to do the actual work. (C) The code for handling the success and failure of an operation are moved into operation functions (as (5) above) and these are called from the core code at appropriate times. (D) The pseudo inode getting stuff used by the dynamic root code is moved over into dynroot.c. (E) struct afs_iget_data is absorbed into the operation struct and afs_iget() expects to be given an op pointer and a vnode record. (F) Point (E) doesn't work for the root dir of a volume, but we know the FID in advance (it's always vnode 1, unique 1), so a separate inode getter, afs_root_iget(), is provided to special-case that. (G) The inode status init/update functions now also take an op and a vnode record. (H) The RPC marshalling functions now, for the most part, just take an afs_operation struct as their only argument. All the data they need is held there. The result delivery functions write their answers there as well. (I) The call is attached to the operation and then the operation core does the waiting. And then the new operation code is, for the moment, made to just initialise the operation, get the appropriate vnode I/O locks and do the same rotation loop as before. This lays the foundation for the following changes in the future: (*) Overhauling the rotation (again). (*) Support for asynchronous I/O, where the fileserver rotation must be done asynchronously also. Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-10 20:51:51 +01:00
const struct qstr *name),
TP_ARGS(call, fid, name),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum afs_fs_operation, op)
__field_struct(struct afs_fid, fid)
__array(char, name, 24)
),
TP_fast_assign(
afs: Build an abstraction around an "operation" concept Turn the afs_operation struct into the main way that most fileserver operations are managed. Various things are added to the struct, including the following: (1) All the parameters and results of the relevant operations are moved into it, removing corresponding fields from the afs_call struct. afs_call gets a pointer to the op. (2) The target volume is made the main focus of the operation, rather than the target vnode(s), and a bunch of op->vnode->volume are made op->volume instead. (3) Two vnode records are defined (op->file[]) for the vnode(s) involved in most operations. The vnode record (struct afs_vnode_param) contains: - The vnode pointer. - The fid of the vnode to be included in the parameters or that was returned in the reply (eg. FS.MakeDir). - The status and callback information that may be returned in the reply about the vnode. - Callback break and data version tracking for detecting simultaneous third-parth changes. (4) Pointers to dentries to be updated with new inodes. (5) An operations table pointer. The table includes pointers to functions for issuing AFS and YFS-variant RPCs, handling the success and abort of an operation and handling post-I/O-lock local editing of a directory. To make this work, the following function restructuring is made: (A) The rotation loop that issues calls to fileservers that can be found in each function that wants to issue an RPC (such as afs_mkdir()) is extracted out into common code, in a new file called fs_operation.c. (B) The rotation loops, such as the one in afs_mkdir(), are replaced with a much smaller piece of code that allocates an operation, sets the parameters and then calls out to the common code to do the actual work. (C) The code for handling the success and failure of an operation are moved into operation functions (as (5) above) and these are called from the core code at appropriate times. (D) The pseudo inode getting stuff used by the dynamic root code is moved over into dynroot.c. (E) struct afs_iget_data is absorbed into the operation struct and afs_iget() expects to be given an op pointer and a vnode record. (F) Point (E) doesn't work for the root dir of a volume, but we know the FID in advance (it's always vnode 1, unique 1), so a separate inode getter, afs_root_iget(), is provided to special-case that. (G) The inode status init/update functions now also take an op and a vnode record. (H) The RPC marshalling functions now, for the most part, just take an afs_operation struct as their only argument. All the data they need is held there. The result delivery functions write their answers there as well. (I) The call is attached to the operation and then the operation core does the waiting. And then the new operation code is, for the moment, made to just initialise the operation, get the appropriate vnode I/O locks and do the same rotation loop as before. This lays the foundation for the following changes in the future: (*) Overhauling the rotation (again). (*) Support for asynchronous I/O, where the fileserver rotation must be done asynchronously also. Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-10 20:51:51 +01:00
unsigned int __len = min_t(unsigned int, name->len, 23);
__entry->call = call->debug_id;
__entry->op = call->operation_ID;
if (fid) {
__entry->fid = *fid;
} else {
__entry->fid.vid = 0;
__entry->fid.vnode = 0;
__entry->fid.unique = 0;
}
afs: Build an abstraction around an "operation" concept Turn the afs_operation struct into the main way that most fileserver operations are managed. Various things are added to the struct, including the following: (1) All the parameters and results of the relevant operations are moved into it, removing corresponding fields from the afs_call struct. afs_call gets a pointer to the op. (2) The target volume is made the main focus of the operation, rather than the target vnode(s), and a bunch of op->vnode->volume are made op->volume instead. (3) Two vnode records are defined (op->file[]) for the vnode(s) involved in most operations. The vnode record (struct afs_vnode_param) contains: - The vnode pointer. - The fid of the vnode to be included in the parameters or that was returned in the reply (eg. FS.MakeDir). - The status and callback information that may be returned in the reply about the vnode. - Callback break and data version tracking for detecting simultaneous third-parth changes. (4) Pointers to dentries to be updated with new inodes. (5) An operations table pointer. The table includes pointers to functions for issuing AFS and YFS-variant RPCs, handling the success and abort of an operation and handling post-I/O-lock local editing of a directory. To make this work, the following function restructuring is made: (A) The rotation loop that issues calls to fileservers that can be found in each function that wants to issue an RPC (such as afs_mkdir()) is extracted out into common code, in a new file called fs_operation.c. (B) The rotation loops, such as the one in afs_mkdir(), are replaced with a much smaller piece of code that allocates an operation, sets the parameters and then calls out to the common code to do the actual work. (C) The code for handling the success and failure of an operation are moved into operation functions (as (5) above) and these are called from the core code at appropriate times. (D) The pseudo inode getting stuff used by the dynamic root code is moved over into dynroot.c. (E) struct afs_iget_data is absorbed into the operation struct and afs_iget() expects to be given an op pointer and a vnode record. (F) Point (E) doesn't work for the root dir of a volume, but we know the FID in advance (it's always vnode 1, unique 1), so a separate inode getter, afs_root_iget(), is provided to special-case that. (G) The inode status init/update functions now also take an op and a vnode record. (H) The RPC marshalling functions now, for the most part, just take an afs_operation struct as their only argument. All the data they need is held there. The result delivery functions write their answers there as well. (I) The call is attached to the operation and then the operation core does the waiting. And then the new operation code is, for the moment, made to just initialise the operation, get the appropriate vnode I/O locks and do the same rotation loop as before. This lays the foundation for the following changes in the future: (*) Overhauling the rotation (again). (*) Support for asynchronous I/O, where the fileserver rotation must be done asynchronously also. Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-10 20:51:51 +01:00
memcpy(__entry->name, name->name, __len);
__entry->name[__len] = 0;
),
TP_printk("c=%08x V=%llx i=%llx:%x %s \"%s\"",
__entry->call,
__entry->fid.vid,
__entry->fid.vnode,
__entry->fid.unique,
__print_symbolic(__entry->op, afs_fs_operations),
__entry->name)
);
TRACE_EVENT(afs_make_fs_call2,
TP_PROTO(struct afs_call *call, const struct afs_fid *fid,
afs: Build an abstraction around an "operation" concept Turn the afs_operation struct into the main way that most fileserver operations are managed. Various things are added to the struct, including the following: (1) All the parameters and results of the relevant operations are moved into it, removing corresponding fields from the afs_call struct. afs_call gets a pointer to the op. (2) The target volume is made the main focus of the operation, rather than the target vnode(s), and a bunch of op->vnode->volume are made op->volume instead. (3) Two vnode records are defined (op->file[]) for the vnode(s) involved in most operations. The vnode record (struct afs_vnode_param) contains: - The vnode pointer. - The fid of the vnode to be included in the parameters or that was returned in the reply (eg. FS.MakeDir). - The status and callback information that may be returned in the reply about the vnode. - Callback break and data version tracking for detecting simultaneous third-parth changes. (4) Pointers to dentries to be updated with new inodes. (5) An operations table pointer. The table includes pointers to functions for issuing AFS and YFS-variant RPCs, handling the success and abort of an operation and handling post-I/O-lock local editing of a directory. To make this work, the following function restructuring is made: (A) The rotation loop that issues calls to fileservers that can be found in each function that wants to issue an RPC (such as afs_mkdir()) is extracted out into common code, in a new file called fs_operation.c. (B) The rotation loops, such as the one in afs_mkdir(), are replaced with a much smaller piece of code that allocates an operation, sets the parameters and then calls out to the common code to do the actual work. (C) The code for handling the success and failure of an operation are moved into operation functions (as (5) above) and these are called from the core code at appropriate times. (D) The pseudo inode getting stuff used by the dynamic root code is moved over into dynroot.c. (E) struct afs_iget_data is absorbed into the operation struct and afs_iget() expects to be given an op pointer and a vnode record. (F) Point (E) doesn't work for the root dir of a volume, but we know the FID in advance (it's always vnode 1, unique 1), so a separate inode getter, afs_root_iget(), is provided to special-case that. (G) The inode status init/update functions now also take an op and a vnode record. (H) The RPC marshalling functions now, for the most part, just take an afs_operation struct as their only argument. All the data they need is held there. The result delivery functions write their answers there as well. (I) The call is attached to the operation and then the operation core does the waiting. And then the new operation code is, for the moment, made to just initialise the operation, get the appropriate vnode I/O locks and do the same rotation loop as before. This lays the foundation for the following changes in the future: (*) Overhauling the rotation (again). (*) Support for asynchronous I/O, where the fileserver rotation must be done asynchronously also. Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-10 20:51:51 +01:00
const struct qstr *name, const struct qstr *name2),
TP_ARGS(call, fid, name, name2),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum afs_fs_operation, op)
__field_struct(struct afs_fid, fid)
__array(char, name, 24)
__array(char, name2, 24)
),
TP_fast_assign(
afs: Build an abstraction around an "operation" concept Turn the afs_operation struct into the main way that most fileserver operations are managed. Various things are added to the struct, including the following: (1) All the parameters and results of the relevant operations are moved into it, removing corresponding fields from the afs_call struct. afs_call gets a pointer to the op. (2) The target volume is made the main focus of the operation, rather than the target vnode(s), and a bunch of op->vnode->volume are made op->volume instead. (3) Two vnode records are defined (op->file[]) for the vnode(s) involved in most operations. The vnode record (struct afs_vnode_param) contains: - The vnode pointer. - The fid of the vnode to be included in the parameters or that was returned in the reply (eg. FS.MakeDir). - The status and callback information that may be returned in the reply about the vnode. - Callback break and data version tracking for detecting simultaneous third-parth changes. (4) Pointers to dentries to be updated with new inodes. (5) An operations table pointer. The table includes pointers to functions for issuing AFS and YFS-variant RPCs, handling the success and abort of an operation and handling post-I/O-lock local editing of a directory. To make this work, the following function restructuring is made: (A) The rotation loop that issues calls to fileservers that can be found in each function that wants to issue an RPC (such as afs_mkdir()) is extracted out into common code, in a new file called fs_operation.c. (B) The rotation loops, such as the one in afs_mkdir(), are replaced with a much smaller piece of code that allocates an operation, sets the parameters and then calls out to the common code to do the actual work. (C) The code for handling the success and failure of an operation are moved into operation functions (as (5) above) and these are called from the core code at appropriate times. (D) The pseudo inode getting stuff used by the dynamic root code is moved over into dynroot.c. (E) struct afs_iget_data is absorbed into the operation struct and afs_iget() expects to be given an op pointer and a vnode record. (F) Point (E) doesn't work for the root dir of a volume, but we know the FID in advance (it's always vnode 1, unique 1), so a separate inode getter, afs_root_iget(), is provided to special-case that. (G) The inode status init/update functions now also take an op and a vnode record. (H) The RPC marshalling functions now, for the most part, just take an afs_operation struct as their only argument. All the data they need is held there. The result delivery functions write their answers there as well. (I) The call is attached to the operation and then the operation core does the waiting. And then the new operation code is, for the moment, made to just initialise the operation, get the appropriate vnode I/O locks and do the same rotation loop as before. This lays the foundation for the following changes in the future: (*) Overhauling the rotation (again). (*) Support for asynchronous I/O, where the fileserver rotation must be done asynchronously also. Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-10 20:51:51 +01:00
unsigned int __len = min_t(unsigned int, name->len, 23);
unsigned int __len2 = min_t(unsigned int, name2->len, 23);
__entry->call = call->debug_id;
__entry->op = call->operation_ID;
if (fid) {
__entry->fid = *fid;
} else {
__entry->fid.vid = 0;
__entry->fid.vnode = 0;
__entry->fid.unique = 0;
}
afs: Build an abstraction around an "operation" concept Turn the afs_operation struct into the main way that most fileserver operations are managed. Various things are added to the struct, including the following: (1) All the parameters and results of the relevant operations are moved into it, removing corresponding fields from the afs_call struct. afs_call gets a pointer to the op. (2) The target volume is made the main focus of the operation, rather than the target vnode(s), and a bunch of op->vnode->volume are made op->volume instead. (3) Two vnode records are defined (op->file[]) for the vnode(s) involved in most operations. The vnode record (struct afs_vnode_param) contains: - The vnode pointer. - The fid of the vnode to be included in the parameters or that was returned in the reply (eg. FS.MakeDir). - The status and callback information that may be returned in the reply about the vnode. - Callback break and data version tracking for detecting simultaneous third-parth changes. (4) Pointers to dentries to be updated with new inodes. (5) An operations table pointer. The table includes pointers to functions for issuing AFS and YFS-variant RPCs, handling the success and abort of an operation and handling post-I/O-lock local editing of a directory. To make this work, the following function restructuring is made: (A) The rotation loop that issues calls to fileservers that can be found in each function that wants to issue an RPC (such as afs_mkdir()) is extracted out into common code, in a new file called fs_operation.c. (B) The rotation loops, such as the one in afs_mkdir(), are replaced with a much smaller piece of code that allocates an operation, sets the parameters and then calls out to the common code to do the actual work. (C) The code for handling the success and failure of an operation are moved into operation functions (as (5) above) and these are called from the core code at appropriate times. (D) The pseudo inode getting stuff used by the dynamic root code is moved over into dynroot.c. (E) struct afs_iget_data is absorbed into the operation struct and afs_iget() expects to be given an op pointer and a vnode record. (F) Point (E) doesn't work for the root dir of a volume, but we know the FID in advance (it's always vnode 1, unique 1), so a separate inode getter, afs_root_iget(), is provided to special-case that. (G) The inode status init/update functions now also take an op and a vnode record. (H) The RPC marshalling functions now, for the most part, just take an afs_operation struct as their only argument. All the data they need is held there. The result delivery functions write their answers there as well. (I) The call is attached to the operation and then the operation core does the waiting. And then the new operation code is, for the moment, made to just initialise the operation, get the appropriate vnode I/O locks and do the same rotation loop as before. This lays the foundation for the following changes in the future: (*) Overhauling the rotation (again). (*) Support for asynchronous I/O, where the fileserver rotation must be done asynchronously also. Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-10 20:51:51 +01:00
memcpy(__entry->name, name->name, __len);
__entry->name[__len] = 0;
afs: Build an abstraction around an "operation" concept Turn the afs_operation struct into the main way that most fileserver operations are managed. Various things are added to the struct, including the following: (1) All the parameters and results of the relevant operations are moved into it, removing corresponding fields from the afs_call struct. afs_call gets a pointer to the op. (2) The target volume is made the main focus of the operation, rather than the target vnode(s), and a bunch of op->vnode->volume are made op->volume instead. (3) Two vnode records are defined (op->file[]) for the vnode(s) involved in most operations. The vnode record (struct afs_vnode_param) contains: - The vnode pointer. - The fid of the vnode to be included in the parameters or that was returned in the reply (eg. FS.MakeDir). - The status and callback information that may be returned in the reply about the vnode. - Callback break and data version tracking for detecting simultaneous third-parth changes. (4) Pointers to dentries to be updated with new inodes. (5) An operations table pointer. The table includes pointers to functions for issuing AFS and YFS-variant RPCs, handling the success and abort of an operation and handling post-I/O-lock local editing of a directory. To make this work, the following function restructuring is made: (A) The rotation loop that issues calls to fileservers that can be found in each function that wants to issue an RPC (such as afs_mkdir()) is extracted out into common code, in a new file called fs_operation.c. (B) The rotation loops, such as the one in afs_mkdir(), are replaced with a much smaller piece of code that allocates an operation, sets the parameters and then calls out to the common code to do the actual work. (C) The code for handling the success and failure of an operation are moved into operation functions (as (5) above) and these are called from the core code at appropriate times. (D) The pseudo inode getting stuff used by the dynamic root code is moved over into dynroot.c. (E) struct afs_iget_data is absorbed into the operation struct and afs_iget() expects to be given an op pointer and a vnode record. (F) Point (E) doesn't work for the root dir of a volume, but we know the FID in advance (it's always vnode 1, unique 1), so a separate inode getter, afs_root_iget(), is provided to special-case that. (G) The inode status init/update functions now also take an op and a vnode record. (H) The RPC marshalling functions now, for the most part, just take an afs_operation struct as their only argument. All the data they need is held there. The result delivery functions write their answers there as well. (I) The call is attached to the operation and then the operation core does the waiting. And then the new operation code is, for the moment, made to just initialise the operation, get the appropriate vnode I/O locks and do the same rotation loop as before. This lays the foundation for the following changes in the future: (*) Overhauling the rotation (again). (*) Support for asynchronous I/O, where the fileserver rotation must be done asynchronously also. Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-10 20:51:51 +01:00
memcpy(__entry->name2, name2->name, __len2);
__entry->name2[__len2] = 0;
),
TP_printk("c=%08x V=%llx i=%llx:%x %s \"%s\" \"%s\"",
__entry->call,
__entry->fid.vid,
__entry->fid.vnode,
__entry->fid.unique,
__print_symbolic(__entry->op, afs_fs_operations),
__entry->name,
__entry->name2)
);
TRACE_EVENT(afs_make_vl_call,
TP_PROTO(struct afs_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum afs_vl_operation, op)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->op = call->operation_ID;
),
TP_printk("c=%08x %s",
__entry->call,
__print_symbolic(__entry->op, afs_vl_operations))
);
TRACE_EVENT(afs_call_done,
TP_PROTO(struct afs_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(struct rxrpc_call *, rx_call)
__field(int, ret)
__field(u32, abort_code)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->rx_call = call->rxcall;
__entry->ret = call->error;
__entry->abort_code = call->abort_code;
),
TP_printk(" c=%08x ret=%d ab=%d [%p]",
__entry->call,
__entry->ret,
__entry->abort_code,
__entry->rx_call)
);
afs: Use ITER_XARRAY for writing Use a single ITER_XARRAY iterator to describe the portion of a file to be transmitted to the server rather than generating a series of small ITER_BVEC iterators on the fly. This will make it easier to implement AIO in afs. In theory we could maybe use one giant ITER_BVEC, but that means potentially allocating a huge array of bio_vec structs (max 256 per page) when in fact the pagecache already has a structure listing all the relevant pages (radix_tree/xarray) that can be walked over. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/153685395197.14766.16289516750731233933.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/158861251312.340223.17924900795425422532.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465828607.1377938.6903132788463419368.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588535018.3465195.14509994354240338307.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161118152415.1232039.6452879415814850025.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161161048194.2537118.13763612220937637316.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/161340411602.1303470.4661108879482218408.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/161539555629.286939.5241869986617154517.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/161653811456.2770958.7017388543246759245.stgit@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/161789095005.6155.6789055030327407928.stgit@warthog.procyon.org.uk/ # v6
2020-02-06 14:22:28 +00:00
TRACE_EVENT(afs_send_data,
TP_PROTO(struct afs_call *call, struct msghdr *msg),
afs: Use ITER_XARRAY for writing Use a single ITER_XARRAY iterator to describe the portion of a file to be transmitted to the server rather than generating a series of small ITER_BVEC iterators on the fly. This will make it easier to implement AIO in afs. In theory we could maybe use one giant ITER_BVEC, but that means potentially allocating a huge array of bio_vec structs (max 256 per page) when in fact the pagecache already has a structure listing all the relevant pages (radix_tree/xarray) that can be walked over. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/153685395197.14766.16289516750731233933.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/158861251312.340223.17924900795425422532.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465828607.1377938.6903132788463419368.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588535018.3465195.14509994354240338307.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161118152415.1232039.6452879415814850025.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161161048194.2537118.13763612220937637316.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/161340411602.1303470.4661108879482218408.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/161539555629.286939.5241869986617154517.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/161653811456.2770958.7017388543246759245.stgit@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/161789095005.6155.6789055030327407928.stgit@warthog.procyon.org.uk/ # v6
2020-02-06 14:22:28 +00:00
TP_ARGS(call, msg),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(unsigned int, flags)
__field(loff_t, offset)
__field(loff_t, count)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->flags = msg->msg_flags;
afs: Use ITER_XARRAY for writing Use a single ITER_XARRAY iterator to describe the portion of a file to be transmitted to the server rather than generating a series of small ITER_BVEC iterators on the fly. This will make it easier to implement AIO in afs. In theory we could maybe use one giant ITER_BVEC, but that means potentially allocating a huge array of bio_vec structs (max 256 per page) when in fact the pagecache already has a structure listing all the relevant pages (radix_tree/xarray) that can be walked over. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/153685395197.14766.16289516750731233933.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/158861251312.340223.17924900795425422532.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465828607.1377938.6903132788463419368.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588535018.3465195.14509994354240338307.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161118152415.1232039.6452879415814850025.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161161048194.2537118.13763612220937637316.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/161340411602.1303470.4661108879482218408.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/161539555629.286939.5241869986617154517.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/161653811456.2770958.7017388543246759245.stgit@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/161789095005.6155.6789055030327407928.stgit@warthog.procyon.org.uk/ # v6
2020-02-06 14:22:28 +00:00
__entry->offset = msg->msg_iter.xarray_start + msg->msg_iter.iov_offset;
__entry->count = iov_iter_count(&msg->msg_iter);
),
afs: Use ITER_XARRAY for writing Use a single ITER_XARRAY iterator to describe the portion of a file to be transmitted to the server rather than generating a series of small ITER_BVEC iterators on the fly. This will make it easier to implement AIO in afs. In theory we could maybe use one giant ITER_BVEC, but that means potentially allocating a huge array of bio_vec structs (max 256 per page) when in fact the pagecache already has a structure listing all the relevant pages (radix_tree/xarray) that can be walked over. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/153685395197.14766.16289516750731233933.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/158861251312.340223.17924900795425422532.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465828607.1377938.6903132788463419368.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588535018.3465195.14509994354240338307.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161118152415.1232039.6452879415814850025.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161161048194.2537118.13763612220937637316.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/161340411602.1303470.4661108879482218408.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/161539555629.286939.5241869986617154517.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/161653811456.2770958.7017388543246759245.stgit@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/161789095005.6155.6789055030327407928.stgit@warthog.procyon.org.uk/ # v6
2020-02-06 14:22:28 +00:00
TP_printk(" c=%08x o=%llx n=%llx f=%x",
__entry->call, __entry->offset, __entry->count,
__entry->flags)
);
afs: Use ITER_XARRAY for writing Use a single ITER_XARRAY iterator to describe the portion of a file to be transmitted to the server rather than generating a series of small ITER_BVEC iterators on the fly. This will make it easier to implement AIO in afs. In theory we could maybe use one giant ITER_BVEC, but that means potentially allocating a huge array of bio_vec structs (max 256 per page) when in fact the pagecache already has a structure listing all the relevant pages (radix_tree/xarray) that can be walked over. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/153685395197.14766.16289516750731233933.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/158861251312.340223.17924900795425422532.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465828607.1377938.6903132788463419368.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588535018.3465195.14509994354240338307.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161118152415.1232039.6452879415814850025.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161161048194.2537118.13763612220937637316.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/161340411602.1303470.4661108879482218408.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/161539555629.286939.5241869986617154517.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/161653811456.2770958.7017388543246759245.stgit@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/161789095005.6155.6789055030327407928.stgit@warthog.procyon.org.uk/ # v6
2020-02-06 14:22:28 +00:00
TRACE_EVENT(afs_sent_data,
TP_PROTO(struct afs_call *call, struct msghdr *msg, int ret),
afs: Use ITER_XARRAY for writing Use a single ITER_XARRAY iterator to describe the portion of a file to be transmitted to the server rather than generating a series of small ITER_BVEC iterators on the fly. This will make it easier to implement AIO in afs. In theory we could maybe use one giant ITER_BVEC, but that means potentially allocating a huge array of bio_vec structs (max 256 per page) when in fact the pagecache already has a structure listing all the relevant pages (radix_tree/xarray) that can be walked over. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/153685395197.14766.16289516750731233933.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/158861251312.340223.17924900795425422532.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465828607.1377938.6903132788463419368.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588535018.3465195.14509994354240338307.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161118152415.1232039.6452879415814850025.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161161048194.2537118.13763612220937637316.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/161340411602.1303470.4661108879482218408.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/161539555629.286939.5241869986617154517.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/161653811456.2770958.7017388543246759245.stgit@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/161789095005.6155.6789055030327407928.stgit@warthog.procyon.org.uk/ # v6
2020-02-06 14:22:28 +00:00
TP_ARGS(call, msg, ret),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(int, ret)
__field(loff_t, offset)
__field(loff_t, count)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->ret = ret;
afs: Use ITER_XARRAY for writing Use a single ITER_XARRAY iterator to describe the portion of a file to be transmitted to the server rather than generating a series of small ITER_BVEC iterators on the fly. This will make it easier to implement AIO in afs. In theory we could maybe use one giant ITER_BVEC, but that means potentially allocating a huge array of bio_vec structs (max 256 per page) when in fact the pagecache already has a structure listing all the relevant pages (radix_tree/xarray) that can be walked over. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/153685395197.14766.16289516750731233933.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/158861251312.340223.17924900795425422532.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465828607.1377938.6903132788463419368.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588535018.3465195.14509994354240338307.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161118152415.1232039.6452879415814850025.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161161048194.2537118.13763612220937637316.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/161340411602.1303470.4661108879482218408.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/161539555629.286939.5241869986617154517.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/161653811456.2770958.7017388543246759245.stgit@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/161789095005.6155.6789055030327407928.stgit@warthog.procyon.org.uk/ # v6
2020-02-06 14:22:28 +00:00
__entry->offset = msg->msg_iter.xarray_start + msg->msg_iter.iov_offset;
__entry->count = iov_iter_count(&msg->msg_iter);
),
afs: Use ITER_XARRAY for writing Use a single ITER_XARRAY iterator to describe the portion of a file to be transmitted to the server rather than generating a series of small ITER_BVEC iterators on the fly. This will make it easier to implement AIO in afs. In theory we could maybe use one giant ITER_BVEC, but that means potentially allocating a huge array of bio_vec structs (max 256 per page) when in fact the pagecache already has a structure listing all the relevant pages (radix_tree/xarray) that can be walked over. Signed-off-by: David Howells <dhowells@redhat.com> Tested-By: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/153685395197.14766.16289516750731233933.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/158861251312.340223.17924900795425422532.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/159465828607.1377938.6903132788463419368.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/160588535018.3465195.14509994354240338307.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161118152415.1232039.6452879415814850025.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/161161048194.2537118.13763612220937637316.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/161340411602.1303470.4661108879482218408.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/161539555629.286939.5241869986617154517.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/161653811456.2770958.7017388543246759245.stgit@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/161789095005.6155.6789055030327407928.stgit@warthog.procyon.org.uk/ # v6
2020-02-06 14:22:28 +00:00
TP_printk(" c=%08x o=%llx n=%llx r=%x",
__entry->call, __entry->offset, __entry->count,
__entry->ret)
);
TRACE_EVENT(afs_dir_check_failed,
afs: Use netfslib for directories In the AFS ecosystem, directories are just a special type of file that is downloaded and parsed locally. Download is done by the same mechanism as ordinary files and the data can be cached. There is one important semantic restriction on directories over files: the client must download the entire directory in one go because, for example, the server could fabricate the contents of the blob on the fly with each download and give a different image each time. So that we can cache the directory download, switch AFS directory support over to using the netfslib single-object API, thereby allowing directory content to be stored in the local cache. To make this work, the following changes are made: (1) A directory's contents are now stored in a folio_queue chain attached to the afs_vnode (inode) struct rather than its associated pagecache, though multipage folios are still used to hold the data. The folio queue is discarded when the directory inode is evicted. This also helps with the phasing out of ITER_XARRAY. (2) Various directory operations are made to use and unuse the cache cookie. (3) The content checking, content dumping and content iteration are now performed with a standard iov_iter iterator over the contents of the folio queue. (4) Iteration and modification must be done with the vnode's validate_lock held. In conjunction with (1), this means that the iteration can be done without the need to lock pages or take extra refs on them, unlike when accessing ->i_pages. (5) Convert to using netfs_read_single() to read data. (6) Provide a ->writepages() to call netfs_writeback_single() to save the data to the cache according to the VM's scheduling whilst holding the validate_lock read-locked as (4). (7) Change local directory image editing functions: (a) Provide a function to get a specific block by number from the folio_queue as we can no longer use the i_pages xarray to locate folios by index. This uses a cursor to remember the current position as we need to iterate through the directory contents. The block is kmapped before being returned. (b) Make the function in (a) extend the directory by an extra folio if we run out of space. (c) Raise the check of the block free space counter, for those blocks that have one, higher in the function to eliminate a call to get a block. (d) Remove the page unlocking and putting done during the editing loops. This is no longer necessary as the folio_queue holds the references and the pages are no longer in the pagecache. (e) Mark the inode dirty and pin the cache usage till writeback at the end of a successful edit. (8) Don't set the large_folios flag on the inode as we do the allocation ourselves rather than the VM doing it automatically. (9) Mark the inode as being a single object that isn't uploaded to the server. (10) Enable caching on directories. (11) Only set the upload key for writeback for regular files. Notes: (*) We keep the ->release_folio(), ->invalidate_folio() and ->migrate_folio() ops as we set the mapping pointer on the folio. Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20241216204124.3752367-22-dhowells@redhat.com cc: Marc Dionne <marc.dionne@auristor.com> cc: Jeff Layton <jlayton@kernel.org> cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-16 20:41:11 +00:00
TP_PROTO(struct afs_vnode *vnode, loff_t off),
afs: Use netfslib for directories In the AFS ecosystem, directories are just a special type of file that is downloaded and parsed locally. Download is done by the same mechanism as ordinary files and the data can be cached. There is one important semantic restriction on directories over files: the client must download the entire directory in one go because, for example, the server could fabricate the contents of the blob on the fly with each download and give a different image each time. So that we can cache the directory download, switch AFS directory support over to using the netfslib single-object API, thereby allowing directory content to be stored in the local cache. To make this work, the following changes are made: (1) A directory's contents are now stored in a folio_queue chain attached to the afs_vnode (inode) struct rather than its associated pagecache, though multipage folios are still used to hold the data. The folio queue is discarded when the directory inode is evicted. This also helps with the phasing out of ITER_XARRAY. (2) Various directory operations are made to use and unuse the cache cookie. (3) The content checking, content dumping and content iteration are now performed with a standard iov_iter iterator over the contents of the folio queue. (4) Iteration and modification must be done with the vnode's validate_lock held. In conjunction with (1), this means that the iteration can be done without the need to lock pages or take extra refs on them, unlike when accessing ->i_pages. (5) Convert to using netfs_read_single() to read data. (6) Provide a ->writepages() to call netfs_writeback_single() to save the data to the cache according to the VM's scheduling whilst holding the validate_lock read-locked as (4). (7) Change local directory image editing functions: (a) Provide a function to get a specific block by number from the folio_queue as we can no longer use the i_pages xarray to locate folios by index. This uses a cursor to remember the current position as we need to iterate through the directory contents. The block is kmapped before being returned. (b) Make the function in (a) extend the directory by an extra folio if we run out of space. (c) Raise the check of the block free space counter, for those blocks that have one, higher in the function to eliminate a call to get a block. (d) Remove the page unlocking and putting done during the editing loops. This is no longer necessary as the folio_queue holds the references and the pages are no longer in the pagecache. (e) Mark the inode dirty and pin the cache usage till writeback at the end of a successful edit. (8) Don't set the large_folios flag on the inode as we do the allocation ourselves rather than the VM doing it automatically. (9) Mark the inode as being a single object that isn't uploaded to the server. (10) Enable caching on directories. (11) Only set the upload key for writeback for regular files. Notes: (*) We keep the ->release_folio(), ->invalidate_folio() and ->migrate_folio() ops as we set the mapping pointer on the folio. Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20241216204124.3752367-22-dhowells@redhat.com cc: Marc Dionne <marc.dionne@auristor.com> cc: Jeff Layton <jlayton@kernel.org> cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-16 20:41:11 +00:00
TP_ARGS(vnode, off),
TP_STRUCT__entry(
__field(struct afs_vnode *, vnode)
__field(loff_t, off)
__field(loff_t, i_size)
),
TP_fast_assign(
__entry->vnode = vnode;
__entry->off = off;
afs: Use netfslib for directories In the AFS ecosystem, directories are just a special type of file that is downloaded and parsed locally. Download is done by the same mechanism as ordinary files and the data can be cached. There is one important semantic restriction on directories over files: the client must download the entire directory in one go because, for example, the server could fabricate the contents of the blob on the fly with each download and give a different image each time. So that we can cache the directory download, switch AFS directory support over to using the netfslib single-object API, thereby allowing directory content to be stored in the local cache. To make this work, the following changes are made: (1) A directory's contents are now stored in a folio_queue chain attached to the afs_vnode (inode) struct rather than its associated pagecache, though multipage folios are still used to hold the data. The folio queue is discarded when the directory inode is evicted. This also helps with the phasing out of ITER_XARRAY. (2) Various directory operations are made to use and unuse the cache cookie. (3) The content checking, content dumping and content iteration are now performed with a standard iov_iter iterator over the contents of the folio queue. (4) Iteration and modification must be done with the vnode's validate_lock held. In conjunction with (1), this means that the iteration can be done without the need to lock pages or take extra refs on them, unlike when accessing ->i_pages. (5) Convert to using netfs_read_single() to read data. (6) Provide a ->writepages() to call netfs_writeback_single() to save the data to the cache according to the VM's scheduling whilst holding the validate_lock read-locked as (4). (7) Change local directory image editing functions: (a) Provide a function to get a specific block by number from the folio_queue as we can no longer use the i_pages xarray to locate folios by index. This uses a cursor to remember the current position as we need to iterate through the directory contents. The block is kmapped before being returned. (b) Make the function in (a) extend the directory by an extra folio if we run out of space. (c) Raise the check of the block free space counter, for those blocks that have one, higher in the function to eliminate a call to get a block. (d) Remove the page unlocking and putting done during the editing loops. This is no longer necessary as the folio_queue holds the references and the pages are no longer in the pagecache. (e) Mark the inode dirty and pin the cache usage till writeback at the end of a successful edit. (8) Don't set the large_folios flag on the inode as we do the allocation ourselves rather than the VM doing it automatically. (9) Mark the inode as being a single object that isn't uploaded to the server. (10) Enable caching on directories. (11) Only set the upload key for writeback for regular files. Notes: (*) We keep the ->release_folio(), ->invalidate_folio() and ->migrate_folio() ops as we set the mapping pointer on the folio. Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20241216204124.3752367-22-dhowells@redhat.com cc: Marc Dionne <marc.dionne@auristor.com> cc: Jeff Layton <jlayton@kernel.org> cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-16 20:41:11 +00:00
__entry->i_size = i_size_read(&vnode->netfs.inode);
),
TP_printk("vn=%p %llx/%llx",
__entry->vnode, __entry->off, __entry->i_size)
);
TRACE_EVENT(afs_call_state,
TP_PROTO(struct afs_call *call,
enum afs_call_state from,
enum afs_call_state to,
int ret, u32 remote_abort),
TP_ARGS(call, from, to, ret, remote_abort),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum afs_call_state, from)
__field(enum afs_call_state, to)
__field(int, ret)
__field(u32, abort)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->from = from;
__entry->to = to;
__entry->ret = ret;
__entry->abort = remote_abort;
),
TP_printk("c=%08x %u->%u r=%d ab=%d",
__entry->call,
__entry->from, __entry->to,
__entry->ret, __entry->abort)
);
TRACE_EVENT(afs_lookup,
TP_PROTO(struct afs_vnode *dvnode, const struct qstr *name,
struct afs_fid *fid),
TP_ARGS(dvnode, name, fid),
TP_STRUCT__entry(
__field_struct(struct afs_fid, dfid)
__field_struct(struct afs_fid, fid)
__array(char, name, 24)
),
TP_fast_assign(
int __len = min_t(int, name->len, 23);
__entry->dfid = dvnode->fid;
__entry->fid = *fid;
memcpy(__entry->name, name->name, __len);
__entry->name[__len] = 0;
),
TP_printk("d=%llx:%llx:%x \"%s\" f=%llx:%x",
__entry->dfid.vid, __entry->dfid.vnode, __entry->dfid.unique,
__entry->name,
__entry->fid.vnode, __entry->fid.unique)
);
TRACE_EVENT(afs_edit_dir,
TP_PROTO(struct afs_vnode *dvnode,
enum afs_edit_dir_reason why,
enum afs_edit_dir_op op,
unsigned int block,
unsigned int slot,
unsigned int f_vnode,
unsigned int f_unique,
const char *name),
TP_ARGS(dvnode, why, op, block, slot, f_vnode, f_unique, name),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(enum afs_edit_dir_reason, why)
__field(enum afs_edit_dir_op, op)
__field(unsigned int, block)
__field(unsigned short, slot)
__field(unsigned int, f_vnode)
__field(unsigned int, f_unique)
__array(char, name, 24)
),
TP_fast_assign(
int __len = strlen(name);
__len = min(__len, 23);
__entry->vnode = dvnode->fid.vnode;
__entry->unique = dvnode->fid.unique;
__entry->why = why;
__entry->op = op;
__entry->block = block;
__entry->slot = slot;
__entry->f_vnode = f_vnode;
__entry->f_unique = f_unique;
memcpy(__entry->name, name, __len);
__entry->name[__len] = 0;
),
TP_printk("di=%x:%x %s %s %u[%u] fi=%x:%x \"%s\"",
__entry->vnode, __entry->unique,
__print_symbolic(__entry->why, afs_edit_dir_reasons),
__print_symbolic(__entry->op, afs_edit_dir_ops),
__entry->block, __entry->slot,
__entry->f_vnode, __entry->f_unique,
__entry->name)
);
TRACE_EVENT(afs_dir_invalid,
TP_PROTO(const struct afs_vnode *dvnode, enum afs_dir_invalid_trace trace),
TP_ARGS(dvnode, trace),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(enum afs_dir_invalid_trace, trace)
),
TP_fast_assign(
__entry->vnode = dvnode->fid.vnode;
__entry->unique = dvnode->fid.unique;
__entry->trace = trace;
),
TP_printk("di=%x:%x %s",
__entry->vnode, __entry->unique,
__print_symbolic(__entry->trace, afs_dir_invalid_traces))
);
TRACE_EVENT(afs_cb_promise,
TP_PROTO(const struct afs_vnode *vnode, enum afs_cb_promise_trace trace),
TP_ARGS(vnode, trace),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(enum afs_cb_promise_trace, trace)
),
TP_fast_assign(
__entry->vnode = vnode->fid.vnode;
__entry->unique = vnode->fid.unique;
__entry->trace = trace;
),
TP_printk("di=%x:%x %s",
__entry->vnode, __entry->unique,
__print_symbolic(__entry->trace, afs_cb_promise_traces))
);
TRACE_EVENT(afs_vnode_invalid,
TP_PROTO(const struct afs_vnode *vnode, enum afs_vnode_invalid_trace trace),
TP_ARGS(vnode, trace),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(enum afs_vnode_invalid_trace, trace)
),
TP_fast_assign(
__entry->vnode = vnode->fid.vnode;
__entry->unique = vnode->fid.unique;
__entry->trace = trace;
),
TP_printk("di=%x:%x %s",
__entry->vnode, __entry->unique,
__print_symbolic(__entry->trace, afs_vnode_invalid_traces))
);
TRACE_EVENT(afs_set_dv,
TP_PROTO(const struct afs_vnode *dvnode, u64 new_dv),
TP_ARGS(dvnode, new_dv),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(u64, old_dv)
__field(u64, new_dv)
),
TP_fast_assign(
__entry->vnode = dvnode->fid.vnode;
__entry->unique = dvnode->fid.unique;
__entry->old_dv = dvnode->status.data_version;
__entry->new_dv = new_dv;
),
TP_printk("di=%x:%x dv=%llx -> dv=%llx",
__entry->vnode, __entry->unique,
__entry->old_dv, __entry->new_dv)
);
TRACE_EVENT(afs_dv_mismatch,
TP_PROTO(const struct afs_vnode *dvnode, u64 before_dv, int delta, u64 new_dv),
TP_ARGS(dvnode, before_dv, delta, new_dv),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(int, delta)
__field(u64, before_dv)
__field(u64, new_dv)
),
TP_fast_assign(
__entry->vnode = dvnode->fid.vnode;
__entry->unique = dvnode->fid.unique;
__entry->delta = delta;
__entry->before_dv = before_dv;
__entry->new_dv = new_dv;
),
TP_printk("di=%x:%x xdv=%llx+%d dv=%llx",
__entry->vnode, __entry->unique,
__entry->before_dv, __entry->delta, __entry->new_dv)
);
TRACE_EVENT(afs_protocol_error,
TP_PROTO(struct afs_call *call, enum afs_eproto_cause cause),
TP_ARGS(call, cause),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(enum afs_eproto_cause, cause)
),
TP_fast_assign(
__entry->call = call ? call->debug_id : 0;
__entry->cause = cause;
),
TP_printk("c=%08x %s",
__entry->call,
__print_symbolic(__entry->cause, afs_eproto_causes))
);
TRACE_EVENT(afs_io_error,
TP_PROTO(unsigned int call, int error, enum afs_io_error where),
TP_ARGS(call, error, where),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(int, error)
__field(enum afs_io_error, where)
),
TP_fast_assign(
__entry->call = call;
__entry->error = error;
__entry->where = where;
),
TP_printk("c=%08x r=%d %s",
__entry->call, __entry->error,
__print_symbolic(__entry->where, afs_io_errors))
);
TRACE_EVENT(afs_file_error,
TP_PROTO(struct afs_vnode *vnode, int error, enum afs_file_error where),
TP_ARGS(vnode, error, where),
TP_STRUCT__entry(
__field_struct(struct afs_fid, fid)
__field(int, error)
__field(enum afs_file_error, where)
),
TP_fast_assign(
__entry->fid = vnode->fid;
__entry->error = error;
__entry->where = where;
),
TP_printk("%llx:%llx:%x r=%d %s",
__entry->fid.vid, __entry->fid.vnode, __entry->fid.unique,
__entry->error,
__print_symbolic(__entry->where, afs_file_errors))
);
afs: Fix error handling with lookup via FS.InlineBulkStatus When afs does a lookup, it tries to use FS.InlineBulkStatus to preemptively look up a bunch of files in the parent directory and cache this locally, on the basis that we might want to look at them too (for example if someone does an ls on a directory, they may want want to then stat every file listed). FS.InlineBulkStatus can be considered a compound op with the normal abort code applying to the compound as a whole. Each status fetch within the compound is then given its own individual abort code - but assuming no error that prevents the bulk fetch from returning the compound result will be 0, even if all the constituent status fetches failed. At the conclusion of afs_do_lookup(), we should use the abort code from the appropriate status to determine the error to return, if any - but instead it is assumed that we were successful if the op as a whole succeeded and we return an incompletely initialised inode, resulting in ENOENT, no matter the actual reason. In the particular instance reported, a vnode with no permission granted to be accessed is being given a UAEACCES abort code which should be reported as EACCES, but is instead being reported as ENOENT. Fix this by abandoning the inode (which will be cleaned up with the op) if file[1] has an abort code indicated and turn that abort code into an error instead. Whilst we're at it, add a tracepoint so that the abort codes of the individual subrequests of FS.InlineBulkStatus can be logged. At the moment only the container abort code can be 0. Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") Reported-by: Jeffrey Altman <jaltman@auristor.com> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2024-01-02 14:02:37 +00:00
TRACE_EVENT(afs_bulkstat_error,
TP_PROTO(struct afs_operation *op, struct afs_fid *fid, unsigned int index, s32 abort),
TP_ARGS(op, fid, index, abort),
TP_STRUCT__entry(
__field_struct(struct afs_fid, fid)
__field(unsigned int, op)
__field(unsigned int, index)
__field(s32, abort)
),
TP_fast_assign(
__entry->op = op->debug_id;
__entry->fid = *fid;
__entry->index = index;
__entry->abort = abort;
),
TP_printk("OP=%08x[%02x] %llx:%llx:%x a=%d",
__entry->op, __entry->index,
__entry->fid.vid, __entry->fid.vnode, __entry->fid.unique,
__entry->abort)
);
TRACE_EVENT(afs_cm_no_server,
TP_PROTO(struct afs_call *call, const struct sockaddr_rxrpc *srx),
TP_ARGS(call, srx),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(unsigned int, op_id)
__field_struct(struct sockaddr_rxrpc, srx)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->op_id = call->operation_ID;
memcpy(&__entry->srx, srx, sizeof(__entry->srx));
),
TP_printk("c=%08x op=%u %pISpc",
__entry->call, __entry->op_id, &__entry->srx.transport)
);
TRACE_EVENT(afs_cm_no_server_u,
TP_PROTO(struct afs_call *call, const uuid_t *uuid),
TP_ARGS(call, uuid),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(unsigned int, op_id)
__field_struct(uuid_t, uuid)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->op_id = call->operation_ID;
memcpy(&__entry->uuid, uuid, sizeof(__entry->uuid));
),
TP_printk("c=%08x op=%u %pU",
__entry->call, __entry->op_id, &__entry->uuid)
);
TRACE_EVENT(afs_flock_ev,
TP_PROTO(struct afs_vnode *vnode, struct file_lock *fl,
enum afs_flock_event event, int error),
TP_ARGS(vnode, fl, event, error),
TP_STRUCT__entry(
__field_struct(struct afs_fid, fid)
__field(enum afs_flock_event, event)
__field(enum afs_lock_state, state)
__field(int, error)
__field(unsigned int, debug_id)
),
TP_fast_assign(
__entry->fid = vnode->fid;
__entry->event = event;
__entry->state = vnode->lock_state;
__entry->error = error;
__entry->debug_id = fl ? fl->fl_u.afs.debug_id : 0;
),
TP_printk("%llx:%llx:%x %04x %s s=%s e=%d",
__entry->fid.vid, __entry->fid.vnode, __entry->fid.unique,
__entry->debug_id,
__print_symbolic(__entry->event, afs_flock_events),
__print_symbolic(__entry->state, afs_flock_states),
__entry->error)
);
TRACE_EVENT(afs_flock_op,
TP_PROTO(struct afs_vnode *vnode, struct file_lock *fl,
enum afs_flock_operation op),
TP_ARGS(vnode, fl, op),
TP_STRUCT__entry(
__field_struct(struct afs_fid, fid)
__field(loff_t, from)
__field(loff_t, len)
__field(enum afs_flock_operation, op)
__field(unsigned char, type)
__field(unsigned int, flags)
__field(unsigned int, debug_id)
),
TP_fast_assign(
__entry->fid = vnode->fid;
__entry->from = fl->fl_start;
__entry->len = fl->fl_end - fl->fl_start + 1;
__entry->op = op;
__entry->type = fl->c.flc_type;
__entry->flags = fl->c.flc_flags;
__entry->debug_id = fl->fl_u.afs.debug_id;
),
TP_printk("%llx:%llx:%x %04x %s t=%s R=%llx/%llx f=%x",
__entry->fid.vid, __entry->fid.vnode, __entry->fid.unique,
__entry->debug_id,
__print_symbolic(__entry->op, afs_flock_operations),
__print_symbolic(__entry->type, afs_flock_types),
__entry->from, __entry->len, __entry->flags)
);
TRACE_EVENT(afs_reload_dir,
TP_PROTO(struct afs_vnode *vnode),
TP_ARGS(vnode),
TP_STRUCT__entry(
__field_struct(struct afs_fid, fid)
),
TP_fast_assign(
__entry->fid = vnode->fid;
),
TP_printk("%llx:%llx:%x",
__entry->fid.vid, __entry->fid.vnode, __entry->fid.unique)
);
TRACE_EVENT(afs_silly_rename,
TP_PROTO(struct afs_vnode *vnode, bool done),
TP_ARGS(vnode, done),
TP_STRUCT__entry(
__field_struct(struct afs_fid, fid)
__field(bool, done)
),
TP_fast_assign(
__entry->fid = vnode->fid;
__entry->done = done;
),
TP_printk("%llx:%llx:%x done=%u",
__entry->fid.vid, __entry->fid.vnode, __entry->fid.unique,
__entry->done)
);
TRACE_EVENT(afs_get_tree,
TP_PROTO(struct afs_cell *cell, struct afs_volume *volume),
TP_ARGS(cell, volume),
TP_STRUCT__entry(
__field(u64, vid)
__array(char, cell, 24)
__array(char, volume, 24)
),
TP_fast_assign(
int __len;
__entry->vid = volume->vid;
__len = min_t(int, cell->name_len, 23);
memcpy(__entry->cell, cell->name, __len);
__entry->cell[__len] = 0;
__len = min_t(int, volume->name_len, 23);
memcpy(__entry->volume, volume->name, __len);
__entry->volume[__len] = 0;
),
TP_printk("--- MOUNT %s:%s %llx",
__entry->cell, __entry->volume, __entry->vid)
);
afs: Parse the VolSync record in the reply of a number of RPC ops A number of fileserver RPC operations return a VolSync record as part of their reply that gives some information about the state of the volume being accessed, including: (1) A volume Creation timestamp. For an RW volume, this is the time at which the volume was created; if it changes, the RW volume was presumably restored from a backup and all cached data should be scrubbed as Data Version numbers could regress on the files in the volume. For an RO volume, this is the time it was last snapshotted from the RW volume. It is expected to advance each time this happens; if it regresses, cached data should be scrubbed. (2) A volume Update timestamp (Auristor only). For an RW volume, this is updated any time any change is made to a volume or its contents. If it regresses, all cached data must be scrubbed. For an RO volume, this is a copy of the RW volume's Update timestamp at the point of snapshotting. It can be used as a version number when checking to see if a callback on a RO volume was due to a snapshot. If it regresses, all cached data must be scrubbed. but this is currently not made use of by the in-kernel afs filesystem. Make the afs filesystem use this by: (1) Add an update time field to the afs_volsync struct and use a value of TIME64_MIN in both that and the creation time to indicate that they are unset. (2) Add creation and update time fields to the afs_volume struct and use this to track the two timestamps. (3) Add a volsync_lock mutex to the afs_volume struct to control modification access for when we detect a change in these values. (3) Add a 'pre-op volsync' struct to the afs_operation struct to record the state of the volume tracking before the op. (4) Add a new counter, cb_scrub, to the afs_volume struct to count events that require all data to be scrubbed. A copy is placed in the afs_vnode struct (inode) and if they no longer match, a scrub takes place. (5) When the result of an operation is being parsed, parse the VolSync data too, if it is provided. Note that the two timestamps are handled separately, since they don't work in quite the same way. - If the afs_volume tracking is unset, just set it and do nothing else. - If the result timestamps are the same as the ones in afs_volume, do nothing. - If the timestamps regress, increment cb_scrub if not already done so. - If the creation timestamp on a RW volume changes, increment cb_scrub if not already done so. - If the creation timestamp on a RO volume advances, update the server list and see if the current server has been excluded, if so reissue the op. Once over half of the replication sites have been updated, increment cb_ro_snapshot to indicate updates may be required and switch over to excluding unupdated replication sites. - If the creation timestamp on a Backup volume advances, just increment cb_ro_snapshot to trigger updates. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-11-05 16:11:07 +00:00
TRACE_EVENT(afs_cb_v_break,
TP_PROTO(afs_volid_t vid, unsigned int cb_v_break,
enum afs_cb_break_reason reason),
TP_ARGS(vid, cb_v_break, reason),
TP_STRUCT__entry(
__field(afs_volid_t, vid)
__field(unsigned int, cb_v_break)
__field(enum afs_cb_break_reason, reason)
),
TP_fast_assign(
__entry->vid = vid;
__entry->cb_v_break = cb_v_break;
__entry->reason = reason;
),
TP_printk("%llx vb=%x %s",
__entry->vid,
__entry->cb_v_break,
__print_symbolic(__entry->reason, afs_cb_break_reasons))
);
TRACE_EVENT(afs_cb_break,
TP_PROTO(struct afs_fid *fid, unsigned int cb_break,
enum afs_cb_break_reason reason, bool skipped),
TP_ARGS(fid, cb_break, reason, skipped),
TP_STRUCT__entry(
__field_struct(struct afs_fid, fid)
__field(unsigned int, cb_break)
__field(enum afs_cb_break_reason, reason)
__field(bool, skipped)
),
TP_fast_assign(
__entry->fid = *fid;
__entry->cb_break = cb_break;
__entry->reason = reason;
__entry->skipped = skipped;
),
TP_printk("%llx:%llx:%x b=%x s=%u %s",
__entry->fid.vid, __entry->fid.vnode, __entry->fid.unique,
__entry->cb_break,
__entry->skipped,
__print_symbolic(__entry->reason, afs_cb_break_reasons))
);
TRACE_EVENT(afs_cb_miss,
TP_PROTO(struct afs_fid *fid, enum afs_cb_break_reason reason),
TP_ARGS(fid, reason),
TP_STRUCT__entry(
__field_struct(struct afs_fid, fid)
__field(enum afs_cb_break_reason, reason)
),
TP_fast_assign(
__entry->fid = *fid;
__entry->reason = reason;
),
TP_printk(" %llx:%llx:%x %s",
__entry->fid.vid, __entry->fid.vnode, __entry->fid.unique,
__print_symbolic(__entry->reason, afs_cb_break_reasons))
);
TRACE_EVENT(afs_server,
TP_PROTO(unsigned int server_debug_id, int ref, int active,
enum afs_server_trace reason),
TP_ARGS(server_debug_id, ref, active, reason),
TP_STRUCT__entry(
__field(unsigned int, server)
__field(int, ref)
__field(int, active)
__field(int, reason)
),
TP_fast_assign(
__entry->server = server_debug_id;
__entry->ref = ref;
__entry->active = active;
__entry->reason = reason;
),
TP_printk("s=%08x %s r=%d a=%d",
__entry->server,
__print_symbolic(__entry->reason, afs_server_traces),
__entry->ref,
__entry->active)
);
TRACE_EVENT(afs_volume,
TP_PROTO(unsigned int debug_id, afs_volid_t vid, int ref,
enum afs_volume_trace reason),
TP_ARGS(debug_id, vid, ref, reason),
TP_STRUCT__entry(
__field(unsigned int, debug_id)
__field(afs_volid_t, vid)
__field(int, ref)
__field(enum afs_volume_trace, reason)
),
TP_fast_assign(
__entry->debug_id = debug_id;
__entry->vid = vid;
__entry->ref = ref;
__entry->reason = reason;
),
TP_printk("V=%08x %s vid=%llx r=%d",
__entry->debug_id,
__print_symbolic(__entry->reason, afs_volume_traces),
__entry->vid,
__entry->ref)
);
TRACE_EVENT(afs_cell,
TP_PROTO(unsigned int cell_debug_id, int ref, int active,
enum afs_cell_trace reason),
TP_ARGS(cell_debug_id, ref, active, reason),
TP_STRUCT__entry(
__field(unsigned int, cell)
__field(int, ref)
__field(int, active)
__field(int, reason)
),
TP_fast_assign(
__entry->cell = cell_debug_id;
__entry->ref = ref;
__entry->active = active;
__entry->reason = reason;
),
TP_printk("L=%08x %s r=%d a=%d",
__entry->cell,
__print_symbolic(__entry->reason, afs_cell_traces),
__entry->ref,
__entry->active)
);
TRACE_EVENT(afs_alist,
TP_PROTO(unsigned int alist_debug_id, int ref, enum afs_alist_trace reason),
TP_ARGS(alist_debug_id, ref, reason),
TP_STRUCT__entry(
__field(unsigned int, alist)
__field(int, ref)
__field(int, active)
__field(int, reason)
),
TP_fast_assign(
__entry->alist = alist_debug_id;
__entry->ref = ref;
__entry->reason = reason;
),
TP_printk("AL=%08x %s r=%d",
__entry->alist,
__print_symbolic(__entry->reason, afs_alist_traces),
__entry->ref)
);
TRACE_EVENT(afs_estate,
TP_PROTO(unsigned int server_debug_id, unsigned int estate_debug_id,
int ref, enum afs_estate_trace reason),
TP_ARGS(server_debug_id, estate_debug_id, ref, reason),
TP_STRUCT__entry(
__field(unsigned int, server)
__field(unsigned int, estate)
__field(int, ref)
__field(int, active)
__field(int, reason)
),
TP_fast_assign(
__entry->server = server_debug_id;
__entry->estate = estate_debug_id;
__entry->ref = ref;
__entry->reason = reason;
),
TP_printk("ES=%08x[%x] %s r=%d",
__entry->server,
__entry->estate,
__print_symbolic(__entry->reason, afs_estate_traces),
__entry->ref)
);
TRACE_EVENT(afs_fs_probe,
TP_PROTO(struct afs_server *server, bool tx, struct afs_endpoint_state *estate,
unsigned int addr_index, int error, s32 abort_code, unsigned int rtt_us),
TP_ARGS(server, tx, estate, addr_index, error, abort_code, rtt_us),
TP_STRUCT__entry(
__field(unsigned int, server)
__field(unsigned int, estate)
__field(bool, tx)
__field(u16, addr_index)
__field(short, error)
__field(s32, abort_code)
__field(unsigned int, rtt_us)
__field_struct(struct sockaddr_rxrpc, srx)
),
TP_fast_assign(
struct afs_addr_list *alist = estate->addresses;
__entry->server = server->debug_id;
__entry->estate = estate->probe_seq;
__entry->tx = tx;
__entry->addr_index = addr_index;
__entry->error = error;
__entry->abort_code = abort_code;
__entry->rtt_us = rtt_us;
memcpy(&__entry->srx, rxrpc_kernel_remote_srx(alist->addrs[addr_index].peer),
sizeof(__entry->srx));
),
TP_printk("s=%08x %s pq=%x ax=%u e=%d ac=%d rtt=%d %pISpc",
__entry->server, __entry->tx ? "tx" : "rx", __entry->estate,
__entry->addr_index, __entry->error, __entry->abort_code, __entry->rtt_us,
&__entry->srx.transport)
);
TRACE_EVENT(afs_vl_probe,
TP_PROTO(struct afs_vlserver *server, bool tx, struct afs_addr_list *alist,
unsigned int addr_index, int error, s32 abort_code, unsigned int rtt_us),
TP_ARGS(server, tx, alist, addr_index, error, abort_code, rtt_us),
TP_STRUCT__entry(
__field(unsigned int, server)
__field(bool, tx)
__field(unsigned short, flags)
__field(u16, addr_index)
__field(short, error)
__field(s32, abort_code)
__field(unsigned int, rtt_us)
__field_struct(struct sockaddr_rxrpc, srx)
),
TP_fast_assign(
__entry->server = server->debug_id;
__entry->tx = tx;
__entry->addr_index = addr_index;
__entry->error = error;
__entry->abort_code = abort_code;
__entry->rtt_us = rtt_us;
memcpy(&__entry->srx, rxrpc_kernel_remote_srx(alist->addrs[addr_index].peer),
sizeof(__entry->srx));
),
TP_printk("vl=%08x %s ax=%u e=%d ac=%d rtt=%d %pISpc",
__entry->server, __entry->tx ? "tx" : "rx", __entry->addr_index,
__entry->error, __entry->abort_code, __entry->rtt_us,
&__entry->srx.transport)
);
afs: Fix fileserver rotation Fix the fileserver rotation so that it doesn't use RTT as the basis for deciding which server and address to use as this doesn't necessarily give a good indication of the best path. Instead, use the configurable preference list in conjunction with whatever probes have succeeded at the time of looking. To this end, make the following changes: (1) Keep an array of "server states" to track what addresses we've tried on each server and move the waitqueue entries there that we'll need for probing. (2) Each afs_server_state struct is made to pin the corresponding server's endpoint state rather than the afs_operation struct carrying a pin on the server we're currently looking at. (3) Drop the server list preference; we now always rescan the server list. (4) afs_wait_for_probes() now uses the server state list to guide it in what it waits for (and to provide the waitqueue entries) and returns an indication of whether we'd got a response, run out of responsive addresses or the endpoint state had been superseded and we need to restart the iteration. (5) Call afs_get_address_preferences*() occasionally to refresh the preference values. (6) When picking a server, scan the addresses of the servers for which we have as-yet untested communications, looking for the highest priority one and use that instead of trying all the addresses for a particular server in ascending-RTT order. (7) When a Busy or Offline state is seen across all available servers, do a short sleep. (8) If we detect that we accessed a future RO volume version whilst it is undergoing replication, reissue the op against the older version until at least half of the servers are replicated. (9) Whilst RO replication is ongoing, increase the frequency of Volume Location server checks for that volume to every ten minutes instead of hourly. Also add a tracepoint to track progress through the rotation algorithm. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
2023-10-18 09:24:01 +01:00
TRACE_EVENT(afs_rotate,
TP_PROTO(struct afs_operation *op, enum afs_rotate_trace reason, unsigned int extra),
TP_ARGS(op, reason, extra),
TP_STRUCT__entry(
__field(unsigned int, op)
__field(unsigned int, flags)
__field(unsigned int, extra)
__field(unsigned short, iteration)
__field(short, server_index)
__field(short, addr_index)
__field(enum afs_rotate_trace, reason)
),
TP_fast_assign(
__entry->op = op->debug_id;
__entry->flags = op->flags;
__entry->iteration = op->nr_iterations;
__entry->server_index = op->server_index;
__entry->addr_index = op->addr_index;
__entry->reason = reason;
__entry->extra = extra;
),
TP_printk("OP=%08x it=%02x %s fl=%x sx=%d ax=%d ext=%d",
__entry->op,
__entry->iteration,
__print_symbolic(__entry->reason, afs_rotate_traces),
__entry->flags,
__entry->server_index,
__entry->addr_index,
__entry->extra)
);
TRACE_EVENT(afs_make_call,
TP_PROTO(struct afs_call *call),
TP_ARGS(call),
TP_STRUCT__entry(
__field(unsigned int, call)
__field(bool, is_vl)
__field(enum afs_fs_operation, op)
__field_struct(struct afs_fid, fid)
__field_struct(struct sockaddr_rxrpc, srx)
),
TP_fast_assign(
__entry->call = call->debug_id;
__entry->op = call->operation_ID;
__entry->fid = call->fid;
memcpy(&__entry->srx, rxrpc_kernel_remote_srx(call->peer),
sizeof(__entry->srx));
__entry->srx.srx_service = call->service_id;
__entry->is_vl = (__entry->srx.srx_service == VL_SERVICE ||
__entry->srx.srx_service == YFS_VL_SERVICE);
),
TP_printk("c=%08x %pISpc+%u %s %llx:%llx:%x",
__entry->call,
&__entry->srx.transport,
__entry->srx.srx_service,
__entry->is_vl ?
__print_symbolic(__entry->op, afs_vl_operations) :
__print_symbolic(__entry->op, afs_fs_operations),
__entry->fid.vid,
__entry->fid.vnode,
__entry->fid.unique)
);
TRACE_EVENT(afs_read_recv,
TP_PROTO(const struct afs_operation *op, const struct afs_call *call),
TP_ARGS(op, call),
TP_STRUCT__entry(
__field(unsigned int, rreq)
__field(unsigned int, sreq)
__field(unsigned int, op)
__field(unsigned int, op_flags)
__field(unsigned int, call)
__field(enum afs_call_state, call_state)
),
TP_fast_assign(
__entry->op = op->debug_id;
__entry->sreq = op->fetch.subreq->debug_index;
__entry->rreq = op->fetch.subreq->rreq->debug_id;
__entry->op_flags = op->flags;
__entry->call = call->debug_id;
__entry->call_state = call->state;
),
TP_printk("R=%08x[%x] OP=%08x c=%08x cs=%x of=%x",
__entry->rreq, __entry->sreq,
__entry->op,
__entry->call, __entry->call_state,
__entry->op_flags)
);
#endif /* _TRACE_AFS_H */
/* This part must be outside protection */
#include <trace/define_trace.h>