linux/arch/powerpc/platforms/pseries/rtas-fadump.c

650 lines
20 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Firmware-Assisted Dump support on POWERVM platform.
*
* Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
* Copyright 2019, Hari Bathini, IBM Corporation.
*/
#define pr_fmt(fmt) "rtas fadump: " fmt
#include <linux/string.h>
#include <linux/memblock.h>
#include <linux/delay.h>
#include <linux/seq_file.h>
#include <linux/crash_dump.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <asm/page.h>
#include <asm/rtas.h>
#include <asm/setup.h>
#include <asm/fadump.h>
#include <asm/fadump-internal.h>
#include "rtas-fadump.h"
static struct rtas_fadump_mem_struct fdm;
static const struct rtas_fadump_mem_struct *fdm_active;
static void rtas_fadump_update_config(struct fw_dump *fadump_conf,
const struct rtas_fadump_mem_struct *fdm)
{
fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr +
fadump_conf->boot_memory_size);
}
/*
* This function is called in the capture kernel to get configuration details
* setup in the first kernel and passed to the f/w.
*/
static void __init rtas_fadump_get_config(struct fw_dump *fadump_conf,
const struct rtas_fadump_mem_struct *fdm)
{
unsigned long base, size, last_end, hole_size;
last_end = 0;
hole_size = 0;
fadump_conf->boot_memory_size = 0;
fadump_conf->boot_mem_regs_cnt = 0;
pr_debug("Boot memory regions:\n");
for (int i = 0; i < be16_to_cpu(fdm->header.dump_num_sections); i++) {
int type = be16_to_cpu(fdm->rgn[i].source_data_type);
u64 addr;
switch (type) {
case RTAS_FADUMP_CPU_STATE_DATA:
addr = be64_to_cpu(fdm->rgn[i].destination_address);
fadump_conf->cpu_state_dest_vaddr = (u64)__va(addr);
/*
* Start address of reserve dump area (permanent reservation) for
* re-registering FADump after dump capture.
*/
fadump_conf->reserve_dump_area_start = addr;
break;
case RTAS_FADUMP_HPTE_REGION:
/* Not processed currently. */
break;
case RTAS_FADUMP_REAL_MODE_REGION:
base = be64_to_cpu(fdm->rgn[i].source_address);
size = be64_to_cpu(fdm->rgn[i].source_len);
pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size);
if (!base) {
fadump_conf->boot_mem_dest_addr =
be64_to_cpu(fdm->rgn[i].destination_address);
}
fadump_conf->boot_mem_addr[fadump_conf->boot_mem_regs_cnt] = base;
fadump_conf->boot_mem_sz[fadump_conf->boot_mem_regs_cnt] = size;
fadump_conf->boot_memory_size += size;
hole_size += (base - last_end);
last_end = base + size;
fadump_conf->boot_mem_regs_cnt++;
break;
case RTAS_FADUMP_PARAM_AREA:
fadump_conf->param_area = be64_to_cpu(fdm->rgn[i].destination_address);
break;
default:
pr_warn("Section type %d unsupported on this kernel. Ignoring!\n", type);
break;
}
}
fadump_conf->boot_mem_top = fadump_conf->boot_memory_size + hole_size;
rtas_fadump_update_config(fadump_conf, fdm);
}
static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf)
{
u64 addr = fadump_conf->reserve_dump_area_start;
u16 sec_cnt = 0;
memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct));
addr = addr & PAGE_MASK;
fdm.header.dump_format_version = cpu_to_be32(0x00000001);
fdm.header.dump_status_flag = 0;
fdm.header.offset_first_dump_section =
cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, rgn));
/*
* Fields for disk dump option.
* We are not using disk dump option, hence set these fields to 0.
*/
fdm.header.dd_block_size = 0;
fdm.header.dd_block_offset = 0;
fdm.header.dd_num_blocks = 0;
fdm.header.dd_offset_disk_path = 0;
/* set 0 to disable an automatic dump-reboot. */
fdm.header.max_time_auto = 0;
/* Kernel dump sections */
/* cpu state data section. */
fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA);
fdm.rgn[sec_cnt].source_address = 0;
fdm.rgn[sec_cnt].source_len = cpu_to_be64(fadump_conf->cpu_state_data_size);
fdm.rgn[sec_cnt].destination_address = cpu_to_be64(addr);
addr += fadump_conf->cpu_state_data_size;
sec_cnt++;
/* hpte region section */
fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_HPTE_REGION);
fdm.rgn[sec_cnt].source_address = 0;
fdm.rgn[sec_cnt].source_len = cpu_to_be64(fadump_conf->hpte_region_size);
fdm.rgn[sec_cnt].destination_address = cpu_to_be64(addr);
addr += fadump_conf->hpte_region_size;
sec_cnt++;
/*
* Align boot memory area destination address to page boundary to
* be able to mmap read this area in the vmcore.
*/
addr = PAGE_ALIGN(addr);
/* First boot memory region destination address */
fadump_conf->boot_mem_dest_addr = addr;
for (int i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) {
/* Boot memory regions */
fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION);
fdm.rgn[sec_cnt].source_address = cpu_to_be64(fadump_conf->boot_mem_addr[i]);
fdm.rgn[sec_cnt].source_len = cpu_to_be64(fadump_conf->boot_mem_sz[i]);
fdm.rgn[sec_cnt].destination_address = cpu_to_be64(addr);
addr += fadump_conf->boot_mem_sz[i];
sec_cnt++;
}
/* Parameters area */
if (fadump_conf->param_area) {
fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG);
fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_PARAM_AREA);
fdm.rgn[sec_cnt].source_address = cpu_to_be64(fadump_conf->param_area);
fdm.rgn[sec_cnt].source_len = cpu_to_be64(COMMAND_LINE_SIZE);
fdm.rgn[sec_cnt].destination_address = cpu_to_be64(fadump_conf->param_area);
sec_cnt++;
}
fdm.header.dump_num_sections = cpu_to_be16(sec_cnt);
rtas_fadump_update_config(fadump_conf, &fdm);
return addr;
}
static u64 rtas_fadump_get_bootmem_min(void)
{
return RTAS_FADUMP_MIN_BOOT_MEM;
}
static int rtas_fadump_register(struct fw_dump *fadump_conf)
{
unsigned int wait_time, fdm_size;
int rc, err = -EIO;
/*
* Platform requires the exact size of the Dump Memory Structure.
* Avoid including any unused rgns in the calculation, as this
* could result in a parameter error (-3) from the platform.
*/
fdm_size = sizeof(struct rtas_fadump_section_header);
fdm_size += be16_to_cpu(fdm.header.dump_num_sections) * sizeof(struct rtas_fadump_section);
/* TODO: Add upper time limit for the delay */
do {
rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
NULL, FADUMP_REGISTER, &fdm, fdm_size);
wait_time = rtas_busy_delay_time(rc);
if (wait_time)
mdelay(wait_time);
} while (wait_time);
switch (rc) {
case 0:
pr_info("Registration is successful!\n");
fadump_conf->dump_registered = 1;
err = 0;
break;
case -1:
pr_err("Failed to register. Hardware Error(%d).\n", rc);
break;
case -3:
if (!is_fadump_reserved_mem_contiguous())
pr_err("Can't have holes in reserved memory area.\n");
pr_err("Failed to register. Parameter Error(%d).\n", rc);
err = -EINVAL;
break;
case -9:
pr_err("Already registered!\n");
fadump_conf->dump_registered = 1;
err = -EEXIST;
break;
default:
pr_err("Failed to register. Unknown Error(%d).\n", rc);
break;
}
return err;
}
static int rtas_fadump_unregister(struct fw_dump *fadump_conf)
{
unsigned int wait_time;
int rc;
/* TODO: Add upper time limit for the delay */
do {
rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
NULL, FADUMP_UNREGISTER, &fdm,
sizeof(struct rtas_fadump_mem_struct));
wait_time = rtas_busy_delay_time(rc);
if (wait_time)
mdelay(wait_time);
} while (wait_time);
if (rc) {
pr_err("Failed to un-register - unexpected error(%d).\n", rc);
return -EIO;
}
fadump_conf->dump_registered = 0;
return 0;
}
static int rtas_fadump_invalidate(struct fw_dump *fadump_conf)
{
unsigned int wait_time;
int rc;
/* TODO: Add upper time limit for the delay */
do {
rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1,
NULL, FADUMP_INVALIDATE, fdm_active,
sizeof(struct rtas_fadump_mem_struct));
wait_time = rtas_busy_delay_time(rc);
if (wait_time)
mdelay(wait_time);
} while (wait_time);
if (rc) {
pr_err("Failed to invalidate - unexpected error (%d).\n", rc);
return -EIO;
}
fadump_conf->dump_active = 0;
fdm_active = NULL;
return 0;
}
#define RTAS_FADUMP_GPR_MASK 0xffffff0000000000
static inline int rtas_fadump_gpr_index(u64 id)
{
char str[3];
int i = -1;
if ((id & RTAS_FADUMP_GPR_MASK) == fadump_str_to_u64("GPR")) {
/* get the digits at the end */
id &= ~RTAS_FADUMP_GPR_MASK;
id >>= 24;
str[2] = '\0';
str[1] = id & 0xff;
str[0] = (id >> 8) & 0xff;
if (kstrtoint(str, 10, &i))
i = -EINVAL;
if (i > 31)
i = -1;
}
return i;
}
static void __init rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val)
{
int i;
i = rtas_fadump_gpr_index(reg_id);
if (i >= 0)
regs->gpr[i] = (unsigned long)reg_val;
else if (reg_id == fadump_str_to_u64("NIA"))
regs->nip = (unsigned long)reg_val;
else if (reg_id == fadump_str_to_u64("MSR"))
regs->msr = (unsigned long)reg_val;
else if (reg_id == fadump_str_to_u64("CTR"))
regs->ctr = (unsigned long)reg_val;
else if (reg_id == fadump_str_to_u64("LR"))
regs->link = (unsigned long)reg_val;
else if (reg_id == fadump_str_to_u64("XER"))
regs->xer = (unsigned long)reg_val;
else if (reg_id == fadump_str_to_u64("CR"))
regs->ccr = (unsigned long)reg_val;
else if (reg_id == fadump_str_to_u64("DAR"))
regs->dar = (unsigned long)reg_val;
else if (reg_id == fadump_str_to_u64("DSISR"))
regs->dsisr = (unsigned long)reg_val;
}
static struct rtas_fadump_reg_entry* __init
rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry,
struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) {
rtas_fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id),
be64_to_cpu(reg_entry->reg_value));
reg_entry++;
}
reg_entry++;
return reg_entry;
}
/*
* Read CPU state dump data and convert it into ELF notes.
* The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
* used to access the data to allow for additional fields to be added without
* affecting compatibility. Each list of registers for a CPU starts with
* "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
* 8 Byte ASCII identifier and 8 Byte register value. The register entry
* with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
* of register value. For more details refer to PAPR document.
*
* Only for the crashing cpu we ignore the CPU dump data and get exact
* state from fadump crash info structure populated by first kernel at the
* time of crash.
*/
static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf)
{
struct rtas_fadump_reg_save_area_header *reg_header;
struct fadump_crash_info_header *fdh = NULL;
struct rtas_fadump_reg_entry *reg_entry;
u32 num_cpus, *note_buf;
int i, rc = 0, cpu = 0;
struct pt_regs regs;
void *vaddr;
vaddr = (void *)fadump_conf->cpu_state_dest_vaddr;
reg_header = vaddr;
if (be64_to_cpu(reg_header->magic_number) !=
fadump_str_to_u64("REGSAVE")) {
pr_err("Unable to read register save area.\n");
return -ENOENT;
}
pr_debug("--------CPU State Data------------\n");
pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number));
pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset));
vaddr += be32_to_cpu(reg_header->num_cpu_offset);
num_cpus = be32_to_cpu(*((__be32 *)(vaddr)));
pr_debug("NumCpus : %u\n", num_cpus);
vaddr += sizeof(u32);
reg_entry = (struct rtas_fadump_reg_entry *)vaddr;
rc = fadump_setup_cpu_notes_buf(num_cpus);
if (rc != 0)
return rc;
note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
if (fadump_conf->fadumphdr_addr)
fdh = __va(fadump_conf->fadumphdr_addr);
for (i = 0; i < num_cpus; i++) {
if (be64_to_cpu(reg_entry->reg_id) !=
fadump_str_to_u64("CPUSTRT")) {
pr_err("Unable to read CPU state data\n");
rc = -ENOENT;
goto error_out;
}
/* Lower 4 bytes of reg_value contains logical cpu id */
cpu = (be64_to_cpu(reg_entry->reg_value) &
RTAS_FADUMP_CPU_ID_MASK);
if (fdh && !cpumask_test_cpu(cpu, &fdh->cpu_mask)) {
RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
continue;
}
pr_debug("Reading register data for cpu %d...\n", cpu);
if (fdh && fdh->crashing_cpu == cpu) {
regs = fdh->regs;
note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry);
} else {
reg_entry++;
reg_entry = rtas_fadump_read_regs(reg_entry, &regs);
note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
}
}
final_note(note_buf);
powerpc: make fadump resilient with memory add/remove events Due to changes in memory resources caused by either memory hotplug or online/offline events, the elfcorehdr, which describes the CPUs and memory of the crashed kernel to the kernel that collects the dump (known as second/fadump kernel), becomes outdated. Consequently, attempting dump collection with an outdated elfcorehdr can lead to failed or inaccurate dump collection. Memory hotplug or online/offline events is referred as memory add/remove events in reset of the commit message. The current solution to address the aforementioned issue is as follows: Monitor memory add/remove events in userspace using udev rules, and re-register fadump whenever there are changes in memory resources. This leads to the creation of a new elfcorehdr with updated system memory information. There are several notable issues associated with re-registering fadump for every memory add/remove events. 1. Bulk memory add/remove events with udev-based fadump re-registration can lead to race conditions and, more importantly, it creates a wide window during which fadump is inactive until all memory add/remove events are settled. 2. Re-registering fadump for every memory add/remove event is inefficient. 3. The memory for elfcorehdr is allocated based on the memblock regions available during early boot and remains fixed thereafter. However, if elfcorehdr is later recreated with additional memblock regions, its size will increase, potentially leading to memory corruption. Address the aforementioned challenges by shifting the creation of elfcorehdr from the first kernel (also referred as the crashed kernel), where it was created and frequently recreated for every memory add/remove event, to the fadump kernel. As a result, the elfcorehdr only needs to be created once, thus eliminating the necessity to re-register fadump during memory add/remove events. At present, the first kernel prepares fadump header and stores it in the fadump reserved area. The fadump header includes the start address of the elfcorehdr, crashing CPU details, and other relevant information. In the event of a crash in the first kernel, the second/fadump boots and accesses the fadump header prepared by the first kernel. It then performs the following steps in a platform-specific function [rtas|opal]_fadump_process: 1. Sanity check for fadump header 2. Update CPU notes in elfcorehdr Along with the above, update the setup_fadump()/fadump.c to create elfcorehdr and set its address to the global variable elfcorehdr_addr for the vmcore module to process it in the second/fadump kernel. Section below outlines the information required to create the elfcorehdr and the changes made to make it available to the fadump kernel if it's not already. To create elfcorehdr, the following crashed kernel information is required: CPU notes, vmcoreinfo, and memory ranges. At present, the CPU notes are already prepared in the fadump kernel, so no changes are needed in that regard. The fadump kernel has access to all crashed kernel memory regions, including boot memory regions that are relocated by firmware to fadump reserved areas, so no changes for that either. However, it is necessary to add new members to the fadump header, i.e., the 'fadump_crash_info_header' structure, in order to pass the crashed kernel's vmcoreinfo address and its size to fadump kernel. In addition to the vmcoreinfo address and size, there are a few other attributes also added to the fadump_crash_info_header structure. 1. version: It stores the fadump header version, which is currently set to 1. This provides flexibility to update the fadump crash info header in the future without changing the magic number. For each change in the fadump header, the version will be increased. This will help the updated kernel determine how to handle kernel dumps from older kernels. The magic number remains relevant for checking fadump header corruption. 2. pt_regs_sz/cpu_mask_sz: Store size of pt_regs and cpu_mask structure of first kernel. These attributes are used to prevent dump processing if the sizes of pt_regs or cpu_mask structure differ between the first and fadump kernels. Note: if either first/crashed kernel or second/fadump kernel do not have the changes introduced here then kernel fail to collect the dump and prints relevant error message on the console. Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://msgid.link/20240422195932.1583833-2-sourabhjain@linux.ibm.com
2024-04-23 01:29:30 +05:30
pr_debug("Updating elfcore header (%llx) with cpu notes\n", fadump_conf->elfcorehdr_addr);
fadump_update_elfcore_header((char *)fadump_conf->elfcorehdr_addr);
return 0;
error_out:
fadump_free_cpu_notes_buf();
return rc;
}
/*
powerpc: make fadump resilient with memory add/remove events Due to changes in memory resources caused by either memory hotplug or online/offline events, the elfcorehdr, which describes the CPUs and memory of the crashed kernel to the kernel that collects the dump (known as second/fadump kernel), becomes outdated. Consequently, attempting dump collection with an outdated elfcorehdr can lead to failed or inaccurate dump collection. Memory hotplug or online/offline events is referred as memory add/remove events in reset of the commit message. The current solution to address the aforementioned issue is as follows: Monitor memory add/remove events in userspace using udev rules, and re-register fadump whenever there are changes in memory resources. This leads to the creation of a new elfcorehdr with updated system memory information. There are several notable issues associated with re-registering fadump for every memory add/remove events. 1. Bulk memory add/remove events with udev-based fadump re-registration can lead to race conditions and, more importantly, it creates a wide window during which fadump is inactive until all memory add/remove events are settled. 2. Re-registering fadump for every memory add/remove event is inefficient. 3. The memory for elfcorehdr is allocated based on the memblock regions available during early boot and remains fixed thereafter. However, if elfcorehdr is later recreated with additional memblock regions, its size will increase, potentially leading to memory corruption. Address the aforementioned challenges by shifting the creation of elfcorehdr from the first kernel (also referred as the crashed kernel), where it was created and frequently recreated for every memory add/remove event, to the fadump kernel. As a result, the elfcorehdr only needs to be created once, thus eliminating the necessity to re-register fadump during memory add/remove events. At present, the first kernel prepares fadump header and stores it in the fadump reserved area. The fadump header includes the start address of the elfcorehdr, crashing CPU details, and other relevant information. In the event of a crash in the first kernel, the second/fadump boots and accesses the fadump header prepared by the first kernel. It then performs the following steps in a platform-specific function [rtas|opal]_fadump_process: 1. Sanity check for fadump header 2. Update CPU notes in elfcorehdr Along with the above, update the setup_fadump()/fadump.c to create elfcorehdr and set its address to the global variable elfcorehdr_addr for the vmcore module to process it in the second/fadump kernel. Section below outlines the information required to create the elfcorehdr and the changes made to make it available to the fadump kernel if it's not already. To create elfcorehdr, the following crashed kernel information is required: CPU notes, vmcoreinfo, and memory ranges. At present, the CPU notes are already prepared in the fadump kernel, so no changes are needed in that regard. The fadump kernel has access to all crashed kernel memory regions, including boot memory regions that are relocated by firmware to fadump reserved areas, so no changes for that either. However, it is necessary to add new members to the fadump header, i.e., the 'fadump_crash_info_header' structure, in order to pass the crashed kernel's vmcoreinfo address and its size to fadump kernel. In addition to the vmcoreinfo address and size, there are a few other attributes also added to the fadump_crash_info_header structure. 1. version: It stores the fadump header version, which is currently set to 1. This provides flexibility to update the fadump crash info header in the future without changing the magic number. For each change in the fadump header, the version will be increased. This will help the updated kernel determine how to handle kernel dumps from older kernels. The magic number remains relevant for checking fadump header corruption. 2. pt_regs_sz/cpu_mask_sz: Store size of pt_regs and cpu_mask structure of first kernel. These attributes are used to prevent dump processing if the sizes of pt_regs or cpu_mask structure differ between the first and fadump kernels. Note: if either first/crashed kernel or second/fadump kernel do not have the changes introduced here then kernel fail to collect the dump and prints relevant error message on the console. Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://msgid.link/20240422195932.1583833-2-sourabhjain@linux.ibm.com
2024-04-23 01:29:30 +05:30
* Validate and process the dump data stored by the firmware, and update
* the CPU notes of elfcorehdr.
*/
static int __init rtas_fadump_process(struct fw_dump *fadump_conf)
{
if (!fdm_active || !fadump_conf->fadumphdr_addr)
return -EINVAL;
/* Check if the dump data is valid. */
for (int i = 0; i < be16_to_cpu(fdm_active->header.dump_num_sections); i++) {
int type = be16_to_cpu(fdm_active->rgn[i].source_data_type);
int rc = 0;
switch (type) {
case RTAS_FADUMP_CPU_STATE_DATA:
case RTAS_FADUMP_HPTE_REGION:
case RTAS_FADUMP_REAL_MODE_REGION:
if (fdm_active->rgn[i].error_flags != 0) {
pr_err("Dump taken by platform is not valid (%d)\n", i);
rc = -EINVAL;
}
if (fdm_active->rgn[i].bytes_dumped != fdm_active->rgn[i].source_len) {
pr_err("Dump taken by platform is incomplete (%d)\n", i);
rc = -EINVAL;
}
if (rc) {
pr_warn("Region type: %u src addr: 0x%llx dest addr: 0x%llx\n",
be16_to_cpu(fdm_active->rgn[i].source_data_type),
be64_to_cpu(fdm_active->rgn[i].source_address),
be64_to_cpu(fdm_active->rgn[i].destination_address));
return rc;
}
break;
case RTAS_FADUMP_PARAM_AREA:
if (fdm_active->rgn[i].bytes_dumped != fdm_active->rgn[i].source_len ||
fdm_active->rgn[i].error_flags != 0) {
pr_warn("Failed to process additional parameters! Proceeding anyway..\n");
fadump_conf->param_area = 0;
}
break;
default:
/*
* If the first/crashed kernel added a new region type that the
* second/fadump kernel doesn't recognize, skip it and process
* assuming backward compatibility.
*/
pr_warn("Unknown region found: type: %u src addr: 0x%llx dest addr: 0x%llx\n",
be16_to_cpu(fdm_active->rgn[i].source_data_type),
be64_to_cpu(fdm_active->rgn[i].source_address),
be64_to_cpu(fdm_active->rgn[i].destination_address));
break;
}
}
powerpc: make fadump resilient with memory add/remove events Due to changes in memory resources caused by either memory hotplug or online/offline events, the elfcorehdr, which describes the CPUs and memory of the crashed kernel to the kernel that collects the dump (known as second/fadump kernel), becomes outdated. Consequently, attempting dump collection with an outdated elfcorehdr can lead to failed or inaccurate dump collection. Memory hotplug or online/offline events is referred as memory add/remove events in reset of the commit message. The current solution to address the aforementioned issue is as follows: Monitor memory add/remove events in userspace using udev rules, and re-register fadump whenever there are changes in memory resources. This leads to the creation of a new elfcorehdr with updated system memory information. There are several notable issues associated with re-registering fadump for every memory add/remove events. 1. Bulk memory add/remove events with udev-based fadump re-registration can lead to race conditions and, more importantly, it creates a wide window during which fadump is inactive until all memory add/remove events are settled. 2. Re-registering fadump for every memory add/remove event is inefficient. 3. The memory for elfcorehdr is allocated based on the memblock regions available during early boot and remains fixed thereafter. However, if elfcorehdr is later recreated with additional memblock regions, its size will increase, potentially leading to memory corruption. Address the aforementioned challenges by shifting the creation of elfcorehdr from the first kernel (also referred as the crashed kernel), where it was created and frequently recreated for every memory add/remove event, to the fadump kernel. As a result, the elfcorehdr only needs to be created once, thus eliminating the necessity to re-register fadump during memory add/remove events. At present, the first kernel prepares fadump header and stores it in the fadump reserved area. The fadump header includes the start address of the elfcorehdr, crashing CPU details, and other relevant information. In the event of a crash in the first kernel, the second/fadump boots and accesses the fadump header prepared by the first kernel. It then performs the following steps in a platform-specific function [rtas|opal]_fadump_process: 1. Sanity check for fadump header 2. Update CPU notes in elfcorehdr Along with the above, update the setup_fadump()/fadump.c to create elfcorehdr and set its address to the global variable elfcorehdr_addr for the vmcore module to process it in the second/fadump kernel. Section below outlines the information required to create the elfcorehdr and the changes made to make it available to the fadump kernel if it's not already. To create elfcorehdr, the following crashed kernel information is required: CPU notes, vmcoreinfo, and memory ranges. At present, the CPU notes are already prepared in the fadump kernel, so no changes are needed in that regard. The fadump kernel has access to all crashed kernel memory regions, including boot memory regions that are relocated by firmware to fadump reserved areas, so no changes for that either. However, it is necessary to add new members to the fadump header, i.e., the 'fadump_crash_info_header' structure, in order to pass the crashed kernel's vmcoreinfo address and its size to fadump kernel. In addition to the vmcoreinfo address and size, there are a few other attributes also added to the fadump_crash_info_header structure. 1. version: It stores the fadump header version, which is currently set to 1. This provides flexibility to update the fadump crash info header in the future without changing the magic number. For each change in the fadump header, the version will be increased. This will help the updated kernel determine how to handle kernel dumps from older kernels. The magic number remains relevant for checking fadump header corruption. 2. pt_regs_sz/cpu_mask_sz: Store size of pt_regs and cpu_mask structure of first kernel. These attributes are used to prevent dump processing if the sizes of pt_regs or cpu_mask structure differ between the first and fadump kernels. Note: if either first/crashed kernel or second/fadump kernel do not have the changes introduced here then kernel fail to collect the dump and prints relevant error message on the console. Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://msgid.link/20240422195932.1583833-2-sourabhjain@linux.ibm.com
2024-04-23 01:29:30 +05:30
return rtas_fadump_build_cpu_notes(fadump_conf);
}
static void rtas_fadump_region_show(struct fw_dump *fadump_conf,
struct seq_file *m)
{
const struct rtas_fadump_mem_struct *fdm_ptr;
if (fdm_active)
fdm_ptr = fdm_active;
else
fdm_ptr = &fdm;
for (int i = 0; i < be16_to_cpu(fdm_ptr->header.dump_num_sections); i++) {
int type = be16_to_cpu(fdm_ptr->rgn[i].source_data_type);
switch (type) {
case RTAS_FADUMP_CPU_STATE_DATA:
seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
be64_to_cpu(fdm_ptr->rgn[i].destination_address),
be64_to_cpu(fdm_ptr->rgn[i].destination_address) +
be64_to_cpu(fdm_ptr->rgn[i].source_len) - 1,
be64_to_cpu(fdm_ptr->rgn[i].source_len),
be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped));
break;
case RTAS_FADUMP_HPTE_REGION:
seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n",
be64_to_cpu(fdm_ptr->rgn[i].destination_address),
be64_to_cpu(fdm_ptr->rgn[i].destination_address) +
be64_to_cpu(fdm_ptr->rgn[i].source_len) - 1,
be64_to_cpu(fdm_ptr->rgn[i].source_len),
be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped));
break;
case RTAS_FADUMP_REAL_MODE_REGION:
seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
be64_to_cpu(fdm_ptr->rgn[i].source_address),
be64_to_cpu(fdm_ptr->rgn[i].destination_address));
seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
be64_to_cpu(fdm_ptr->rgn[i].source_len),
be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped));
break;
case RTAS_FADUMP_PARAM_AREA:
seq_printf(m, "\n[%#016llx-%#016llx]: cmdline append: '%s'\n",
be64_to_cpu(fdm_ptr->rgn[i].destination_address),
be64_to_cpu(fdm_ptr->rgn[i].destination_address) +
be64_to_cpu(fdm_ptr->rgn[i].source_len) - 1,
(char *)__va(be64_to_cpu(fdm_ptr->rgn[i].destination_address)));
break;
default:
seq_printf(m, "Unknown region type %d : Src: %#016llx, Dest: %#016llx, ",
type, be64_to_cpu(fdm_ptr->rgn[i].source_address),
be64_to_cpu(fdm_ptr->rgn[i].destination_address));
break;
}
}
/* Dump is active. Show preserved area start address. */
if (fdm_active) {
seq_printf(m, "\nMemory above %#016llx is reserved for saving crash dump\n",
fadump_conf->boot_mem_top);
}
}
static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh,
const char *msg)
{
/* Call ibm,os-term rtas call to trigger firmware assisted dump */
rtas_os_term((char *)msg);
}
/* FADUMP_MAX_MEM_REGS or lower */
static int rtas_fadump_max_boot_mem_rgns(void)
{
/*
* Version 1 of Kernel Assisted Dump Memory Structure (PAPR) supports 10 sections.
* With one each section taken for CPU state data & HPTE respectively, 8 sections
* can be used for boot memory regions.
*
* If new region(s) is(are) defined, maximum boot memory regions will decrease
* proportionally.
*/
return RTAS_FADUMP_MAX_BOOT_MEM_REGS;
}
static struct fadump_ops rtas_fadump_ops = {
.fadump_init_mem_struct = rtas_fadump_init_mem_struct,
.fadump_get_bootmem_min = rtas_fadump_get_bootmem_min,
.fadump_register = rtas_fadump_register,
.fadump_unregister = rtas_fadump_unregister,
.fadump_invalidate = rtas_fadump_invalidate,
.fadump_process = rtas_fadump_process,
.fadump_region_show = rtas_fadump_region_show,
.fadump_trigger = rtas_fadump_trigger,
.fadump_max_boot_mem_rgns = rtas_fadump_max_boot_mem_rgns,
};
void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
{
int i, size, num_sections;
const __be32 *sections;
const __be32 *token;
/*
* Check if Firmware Assisted dump is supported. if yes, check
* if dump has been initiated on last reboot.
*/
token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
if (!token)
return;
fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token);
fadump_conf->ops = &rtas_fadump_ops;
fadump_conf->fadump_supported = 1;
fadump_conf->param_area_supported = 1;
/* Firmware supports 64-bit value for size, align it to pagesize. */
fadump_conf->max_copy_size = ALIGN_DOWN(U64_MAX, PAGE_SIZE);
/*
* The 'ibm,kernel-dump' rtas node is present only if there is
* dump data waiting for us.
*/
fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
if (fdm_active) {
pr_info("Firmware-assisted dump is active.\n");
fadump_conf->dump_active = 1;
rtas_fadump_get_config(fadump_conf, (void *)__pa(fdm_active));
}
/* Get the sizes required to store dump data for the firmware provided
* dump sections.
* For each dump section type supported, a 32bit cell which defines
* the ID of a supported section followed by two 32 bit cells which
* gives the size of the section in bytes.
*/
sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
&size);
if (!sections)
return;
num_sections = size / (3 * sizeof(u32));
for (i = 0; i < num_sections; i++, sections += 3) {
u32 type = (u32)of_read_number(sections, 1);
switch (type) {
case RTAS_FADUMP_CPU_STATE_DATA:
fadump_conf->cpu_state_data_size =
of_read_ulong(&sections[1], 2);
break;
case RTAS_FADUMP_HPTE_REGION:
fadump_conf->hpte_region_size =
of_read_ulong(&sections[1], 2);
break;
}
}
}