2020-09-11 14:25:10 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
/*
|
|
|
|
* Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
|
|
|
|
* No bombay mix was harmed in the writing of this file.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2020 Google LLC
|
|
|
|
* Author: Will Deacon <will@kernel.org>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/bitfield.h>
|
|
|
|
#include <asm/kvm_pgtable.h>
|
2021-03-19 10:01:30 +00:00
|
|
|
#include <asm/stage2_pgtable.h>
|
2020-09-11 14:25:10 +01:00
|
|
|
|
|
|
|
struct kvm_pgtable_walk_data {
|
|
|
|
struct kvm_pgtable_walker *walker;
|
|
|
|
|
2023-04-21 10:18:34 +01:00
|
|
|
const u64 start;
|
2020-09-11 14:25:10 +01:00
|
|
|
u64 addr;
|
2023-04-21 10:18:34 +01:00
|
|
|
const u64 end;
|
2020-09-11 14:25:10 +01:00
|
|
|
};
|
|
|
|
|
2023-04-26 17:23:20 +00:00
|
|
|
static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
|
|
|
|
{
|
|
|
|
return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
|
|
|
|
{
|
|
|
|
return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
|
2021-03-19 10:01:39 +00:00
|
|
|
{
|
2022-11-07 21:56:31 +00:00
|
|
|
u64 granule = kvm_granule_size(ctx->level);
|
2021-03-19 10:01:39 +00:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
if (!kvm_level_supports_block_mapping(ctx->level))
|
2020-09-11 14:25:10 +01:00
|
|
|
return false;
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
if (granule > (ctx->end - ctx->addr))
|
2020-09-11 14:25:10 +01:00
|
|
|
return false;
|
|
|
|
|
2024-12-12 09:18:46 +01:00
|
|
|
if (!IS_ALIGNED(phys, granule))
|
2021-03-19 10:01:37 +00:00
|
|
|
return false;
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
return IS_ALIGNED(ctx->addr, granule);
|
2020-09-11 14:25:10 +01:00
|
|
|
}
|
|
|
|
|
2023-11-27 11:17:33 +00:00
|
|
|
static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
|
|
|
u64 shift = kvm_granule_shift(level);
|
|
|
|
u64 mask = BIT(PAGE_SHIFT - 3) - 1;
|
|
|
|
|
|
|
|
return (data->addr >> shift) & mask;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:34 +00:00
|
|
|
static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
|
|
|
u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
|
|
|
|
u64 mask = BIT(pgt->ia_bits) - 1;
|
|
|
|
|
|
|
|
return (addr & mask) >> shift;
|
|
|
|
}
|
|
|
|
|
2023-11-27 11:17:33 +00:00
|
|
|
static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
|
|
|
struct kvm_pgtable pgt = {
|
|
|
|
.ia_bits = ia_bits,
|
|
|
|
.start_level = start_level,
|
|
|
|
};
|
|
|
|
|
2022-11-07 21:56:34 +00:00
|
|
|
return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
|
2020-09-11 14:25:10 +01:00
|
|
|
}
|
|
|
|
|
2023-11-27 11:17:33 +00:00
|
|
|
static bool kvm_pte_table(kvm_pte_t pte, s8 level)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
2023-11-27 11:17:33 +00:00
|
|
|
if (level == KVM_PGTABLE_LAST_LEVEL)
|
2020-09-11 14:25:10 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!kvm_pte_valid(pte))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:01:14 +00:00
|
|
|
static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
2021-03-19 10:01:14 +00:00
|
|
|
return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
|
2020-09-11 14:25:10 +01:00
|
|
|
}
|
|
|
|
|
2021-03-19 10:01:36 +00:00
|
|
|
static void kvm_clear_pte(kvm_pte_t *ptep)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
2021-03-19 10:01:36 +00:00
|
|
|
WRITE_ONCE(*ptep, 0);
|
2020-09-11 14:25:10 +01:00
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:40 +00:00
|
|
|
static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
2022-11-07 21:56:40 +00:00
|
|
|
kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
|
2020-09-11 14:25:10 +01:00
|
|
|
|
|
|
|
pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
|
|
|
|
pte |= KVM_PTE_VALID;
|
2022-11-07 21:56:40 +00:00
|
|
|
return pte;
|
2020-09-11 14:25:10 +01:00
|
|
|
}
|
|
|
|
|
2023-11-27 11:17:33 +00:00
|
|
|
static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
2021-01-14 20:13:48 +08:00
|
|
|
kvm_pte_t pte = kvm_phys_to_pte(pa);
|
2023-11-27 11:17:33 +00:00
|
|
|
u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE :
|
|
|
|
KVM_PTE_TYPE_BLOCK;
|
2020-09-11 14:25:10 +01:00
|
|
|
|
|
|
|
pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
|
|
|
|
pte |= FIELD_PREP(KVM_PTE_TYPE, type);
|
|
|
|
pte |= KVM_PTE_VALID;
|
|
|
|
|
2021-01-14 20:13:48 +08:00
|
|
|
return pte;
|
2020-09-11 14:25:10 +01:00
|
|
|
}
|
|
|
|
|
2021-03-19 10:01:37 +00:00
|
|
|
static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
|
|
|
|
{
|
|
|
|
return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
|
|
|
|
const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
|
|
|
struct kvm_pgtable_walker *walker = data->walker;
|
2022-11-07 21:56:38 +00:00
|
|
|
|
|
|
|
/* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */
|
|
|
|
WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held());
|
2022-11-07 21:56:31 +00:00
|
|
|
return walker->cb(ctx, visit);
|
2020-09-11 14:25:10 +01:00
|
|
|
}
|
|
|
|
|
2022-12-02 18:51:52 +00:00
|
|
|
static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker,
|
|
|
|
int r)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Visitor callbacks return EAGAIN when the conditions that led to a
|
|
|
|
* fault are no longer reflected in the page tables due to a race to
|
|
|
|
* update a PTE. In the context of a fault handler this is interpreted
|
|
|
|
* as a signal to retry guest execution.
|
|
|
|
*
|
|
|
|
* Ignore the return code altogether for walkers outside a fault handler
|
|
|
|
* (e.g. write protecting a range of memory) and chug along with the
|
|
|
|
* page table walk.
|
|
|
|
*/
|
|
|
|
if (r == -EAGAIN)
|
|
|
|
return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT);
|
|
|
|
|
|
|
|
return !r;
|
|
|
|
}
|
|
|
|
|
2020-09-11 14:25:10 +01:00
|
|
|
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
|
2023-11-27 11:17:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level);
|
2020-09-11 14:25:10 +01:00
|
|
|
|
|
|
|
static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
|
2022-11-07 21:56:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops,
|
2023-11-27 11:17:33 +00:00
|
|
|
kvm_pteref_t pteref, s8 level)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
2022-11-07 21:56:31 +00:00
|
|
|
enum kvm_pgtable_walk_flags flags = data->walker->flags;
|
2022-11-18 18:22:20 +00:00
|
|
|
kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref);
|
2022-11-07 21:56:31 +00:00
|
|
|
struct kvm_pgtable_visit_ctx ctx = {
|
|
|
|
.ptep = ptep,
|
2022-11-07 21:56:32 +00:00
|
|
|
.old = READ_ONCE(*ptep),
|
2022-11-07 21:56:31 +00:00
|
|
|
.arg = data->walker->arg,
|
2022-11-07 21:56:33 +00:00
|
|
|
.mm_ops = mm_ops,
|
KVM: arm64: Infer the PA offset from IPA in stage-2 map walker
Until now, the page table walker counted increments to the PA and IPA
of a walk in two separate places. While the PA is incremented as soon as
a leaf PTE is installed in stage2_map_walker_try_leaf(), the IPA is
actually bumped in the generic table walker context. Critically,
__kvm_pgtable_visit() rereads the PTE after the LEAF callback returns
to work out if a table or leaf was installed, and only bumps the IPA for
a leaf PTE.
This arrangement worked fine when we handled faults behind the write lock,
as the walker had exclusive access to the stage-2 page tables. However,
commit 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
started handling all stage-2 faults behind the read lock, opening up a
race where a walker could increment the PA but not the IPA of a walk.
Nothing good ensues, as the walker starts mapping with the incorrect
IPA -> PA relationship.
For example, assume that two vCPUs took a data abort on the same IPA.
One observes that dirty logging is disabled, and the other observed that
it is enabled:
vCPU attempting PMD mapping vCPU attempting PTE mapping
====================================== =====================================
/* install PMD */
stage2_make_pte(ctx, leaf);
data->phys += granule;
/* replace PMD with a table */
stage2_try_break_pte(ctx, data->mmu);
stage2_make_pte(ctx, table);
/* table is observed */
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
/*
* map walk continues w/o incrementing
* IPA.
*/
__kvm_pgtable_walk(..., level + 1);
Bring an end to the whole mess by using the IPA as the single source of
truth for how far along a walk has gotten. Work out the correct PA to
map by calculating the IPA offset from the beginning of the walk and add
that to the starting physical address.
Cc: stable@vger.kernel.org
Fixes: 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230421071606.1603916-2-oliver.upton@linux.dev
2023-04-21 07:16:05 +00:00
|
|
|
.start = data->start,
|
2022-11-07 21:56:31 +00:00
|
|
|
.addr = data->addr,
|
|
|
|
.end = data->end,
|
|
|
|
.level = level,
|
|
|
|
.flags = flags,
|
|
|
|
};
|
2020-09-11 14:25:10 +01:00
|
|
|
int ret = 0;
|
2023-05-22 11:32:58 +01:00
|
|
|
bool reload = false;
|
2022-11-07 21:56:36 +00:00
|
|
|
kvm_pteref_t childp;
|
2022-11-07 21:56:32 +00:00
|
|
|
bool table = kvm_pte_table(ctx.old, level);
|
2020-09-11 14:25:10 +01:00
|
|
|
|
2023-05-22 11:32:58 +01:00
|
|
|
if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
|
2022-11-07 21:56:31 +00:00
|
|
|
ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
|
2023-05-22 11:32:58 +01:00
|
|
|
reload = true;
|
|
|
|
}
|
2020-09-11 14:25:10 +01:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
|
|
|
|
ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
|
2023-05-22 11:32:58 +01:00
|
|
|
reload = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reload the page table after invoking the walker callback for leaf
|
|
|
|
* entries or after pre-order traversal, to allow the walker to descend
|
|
|
|
* into a newly installed or replaced table.
|
|
|
|
*/
|
|
|
|
if (reload) {
|
2022-11-07 21:56:32 +00:00
|
|
|
ctx.old = READ_ONCE(*ptep);
|
|
|
|
table = kvm_pte_table(ctx.old, level);
|
2020-09-11 14:25:10 +01:00
|
|
|
}
|
|
|
|
|
2022-12-02 18:51:52 +00:00
|
|
|
if (!kvm_pgtable_walk_continue(data->walker, ret))
|
2020-09-11 14:25:10 +01:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (!table) {
|
2021-03-05 18:52:54 +00:00
|
|
|
data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
|
2020-09-11 14:25:10 +01:00
|
|
|
data->addr += kvm_granule_size(level);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:36 +00:00
|
|
|
childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
|
2022-11-07 21:56:33 +00:00
|
|
|
ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1);
|
2022-12-02 18:51:52 +00:00
|
|
|
if (!kvm_pgtable_walk_continue(data->walker, ret))
|
2020-09-11 14:25:10 +01:00
|
|
|
goto out;
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
|
|
|
|
ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);
|
2020-09-11 14:25:10 +01:00
|
|
|
|
|
|
|
out:
|
2022-12-02 18:51:52 +00:00
|
|
|
if (kvm_pgtable_walk_continue(data->walker, ret))
|
|
|
|
return 0;
|
|
|
|
|
2020-09-11 14:25:10 +01:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
|
2023-11-27 11:17:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
|
|
|
u32 idx;
|
|
|
|
int ret = 0;
|
|
|
|
|
2023-11-27 11:17:33 +00:00
|
|
|
if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL ||
|
|
|
|
level > KVM_PGTABLE_LAST_LEVEL))
|
2020-09-11 14:25:10 +01:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
|
2022-11-07 21:56:36 +00:00
|
|
|
kvm_pteref_t pteref = &pgtable[idx];
|
2020-09-11 14:25:10 +01:00
|
|
|
|
|
|
|
if (data->addr >= data->end)
|
|
|
|
break;
|
|
|
|
|
2022-11-07 21:56:36 +00:00
|
|
|
ret = __kvm_pgtable_visit(data, mm_ops, pteref, level);
|
2020-09-11 14:25:10 +01:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:34 +00:00
|
|
|
static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data)
|
2020-09-11 14:25:10 +01:00
|
|
|
{
|
|
|
|
u32 idx;
|
|
|
|
int ret = 0;
|
|
|
|
u64 limit = BIT(pgt->ia_bits);
|
|
|
|
|
|
|
|
if (data->addr > limit || data->end > limit)
|
|
|
|
return -ERANGE;
|
|
|
|
|
|
|
|
if (!pgt->pgd)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2022-11-07 21:56:34 +00:00
|
|
|
for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) {
|
2022-11-07 21:56:36 +00:00
|
|
|
kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE];
|
2020-09-11 14:25:10 +01:00
|
|
|
|
2022-11-07 21:56:36 +00:00
|
|
|
ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level);
|
2020-09-11 14:25:10 +01:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
|
|
|
struct kvm_pgtable_walker *walker)
|
|
|
|
{
|
|
|
|
struct kvm_pgtable_walk_data walk_data = {
|
KVM: arm64: Infer the PA offset from IPA in stage-2 map walker
Until now, the page table walker counted increments to the PA and IPA
of a walk in two separate places. While the PA is incremented as soon as
a leaf PTE is installed in stage2_map_walker_try_leaf(), the IPA is
actually bumped in the generic table walker context. Critically,
__kvm_pgtable_visit() rereads the PTE after the LEAF callback returns
to work out if a table or leaf was installed, and only bumps the IPA for
a leaf PTE.
This arrangement worked fine when we handled faults behind the write lock,
as the walker had exclusive access to the stage-2 page tables. However,
commit 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
started handling all stage-2 faults behind the read lock, opening up a
race where a walker could increment the PA but not the IPA of a walk.
Nothing good ensues, as the walker starts mapping with the incorrect
IPA -> PA relationship.
For example, assume that two vCPUs took a data abort on the same IPA.
One observes that dirty logging is disabled, and the other observed that
it is enabled:
vCPU attempting PMD mapping vCPU attempting PTE mapping
====================================== =====================================
/* install PMD */
stage2_make_pte(ctx, leaf);
data->phys += granule;
/* replace PMD with a table */
stage2_try_break_pte(ctx, data->mmu);
stage2_make_pte(ctx, table);
/* table is observed */
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
/*
* map walk continues w/o incrementing
* IPA.
*/
__kvm_pgtable_walk(..., level + 1);
Bring an end to the whole mess by using the IPA as the single source of
truth for how far along a walk has gotten. Work out the correct PA to
map by calculating the IPA offset from the beginning of the walk and add
that to the starting physical address.
Cc: stable@vger.kernel.org
Fixes: 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230421071606.1603916-2-oliver.upton@linux.dev
2023-04-21 07:16:05 +00:00
|
|
|
.start = ALIGN_DOWN(addr, PAGE_SIZE),
|
2020-09-11 14:25:10 +01:00
|
|
|
.addr = ALIGN_DOWN(addr, PAGE_SIZE),
|
|
|
|
.end = PAGE_ALIGN(walk_data.addr + size),
|
|
|
|
.walker = walker,
|
|
|
|
};
|
2022-11-07 21:56:38 +00:00
|
|
|
int r;
|
|
|
|
|
2022-11-18 18:22:22 +00:00
|
|
|
r = kvm_pgtable_walk_begin(walker);
|
|
|
|
if (r)
|
|
|
|
return r;
|
|
|
|
|
2022-11-07 21:56:38 +00:00
|
|
|
r = _kvm_pgtable_walk(pgt, &walk_data);
|
KVM: arm64: Don't acquire RCU read lock for exclusive table walks
Marek reported a BUG resulting from the recent parallel faults changes,
as the hyp stage-1 map walker attempted to allocate table memory while
holding the RCU read lock:
BUG: sleeping function called from invalid context at
include/linux/sched/mm.h:274
in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
preempt_count: 0, expected: 0
RCU nest depth: 1, expected: 0
2 locks held by swapper/0/1:
#0: ffff80000a8a44d0 (kvm_hyp_pgd_mutex){+.+.}-{3:3}, at:
__create_hyp_mappings+0x80/0xc4
#1: ffff80000a927720 (rcu_read_lock){....}-{1:2}, at:
kvm_pgtable_walk+0x0/0x1f4
CPU: 2 PID: 1 Comm: swapper/0 Not tainted 6.1.0-rc3+ #5918
Hardware name: Raspberry Pi 3 Model B (DT)
Call trace:
dump_backtrace.part.0+0xe4/0xf0
show_stack+0x18/0x40
dump_stack_lvl+0x8c/0xb8
dump_stack+0x18/0x34
__might_resched+0x178/0x220
__might_sleep+0x48/0xa0
prepare_alloc_pages+0x178/0x1a0
__alloc_pages+0x9c/0x109c
alloc_page_interleave+0x1c/0xc4
alloc_pages+0xec/0x160
get_zeroed_page+0x1c/0x44
kvm_hyp_zalloc_page+0x14/0x20
hyp_map_walker+0xd4/0x134
kvm_pgtable_visitor_cb.isra.0+0x38/0x5c
__kvm_pgtable_walk+0x1a4/0x220
kvm_pgtable_walk+0x104/0x1f4
kvm_pgtable_hyp_map+0x80/0xc4
__create_hyp_mappings+0x9c/0xc4
kvm_mmu_init+0x144/0x1cc
kvm_arch_init+0xe4/0xef4
kvm_init+0x3c/0x3d0
arm_init+0x20/0x30
do_one_initcall+0x74/0x400
kernel_init_freeable+0x2e0/0x350
kernel_init+0x24/0x130
ret_from_fork+0x10/0x20
Since the hyp stage-1 table walkers are serialized by kvm_hyp_pgd_mutex,
RCU protection really doesn't add anything. Don't acquire the RCU read
lock for an exclusive walk.
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221118182222.3932898-3-oliver.upton@linux.dev
2022-11-18 18:22:21 +00:00
|
|
|
kvm_pgtable_walk_end(walker);
|
2020-09-11 14:25:10 +01:00
|
|
|
|
2022-11-07 21:56:38 +00:00
|
|
|
return r;
|
2020-09-11 14:25:10 +01:00
|
|
|
}
|
2020-09-11 14:25:11 +01:00
|
|
|
|
2021-07-26 16:35:47 +01:00
|
|
|
struct leaf_walk_data {
|
|
|
|
kvm_pte_t pte;
|
2023-11-27 11:17:33 +00:00
|
|
|
s8 level;
|
2021-07-26 16:35:47 +01:00
|
|
|
};
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2021-07-26 16:35:47 +01:00
|
|
|
{
|
2022-11-07 21:56:31 +00:00
|
|
|
struct leaf_walk_data *data = ctx->arg;
|
2021-07-26 16:35:47 +01:00
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
data->pte = ctx->old;
|
2022-11-07 21:56:31 +00:00
|
|
|
data->level = ctx->level;
|
2021-07-26 16:35:47 +01:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
|
2023-11-27 11:17:33 +00:00
|
|
|
kvm_pte_t *ptep, s8 *level)
|
2021-07-26 16:35:47 +01:00
|
|
|
{
|
|
|
|
struct leaf_walk_data data;
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = leaf_walker,
|
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF,
|
|
|
|
.arg = &data,
|
|
|
|
};
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
|
|
|
|
PAGE_SIZE, &walker);
|
|
|
|
if (!ret) {
|
|
|
|
if (ptep)
|
|
|
|
*ptep = data.pte;
|
|
|
|
if (level)
|
|
|
|
*level = data.level;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-09-11 14:25:11 +01:00
|
|
|
struct hyp_map_data {
|
2023-04-21 10:18:34 +01:00
|
|
|
const u64 phys;
|
2021-03-19 10:01:14 +00:00
|
|
|
kvm_pte_t attr;
|
2020-09-11 14:25:11 +01:00
|
|
|
};
|
|
|
|
|
2021-03-19 10:01:38 +00:00
|
|
|
static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
|
2020-09-11 14:25:11 +01:00
|
|
|
{
|
|
|
|
bool device = prot & KVM_PGTABLE_PROT_DEVICE;
|
|
|
|
u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
|
|
|
|
kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
|
|
|
|
u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
|
|
|
|
u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
|
|
|
|
KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
|
|
|
|
|
|
|
|
if (!(prot & KVM_PGTABLE_PROT_R))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (prot & KVM_PGTABLE_PROT_X) {
|
|
|
|
if (prot & KVM_PGTABLE_PROT_W)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (device)
|
|
|
|
return -EINVAL;
|
2023-05-30 15:08:45 +00:00
|
|
|
|
arm64: Avoid cpus_have_const_cap() for ARM64_HAS_BTI
In system_supports_bti() we use cpus_have_const_cap() to check for
ARM64_HAS_BTI, but this is not necessary and alternative_has_cap_*() or
cpus_have_final_*cap() would be preferable.
For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.
Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.
When CONFIG_ARM64_BTI_KERNEL=y, the ARM64_HAS_BTI cpucap is a strict
boot cpu feature which is detected and patched early on the boot cpu.
All uses guarded by CONFIG_ARM64_BTI_KERNEL happen after the boot CPU
has detected ARM64_HAS_BTI and patched boot alternatives, and hence can
safely use alternative_has_cap_*() or cpus_have_final_boot_cap().
Regardless of CONFIG_ARM64_BTI_KERNEL, all other uses of ARM64_HAS_BTI
happen after system capabilities have been finalized and alternatives
have been patched. Hence these can safely use alternative_has_cap_*) or
cpus_have_final_cap().
This patch splits system_supports_bti() into system_supports_bti() and
system_supports_bti_kernel(), with the former handling where the cpucap
affects userspace functionality, and ther latter handling where the
cpucap affects kernel functionality. The use of cpus_have_const_cap() is
replaced by cpus_have_final_cap() in cpus_have_const_cap, and
cpus_have_final_boot_cap() in system_supports_bti_kernel(). This will
avoid generating code to test the system_cpucaps bitmap and should be
better for all subsequent calls at runtime. The use of
cpus_have_final_cap() and cpus_have_final_boot_cap() will make it easier
to spot if code is chaanged such that these run before the ARM64_HAS_BTI
cpucap is guaranteed to have been finalized.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2023-10-16 11:24:39 +01:00
|
|
|
if (system_supports_bti_kernel())
|
2023-05-30 15:08:45 +00:00
|
|
|
attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP;
|
2020-09-11 14:25:11 +01:00
|
|
|
} else {
|
|
|
|
attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
|
|
|
|
}
|
|
|
|
|
|
|
|
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
|
KVM: arm64: Use LPA2 page-tables for stage2 and hyp stage1
Implement a simple policy whereby if the HW supports FEAT_LPA2 for the
page size we are using, always use LPA2-style page-tables for stage 2
and hyp stage 1 (assuming an nvhe hyp), regardless of the VMM-requested
IPA size or HW-implemented PA size. When in use we can now support up to
52-bit IPA and PA sizes.
We use the previously created cpu feature to track whether LPA2 is
supported for deciding whether to use the LPA2 or classic pte format.
Note that FEAT_LPA2 brings support for bigger block mappings (512GB with
4KB, 64GB with 16KB). We explicitly don't enable these in the library
because stage2_apply_range() works on batch sizes of the largest used
block mapping, and increasing the size of the batch would lead to soft
lockups. See commit 5994bc9e05c2 ("KVM: arm64: Limit
stage2_apply_range() batch size to largest block").
With the addition of LPA2 support in the hypervisor, the PA size
supported by the HW must be capped with a runtime decision, rather than
simply using a compile-time decision based on PA_BITS. For example, on a
system that advertises 52 bit PA but does not support FEAT_LPA2, A 4KB
or 16KB kernel compiled with LPA2 support must still limit the PA size
to 48 bits.
Therefore, move the insertion of the PS field into TCR_EL2 out of
__kvm_hyp_init assembly code and instead do it in cpu_prepare_hyp_mode()
where the rest of TCR_EL2 is prepared. This allows us to figure out PS
with kvm_get_parange(), which has the appropriate logic to ensure the
above requirement. (and the PS field of VTCR_EL2 is already populated
this way).
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231127111737.1897081-8-ryan.roberts@arm.com
2023-11-27 11:17:32 +00:00
|
|
|
if (!kvm_lpa2_is_enabled())
|
|
|
|
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
|
2020-09-11 14:25:11 +01:00
|
|
|
attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
|
2021-08-09 16:24:38 +01:00
|
|
|
attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
|
2021-03-19 10:01:38 +00:00
|
|
|
*ptep = attr;
|
|
|
|
|
2020-09-11 14:25:11 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-09 16:24:43 +01:00
|
|
|
enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
|
|
|
|
{
|
|
|
|
enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
|
|
|
|
u32 ap;
|
|
|
|
|
|
|
|
if (!kvm_pte_valid(pte))
|
|
|
|
return prot;
|
|
|
|
|
|
|
|
if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
|
|
|
|
prot |= KVM_PGTABLE_PROT_X;
|
|
|
|
|
|
|
|
ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
|
|
|
|
if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
|
|
|
|
prot |= KVM_PGTABLE_PROT_R;
|
|
|
|
else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
|
|
|
|
prot |= KVM_PGTABLE_PROT_RW;
|
|
|
|
|
|
|
|
return prot;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
struct hyp_map_data *data)
|
2020-09-11 14:25:11 +01:00
|
|
|
{
|
2023-04-21 07:16:06 +00:00
|
|
|
u64 phys = data->phys + (ctx->addr - ctx->start);
|
2022-11-07 21:56:32 +00:00
|
|
|
kvm_pte_t new;
|
2020-09-11 14:25:11 +01:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
if (!kvm_block_mapping_supported(ctx, phys))
|
2020-09-11 14:25:11 +01:00
|
|
|
return false;
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
|
2022-11-07 21:56:32 +00:00
|
|
|
if (ctx->old == new)
|
2021-12-15 16:12:19 +00:00
|
|
|
return true;
|
2022-11-07 21:56:32 +00:00
|
|
|
if (!kvm_pte_valid(ctx->old))
|
2022-11-07 21:56:33 +00:00
|
|
|
ctx->mm_ops->get_page(ctx->ptep);
|
2022-11-07 21:56:32 +00:00
|
|
|
else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
|
2021-12-15 16:12:19 +00:00
|
|
|
return false;
|
2021-01-14 20:13:48 +08:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
smp_store_release(ctx->ptep, new);
|
2020-09-11 14:25:11 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2020-09-11 14:25:11 +01:00
|
|
|
{
|
2022-11-07 21:56:40 +00:00
|
|
|
kvm_pte_t *childp, new;
|
2022-11-07 21:56:31 +00:00
|
|
|
struct hyp_map_data *data = ctx->arg;
|
2022-11-07 21:56:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
2020-09-11 14:25:11 +01:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
if (hyp_map_walker_try_leaf(ctx, data))
|
2020-09-11 14:25:11 +01:00
|
|
|
return 0;
|
|
|
|
|
2023-11-27 11:17:33 +00:00
|
|
|
if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
|
2020-09-11 14:25:11 +01:00
|
|
|
return -EINVAL;
|
|
|
|
|
2021-03-19 10:01:14 +00:00
|
|
|
childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
|
2020-09-11 14:25:11 +01:00
|
|
|
if (!childp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2022-11-07 21:56:40 +00:00
|
|
|
new = kvm_init_table_pte(childp, mm_ops);
|
2022-11-07 21:56:31 +00:00
|
|
|
mm_ops->get_page(ctx->ptep);
|
2022-11-07 21:56:40 +00:00
|
|
|
smp_store_release(ctx->ptep, new);
|
|
|
|
|
2020-09-11 14:25:11 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
|
|
|
|
enum kvm_pgtable_prot prot)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct hyp_map_data map_data = {
|
|
|
|
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
|
|
|
|
};
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = hyp_map_walker,
|
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF,
|
|
|
|
.arg = &map_data,
|
|
|
|
};
|
|
|
|
|
2021-03-19 10:01:38 +00:00
|
|
|
ret = hyp_set_prot_attr(prot, &map_data.attr);
|
2020-09-11 14:25:11 +01:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
|
|
|
|
dsb(ishst);
|
|
|
|
isb();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2021-12-15 16:12:22 +00:00
|
|
|
{
|
2022-11-07 21:56:32 +00:00
|
|
|
kvm_pte_t *childp = NULL;
|
2022-11-07 21:56:31 +00:00
|
|
|
u64 granule = kvm_granule_size(ctx->level);
|
2022-11-07 21:56:33 +00:00
|
|
|
u64 *unmapped = ctx->arg;
|
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
2021-12-15 16:12:22 +00:00
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
if (!kvm_pte_valid(ctx->old))
|
2021-12-15 16:12:22 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
if (kvm_pte_table(ctx->old, ctx->level)) {
|
|
|
|
childp = kvm_pte_follow(ctx->old, mm_ops);
|
2021-12-15 16:12:22 +00:00
|
|
|
|
|
|
|
if (mm_ops->page_count(childp) != 1)
|
|
|
|
return 0;
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
kvm_clear_pte(ctx->ptep);
|
2021-12-15 16:12:22 +00:00
|
|
|
dsb(ishst);
|
2024-03-27 12:48:51 +00:00
|
|
|
__tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
|
2021-12-15 16:12:22 +00:00
|
|
|
} else {
|
2022-11-07 21:56:31 +00:00
|
|
|
if (ctx->end - ctx->addr < granule)
|
2021-12-15 16:12:22 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
kvm_clear_pte(ctx->ptep);
|
2021-12-15 16:12:22 +00:00
|
|
|
dsb(ishst);
|
2022-11-07 21:56:31 +00:00
|
|
|
__tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
|
2022-11-07 21:56:33 +00:00
|
|
|
*unmapped += granule;
|
2021-12-15 16:12:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
dsb(ish);
|
|
|
|
isb();
|
2022-11-07 21:56:31 +00:00
|
|
|
mm_ops->put_page(ctx->ptep);
|
2021-12-15 16:12:22 +00:00
|
|
|
|
|
|
|
if (childp)
|
|
|
|
mm_ops->put_page(childp);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
|
|
|
|
{
|
2022-11-07 21:56:33 +00:00
|
|
|
u64 unmapped = 0;
|
2021-12-15 16:12:22 +00:00
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = hyp_unmap_walker,
|
2022-11-07 21:56:33 +00:00
|
|
|
.arg = &unmapped,
|
2021-12-15 16:12:22 +00:00
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (!pgt->mm_ops->page_count)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
kvm_pgtable_walk(pgt, addr, size, &walker);
|
2022-11-07 21:56:33 +00:00
|
|
|
return unmapped;
|
2021-12-15 16:12:22 +00:00
|
|
|
}
|
|
|
|
|
2021-03-19 10:01:14 +00:00
|
|
|
int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
|
|
|
|
struct kvm_pgtable_mm_ops *mm_ops)
|
2020-09-11 14:25:11 +01:00
|
|
|
{
|
2023-11-27 11:17:33 +00:00
|
|
|
s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 -
|
|
|
|
ARM64_HW_PGTABLE_LEVELS(va_bits);
|
|
|
|
|
|
|
|
if (start_level < KVM_PGTABLE_FIRST_LEVEL ||
|
|
|
|
start_level > KVM_PGTABLE_LAST_LEVEL)
|
|
|
|
return -EINVAL;
|
2020-09-11 14:25:11 +01:00
|
|
|
|
2022-11-07 21:56:36 +00:00
|
|
|
pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL);
|
2020-09-11 14:25:11 +01:00
|
|
|
if (!pgt->pgd)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
pgt->ia_bits = va_bits;
|
2023-11-27 11:17:33 +00:00
|
|
|
pgt->start_level = start_level;
|
2021-03-19 10:01:14 +00:00
|
|
|
pgt->mm_ops = mm_ops;
|
2020-09-11 14:25:11 +01:00
|
|
|
pgt->mmu = NULL;
|
2021-08-09 16:24:37 +01:00
|
|
|
pgt->force_pte_cb = NULL;
|
|
|
|
|
2020-09-11 14:25:11 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2020-09-11 14:25:11 +01:00
|
|
|
{
|
2022-11-07 21:56:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
2021-12-15 16:12:19 +00:00
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
if (!kvm_pte_valid(ctx->old))
|
2021-12-15 16:12:19 +00:00
|
|
|
return 0;
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
mm_ops->put_page(ctx->ptep);
|
2021-12-15 16:12:19 +00:00
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
if (kvm_pte_table(ctx->old, ctx->level))
|
|
|
|
mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
|
2021-03-19 10:01:14 +00:00
|
|
|
|
2020-09-11 14:25:11 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
|
|
|
|
{
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = hyp_free_walker,
|
2021-12-15 16:12:19 +00:00
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
|
2020-09-11 14:25:11 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
|
2022-11-18 18:22:20 +00:00
|
|
|
pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd));
|
2020-09-11 14:25:11 +01:00
|
|
|
pgt->pgd = NULL;
|
|
|
|
}
|
2020-09-11 14:25:13 +01:00
|
|
|
|
2020-09-11 14:25:14 +01:00
|
|
|
struct stage2_map_data {
|
2023-04-21 10:18:34 +01:00
|
|
|
const u64 phys;
|
2020-09-11 14:25:14 +01:00
|
|
|
kvm_pte_t attr;
|
2021-03-19 10:01:37 +00:00
|
|
|
u8 owner_id;
|
2020-09-11 14:25:14 +01:00
|
|
|
|
|
|
|
kvm_pte_t *anchor;
|
2021-03-19 10:01:36 +00:00
|
|
|
kvm_pte_t *childp;
|
2020-09-11 14:25:14 +01:00
|
|
|
|
|
|
|
struct kvm_s2_mmu *mmu;
|
2021-03-19 10:01:33 +00:00
|
|
|
void *memcache;
|
2021-03-19 10:01:14 +00:00
|
|
|
|
2021-08-09 16:24:37 +01:00
|
|
|
/* Force mappings to page granularity */
|
|
|
|
bool force_pte;
|
2024-12-12 09:18:46 +01:00
|
|
|
|
|
|
|
/* Walk should update owner_id only */
|
|
|
|
bool annotation;
|
2020-09-11 14:25:14 +01:00
|
|
|
};
|
|
|
|
|
2021-03-19 10:01:30 +00:00
|
|
|
u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
|
|
|
|
{
|
|
|
|
u64 vtcr = VTCR_EL2_FLAGS;
|
2023-11-27 11:17:33 +00:00
|
|
|
s8 lvls;
|
2021-03-19 10:01:30 +00:00
|
|
|
|
|
|
|
vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
|
|
|
|
vtcr |= VTCR_EL2_T0SZ(phys_shift);
|
|
|
|
/*
|
|
|
|
* Use a minimum 2 level page table to prevent splitting
|
|
|
|
* host PMD huge pages at stage2.
|
|
|
|
*/
|
|
|
|
lvls = stage2_pgtable_levels(phys_shift);
|
|
|
|
if (lvls < 2)
|
|
|
|
lvls = 2;
|
2023-11-27 11:17:34 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* When LPA2 is enabled, the HW supports an extra level of translation
|
|
|
|
* (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2
|
|
|
|
* to as an addition to SL0 to enable encoding this extra start level.
|
|
|
|
* However, since we always use concatenated pages for the first level
|
|
|
|
* lookup, we will never need this extra level and therefore do not need
|
|
|
|
* to touch SL2.
|
|
|
|
*/
|
2021-03-19 10:01:30 +00:00
|
|
|
vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
|
|
|
|
|
2022-12-02 18:51:56 +00:00
|
|
|
#ifdef CONFIG_ARM64_HW_AFDBM
|
2021-03-19 10:01:30 +00:00
|
|
|
/*
|
|
|
|
* Enable the Hardware Access Flag management, unconditionally
|
2023-06-09 22:01:02 +00:00
|
|
|
* on all CPUs. In systems that have asymmetric support for the feature
|
|
|
|
* this allows KVM to leverage hardware support on the subset of cores
|
|
|
|
* that implement the feature.
|
|
|
|
*
|
|
|
|
* The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
|
|
|
|
* hardware) on implementations that do not advertise support for the
|
|
|
|
* feature. As such, setting HA unconditionally is safe, unless you
|
|
|
|
* happen to be running on a design that has unadvertised support for
|
|
|
|
* HAFDBS. Here be dragons.
|
2021-03-19 10:01:30 +00:00
|
|
|
*/
|
2023-06-09 22:01:02 +00:00
|
|
|
if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
|
|
|
|
vtcr |= VTCR_EL2_HA;
|
2022-12-02 18:51:56 +00:00
|
|
|
#endif /* CONFIG_ARM64_HW_AFDBM */
|
2021-03-19 10:01:30 +00:00
|
|
|
|
KVM: arm64: Use LPA2 page-tables for stage2 and hyp stage1
Implement a simple policy whereby if the HW supports FEAT_LPA2 for the
page size we are using, always use LPA2-style page-tables for stage 2
and hyp stage 1 (assuming an nvhe hyp), regardless of the VMM-requested
IPA size or HW-implemented PA size. When in use we can now support up to
52-bit IPA and PA sizes.
We use the previously created cpu feature to track whether LPA2 is
supported for deciding whether to use the LPA2 or classic pte format.
Note that FEAT_LPA2 brings support for bigger block mappings (512GB with
4KB, 64GB with 16KB). We explicitly don't enable these in the library
because stage2_apply_range() works on batch sizes of the largest used
block mapping, and increasing the size of the batch would lead to soft
lockups. See commit 5994bc9e05c2 ("KVM: arm64: Limit
stage2_apply_range() batch size to largest block").
With the addition of LPA2 support in the hypervisor, the PA size
supported by the HW must be capped with a runtime decision, rather than
simply using a compile-time decision based on PA_BITS. For example, on a
system that advertises 52 bit PA but does not support FEAT_LPA2, A 4KB
or 16KB kernel compiled with LPA2 support must still limit the PA size
to 48 bits.
Therefore, move the insertion of the PS field into TCR_EL2 out of
__kvm_hyp_init assembly code and instead do it in cpu_prepare_hyp_mode()
where the rest of TCR_EL2 is prepared. This allows us to figure out PS
with kvm_get_parange(), which has the appropriate logic to ensure the
above requirement. (and the PS field of VTCR_EL2 is already populated
this way).
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231127111737.1897081-8-ryan.roberts@arm.com
2023-11-27 11:17:32 +00:00
|
|
|
if (kvm_lpa2_is_enabled())
|
|
|
|
vtcr |= VTCR_EL2_DS;
|
|
|
|
|
2021-03-19 10:01:30 +00:00
|
|
|
/* Set the vmid bits */
|
|
|
|
vtcr |= (get_vmid_bits(mmfr1) == 16) ?
|
|
|
|
VTCR_EL2_VS_16BIT :
|
|
|
|
VTCR_EL2_VS_8BIT;
|
|
|
|
|
|
|
|
return vtcr;
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:01:40 +00:00
|
|
|
static bool stage2_has_fwb(struct kvm_pgtable *pgt)
|
|
|
|
{
|
arm64: kvm: Use cpus_have_final_cap() explicitly
Much of the arm64 KVM code uses cpus_have_const_cap() to check for
cpucaps, but this is unnecessary and it would be preferable to use
cpus_have_final_cap().
For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.
Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.
KVM is initialized after cpucaps have been finalized and alternatives
have been patched. Since commit:
d86de40decaa14e6 ("arm64: cpufeature: upgrade hyp caps to final")
... use of cpus_have_const_cap() in hyp code is automatically converted
to use cpus_have_final_cap():
| static __always_inline bool cpus_have_const_cap(int num)
| {
| if (is_hyp_code())
| return cpus_have_final_cap(num);
| else if (system_capabilities_finalized())
| return __cpus_have_const_cap(num);
| else
| return cpus_have_cap(num);
| }
Thus, converting hyp code to use cpus_have_final_cap() directly will not
result in any functional change.
Non-hyp KVM code is also not executed until cpucaps have been finalized,
and it would be preferable to extent the same treatment to this code and
use cpus_have_final_cap() directly.
This patch converts instances of cpus_have_const_cap() in KVM-only code
over to cpus_have_final_cap(). As all of this code runs after cpucaps
have been finalized, there should be no functional change as a result of
this patch, but the redundant instructions generated by
cpus_have_const_cap() will be removed from the non-hyp KVM code.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2023-10-16 11:24:32 +01:00
|
|
|
if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
|
2021-03-19 10:01:40 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
|
|
|
|
}
|
|
|
|
|
2023-08-11 04:51:23 +00:00
|
|
|
void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
|
|
|
|
phys_addr_t addr, size_t size)
|
|
|
|
{
|
|
|
|
unsigned long pages, inval_pages;
|
|
|
|
|
|
|
|
if (!system_supports_tlb_range()) {
|
|
|
|
kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pages = size >> PAGE_SHIFT;
|
|
|
|
while (pages > 0) {
|
|
|
|
inval_pages = min(pages, MAX_TLBI_RANGE_PAGES);
|
|
|
|
kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages);
|
|
|
|
|
|
|
|
addr += inval_pages << PAGE_SHIFT;
|
|
|
|
pages -= inval_pages;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:01:40 +00:00
|
|
|
#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
|
|
|
|
|
|
|
|
static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
|
|
|
|
kvm_pte_t *ptep)
|
2020-09-11 14:25:14 +01:00
|
|
|
{
|
KVM: arm64: Introduce new flag for non-cacheable IO memory
Currently, KVM for ARM64 maps at stage 2 memory that is considered device
(i.e. it is not RAM) with DEVICE_nGnRE memory attributes; this setting
overrides (as per the ARM architecture [1]) any device MMIO mapping
present at stage 1, resulting in a set-up whereby a guest operating
system cannot determine device MMIO mapping memory attributes on its
own but it is always overridden by the KVM stage 2 default.
This set-up does not allow guest operating systems to select device
memory attributes independently from KVM stage-2 mappings
(refer to [1], "Combining stage 1 and stage 2 memory type attributes"),
which turns out to be an issue in that guest operating systems
(e.g. Linux) may request to map devices MMIO regions with memory
attributes that guarantee better performance (e.g. gathering
attribute - that for some devices can generate larger PCIe memory
writes TLPs) and specific operations (e.g. unaligned transactions)
such as the NormalNC memory type.
The default device stage 2 mapping was chosen in KVM for ARM64 since
it was considered safer (i.e. it would not allow guests to trigger
uncontained failures ultimately crashing the machine) but this
turned out to be asynchronous (SError) defeating the purpose.
Failures containability is a property of the platform and is independent
from the memory type used for MMIO device memory mappings.
Actually, DEVICE_nGnRE memory type is even more problematic than
Normal-NC memory type in terms of faults containability in that e.g.
aborts triggered on DEVICE_nGnRE loads cannot be made, architecturally,
synchronous (i.e. that would imply that the processor should issue at
most 1 load transaction at a time - it cannot pipeline them - otherwise
the synchronous abort semantics would break the no-speculation attribute
attached to DEVICE_XXX memory).
This means that regardless of the combined stage1+stage2 mappings a
platform is safe if and only if device transactions cannot trigger
uncontained failures and that in turn relies on platform capabilities
and the device type being assigned (i.e. PCIe AER/DPC error containment
and RAS architecture[3]); therefore the default KVM device stage 2
memory attributes play no role in making device assignment safer
for a given platform (if the platform design adheres to design
guidelines outlined in [3]) and therefore can be relaxed.
For all these reasons, relax the KVM stage 2 device memory attributes
from DEVICE_nGnRE to Normal-NC.
The NormalNC was chosen over a different Normal memory type default
at stage-2 (e.g. Normal Write-through) to avoid cache allocation/snooping.
Relaxing S2 KVM device MMIO mappings to Normal-NC is not expected to
trigger any issue on guest device reclaim use cases either (i.e. device
MMIO unmap followed by a device reset) at least for PCIe devices, in that
in PCIe a device reset is architected and carried out through PCI config
space transactions that are naturally ordered with respect to MMIO
transactions according to the PCI ordering rules.
Having Normal-NC S2 default puts guests in control (thanks to
stage1+stage2 combined memory attributes rules [1]) of device MMIO
regions memory mappings, according to the rules described in [1]
and summarized here ([(S1) - stage1], [(S2) - stage 2]):
S1 | S2 | Result
NORMAL-WB | NORMAL-NC | NORMAL-NC
NORMAL-WT | NORMAL-NC | NORMAL-NC
NORMAL-NC | NORMAL-NC | NORMAL-NC
DEVICE<attr> | NORMAL-NC | DEVICE<attr>
It is worth noting that currently, to map devices MMIO space to user
space in a device pass-through use case the VFIO framework applies memory
attributes derived from pgprot_noncached() settings applied to VMAs, which
result in device-nGnRnE memory attributes for the stage-1 VMM mappings.
This means that a userspace mapping for device MMIO space carried
out with the current VFIO framework and a guest OS mapping for the same
MMIO space may result in a mismatched alias as described in [2].
Defaulting KVM device stage-2 mappings to Normal-NC attributes does not
change anything in this respect, in that the mismatched aliases would
only affect (refer to [2] for a detailed explanation) ordering between
the userspace and GuestOS mappings resulting stream of transactions
(i.e. it does not cause loss of property for either stream of
transactions on its own), which is harmless given that the userspace
and GuestOS access to the device is carried out through independent
transactions streams.
A Normal-NC flag is not present today. So add a new kvm_pgtable_prot
(KVM_PGTABLE_PROT_NORMAL_NC) flag for it, along with its
corresponding PTE value 0x5 (0b101) determined from [1].
Lastly, adapt the stage2 PTE property setter function
(stage2_set_prot_attr) to handle the NormalNC attribute.
The entire discussion leading to this patch series may be followed through
the following links.
Link: https://lore.kernel.org/all/20230907181459.18145-3-ankita@nvidia.com
Link: https://lore.kernel.org/r/20231205033015.10044-1-ankita@nvidia.com
[1] section D8.5.5 - DDI0487J_a_a-profile_architecture_reference_manual.pdf
[2] section B2.8 - DDI0487J_a_a-profile_architecture_reference_manual.pdf
[3] sections 1.7.7.3/1.8.5.2/appendix C - DEN0029H_SBSA_7.1.pdf
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20240224150546.368-2-ankita@nvidia.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2024-02-24 20:35:43 +05:30
|
|
|
kvm_pte_t attr;
|
2020-09-11 14:25:14 +01:00
|
|
|
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
|
|
|
|
|
KVM: arm64: Introduce new flag for non-cacheable IO memory
Currently, KVM for ARM64 maps at stage 2 memory that is considered device
(i.e. it is not RAM) with DEVICE_nGnRE memory attributes; this setting
overrides (as per the ARM architecture [1]) any device MMIO mapping
present at stage 1, resulting in a set-up whereby a guest operating
system cannot determine device MMIO mapping memory attributes on its
own but it is always overridden by the KVM stage 2 default.
This set-up does not allow guest operating systems to select device
memory attributes independently from KVM stage-2 mappings
(refer to [1], "Combining stage 1 and stage 2 memory type attributes"),
which turns out to be an issue in that guest operating systems
(e.g. Linux) may request to map devices MMIO regions with memory
attributes that guarantee better performance (e.g. gathering
attribute - that for some devices can generate larger PCIe memory
writes TLPs) and specific operations (e.g. unaligned transactions)
such as the NormalNC memory type.
The default device stage 2 mapping was chosen in KVM for ARM64 since
it was considered safer (i.e. it would not allow guests to trigger
uncontained failures ultimately crashing the machine) but this
turned out to be asynchronous (SError) defeating the purpose.
Failures containability is a property of the platform and is independent
from the memory type used for MMIO device memory mappings.
Actually, DEVICE_nGnRE memory type is even more problematic than
Normal-NC memory type in terms of faults containability in that e.g.
aborts triggered on DEVICE_nGnRE loads cannot be made, architecturally,
synchronous (i.e. that would imply that the processor should issue at
most 1 load transaction at a time - it cannot pipeline them - otherwise
the synchronous abort semantics would break the no-speculation attribute
attached to DEVICE_XXX memory).
This means that regardless of the combined stage1+stage2 mappings a
platform is safe if and only if device transactions cannot trigger
uncontained failures and that in turn relies on platform capabilities
and the device type being assigned (i.e. PCIe AER/DPC error containment
and RAS architecture[3]); therefore the default KVM device stage 2
memory attributes play no role in making device assignment safer
for a given platform (if the platform design adheres to design
guidelines outlined in [3]) and therefore can be relaxed.
For all these reasons, relax the KVM stage 2 device memory attributes
from DEVICE_nGnRE to Normal-NC.
The NormalNC was chosen over a different Normal memory type default
at stage-2 (e.g. Normal Write-through) to avoid cache allocation/snooping.
Relaxing S2 KVM device MMIO mappings to Normal-NC is not expected to
trigger any issue on guest device reclaim use cases either (i.e. device
MMIO unmap followed by a device reset) at least for PCIe devices, in that
in PCIe a device reset is architected and carried out through PCI config
space transactions that are naturally ordered with respect to MMIO
transactions according to the PCI ordering rules.
Having Normal-NC S2 default puts guests in control (thanks to
stage1+stage2 combined memory attributes rules [1]) of device MMIO
regions memory mappings, according to the rules described in [1]
and summarized here ([(S1) - stage1], [(S2) - stage 2]):
S1 | S2 | Result
NORMAL-WB | NORMAL-NC | NORMAL-NC
NORMAL-WT | NORMAL-NC | NORMAL-NC
NORMAL-NC | NORMAL-NC | NORMAL-NC
DEVICE<attr> | NORMAL-NC | DEVICE<attr>
It is worth noting that currently, to map devices MMIO space to user
space in a device pass-through use case the VFIO framework applies memory
attributes derived from pgprot_noncached() settings applied to VMAs, which
result in device-nGnRnE memory attributes for the stage-1 VMM mappings.
This means that a userspace mapping for device MMIO space carried
out with the current VFIO framework and a guest OS mapping for the same
MMIO space may result in a mismatched alias as described in [2].
Defaulting KVM device stage-2 mappings to Normal-NC attributes does not
change anything in this respect, in that the mismatched aliases would
only affect (refer to [2] for a detailed explanation) ordering between
the userspace and GuestOS mappings resulting stream of transactions
(i.e. it does not cause loss of property for either stream of
transactions on its own), which is harmless given that the userspace
and GuestOS access to the device is carried out through independent
transactions streams.
A Normal-NC flag is not present today. So add a new kvm_pgtable_prot
(KVM_PGTABLE_PROT_NORMAL_NC) flag for it, along with its
corresponding PTE value 0x5 (0b101) determined from [1].
Lastly, adapt the stage2 PTE property setter function
(stage2_set_prot_attr) to handle the NormalNC attribute.
The entire discussion leading to this patch series may be followed through
the following links.
Link: https://lore.kernel.org/all/20230907181459.18145-3-ankita@nvidia.com
Link: https://lore.kernel.org/r/20231205033015.10044-1-ankita@nvidia.com
[1] section D8.5.5 - DDI0487J_a_a-profile_architecture_reference_manual.pdf
[2] section B2.8 - DDI0487J_a_a-profile_architecture_reference_manual.pdf
[3] sections 1.7.7.3/1.8.5.2/appendix C - DEN0029H_SBSA_7.1.pdf
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20240224150546.368-2-ankita@nvidia.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2024-02-24 20:35:43 +05:30
|
|
|
switch (prot & (KVM_PGTABLE_PROT_DEVICE |
|
|
|
|
KVM_PGTABLE_PROT_NORMAL_NC)) {
|
|
|
|
case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
|
|
|
|
return -EINVAL;
|
|
|
|
case KVM_PGTABLE_PROT_DEVICE:
|
|
|
|
if (prot & KVM_PGTABLE_PROT_X)
|
|
|
|
return -EINVAL;
|
|
|
|
attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
|
|
|
|
break;
|
|
|
|
case KVM_PGTABLE_PROT_NORMAL_NC:
|
|
|
|
if (prot & KVM_PGTABLE_PROT_X)
|
|
|
|
return -EINVAL;
|
|
|
|
attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
attr = KVM_S2_MEMATTR(pgt, NORMAL);
|
|
|
|
}
|
|
|
|
|
2020-09-11 14:25:14 +01:00
|
|
|
if (!(prot & KVM_PGTABLE_PROT_X))
|
|
|
|
attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
|
|
|
|
|
|
|
|
if (prot & KVM_PGTABLE_PROT_R)
|
|
|
|
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
|
|
|
|
|
|
|
|
if (prot & KVM_PGTABLE_PROT_W)
|
|
|
|
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
|
|
|
|
|
KVM: arm64: Use LPA2 page-tables for stage2 and hyp stage1
Implement a simple policy whereby if the HW supports FEAT_LPA2 for the
page size we are using, always use LPA2-style page-tables for stage 2
and hyp stage 1 (assuming an nvhe hyp), regardless of the VMM-requested
IPA size or HW-implemented PA size. When in use we can now support up to
52-bit IPA and PA sizes.
We use the previously created cpu feature to track whether LPA2 is
supported for deciding whether to use the LPA2 or classic pte format.
Note that FEAT_LPA2 brings support for bigger block mappings (512GB with
4KB, 64GB with 16KB). We explicitly don't enable these in the library
because stage2_apply_range() works on batch sizes of the largest used
block mapping, and increasing the size of the batch would lead to soft
lockups. See commit 5994bc9e05c2 ("KVM: arm64: Limit
stage2_apply_range() batch size to largest block").
With the addition of LPA2 support in the hypervisor, the PA size
supported by the HW must be capped with a runtime decision, rather than
simply using a compile-time decision based on PA_BITS. For example, on a
system that advertises 52 bit PA but does not support FEAT_LPA2, A 4KB
or 16KB kernel compiled with LPA2 support must still limit the PA size
to 48 bits.
Therefore, move the insertion of the PS field into TCR_EL2 out of
__kvm_hyp_init assembly code and instead do it in cpu_prepare_hyp_mode()
where the rest of TCR_EL2 is prepared. This allows us to figure out PS
with kvm_get_parange(), which has the appropriate logic to ensure the
above requirement. (and the PS field of VTCR_EL2 is already populated
this way).
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231127111737.1897081-8-ryan.roberts@arm.com
2023-11-27 11:17:32 +00:00
|
|
|
if (!kvm_lpa2_is_enabled())
|
|
|
|
attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
|
|
|
|
|
2020-09-11 14:25:14 +01:00
|
|
|
attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
|
2021-08-09 16:24:38 +01:00
|
|
|
attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
|
2021-03-19 10:01:38 +00:00
|
|
|
*ptep = attr;
|
|
|
|
|
2020-09-11 14:25:14 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-09 16:24:43 +01:00
|
|
|
enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
|
|
|
|
{
|
|
|
|
enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
|
|
|
|
|
|
|
|
if (!kvm_pte_valid(pte))
|
|
|
|
return prot;
|
|
|
|
|
|
|
|
if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
|
|
|
|
prot |= KVM_PGTABLE_PROT_R;
|
|
|
|
if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
|
|
|
|
prot |= KVM_PGTABLE_PROT_W;
|
|
|
|
if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
|
|
|
|
prot |= KVM_PGTABLE_PROT_X;
|
|
|
|
|
|
|
|
return prot;
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:01:37 +00:00
|
|
|
static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
|
|
|
|
{
|
|
|
|
if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool stage2_pte_is_counted(kvm_pte_t pte)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The refcount tracks valid entries as well as invalid entries if they
|
|
|
|
* encode ownership of a page to another entity than the page-table
|
|
|
|
* owner, whose id is 0.
|
|
|
|
*/
|
|
|
|
return !!pte;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:58:55 +00:00
|
|
|
static bool stage2_pte_is_locked(kvm_pte_t pte)
|
|
|
|
{
|
|
|
|
return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:39 +00:00
|
|
|
static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
|
|
|
|
{
|
|
|
|
if (!kvm_pgtable_walk_shared(ctx)) {
|
|
|
|
WRITE_ONCE(*ctx->ptep, new);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:58:55 +00:00
|
|
|
/**
|
|
|
|
* stage2_try_break_pte() - Invalidates a pte according to the
|
|
|
|
* 'break-before-make' requirements of the
|
|
|
|
* architecture.
|
|
|
|
*
|
|
|
|
* @ctx: context of the visited pte.
|
|
|
|
* @mmu: stage-2 mmu
|
|
|
|
*
|
|
|
|
* Returns: true if the pte was successfully broken.
|
|
|
|
*
|
|
|
|
* If the removed pte was valid, performs the necessary serialization and TLB
|
|
|
|
* invalidation for the old value. For counted ptes, drops the reference count
|
|
|
|
* on the containing table page.
|
|
|
|
*/
|
|
|
|
static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
struct kvm_s2_mmu *mmu)
|
|
|
|
{
|
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
|
|
|
|
|
|
|
if (stage2_pte_is_locked(ctx->old)) {
|
|
|
|
/*
|
|
|
|
* Should never occur if this walker has exclusive access to the
|
|
|
|
* page tables.
|
|
|
|
*/
|
|
|
|
WARN_ON(!kvm_pgtable_walk_shared(ctx));
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
|
|
|
|
return false;
|
|
|
|
|
2023-04-26 17:23:20 +00:00
|
|
|
if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
|
|
|
|
/*
|
|
|
|
* Perform the appropriate TLB invalidation based on the
|
|
|
|
* evicted pte value (if any).
|
|
|
|
*/
|
2024-03-27 12:48:53 +00:00
|
|
|
if (kvm_pte_table(ctx->old, ctx->level)) {
|
|
|
|
u64 size = kvm_granule_size(ctx->level);
|
|
|
|
u64 addr = ALIGN_DOWN(ctx->addr, size);
|
|
|
|
|
|
|
|
kvm_tlb_flush_vmid_range(mmu, addr, size);
|
|
|
|
} else if (kvm_pte_valid(ctx->old)) {
|
2023-04-26 17:23:20 +00:00
|
|
|
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
|
|
|
|
ctx->addr, ctx->level);
|
2024-03-27 12:48:53 +00:00
|
|
|
}
|
2023-04-26 17:23:20 +00:00
|
|
|
}
|
2022-11-07 21:58:55 +00:00
|
|
|
|
|
|
|
if (stage2_pte_is_counted(ctx->old))
|
|
|
|
mm_ops->put_page(ctx->ptep);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
|
|
|
|
{
|
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
|
|
|
|
|
|
|
WARN_ON(!stage2_pte_is_locked(*ctx->ptep));
|
|
|
|
|
|
|
|
if (stage2_pte_is_counted(new))
|
|
|
|
mm_ops->get_page(ctx->ptep);
|
|
|
|
|
|
|
|
smp_store_release(ctx->ptep, new);
|
|
|
|
}
|
|
|
|
|
2023-08-11 04:51:27 +00:00
|
|
|
static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
|
2021-03-19 10:01:37 +00:00
|
|
|
{
|
|
|
|
/*
|
2023-08-11 04:51:27 +00:00
|
|
|
* If FEAT_TLBIRANGE is implemented, defer the individual
|
|
|
|
* TLB invalidations until the entire walk is finished, and
|
|
|
|
* then use the range-based TLBI instructions to do the
|
|
|
|
* invalidations. Condition deferred TLB invalidation on the
|
|
|
|
* system supporting FWB as the optimization is entirely
|
|
|
|
* pointless when the unmap walker needs to perform CMOs.
|
|
|
|
*/
|
|
|
|
return system_supports_tlb_range() && stage2_has_fwb(pgt);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
struct kvm_s2_mmu *mmu,
|
|
|
|
struct kvm_pgtable_mm_ops *mm_ops)
|
|
|
|
{
|
|
|
|
struct kvm_pgtable *pgt = ctx->arg;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear the existing PTE, and perform break-before-make if it was
|
|
|
|
* valid. Depending on the system support, defer the TLB maintenance
|
|
|
|
* for the same until the entire unmap walk is completed.
|
2021-03-19 10:01:37 +00:00
|
|
|
*/
|
2022-11-07 21:56:32 +00:00
|
|
|
if (kvm_pte_valid(ctx->old)) {
|
2022-11-07 21:56:31 +00:00
|
|
|
kvm_clear_pte(ctx->ptep);
|
2023-08-11 04:51:27 +00:00
|
|
|
|
2024-03-27 12:48:51 +00:00
|
|
|
if (kvm_pte_table(ctx->old, ctx->level)) {
|
|
|
|
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
|
|
|
|
TLBI_TTL_UNKNOWN);
|
|
|
|
} else if (!stage2_unmap_defer_tlb_flush(pgt)) {
|
|
|
|
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
|
|
|
|
ctx->level);
|
2024-03-27 12:48:50 +00:00
|
|
|
}
|
2021-03-19 10:01:37 +00:00
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
mm_ops->put_page(ctx->ptep);
|
2021-03-19 10:01:37 +00:00
|
|
|
}
|
|
|
|
|
2021-06-17 18:58:24 +08:00
|
|
|
static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
|
|
|
|
{
|
|
|
|
u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
|
2024-04-23 16:05:15 +01:00
|
|
|
return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
|
2021-06-17 18:58:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool stage2_pte_executable(kvm_pte_t pte)
|
|
|
|
{
|
2024-04-23 16:05:15 +01:00
|
|
|
return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
|
2021-06-17 18:58:24 +08:00
|
|
|
}
|
|
|
|
|
KVM: arm64: Infer the PA offset from IPA in stage-2 map walker
Until now, the page table walker counted increments to the PA and IPA
of a walk in two separate places. While the PA is incremented as soon as
a leaf PTE is installed in stage2_map_walker_try_leaf(), the IPA is
actually bumped in the generic table walker context. Critically,
__kvm_pgtable_visit() rereads the PTE after the LEAF callback returns
to work out if a table or leaf was installed, and only bumps the IPA for
a leaf PTE.
This arrangement worked fine when we handled faults behind the write lock,
as the walker had exclusive access to the stage-2 page tables. However,
commit 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
started handling all stage-2 faults behind the read lock, opening up a
race where a walker could increment the PA but not the IPA of a walk.
Nothing good ensues, as the walker starts mapping with the incorrect
IPA -> PA relationship.
For example, assume that two vCPUs took a data abort on the same IPA.
One observes that dirty logging is disabled, and the other observed that
it is enabled:
vCPU attempting PMD mapping vCPU attempting PTE mapping
====================================== =====================================
/* install PMD */
stage2_make_pte(ctx, leaf);
data->phys += granule;
/* replace PMD with a table */
stage2_try_break_pte(ctx, data->mmu);
stage2_make_pte(ctx, table);
/* table is observed */
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
/*
* map walk continues w/o incrementing
* IPA.
*/
__kvm_pgtable_walk(..., level + 1);
Bring an end to the whole mess by using the IPA as the single source of
truth for how far along a walk has gotten. Work out the correct PA to
map by calculating the IPA offset from the beginning of the walk and add
that to the starting physical address.
Cc: stable@vger.kernel.org
Fixes: 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230421071606.1603916-2-oliver.upton@linux.dev
2023-04-21 07:16:05 +00:00
|
|
|
static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
const struct stage2_map_data *data)
|
|
|
|
{
|
|
|
|
u64 phys = data->phys;
|
|
|
|
|
2024-12-12 09:18:46 +01:00
|
|
|
/* Work out the correct PA based on how far the walk has gotten */
|
KVM: arm64: Infer the PA offset from IPA in stage-2 map walker
Until now, the page table walker counted increments to the PA and IPA
of a walk in two separate places. While the PA is incremented as soon as
a leaf PTE is installed in stage2_map_walker_try_leaf(), the IPA is
actually bumped in the generic table walker context. Critically,
__kvm_pgtable_visit() rereads the PTE after the LEAF callback returns
to work out if a table or leaf was installed, and only bumps the IPA for
a leaf PTE.
This arrangement worked fine when we handled faults behind the write lock,
as the walker had exclusive access to the stage-2 page tables. However,
commit 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
started handling all stage-2 faults behind the read lock, opening up a
race where a walker could increment the PA but not the IPA of a walk.
Nothing good ensues, as the walker starts mapping with the incorrect
IPA -> PA relationship.
For example, assume that two vCPUs took a data abort on the same IPA.
One observes that dirty logging is disabled, and the other observed that
it is enabled:
vCPU attempting PMD mapping vCPU attempting PTE mapping
====================================== =====================================
/* install PMD */
stage2_make_pte(ctx, leaf);
data->phys += granule;
/* replace PMD with a table */
stage2_try_break_pte(ctx, data->mmu);
stage2_make_pte(ctx, table);
/* table is observed */
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
/*
* map walk continues w/o incrementing
* IPA.
*/
__kvm_pgtable_walk(..., level + 1);
Bring an end to the whole mess by using the IPA as the single source of
truth for how far along a walk has gotten. Work out the correct PA to
map by calculating the IPA offset from the beginning of the walk and add
that to the starting physical address.
Cc: stable@vger.kernel.org
Fixes: 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230421071606.1603916-2-oliver.upton@linux.dev
2023-04-21 07:16:05 +00:00
|
|
|
return phys + (ctx->addr - ctx->start);
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
|
2021-08-09 16:24:37 +01:00
|
|
|
struct stage2_map_data *data)
|
|
|
|
{
|
KVM: arm64: Infer the PA offset from IPA in stage-2 map walker
Until now, the page table walker counted increments to the PA and IPA
of a walk in two separate places. While the PA is incremented as soon as
a leaf PTE is installed in stage2_map_walker_try_leaf(), the IPA is
actually bumped in the generic table walker context. Critically,
__kvm_pgtable_visit() rereads the PTE after the LEAF callback returns
to work out if a table or leaf was installed, and only bumps the IPA for
a leaf PTE.
This arrangement worked fine when we handled faults behind the write lock,
as the walker had exclusive access to the stage-2 page tables. However,
commit 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
started handling all stage-2 faults behind the read lock, opening up a
race where a walker could increment the PA but not the IPA of a walk.
Nothing good ensues, as the walker starts mapping with the incorrect
IPA -> PA relationship.
For example, assume that two vCPUs took a data abort on the same IPA.
One observes that dirty logging is disabled, and the other observed that
it is enabled:
vCPU attempting PMD mapping vCPU attempting PTE mapping
====================================== =====================================
/* install PMD */
stage2_make_pte(ctx, leaf);
data->phys += granule;
/* replace PMD with a table */
stage2_try_break_pte(ctx, data->mmu);
stage2_make_pte(ctx, table);
/* table is observed */
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
/*
* map walk continues w/o incrementing
* IPA.
*/
__kvm_pgtable_walk(..., level + 1);
Bring an end to the whole mess by using the IPA as the single source of
truth for how far along a walk has gotten. Work out the correct PA to
map by calculating the IPA offset from the beginning of the walk and add
that to the starting physical address.
Cc: stable@vger.kernel.org
Fixes: 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230421071606.1603916-2-oliver.upton@linux.dev
2023-04-21 07:16:05 +00:00
|
|
|
u64 phys = stage2_map_walker_phys_addr(ctx, data);
|
|
|
|
|
2023-11-27 11:17:33 +00:00
|
|
|
if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL)
|
2021-08-09 16:24:37 +01:00
|
|
|
return false;
|
|
|
|
|
2024-12-12 09:18:46 +01:00
|
|
|
if (data->annotation)
|
|
|
|
return true;
|
|
|
|
|
KVM: arm64: Infer the PA offset from IPA in stage-2 map walker
Until now, the page table walker counted increments to the PA and IPA
of a walk in two separate places. While the PA is incremented as soon as
a leaf PTE is installed in stage2_map_walker_try_leaf(), the IPA is
actually bumped in the generic table walker context. Critically,
__kvm_pgtable_visit() rereads the PTE after the LEAF callback returns
to work out if a table or leaf was installed, and only bumps the IPA for
a leaf PTE.
This arrangement worked fine when we handled faults behind the write lock,
as the walker had exclusive access to the stage-2 page tables. However,
commit 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
started handling all stage-2 faults behind the read lock, opening up a
race where a walker could increment the PA but not the IPA of a walk.
Nothing good ensues, as the walker starts mapping with the incorrect
IPA -> PA relationship.
For example, assume that two vCPUs took a data abort on the same IPA.
One observes that dirty logging is disabled, and the other observed that
it is enabled:
vCPU attempting PMD mapping vCPU attempting PTE mapping
====================================== =====================================
/* install PMD */
stage2_make_pte(ctx, leaf);
data->phys += granule;
/* replace PMD with a table */
stage2_try_break_pte(ctx, data->mmu);
stage2_make_pte(ctx, table);
/* table is observed */
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
/*
* map walk continues w/o incrementing
* IPA.
*/
__kvm_pgtable_walk(..., level + 1);
Bring an end to the whole mess by using the IPA as the single source of
truth for how far along a walk has gotten. Work out the correct PA to
map by calculating the IPA offset from the beginning of the walk and add
that to the starting physical address.
Cc: stable@vger.kernel.org
Fixes: 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230421071606.1603916-2-oliver.upton@linux.dev
2023-04-21 07:16:05 +00:00
|
|
|
return kvm_block_mapping_supported(ctx, phys);
|
2021-08-09 16:24:37 +01:00
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
|
KVM: arm64: Filter out the case of only changing permissions from stage-2 map path
(1) During running time of a a VM with numbers of vCPUs, if some vCPUs
access the same GPA almost at the same time and the stage-2 mapping of
the GPA has not been built yet, as a result they will all cause
translation faults. The first vCPU builds the mapping, and the followed
ones end up updating the valid leaf PTE. Note that these vCPUs might
want different access permissions (RO, RW, RX, RWX, etc.).
(2) It's inevitable that we sometimes will update an existing valid leaf
PTE in the map path, and we perform break-before-make in this case.
Then more unnecessary translation faults could be caused if the
*break stage* of BBM is just catched by other vCPUS.
With (1) and (2), something unsatisfactory could happen: vCPU A causes
a translation fault and builds the mapping with RW permissions, vCPU B
then update the valid leaf PTE with break-before-make and permissions
are updated back to RO. Besides, *break stage* of BBM may trigger more
translation faults. Finally, some useless small loops could occur.
We can make some optimization to solve above problems: When we need to
update a valid leaf PTE in the map path, let's filter out the case where
this update only change access permissions, and don't update the valid
leaf PTE here in this case. Instead, let the vCPU enter back the guest
and it will exit next time to go through the relax_perms path without
break-before-make if it still wants more permissions.
Signed-off-by: Yanan Wang <wangyanan55@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210114121350.123684-3-wangyanan55@huawei.com
2021-01-14 20:13:49 +08:00
|
|
|
struct stage2_map_data *data)
|
2020-09-11 14:25:14 +01:00
|
|
|
{
|
2022-11-07 21:56:32 +00:00
|
|
|
kvm_pte_t new;
|
KVM: arm64: Infer the PA offset from IPA in stage-2 map walker
Until now, the page table walker counted increments to the PA and IPA
of a walk in two separate places. While the PA is incremented as soon as
a leaf PTE is installed in stage2_map_walker_try_leaf(), the IPA is
actually bumped in the generic table walker context. Critically,
__kvm_pgtable_visit() rereads the PTE after the LEAF callback returns
to work out if a table or leaf was installed, and only bumps the IPA for
a leaf PTE.
This arrangement worked fine when we handled faults behind the write lock,
as the walker had exclusive access to the stage-2 page tables. However,
commit 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
started handling all stage-2 faults behind the read lock, opening up a
race where a walker could increment the PA but not the IPA of a walk.
Nothing good ensues, as the walker starts mapping with the incorrect
IPA -> PA relationship.
For example, assume that two vCPUs took a data abort on the same IPA.
One observes that dirty logging is disabled, and the other observed that
it is enabled:
vCPU attempting PMD mapping vCPU attempting PTE mapping
====================================== =====================================
/* install PMD */
stage2_make_pte(ctx, leaf);
data->phys += granule;
/* replace PMD with a table */
stage2_try_break_pte(ctx, data->mmu);
stage2_make_pte(ctx, table);
/* table is observed */
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
/*
* map walk continues w/o incrementing
* IPA.
*/
__kvm_pgtable_walk(..., level + 1);
Bring an end to the whole mess by using the IPA as the single source of
truth for how far along a walk has gotten. Work out the correct PA to
map by calculating the IPA offset from the beginning of the walk and add
that to the starting physical address.
Cc: stable@vger.kernel.org
Fixes: 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel")
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230421071606.1603916-2-oliver.upton@linux.dev
2023-04-21 07:16:05 +00:00
|
|
|
u64 phys = stage2_map_walker_phys_addr(ctx, data);
|
|
|
|
u64 granule = kvm_granule_size(ctx->level);
|
2021-06-17 18:58:24 +08:00
|
|
|
struct kvm_pgtable *pgt = data->mmu->pgt;
|
2022-11-07 21:56:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
if (!stage2_leaf_mapping_allowed(ctx, data))
|
KVM: arm64: Filter out the case of only changing permissions from stage-2 map path
(1) During running time of a a VM with numbers of vCPUs, if some vCPUs
access the same GPA almost at the same time and the stage-2 mapping of
the GPA has not been built yet, as a result they will all cause
translation faults. The first vCPU builds the mapping, and the followed
ones end up updating the valid leaf PTE. Note that these vCPUs might
want different access permissions (RO, RW, RX, RWX, etc.).
(2) It's inevitable that we sometimes will update an existing valid leaf
PTE in the map path, and we perform break-before-make in this case.
Then more unnecessary translation faults could be caused if the
*break stage* of BBM is just catched by other vCPUS.
With (1) and (2), something unsatisfactory could happen: vCPU A causes
a translation fault and builds the mapping with RW permissions, vCPU B
then update the valid leaf PTE with break-before-make and permissions
are updated back to RO. Besides, *break stage* of BBM may trigger more
translation faults. Finally, some useless small loops could occur.
We can make some optimization to solve above problems: When we need to
update a valid leaf PTE in the map path, let's filter out the case where
this update only change access permissions, and don't update the valid
leaf PTE here in this case. Instead, let the vCPU enter back the guest
and it will exit next time to go through the relax_perms path without
break-before-make if it still wants more permissions.
Signed-off-by: Yanan Wang <wangyanan55@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210114121350.123684-3-wangyanan55@huawei.com
2021-01-14 20:13:49 +08:00
|
|
|
return -E2BIG;
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2024-12-12 09:18:46 +01:00
|
|
|
if (!data->annotation)
|
2022-11-07 21:56:31 +00:00
|
|
|
new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
|
2021-03-19 10:01:37 +00:00
|
|
|
else
|
|
|
|
new = kvm_init_invalid_leaf_owner(data->owner_id);
|
|
|
|
|
2022-11-07 21:59:34 +00:00
|
|
|
/*
|
|
|
|
* Skip updating the PTE if we are trying to recreate the exact
|
|
|
|
* same mapping or only change the access permissions. Instead,
|
|
|
|
* the vCPU will exit one more time from guest if still needed
|
|
|
|
* and then go through the path of relaxing permissions.
|
|
|
|
*/
|
|
|
|
if (!stage2_pte_needs_update(ctx->old, new))
|
|
|
|
return -EAGAIN;
|
2020-12-02 04:10:32 +08:00
|
|
|
|
2024-04-23 16:05:16 +01:00
|
|
|
/* If we're only changing software bits, then store them and go! */
|
|
|
|
if (!kvm_pgtable_walk_shared(ctx) &&
|
|
|
|
!((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) {
|
|
|
|
bool old_is_counted = stage2_pte_is_counted(ctx->old);
|
|
|
|
|
|
|
|
if (old_is_counted != stage2_pte_is_counted(new)) {
|
|
|
|
if (old_is_counted)
|
|
|
|
mm_ops->put_page(ctx->ptep);
|
|
|
|
else
|
|
|
|
mm_ops->get_page(ctx->ptep);
|
|
|
|
}
|
|
|
|
WARN_ON_ONCE(!stage2_try_set_pte(ctx, new));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:59:34 +00:00
|
|
|
if (!stage2_try_break_pte(ctx, data->mmu))
|
|
|
|
return -EAGAIN;
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2021-06-17 18:58:24 +08:00
|
|
|
/* Perform CMOs before installation of the guest stage-2 PTE */
|
2023-04-26 17:23:20 +00:00
|
|
|
if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
|
|
|
|
stage2_pte_cacheable(pgt, new))
|
2021-06-17 18:58:24 +08:00
|
|
|
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
|
2023-04-26 17:23:20 +00:00
|
|
|
granule);
|
2021-06-17 18:58:24 +08:00
|
|
|
|
2023-04-26 17:23:20 +00:00
|
|
|
if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
|
|
|
|
stage2_pte_executable(new))
|
2021-06-17 18:58:24 +08:00
|
|
|
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
|
|
|
|
|
2022-11-07 21:59:34 +00:00
|
|
|
stage2_make_pte(ctx, new);
|
|
|
|
|
KVM: arm64: Filter out the case of only changing permissions from stage-2 map path
(1) During running time of a a VM with numbers of vCPUs, if some vCPUs
access the same GPA almost at the same time and the stage-2 mapping of
the GPA has not been built yet, as a result they will all cause
translation faults. The first vCPU builds the mapping, and the followed
ones end up updating the valid leaf PTE. Note that these vCPUs might
want different access permissions (RO, RW, RX, RWX, etc.).
(2) It's inevitable that we sometimes will update an existing valid leaf
PTE in the map path, and we perform break-before-make in this case.
Then more unnecessary translation faults could be caused if the
*break stage* of BBM is just catched by other vCPUS.
With (1) and (2), something unsatisfactory could happen: vCPU A causes
a translation fault and builds the mapping with RW permissions, vCPU B
then update the valid leaf PTE with break-before-make and permissions
are updated back to RO. Besides, *break stage* of BBM may trigger more
translation faults. Finally, some useless small loops could occur.
We can make some optimization to solve above problems: When we need to
update a valid leaf PTE in the map path, let's filter out the case where
this update only change access permissions, and don't update the valid
leaf PTE here in this case. Instead, let the vCPU enter back the guest
and it will exit next time to go through the relax_perms path without
break-before-make if it still wants more permissions.
Signed-off-by: Yanan Wang <wangyanan55@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210114121350.123684-3-wangyanan55@huawei.com
2021-01-14 20:13:49 +08:00
|
|
|
return 0;
|
2020-09-11 14:25:14 +01:00
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
|
2020-09-11 14:25:14 +01:00
|
|
|
struct stage2_map_data *data)
|
|
|
|
{
|
2022-11-07 21:56:37 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
|
|
|
kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
|
|
|
|
int ret;
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
if (!stage2_leaf_mapping_allowed(ctx, data))
|
2020-09-11 14:25:14 +01:00
|
|
|
return 0;
|
|
|
|
|
2022-11-07 21:56:37 +00:00
|
|
|
ret = stage2_map_walker_try_leaf(ctx, data);
|
2022-11-07 22:00:06 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2022-11-07 21:56:37 +00:00
|
|
|
|
2023-04-26 17:23:19 +00:00
|
|
|
mm_ops->free_unlinked_table(childp, ctx->level);
|
2022-11-07 22:00:06 +00:00
|
|
|
return 0;
|
2020-09-11 14:25:14 +01:00
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
|
2020-09-11 14:25:14 +01:00
|
|
|
struct stage2_map_data *data)
|
|
|
|
{
|
2022-11-07 21:56:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
2022-11-07 21:56:40 +00:00
|
|
|
kvm_pte_t *childp, new;
|
2021-03-19 10:01:14 +00:00
|
|
|
int ret;
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
ret = stage2_map_walker_try_leaf(ctx, data);
|
KVM: arm64: Filter out the case of only changing permissions from stage-2 map path
(1) During running time of a a VM with numbers of vCPUs, if some vCPUs
access the same GPA almost at the same time and the stage-2 mapping of
the GPA has not been built yet, as a result they will all cause
translation faults. The first vCPU builds the mapping, and the followed
ones end up updating the valid leaf PTE. Note that these vCPUs might
want different access permissions (RO, RW, RX, RWX, etc.).
(2) It's inevitable that we sometimes will update an existing valid leaf
PTE in the map path, and we perform break-before-make in this case.
Then more unnecessary translation faults could be caused if the
*break stage* of BBM is just catched by other vCPUS.
With (1) and (2), something unsatisfactory could happen: vCPU A causes
a translation fault and builds the mapping with RW permissions, vCPU B
then update the valid leaf PTE with break-before-make and permissions
are updated back to RO. Besides, *break stage* of BBM may trigger more
translation faults. Finally, some useless small loops could occur.
We can make some optimization to solve above problems: When we need to
update a valid leaf PTE in the map path, let's filter out the case where
this update only change access permissions, and don't update the valid
leaf PTE here in this case. Instead, let the vCPU enter back the guest
and it will exit next time to go through the relax_perms path without
break-before-make if it still wants more permissions.
Signed-off-by: Yanan Wang <wangyanan55@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210114121350.123684-3-wangyanan55@huawei.com
2021-01-14 20:13:49 +08:00
|
|
|
if (ret != -E2BIG)
|
|
|
|
return ret;
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2023-11-27 11:17:33 +00:00
|
|
|
if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
|
2020-09-11 14:25:14 +01:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (!data->memcache)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2021-03-19 10:01:14 +00:00
|
|
|
childp = mm_ops->zalloc_page(data->memcache);
|
2020-09-11 14:25:14 +01:00
|
|
|
if (!childp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2022-11-07 21:58:55 +00:00
|
|
|
if (!stage2_try_break_pte(ctx, data->mmu)) {
|
|
|
|
mm_ops->put_page(childp);
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
|
2020-09-11 14:25:14 +01:00
|
|
|
/*
|
|
|
|
* If we've run into an existing block mapping then replace it with
|
|
|
|
* a table. Accesses beyond 'end' that fall within the new table
|
|
|
|
* will be mapped lazily.
|
|
|
|
*/
|
2022-11-07 21:56:40 +00:00
|
|
|
new = kvm_init_table_pte(childp, mm_ops);
|
2022-11-07 21:58:55 +00:00
|
|
|
stage2_make_pte(ctx, new);
|
2021-01-14 20:13:48 +08:00
|
|
|
|
2020-09-11 14:25:14 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2022-11-07 21:56:37 +00:00
|
|
|
* The TABLE_PRE callback runs for table entries on the way down, looking
|
|
|
|
* for table entries which we could conceivably replace with a block entry
|
|
|
|
* for this mapping. If it finds one it replaces the entry and calls
|
2023-04-26 17:23:19 +00:00
|
|
|
* kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
|
2020-09-11 14:25:14 +01:00
|
|
|
*
|
2022-11-07 21:56:37 +00:00
|
|
|
* Otherwise, the LEAF callback performs the mapping at the existing leaves
|
|
|
|
* instead.
|
2020-09-11 14:25:14 +01:00
|
|
|
*/
|
2022-11-07 21:56:31 +00:00
|
|
|
static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2020-09-11 14:25:14 +01:00
|
|
|
{
|
2022-11-07 21:56:31 +00:00
|
|
|
struct stage2_map_data *data = ctx->arg;
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
switch (visit) {
|
2020-09-11 14:25:14 +01:00
|
|
|
case KVM_PGTABLE_WALK_TABLE_PRE:
|
2022-11-07 21:56:31 +00:00
|
|
|
return stage2_map_walk_table_pre(ctx, data);
|
2020-09-11 14:25:14 +01:00
|
|
|
case KVM_PGTABLE_WALK_LEAF:
|
2022-11-07 21:56:31 +00:00
|
|
|
return stage2_map_walk_leaf(ctx, data);
|
2022-11-07 21:56:37 +00:00
|
|
|
default:
|
|
|
|
return -EINVAL;
|
2020-09-11 14:25:14 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
|
|
|
u64 phys, enum kvm_pgtable_prot prot,
|
2022-11-07 22:00:33 +00:00
|
|
|
void *mc, enum kvm_pgtable_walk_flags flags)
|
2020-09-11 14:25:14 +01:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct stage2_map_data map_data = {
|
|
|
|
.phys = ALIGN_DOWN(phys, PAGE_SIZE),
|
|
|
|
.mmu = pgt->mmu,
|
|
|
|
.memcache = mc,
|
2021-08-09 16:24:37 +01:00
|
|
|
.force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
|
2020-09-11 14:25:14 +01:00
|
|
|
};
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_map_walker,
|
2022-11-07 22:00:33 +00:00
|
|
|
.flags = flags |
|
|
|
|
KVM_PGTABLE_WALK_TABLE_PRE |
|
2022-11-07 21:56:37 +00:00
|
|
|
KVM_PGTABLE_WALK_LEAF,
|
2020-09-11 14:25:14 +01:00
|
|
|
.arg = &map_data,
|
|
|
|
};
|
|
|
|
|
2021-03-19 10:01:41 +00:00
|
|
|
if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2021-03-19 10:01:40 +00:00
|
|
|
ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
|
2020-09-11 14:25:14 +01:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
|
|
|
|
dsb(ishst);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-03-19 10:01:37 +00:00
|
|
|
int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
|
|
|
void *mc, u8 owner_id)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct stage2_map_data map_data = {
|
|
|
|
.mmu = pgt->mmu,
|
|
|
|
.memcache = mc,
|
|
|
|
.owner_id = owner_id,
|
2021-08-09 16:24:37 +01:00
|
|
|
.force_pte = true,
|
2024-12-12 09:18:46 +01:00
|
|
|
.annotation = true,
|
2021-03-19 10:01:37 +00:00
|
|
|
};
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_map_walker,
|
|
|
|
.flags = KVM_PGTABLE_WALK_TABLE_PRE |
|
2022-11-07 21:56:37 +00:00
|
|
|
KVM_PGTABLE_WALK_LEAF,
|
2021-03-19 10:01:37 +00:00
|
|
|
.arg = &map_data,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (owner_id > KVM_MAX_OWNER_ID)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2020-09-11 14:25:14 +01:00
|
|
|
{
|
2022-11-07 21:56:31 +00:00
|
|
|
struct kvm_pgtable *pgt = ctx->arg;
|
2021-03-19 10:01:14 +00:00
|
|
|
struct kvm_s2_mmu *mmu = pgt->mmu;
|
2022-11-07 21:56:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
2022-11-07 21:56:32 +00:00
|
|
|
kvm_pte_t *childp = NULL;
|
2020-09-11 14:25:14 +01:00
|
|
|
bool need_flush = false;
|
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
if (!kvm_pte_valid(ctx->old)) {
|
|
|
|
if (stage2_pte_is_counted(ctx->old)) {
|
2022-11-07 21:56:31 +00:00
|
|
|
kvm_clear_pte(ctx->ptep);
|
|
|
|
mm_ops->put_page(ctx->ptep);
|
2021-03-19 10:01:37 +00:00
|
|
|
}
|
2020-09-11 14:25:14 +01:00
|
|
|
return 0;
|
2021-03-19 10:01:37 +00:00
|
|
|
}
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
if (kvm_pte_table(ctx->old, ctx->level)) {
|
|
|
|
childp = kvm_pte_follow(ctx->old, mm_ops);
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2021-03-19 10:01:14 +00:00
|
|
|
if (mm_ops->page_count(childp) != 1)
|
2020-09-11 14:25:14 +01:00
|
|
|
return 0;
|
2022-11-07 21:56:32 +00:00
|
|
|
} else if (stage2_pte_cacheable(pgt, ctx->old)) {
|
2021-03-19 10:01:40 +00:00
|
|
|
need_flush = !stage2_has_fwb(pgt);
|
2020-09-11 14:25:14 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is similar to the map() path in that we unmap the entire
|
|
|
|
* block entry and rely on the remaining portions being faulted
|
|
|
|
* back lazily.
|
|
|
|
*/
|
2023-08-11 04:51:27 +00:00
|
|
|
stage2_unmap_put_pte(ctx, mmu, mm_ops);
|
2020-09-11 14:25:14 +01:00
|
|
|
|
2022-01-14 08:57:58 +00:00
|
|
|
if (need_flush && mm_ops->dcache_clean_inval_poc)
|
2022-11-07 21:56:32 +00:00
|
|
|
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
|
2022-11-07 21:56:31 +00:00
|
|
|
kvm_granule_size(ctx->level));
|
2020-09-11 14:25:14 +01:00
|
|
|
|
|
|
|
if (childp)
|
2021-03-19 10:01:14 +00:00
|
|
|
mm_ops->put_page(childp);
|
2020-09-11 14:25:14 +01:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
|
|
|
|
{
|
2023-08-11 04:51:27 +00:00
|
|
|
int ret;
|
2020-09-11 14:25:14 +01:00
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_unmap_walker,
|
2021-03-19 10:01:14 +00:00
|
|
|
.arg = pgt,
|
2020-09-11 14:25:14 +01:00
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
|
|
|
|
};
|
|
|
|
|
2023-08-11 04:51:27 +00:00
|
|
|
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
|
|
|
|
if (stage2_unmap_defer_tlb_flush(pgt))
|
|
|
|
/* Perform the deferred TLB invalidations */
|
|
|
|
kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
|
|
|
|
|
|
|
|
return ret;
|
2020-09-11 14:25:14 +01:00
|
|
|
}
|
|
|
|
|
2020-09-11 14:25:18 +01:00
|
|
|
struct stage2_attr_data {
|
2021-06-17 18:58:22 +08:00
|
|
|
kvm_pte_t attr_set;
|
|
|
|
kvm_pte_t attr_clr;
|
|
|
|
kvm_pte_t pte;
|
2023-11-27 11:17:33 +00:00
|
|
|
s8 level;
|
2020-09-11 14:25:18 +01:00
|
|
|
};
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2020-09-11 14:25:18 +01:00
|
|
|
{
|
2022-11-07 21:56:32 +00:00
|
|
|
kvm_pte_t pte = ctx->old;
|
2022-11-07 21:56:31 +00:00
|
|
|
struct stage2_attr_data *data = ctx->arg;
|
2022-11-07 21:56:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
2020-09-11 14:25:18 +01:00
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
if (!kvm_pte_valid(ctx->old))
|
2022-12-02 18:51:53 +00:00
|
|
|
return -EAGAIN;
|
2020-09-11 14:25:18 +01:00
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
data->level = ctx->level;
|
2020-09-11 14:25:18 +01:00
|
|
|
data->pte = pte;
|
|
|
|
pte &= ~data->attr_clr;
|
|
|
|
pte |= data->attr_set;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We may race with the CPU trying to set the access flag here,
|
|
|
|
* but worst-case the access flag update gets lost and will be
|
|
|
|
* set on the next access instead.
|
|
|
|
*/
|
2021-06-17 18:58:24 +08:00
|
|
|
if (data->pte != pte) {
|
|
|
|
/*
|
|
|
|
* Invalidate instruction cache before updating the guest
|
|
|
|
* stage-2 PTE if we are going to add executable permission.
|
|
|
|
*/
|
|
|
|
if (mm_ops->icache_inval_pou &&
|
2022-11-07 21:56:32 +00:00
|
|
|
stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
|
2021-06-17 18:58:24 +08:00
|
|
|
mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
|
2022-11-07 21:56:31 +00:00
|
|
|
kvm_granule_size(ctx->level));
|
2022-11-07 21:56:39 +00:00
|
|
|
|
|
|
|
if (!stage2_try_set_pte(ctx, pte))
|
|
|
|
return -EAGAIN;
|
2021-06-17 18:58:24 +08:00
|
|
|
}
|
2020-09-11 14:25:18 +01:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
|
|
|
|
u64 size, kvm_pte_t attr_set,
|
2020-09-30 14:18:01 +01:00
|
|
|
kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
|
2023-11-27 11:17:33 +00:00
|
|
|
s8 *level, enum kvm_pgtable_walk_flags flags)
|
2020-09-11 14:25:18 +01:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
|
|
|
|
struct stage2_attr_data data = {
|
|
|
|
.attr_set = attr_set & attr_mask,
|
|
|
|
.attr_clr = attr_clr & attr_mask,
|
|
|
|
};
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_attr_walker,
|
|
|
|
.arg = &data,
|
2022-11-07 21:56:39 +00:00
|
|
|
.flags = flags | KVM_PGTABLE_WALK_LEAF,
|
2020-09-11 14:25:18 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (orig_pte)
|
|
|
|
*orig_pte = data.pte;
|
2020-09-30 14:18:01 +01:00
|
|
|
|
|
|
|
if (level)
|
|
|
|
*level = data.level;
|
2020-09-11 14:25:18 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-09-11 14:25:20 +01:00
|
|
|
int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
|
|
|
|
{
|
|
|
|
return stage2_update_leaf_attrs(pgt, addr, size, 0,
|
2020-09-30 14:18:01 +01:00
|
|
|
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
|
2022-11-07 21:56:39 +00:00
|
|
|
NULL, NULL, 0);
|
2020-09-11 14:25:20 +01:00
|
|
|
}
|
|
|
|
|
2024-12-18 19:40:46 +00:00
|
|
|
void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
|
|
|
|
enum kvm_pgtable_walk_flags flags)
|
2020-09-11 14:25:18 +01:00
|
|
|
{
|
2022-12-02 18:51:54 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
|
2024-12-18 19:40:46 +00:00
|
|
|
NULL, NULL, flags);
|
2022-12-02 18:51:54 +00:00
|
|
|
if (!ret)
|
|
|
|
dsb(ishst);
|
2020-09-11 14:25:18 +01:00
|
|
|
}
|
|
|
|
|
2023-06-27 23:54:05 +00:00
|
|
|
struct stage2_age_data {
|
|
|
|
bool mkold;
|
|
|
|
bool young;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2020-09-11 14:25:18 +01:00
|
|
|
{
|
2023-06-27 23:54:05 +00:00
|
|
|
kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
|
|
|
|
struct stage2_age_data *data = ctx->arg;
|
|
|
|
|
|
|
|
if (!kvm_pte_valid(ctx->old) || new == ctx->old)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
data->young = true;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* stage2_age_walker() is always called while holding the MMU lock for
|
|
|
|
* write, so this will always succeed. Nonetheless, this deliberately
|
|
|
|
* follows the race detection pattern of the other stage-2 walkers in
|
|
|
|
* case the locking mechanics of the MMU notifiers is ever changed.
|
|
|
|
*/
|
|
|
|
if (data->mkold && !stage2_try_set_pte(ctx, new))
|
|
|
|
return -EAGAIN;
|
|
|
|
|
2020-09-11 14:25:18 +01:00
|
|
|
/*
|
|
|
|
* "But where's the TLBI?!", you scream.
|
|
|
|
* "Over in the core code", I sigh.
|
|
|
|
*
|
|
|
|
* See the '->clear_flush_young()' callback on the KVM mmu notifier.
|
|
|
|
*/
|
2023-06-27 23:54:05 +00:00
|
|
|
return 0;
|
2020-09-11 14:25:18 +01:00
|
|
|
}
|
|
|
|
|
2023-06-27 23:54:05 +00:00
|
|
|
bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
|
|
|
|
u64 size, bool mkold)
|
2020-09-11 14:25:18 +01:00
|
|
|
{
|
2023-06-27 23:54:05 +00:00
|
|
|
struct stage2_age_data data = {
|
|
|
|
.mkold = mkold,
|
|
|
|
};
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_age_walker,
|
|
|
|
.arg = &data,
|
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF,
|
|
|
|
};
|
|
|
|
|
|
|
|
WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
|
|
|
|
return data.young;
|
2020-09-11 14:25:18 +01:00
|
|
|
}
|
|
|
|
|
2020-09-11 14:25:24 +01:00
|
|
|
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
|
2024-12-18 19:40:47 +00:00
|
|
|
enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags)
|
2020-09-11 14:25:24 +01:00
|
|
|
{
|
|
|
|
int ret;
|
2023-11-27 11:17:33 +00:00
|
|
|
s8 level;
|
2020-09-11 14:25:24 +01:00
|
|
|
kvm_pte_t set = 0, clr = 0;
|
|
|
|
|
2021-08-09 16:24:38 +01:00
|
|
|
if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2020-09-11 14:25:24 +01:00
|
|
|
if (prot & KVM_PGTABLE_PROT_R)
|
|
|
|
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
|
|
|
|
|
|
|
|
if (prot & KVM_PGTABLE_PROT_W)
|
|
|
|
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
|
|
|
|
|
|
|
|
if (prot & KVM_PGTABLE_PROT_X)
|
|
|
|
clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
|
|
|
|
|
2024-12-18 19:40:47 +00:00
|
|
|
ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags);
|
2023-09-22 22:32:29 +00:00
|
|
|
if (!ret || ret == -EAGAIN)
|
2023-04-26 17:23:30 +00:00
|
|
|
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
|
2020-09-11 14:25:24 +01:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2020-09-11 14:25:22 +01:00
|
|
|
{
|
2022-11-07 21:56:31 +00:00
|
|
|
struct kvm_pgtable *pgt = ctx->arg;
|
2021-03-19 10:01:40 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
|
2020-09-11 14:25:22 +01:00
|
|
|
|
2024-04-23 16:05:15 +01:00
|
|
|
if (!stage2_pte_cacheable(pgt, ctx->old))
|
2020-09-11 14:25:22 +01:00
|
|
|
return 0;
|
|
|
|
|
2022-01-14 08:57:58 +00:00
|
|
|
if (mm_ops->dcache_clean_inval_poc)
|
2022-11-07 21:56:32 +00:00
|
|
|
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
|
2022-11-07 21:56:31 +00:00
|
|
|
kvm_granule_size(ctx->level));
|
2020-09-11 14:25:22 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
|
|
|
|
{
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_flush_walker,
|
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF,
|
2021-03-19 10:01:40 +00:00
|
|
|
.arg = pgt,
|
2020-09-11 14:25:22 +01:00
|
|
|
};
|
|
|
|
|
2021-03-19 10:01:40 +00:00
|
|
|
if (stage2_has_fwb(pgt))
|
2020-09-11 14:25:22 +01:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
return kvm_pgtable_walk(pgt, addr, size, &walker);
|
|
|
|
}
|
|
|
|
|
2023-04-26 17:23:21 +00:00
|
|
|
kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
|
2023-11-27 11:17:33 +00:00
|
|
|
u64 phys, s8 level,
|
2023-04-26 17:23:21 +00:00
|
|
|
enum kvm_pgtable_prot prot,
|
|
|
|
void *mc, bool force_pte)
|
|
|
|
{
|
|
|
|
struct stage2_map_data map_data = {
|
|
|
|
.phys = phys,
|
|
|
|
.mmu = pgt->mmu,
|
|
|
|
.memcache = mc,
|
|
|
|
.force_pte = force_pte,
|
|
|
|
};
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_map_walker,
|
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF |
|
|
|
|
KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
|
|
|
|
KVM_PGTABLE_WALK_SKIP_CMO,
|
|
|
|
.arg = &map_data,
|
|
|
|
};
|
|
|
|
/*
|
|
|
|
* The input address (.addr) is irrelevant for walking an
|
|
|
|
* unlinked table. Construct an ambiguous IA range to map
|
|
|
|
* kvm_granule_size(level) worth of memory.
|
|
|
|
*/
|
|
|
|
struct kvm_pgtable_walk_data data = {
|
|
|
|
.walker = &walker,
|
|
|
|
.addr = 0,
|
|
|
|
.end = kvm_granule_size(level),
|
|
|
|
};
|
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
|
|
|
|
kvm_pte_t *pgtable;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!IS_ALIGNED(phys, kvm_granule_size(level)))
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
pgtable = mm_ops->zalloc_page(mc);
|
|
|
|
if (!pgtable)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
|
|
|
|
level + 1);
|
|
|
|
if (ret) {
|
|
|
|
kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
return pgtable;
|
|
|
|
}
|
2021-08-09 16:24:37 +01:00
|
|
|
|
2023-04-26 17:23:24 +00:00
|
|
|
/*
|
|
|
|
* Get the number of page-tables needed to replace a block with a
|
|
|
|
* fully populated tree up to the PTE entries. Note that @level is
|
|
|
|
* interpreted as in "level @level entry".
|
|
|
|
*/
|
2023-11-27 11:17:33 +00:00
|
|
|
static int stage2_block_get_nr_page_tables(s8 level)
|
2023-04-26 17:23:24 +00:00
|
|
|
{
|
|
|
|
switch (level) {
|
|
|
|
case 1:
|
|
|
|
return PTRS_PER_PTE + 1;
|
|
|
|
case 2:
|
|
|
|
return 1;
|
|
|
|
case 3:
|
|
|
|
return 0;
|
|
|
|
default:
|
|
|
|
WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
|
2023-11-27 11:17:33 +00:00
|
|
|
level > KVM_PGTABLE_LAST_LEVEL);
|
2023-04-26 17:23:24 +00:00
|
|
|
return -EINVAL;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
|
|
|
{
|
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
|
|
|
struct kvm_mmu_memory_cache *mc = ctx->arg;
|
|
|
|
struct kvm_s2_mmu *mmu;
|
|
|
|
kvm_pte_t pte = ctx->old, new, *childp;
|
|
|
|
enum kvm_pgtable_prot prot;
|
2023-11-27 11:17:33 +00:00
|
|
|
s8 level = ctx->level;
|
2023-04-26 17:23:24 +00:00
|
|
|
bool force_pte;
|
|
|
|
int nr_pages;
|
|
|
|
u64 phys;
|
|
|
|
|
|
|
|
/* No huge-pages exist at the last level */
|
2023-11-27 11:17:33 +00:00
|
|
|
if (level == KVM_PGTABLE_LAST_LEVEL)
|
2023-04-26 17:23:24 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* We only split valid block mappings */
|
|
|
|
if (!kvm_pte_valid(pte))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
nr_pages = stage2_block_get_nr_page_tables(level);
|
|
|
|
if (nr_pages < 0)
|
|
|
|
return nr_pages;
|
|
|
|
|
|
|
|
if (mc->nobjs >= nr_pages) {
|
|
|
|
/* Build a tree mapped down to the PTE granularity. */
|
|
|
|
force_pte = true;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Don't force PTEs, so create_unlinked() below does
|
|
|
|
* not populate the tree up to the PTE level. The
|
|
|
|
* consequence is that the call will require a single
|
|
|
|
* page of level 2 entries at level 1, or a single
|
|
|
|
* page of PTEs at level 2. If we are at level 1, the
|
|
|
|
* PTEs will be created recursively.
|
|
|
|
*/
|
|
|
|
force_pte = false;
|
|
|
|
nr_pages = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mc->nobjs < nr_pages)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
|
|
|
|
phys = kvm_pte_to_phys(pte);
|
|
|
|
prot = kvm_pgtable_stage2_pte_prot(pte);
|
|
|
|
|
|
|
|
childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
|
|
|
|
level, prot, mc, force_pte);
|
|
|
|
if (IS_ERR(childp))
|
|
|
|
return PTR_ERR(childp);
|
|
|
|
|
|
|
|
if (!stage2_try_break_pte(ctx, mmu)) {
|
|
|
|
kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note, the contents of the page table are guaranteed to be made
|
|
|
|
* visible before the new PTE is assigned because stage2_make_pte()
|
|
|
|
* writes the PTE using smp_store_release().
|
|
|
|
*/
|
|
|
|
new = kvm_init_table_pte(childp, mm_ops);
|
|
|
|
stage2_make_pte(ctx, new);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
|
|
|
struct kvm_mmu_memory_cache *mc)
|
|
|
|
{
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_split_walker,
|
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF,
|
|
|
|
.arg = mc,
|
|
|
|
};
|
2024-08-08 17:42:43 +00:00
|
|
|
int ret;
|
2023-04-26 17:23:24 +00:00
|
|
|
|
2024-08-08 17:42:43 +00:00
|
|
|
ret = kvm_pgtable_walk(pgt, addr, size, &walker);
|
|
|
|
dsb(ishst);
|
|
|
|
return ret;
|
2023-04-26 17:23:24 +00:00
|
|
|
}
|
2021-08-09 16:24:37 +01:00
|
|
|
|
2021-11-29 20:00:45 +00:00
|
|
|
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
|
2021-08-09 16:24:37 +01:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops,
|
|
|
|
enum kvm_pgtable_stage2_flags flags,
|
|
|
|
kvm_pgtable_force_pte_cb_t force_pte_cb)
|
2020-09-11 14:25:13 +01:00
|
|
|
{
|
|
|
|
size_t pgd_sz;
|
KVM: arm64: Move VTCR_EL2 into struct s2_mmu
We currently have a global VTCR_EL2 value for each guest, even
if the guest uses NV. This implies that the guest's own S2 must
fit in the host's. This is odd, for multiple reasons:
- the PARange values and the number of IPA bits don't necessarily
match: you can have 33 bits of IPA space, and yet you can only
describe 32 or 36 bits of PARange
- When userspace set the IPA space, it creates a contract with the
kernel saying "this is the IPA space I'm prepared to handle".
At no point does it constraint the guest's own IPA space as
long as the guest doesn't try to use a [I]PA outside of the
IPA space set by userspace
- We don't even try to hide the value of ID_AA64MMFR0_EL1.PARange.
And then there is the consequence of the above: if a guest tries
to create a S2 that has for input address something that is larger
than the IPA space defined by the host, we inject a fatal exception.
This is no good. For all intent and purposes, a guest should be
able to have the S2 it really wants, as long as the *output* address
of that S2 isn't outside of the IPA space.
For that, we need to have a per-s2_mmu VTCR_EL2 setting, which
allows us to represent the full PARange. Move the vctr field into
the s2_mmu structure, which has no impact whatsoever, except for NV.
Note that once we are able to override ID_AA64MMFR0_EL1.PARange
from userspace, we'll also be able to restrict the size of the
shadow S2 that NV uses.
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20231012205108.3937270-1-maz@kernel.org
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
2023-10-12 21:51:08 +01:00
|
|
|
u64 vtcr = mmu->vtcr;
|
2020-09-11 14:25:13 +01:00
|
|
|
u32 ia_bits = VTCR_EL2_IPA(vtcr);
|
|
|
|
u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
|
2023-11-27 11:17:33 +00:00
|
|
|
s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
|
2020-09-11 14:25:13 +01:00
|
|
|
|
|
|
|
pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
|
2022-11-07 21:56:36 +00:00
|
|
|
pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);
|
2020-09-11 14:25:13 +01:00
|
|
|
if (!pgt->pgd)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
pgt->ia_bits = ia_bits;
|
|
|
|
pgt->start_level = start_level;
|
2021-03-19 10:01:14 +00:00
|
|
|
pgt->mm_ops = mm_ops;
|
2021-11-29 20:00:45 +00:00
|
|
|
pgt->mmu = mmu;
|
2021-03-19 10:01:40 +00:00
|
|
|
pgt->flags = flags;
|
2021-08-09 16:24:37 +01:00
|
|
|
pgt->force_pte_cb = force_pte_cb;
|
2020-09-11 14:25:13 +01:00
|
|
|
|
|
|
|
/* Ensure zeroed PGD pages are visible to the hardware walker */
|
|
|
|
dsb(ishst);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-11-10 19:02:45 +00:00
|
|
|
size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
|
|
|
|
{
|
|
|
|
u32 ia_bits = VTCR_EL2_IPA(vtcr);
|
|
|
|
u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
|
2023-11-27 11:17:33 +00:00
|
|
|
s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
|
2022-11-10 19:02:45 +00:00
|
|
|
|
|
|
|
return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
|
|
|
enum kvm_pgtable_walk_flags visit)
|
2020-09-11 14:25:13 +01:00
|
|
|
{
|
2022-11-07 21:56:33 +00:00
|
|
|
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
|
2020-09-11 14:25:13 +01:00
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
if (!stage2_pte_is_counted(ctx->old))
|
2020-09-11 14:25:13 +01:00
|
|
|
return 0;
|
|
|
|
|
2022-11-07 21:56:31 +00:00
|
|
|
mm_ops->put_page(ctx->ptep);
|
2020-09-11 14:25:13 +01:00
|
|
|
|
2022-11-07 21:56:32 +00:00
|
|
|
if (kvm_pte_table(ctx->old, ctx->level))
|
|
|
|
mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
|
2020-09-11 14:25:13 +01:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
|
|
|
|
{
|
|
|
|
size_t pgd_sz;
|
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_free_walker,
|
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF |
|
|
|
|
KVM_PGTABLE_WALK_TABLE_POST,
|
|
|
|
};
|
|
|
|
|
|
|
|
WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
|
|
|
|
pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
|
2022-11-18 18:22:20 +00:00
|
|
|
pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
|
2020-09-11 14:25:13 +01:00
|
|
|
pgt->pgd = NULL;
|
|
|
|
}
|
2022-11-07 21:56:35 +00:00
|
|
|
|
2023-11-27 11:17:33 +00:00
|
|
|
void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
|
2022-11-07 21:56:35 +00:00
|
|
|
{
|
2022-11-07 21:56:37 +00:00
|
|
|
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
|
2022-11-07 21:56:35 +00:00
|
|
|
struct kvm_pgtable_walker walker = {
|
|
|
|
.cb = stage2_free_walker,
|
|
|
|
.flags = KVM_PGTABLE_WALK_LEAF |
|
|
|
|
KVM_PGTABLE_WALK_TABLE_POST,
|
|
|
|
};
|
|
|
|
struct kvm_pgtable_walk_data data = {
|
|
|
|
.walker = &walker,
|
|
|
|
|
|
|
|
/*
|
|
|
|
* At this point the IPA really doesn't matter, as the page
|
|
|
|
* table being traversed has already been removed from the stage
|
|
|
|
* 2. Set an appropriate range to cover the entire page table.
|
|
|
|
*/
|
|
|
|
.addr = 0,
|
|
|
|
.end = kvm_granule_size(level),
|
|
|
|
};
|
|
|
|
|
2022-11-07 21:56:37 +00:00
|
|
|
WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1));
|
2023-05-30 19:32:13 +00:00
|
|
|
|
|
|
|
WARN_ON(mm_ops->page_count(pgtable) != 1);
|
|
|
|
mm_ops->put_page(pgtable);
|
2022-11-07 21:56:35 +00:00
|
|
|
}
|