linux/arch/mips/include/asm/barrier.h

186 lines
5.2 KiB
C
Raw Normal View History

/*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
* for more details.
*
* Copyright (C) 2006 by Ralf Baechle (ralf@linux-mips.org)
*/
#ifndef __ASM_BARRIER_H
#define __ASM_BARRIER_H
#include <asm/addrspace.h>
MIPS: barrier: Add __SYNC() infrastructure Introduce an asm/sync.h header which provides infrastructure that can be used to generate sync instructions of various types, and for various reasons. For example if we need a sync instruction that provides a full completion barrier but only on systems which have weak memory ordering, we can generate the appropriate assembly code using: __SYNC(full, weak_ordering) When the kernel is configured to run on systems with weak memory ordering (ie. CONFIG_WEAK_ORDERING is selected) we'll emit a sync instruction. When the kernel is configured to run on systems with strong memory ordering (ie. CONFIG_WEAK_ORDERING is not selected) we'll emit nothing. The caller doesn't need to know which happened - it simply says what it needs & when, with no concern for checking the kernel configuration. There are some scenarios in which we may want to emit code only when we *didn't* emit a sync instruction. For example, some Loongson3 CPUs suffer from a bug that requires us to emit a sync instruction prior to each ll instruction (enabled by CONFIG_CPU_LOONGSON3_WORKAROUNDS). In cases where this bug workaround is enabled, it's wasteful to then have more generic code emit another sync instruction to provide barriers we need in general. A __SYNC_ELSE() macro allows for this, providing an extra argument that contains code to be assembled only in cases where the sync instruction was not emitted. For example if we have a scenario in which we generally want to emit a release barrier but for affected Loongson3 configurations upgrade that to a full completion barrier, we can do that like so: __SYNC_ELSE(full, loongson3_war, __SYNC(rl, always)) The assembly generated by these macros can be used either as inline assembly or in assembly source files. Differing types of sync as provided by MIPSr6 are defined, but currently they all generate a full completion barrier except in kernels configured for Cavium Octeon systems. There the wmb sync-type is used, and rmb syncs are omitted, as has been the case since commit 6b07d38aaa52 ("MIPS: Octeon: Use optimized memory barrier primitives."). Using __SYNC() with the wmb or rmb types will abstract away the Octeon specific behavior and allow us to later clean up asm/barrier.h code that currently includes a plethora of #ifdef's. Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: linux-mips@vger.kernel.org Cc: Huacai Chen <chenhc@lemote.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: linux-kernel@vger.kernel.org
2019-10-01 21:53:07 +00:00
#include <asm/sync.h>
#ifdef CONFIG_CPU_HAS_SYNC
#define __sync() \
__asm__ __volatile__( \
".set push\n\t" \
".set noreorder\n\t" \
".set mips2\n\t" \
"sync\n\t" \
".set pop" \
: /* no output */ \
: /* no input */ \
: "memory")
#else
#define __sync() do { } while(0)
#endif
static inline void rmb(void)
{
asm volatile(__SYNC(rmb, always) ::: "memory");
}
#define rmb rmb
static inline void wmb(void)
{
asm volatile(__SYNC(wmb, always) ::: "memory");
}
#define wmb wmb
#define __fast_iob() \
__asm__ __volatile__( \
".set push\n\t" \
".set noreorder\n\t" \
"lw $0,%0\n\t" \
"nop\n\t" \
".set pop" \
: /* no output */ \
: "m" (*(int *)CKSEG1) \
: "memory")
#ifdef CONFIG_CPU_CAVIUM_OCTEON
# define fast_mb() __sync()
# define fast_iob() do { } while (0)
#else /* ! CONFIG_CPU_CAVIUM_OCTEON */
# define fast_mb() __sync()
# ifdef CONFIG_SGI_IP28
# define fast_iob() \
__asm__ __volatile__( \
".set push\n\t" \
".set noreorder\n\t" \
"lw $0,%0\n\t" \
"sync\n\t" \
"lw $0,%0\n\t" \
".set pop" \
: /* no output */ \
: "m" (*(int *)CKSEG1ADDR(0x1fa00004)) \
: "memory")
# else
# define fast_iob() \
do { \
__sync(); \
__fast_iob(); \
} while (0)
# endif
#endif /* CONFIG_CPU_CAVIUM_OCTEON */
#ifdef CONFIG_CPU_HAS_WB
#include <asm/wbflush.h>
#define mb() wbflush()
#define iob() wbflush()
#else /* !CONFIG_CPU_HAS_WB */
#define mb() fast_mb()
#define iob() fast_iob()
#endif /* !CONFIG_CPU_HAS_WB */
#if defined(CONFIG_WEAK_ORDERING)
# ifdef CONFIG_CPU_CAVIUM_OCTEON
# define __smp_mb() __sync()
# else
# define __smp_mb() __asm__ __volatile__("sync" : : :"memory")
# endif
# define __smp_rmb() rmb()
# define __smp_wmb() wmb()
#else
#define __smp_mb() barrier()
#define __smp_rmb() barrier()
#define __smp_wmb() barrier()
#endif
/*
* When LL/SC does imply order, it must also be a compiler barrier to avoid the
* compiler from reordering where the CPU will not. When it does not imply
* order, the compiler is also free to reorder across the LL/SC loop and
* ordering will be done by smp_llsc_mb() and friends.
*/
#if defined(CONFIG_WEAK_REORDERING_BEYOND_LLSC) && defined(CONFIG_SMP)
#define __WEAK_LLSC_MB " sync \n"
#define smp_llsc_mb() __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
#define __LLSC_CLOBBER
#else
#define __WEAK_LLSC_MB " \n"
#define smp_llsc_mb() do { } while (0)
#define __LLSC_CLOBBER "memory"
#endif
#ifdef CONFIG_CPU_CAVIUM_OCTEON
#define smp_mb__before_llsc() smp_wmb()
#define __smp_mb__before_llsc() __smp_wmb()
/* Cause previous writes to become visible on all CPUs as soon as possible */
#define nudge_writes() __asm__ __volatile__(".set push\n\t" \
".set arch=octeon\n\t" \
"syncw\n\t" \
".set pop" : : : "memory")
#else
#define smp_mb__before_llsc() smp_llsc_mb()
#define __smp_mb__before_llsc() smp_llsc_mb()
#define nudge_writes() mb()
#endif
#define __smp_mb__before_atomic() __smp_mb__before_llsc()
#define __smp_mb__after_atomic() smp_llsc_mb()
MIPS: Loongson: Introduce and use loongson_llsc_mb() On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and lld/scd is very weak ordering. We should add sync instructions "before each ll/lld" and "at the branch-target between ll/sc" to workaround. Otherwise, this flaw will cause deadlock occasionally (e.g. when doing heavy load test with LTP). Below is the explaination of CPU designer: "For Loongson 3 family, when a memory access instruction (load, store, or prefetch)'s executing occurs between the execution of LL and SC, the success or failure of SC is not predictable. Although programmer would not insert memory access instructions between LL and SC, the memory instructions before LL in program-order, may dynamically executed between the execution of LL/SC, so a memory fence (SYNC) is needed before LL/LLD to avoid this situation. Since Loongson-3A R2 (3A2000), we have improved our hardware design to handle this case. But we later deduce a rarely circumstance that some speculatively executed memory instructions due to branch misprediction between LL/SC still fall into the above case, so a memory fence (SYNC) at branch-target (if its target is not between LL/SC) is needed for Loongson 3A1000, 3B1500, 3A2000 and 3A3000. Our processor is continually evolving and we aim to to remove all these workaround-SYNCs around LL/SC for new-come processor." Here is an example: Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same time('sc' return 1), and the variable is only *added by 1*, sometimes, which is wrong and unacceptable(it should be added by 2). Why disable fix-loongson3-llsc in compiler? Because compiler fix will cause problems in kernel's __ex_table section. This patch fix all the cases in kernel, but: +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix the ll and branch-target coincidently such as atomic_sub_if_positive/ cmpxchg/xchg, just like this one. +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch edac.h +. local_ops and cmpxchg_local should not be affected by this bug since only the owner can write. +. mips_atomic_set for syscall.c is deprecated and rarely used, just let it go Signed-off-by: Huacai Chen <chenhc@lemote.com> Signed-off-by: Huang Pei <huangpei@loongson.cn> [paul.burton@mips.com: - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add a comment describing why it's there. - Make loongson_llsc_mb() a no-op when CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory barrier. - Add a comment describing the bug & how loongson_llsc_mb() helps in asm/barrier.h.] Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: ambrosehua@gmail.com Cc: Steven J . Hill <Steven.Hill@cavium.com> Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang <zhangfx@lemote.com> Cc: Zhangjin Wu <wuzhangjin@gmail.com> Cc: Li Xuefeng <lixuefeng@loongson.cn> Cc: Xu Chenghua <xuchenghua@loongson.cn>
2019-01-15 16:04:54 +08:00
/*
* Some Loongson 3 CPUs have a bug wherein execution of a memory access (load,
* store or prefetch) in between an LL & SC can cause the SC instruction to
MIPS: Loongson: Introduce and use loongson_llsc_mb() On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and lld/scd is very weak ordering. We should add sync instructions "before each ll/lld" and "at the branch-target between ll/sc" to workaround. Otherwise, this flaw will cause deadlock occasionally (e.g. when doing heavy load test with LTP). Below is the explaination of CPU designer: "For Loongson 3 family, when a memory access instruction (load, store, or prefetch)'s executing occurs between the execution of LL and SC, the success or failure of SC is not predictable. Although programmer would not insert memory access instructions between LL and SC, the memory instructions before LL in program-order, may dynamically executed between the execution of LL/SC, so a memory fence (SYNC) is needed before LL/LLD to avoid this situation. Since Loongson-3A R2 (3A2000), we have improved our hardware design to handle this case. But we later deduce a rarely circumstance that some speculatively executed memory instructions due to branch misprediction between LL/SC still fall into the above case, so a memory fence (SYNC) at branch-target (if its target is not between LL/SC) is needed for Loongson 3A1000, 3B1500, 3A2000 and 3A3000. Our processor is continually evolving and we aim to to remove all these workaround-SYNCs around LL/SC for new-come processor." Here is an example: Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same time('sc' return 1), and the variable is only *added by 1*, sometimes, which is wrong and unacceptable(it should be added by 2). Why disable fix-loongson3-llsc in compiler? Because compiler fix will cause problems in kernel's __ex_table section. This patch fix all the cases in kernel, but: +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix the ll and branch-target coincidently such as atomic_sub_if_positive/ cmpxchg/xchg, just like this one. +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch edac.h +. local_ops and cmpxchg_local should not be affected by this bug since only the owner can write. +. mips_atomic_set for syscall.c is deprecated and rarely used, just let it go Signed-off-by: Huacai Chen <chenhc@lemote.com> Signed-off-by: Huang Pei <huangpei@loongson.cn> [paul.burton@mips.com: - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add a comment describing why it's there. - Make loongson_llsc_mb() a no-op when CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory barrier. - Add a comment describing the bug & how loongson_llsc_mb() helps in asm/barrier.h.] Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: ambrosehua@gmail.com Cc: Steven J . Hill <Steven.Hill@cavium.com> Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang <zhangfx@lemote.com> Cc: Zhangjin Wu <wuzhangjin@gmail.com> Cc: Li Xuefeng <lixuefeng@loongson.cn> Cc: Xu Chenghua <xuchenghua@loongson.cn>
2019-01-15 16:04:54 +08:00
* erroneously succeed, breaking atomicity. Whilst it's unusual to write code
* containing such sequences, this bug bites harder than we might otherwise
* expect due to reordering & speculation:
*
* 1) A memory access appearing prior to the LL in program order may actually
* be executed after the LL - this is the reordering case.
MIPS: Loongson: Introduce and use loongson_llsc_mb() On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and lld/scd is very weak ordering. We should add sync instructions "before each ll/lld" and "at the branch-target between ll/sc" to workaround. Otherwise, this flaw will cause deadlock occasionally (e.g. when doing heavy load test with LTP). Below is the explaination of CPU designer: "For Loongson 3 family, when a memory access instruction (load, store, or prefetch)'s executing occurs between the execution of LL and SC, the success or failure of SC is not predictable. Although programmer would not insert memory access instructions between LL and SC, the memory instructions before LL in program-order, may dynamically executed between the execution of LL/SC, so a memory fence (SYNC) is needed before LL/LLD to avoid this situation. Since Loongson-3A R2 (3A2000), we have improved our hardware design to handle this case. But we later deduce a rarely circumstance that some speculatively executed memory instructions due to branch misprediction between LL/SC still fall into the above case, so a memory fence (SYNC) at branch-target (if its target is not between LL/SC) is needed for Loongson 3A1000, 3B1500, 3A2000 and 3A3000. Our processor is continually evolving and we aim to to remove all these workaround-SYNCs around LL/SC for new-come processor." Here is an example: Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same time('sc' return 1), and the variable is only *added by 1*, sometimes, which is wrong and unacceptable(it should be added by 2). Why disable fix-loongson3-llsc in compiler? Because compiler fix will cause problems in kernel's __ex_table section. This patch fix all the cases in kernel, but: +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix the ll and branch-target coincidently such as atomic_sub_if_positive/ cmpxchg/xchg, just like this one. +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch edac.h +. local_ops and cmpxchg_local should not be affected by this bug since only the owner can write. +. mips_atomic_set for syscall.c is deprecated and rarely used, just let it go Signed-off-by: Huacai Chen <chenhc@lemote.com> Signed-off-by: Huang Pei <huangpei@loongson.cn> [paul.burton@mips.com: - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add a comment describing why it's there. - Make loongson_llsc_mb() a no-op when CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory barrier. - Add a comment describing the bug & how loongson_llsc_mb() helps in asm/barrier.h.] Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: ambrosehua@gmail.com Cc: Steven J . Hill <Steven.Hill@cavium.com> Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang <zhangfx@lemote.com> Cc: Zhangjin Wu <wuzhangjin@gmail.com> Cc: Li Xuefeng <lixuefeng@loongson.cn> Cc: Xu Chenghua <xuchenghua@loongson.cn>
2019-01-15 16:04:54 +08:00
*
* In order to avoid this we need to place a memory barrier (ie. a SYNC
* instruction) prior to every LL instruction, in between it and any earlier
* memory access instructions.
MIPS: Loongson: Introduce and use loongson_llsc_mb() On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and lld/scd is very weak ordering. We should add sync instructions "before each ll/lld" and "at the branch-target between ll/sc" to workaround. Otherwise, this flaw will cause deadlock occasionally (e.g. when doing heavy load test with LTP). Below is the explaination of CPU designer: "For Loongson 3 family, when a memory access instruction (load, store, or prefetch)'s executing occurs between the execution of LL and SC, the success or failure of SC is not predictable. Although programmer would not insert memory access instructions between LL and SC, the memory instructions before LL in program-order, may dynamically executed between the execution of LL/SC, so a memory fence (SYNC) is needed before LL/LLD to avoid this situation. Since Loongson-3A R2 (3A2000), we have improved our hardware design to handle this case. But we later deduce a rarely circumstance that some speculatively executed memory instructions due to branch misprediction between LL/SC still fall into the above case, so a memory fence (SYNC) at branch-target (if its target is not between LL/SC) is needed for Loongson 3A1000, 3B1500, 3A2000 and 3A3000. Our processor is continually evolving and we aim to to remove all these workaround-SYNCs around LL/SC for new-come processor." Here is an example: Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same time('sc' return 1), and the variable is only *added by 1*, sometimes, which is wrong and unacceptable(it should be added by 2). Why disable fix-loongson3-llsc in compiler? Because compiler fix will cause problems in kernel's __ex_table section. This patch fix all the cases in kernel, but: +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix the ll and branch-target coincidently such as atomic_sub_if_positive/ cmpxchg/xchg, just like this one. +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch edac.h +. local_ops and cmpxchg_local should not be affected by this bug since only the owner can write. +. mips_atomic_set for syscall.c is deprecated and rarely used, just let it go Signed-off-by: Huacai Chen <chenhc@lemote.com> Signed-off-by: Huang Pei <huangpei@loongson.cn> [paul.burton@mips.com: - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add a comment describing why it's there. - Make loongson_llsc_mb() a no-op when CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory barrier. - Add a comment describing the bug & how loongson_llsc_mb() helps in asm/barrier.h.] Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: ambrosehua@gmail.com Cc: Steven J . Hill <Steven.Hill@cavium.com> Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang <zhangfx@lemote.com> Cc: Zhangjin Wu <wuzhangjin@gmail.com> Cc: Li Xuefeng <lixuefeng@loongson.cn> Cc: Xu Chenghua <xuchenghua@loongson.cn>
2019-01-15 16:04:54 +08:00
*
* This reordering case is fixed by 3A R2 CPUs, ie. 3A2000 models and later.
*
* 2) If a conditional branch exists between an LL & SC with a target outside
* of the LL-SC loop, for example an exit upon value mismatch in cmpxchg()
MIPS: Loongson: Introduce and use loongson_llsc_mb() On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and lld/scd is very weak ordering. We should add sync instructions "before each ll/lld" and "at the branch-target between ll/sc" to workaround. Otherwise, this flaw will cause deadlock occasionally (e.g. when doing heavy load test with LTP). Below is the explaination of CPU designer: "For Loongson 3 family, when a memory access instruction (load, store, or prefetch)'s executing occurs between the execution of LL and SC, the success or failure of SC is not predictable. Although programmer would not insert memory access instructions between LL and SC, the memory instructions before LL in program-order, may dynamically executed between the execution of LL/SC, so a memory fence (SYNC) is needed before LL/LLD to avoid this situation. Since Loongson-3A R2 (3A2000), we have improved our hardware design to handle this case. But we later deduce a rarely circumstance that some speculatively executed memory instructions due to branch misprediction between LL/SC still fall into the above case, so a memory fence (SYNC) at branch-target (if its target is not between LL/SC) is needed for Loongson 3A1000, 3B1500, 3A2000 and 3A3000. Our processor is continually evolving and we aim to to remove all these workaround-SYNCs around LL/SC for new-come processor." Here is an example: Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same time('sc' return 1), and the variable is only *added by 1*, sometimes, which is wrong and unacceptable(it should be added by 2). Why disable fix-loongson3-llsc in compiler? Because compiler fix will cause problems in kernel's __ex_table section. This patch fix all the cases in kernel, but: +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix the ll and branch-target coincidently such as atomic_sub_if_positive/ cmpxchg/xchg, just like this one. +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch edac.h +. local_ops and cmpxchg_local should not be affected by this bug since only the owner can write. +. mips_atomic_set for syscall.c is deprecated and rarely used, just let it go Signed-off-by: Huacai Chen <chenhc@lemote.com> Signed-off-by: Huang Pei <huangpei@loongson.cn> [paul.burton@mips.com: - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add a comment describing why it's there. - Make loongson_llsc_mb() a no-op when CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory barrier. - Add a comment describing the bug & how loongson_llsc_mb() helps in asm/barrier.h.] Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: ambrosehua@gmail.com Cc: Steven J . Hill <Steven.Hill@cavium.com> Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang <zhangfx@lemote.com> Cc: Zhangjin Wu <wuzhangjin@gmail.com> Cc: Li Xuefeng <lixuefeng@loongson.cn> Cc: Xu Chenghua <xuchenghua@loongson.cn>
2019-01-15 16:04:54 +08:00
* or similar, then misprediction of the branch may allow speculative
* execution of memory accesses from outside of the LL-SC loop.
MIPS: Loongson: Introduce and use loongson_llsc_mb() On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and lld/scd is very weak ordering. We should add sync instructions "before each ll/lld" and "at the branch-target between ll/sc" to workaround. Otherwise, this flaw will cause deadlock occasionally (e.g. when doing heavy load test with LTP). Below is the explaination of CPU designer: "For Loongson 3 family, when a memory access instruction (load, store, or prefetch)'s executing occurs between the execution of LL and SC, the success or failure of SC is not predictable. Although programmer would not insert memory access instructions between LL and SC, the memory instructions before LL in program-order, may dynamically executed between the execution of LL/SC, so a memory fence (SYNC) is needed before LL/LLD to avoid this situation. Since Loongson-3A R2 (3A2000), we have improved our hardware design to handle this case. But we later deduce a rarely circumstance that some speculatively executed memory instructions due to branch misprediction between LL/SC still fall into the above case, so a memory fence (SYNC) at branch-target (if its target is not between LL/SC) is needed for Loongson 3A1000, 3B1500, 3A2000 and 3A3000. Our processor is continually evolving and we aim to to remove all these workaround-SYNCs around LL/SC for new-come processor." Here is an example: Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same time('sc' return 1), and the variable is only *added by 1*, sometimes, which is wrong and unacceptable(it should be added by 2). Why disable fix-loongson3-llsc in compiler? Because compiler fix will cause problems in kernel's __ex_table section. This patch fix all the cases in kernel, but: +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix the ll and branch-target coincidently such as atomic_sub_if_positive/ cmpxchg/xchg, just like this one. +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch edac.h +. local_ops and cmpxchg_local should not be affected by this bug since only the owner can write. +. mips_atomic_set for syscall.c is deprecated and rarely used, just let it go Signed-off-by: Huacai Chen <chenhc@lemote.com> Signed-off-by: Huang Pei <huangpei@loongson.cn> [paul.burton@mips.com: - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add a comment describing why it's there. - Make loongson_llsc_mb() a no-op when CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory barrier. - Add a comment describing the bug & how loongson_llsc_mb() helps in asm/barrier.h.] Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: ambrosehua@gmail.com Cc: Steven J . Hill <Steven.Hill@cavium.com> Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang <zhangfx@lemote.com> Cc: Zhangjin Wu <wuzhangjin@gmail.com> Cc: Li Xuefeng <lixuefeng@loongson.cn> Cc: Xu Chenghua <xuchenghua@loongson.cn>
2019-01-15 16:04:54 +08:00
*
* In order to avoid this we need a memory barrier (ie. a SYNC instruction)
MIPS: Loongson: Introduce and use loongson_llsc_mb() On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and lld/scd is very weak ordering. We should add sync instructions "before each ll/lld" and "at the branch-target between ll/sc" to workaround. Otherwise, this flaw will cause deadlock occasionally (e.g. when doing heavy load test with LTP). Below is the explaination of CPU designer: "For Loongson 3 family, when a memory access instruction (load, store, or prefetch)'s executing occurs between the execution of LL and SC, the success or failure of SC is not predictable. Although programmer would not insert memory access instructions between LL and SC, the memory instructions before LL in program-order, may dynamically executed between the execution of LL/SC, so a memory fence (SYNC) is needed before LL/LLD to avoid this situation. Since Loongson-3A R2 (3A2000), we have improved our hardware design to handle this case. But we later deduce a rarely circumstance that some speculatively executed memory instructions due to branch misprediction between LL/SC still fall into the above case, so a memory fence (SYNC) at branch-target (if its target is not between LL/SC) is needed for Loongson 3A1000, 3B1500, 3A2000 and 3A3000. Our processor is continually evolving and we aim to to remove all these workaround-SYNCs around LL/SC for new-come processor." Here is an example: Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same time('sc' return 1), and the variable is only *added by 1*, sometimes, which is wrong and unacceptable(it should be added by 2). Why disable fix-loongson3-llsc in compiler? Because compiler fix will cause problems in kernel's __ex_table section. This patch fix all the cases in kernel, but: +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix the ll and branch-target coincidently such as atomic_sub_if_positive/ cmpxchg/xchg, just like this one. +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch edac.h +. local_ops and cmpxchg_local should not be affected by this bug since only the owner can write. +. mips_atomic_set for syscall.c is deprecated and rarely used, just let it go Signed-off-by: Huacai Chen <chenhc@lemote.com> Signed-off-by: Huang Pei <huangpei@loongson.cn> [paul.burton@mips.com: - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add a comment describing why it's there. - Make loongson_llsc_mb() a no-op when CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory barrier. - Add a comment describing the bug & how loongson_llsc_mb() helps in asm/barrier.h.] Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: ambrosehua@gmail.com Cc: Steven J . Hill <Steven.Hill@cavium.com> Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang <zhangfx@lemote.com> Cc: Zhangjin Wu <wuzhangjin@gmail.com> Cc: Li Xuefeng <lixuefeng@loongson.cn> Cc: Xu Chenghua <xuchenghua@loongson.cn>
2019-01-15 16:04:54 +08:00
* at each affected branch target, for which we also use loongson_llsc_mb()
* defined below.
*
* This case affects all current Loongson 3 CPUs.
*
* The above described cases cause an error in the cache coherence protocol;
* such that the Invalidate of a competing LL-SC goes 'missing' and SC
* erroneously observes its core still has Exclusive state and lets the SC
* proceed.
*
* Therefore the error only occurs on SMP systems.
MIPS: Loongson: Introduce and use loongson_llsc_mb() On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and lld/scd is very weak ordering. We should add sync instructions "before each ll/lld" and "at the branch-target between ll/sc" to workaround. Otherwise, this flaw will cause deadlock occasionally (e.g. when doing heavy load test with LTP). Below is the explaination of CPU designer: "For Loongson 3 family, when a memory access instruction (load, store, or prefetch)'s executing occurs between the execution of LL and SC, the success or failure of SC is not predictable. Although programmer would not insert memory access instructions between LL and SC, the memory instructions before LL in program-order, may dynamically executed between the execution of LL/SC, so a memory fence (SYNC) is needed before LL/LLD to avoid this situation. Since Loongson-3A R2 (3A2000), we have improved our hardware design to handle this case. But we later deduce a rarely circumstance that some speculatively executed memory instructions due to branch misprediction between LL/SC still fall into the above case, so a memory fence (SYNC) at branch-target (if its target is not between LL/SC) is needed for Loongson 3A1000, 3B1500, 3A2000 and 3A3000. Our processor is continually evolving and we aim to to remove all these workaround-SYNCs around LL/SC for new-come processor." Here is an example: Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same time('sc' return 1), and the variable is only *added by 1*, sometimes, which is wrong and unacceptable(it should be added by 2). Why disable fix-loongson3-llsc in compiler? Because compiler fix will cause problems in kernel's __ex_table section. This patch fix all the cases in kernel, but: +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix the ll and branch-target coincidently such as atomic_sub_if_positive/ cmpxchg/xchg, just like this one. +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch edac.h +. local_ops and cmpxchg_local should not be affected by this bug since only the owner can write. +. mips_atomic_set for syscall.c is deprecated and rarely used, just let it go Signed-off-by: Huacai Chen <chenhc@lemote.com> Signed-off-by: Huang Pei <huangpei@loongson.cn> [paul.burton@mips.com: - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add a comment describing why it's there. - Make loongson_llsc_mb() a no-op when CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory barrier. - Add a comment describing the bug & how loongson_llsc_mb() helps in asm/barrier.h.] Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: ambrosehua@gmail.com Cc: Steven J . Hill <Steven.Hill@cavium.com> Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang <zhangfx@lemote.com> Cc: Zhangjin Wu <wuzhangjin@gmail.com> Cc: Li Xuefeng <lixuefeng@loongson.cn> Cc: Xu Chenghua <xuchenghua@loongson.cn>
2019-01-15 16:04:54 +08:00
*/
#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS /* Loongson-3's LLSC workaround */
#define loongson_llsc_mb() __asm__ __volatile__("sync" : : :"memory")
MIPS: Loongson: Introduce and use loongson_llsc_mb() On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and lld/scd is very weak ordering. We should add sync instructions "before each ll/lld" and "at the branch-target between ll/sc" to workaround. Otherwise, this flaw will cause deadlock occasionally (e.g. when doing heavy load test with LTP). Below is the explaination of CPU designer: "For Loongson 3 family, when a memory access instruction (load, store, or prefetch)'s executing occurs between the execution of LL and SC, the success or failure of SC is not predictable. Although programmer would not insert memory access instructions between LL and SC, the memory instructions before LL in program-order, may dynamically executed between the execution of LL/SC, so a memory fence (SYNC) is needed before LL/LLD to avoid this situation. Since Loongson-3A R2 (3A2000), we have improved our hardware design to handle this case. But we later deduce a rarely circumstance that some speculatively executed memory instructions due to branch misprediction between LL/SC still fall into the above case, so a memory fence (SYNC) at branch-target (if its target is not between LL/SC) is needed for Loongson 3A1000, 3B1500, 3A2000 and 3A3000. Our processor is continually evolving and we aim to to remove all these workaround-SYNCs around LL/SC for new-come processor." Here is an example: Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same time('sc' return 1), and the variable is only *added by 1*, sometimes, which is wrong and unacceptable(it should be added by 2). Why disable fix-loongson3-llsc in compiler? Because compiler fix will cause problems in kernel's __ex_table section. This patch fix all the cases in kernel, but: +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix the ll and branch-target coincidently such as atomic_sub_if_positive/ cmpxchg/xchg, just like this one. +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch edac.h +. local_ops and cmpxchg_local should not be affected by this bug since only the owner can write. +. mips_atomic_set for syscall.c is deprecated and rarely used, just let it go Signed-off-by: Huacai Chen <chenhc@lemote.com> Signed-off-by: Huang Pei <huangpei@loongson.cn> [paul.burton@mips.com: - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add a comment describing why it's there. - Make loongson_llsc_mb() a no-op when CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory barrier. - Add a comment describing the bug & how loongson_llsc_mb() helps in asm/barrier.h.] Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: ambrosehua@gmail.com Cc: Steven J . Hill <Steven.Hill@cavium.com> Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang <zhangfx@lemote.com> Cc: Zhangjin Wu <wuzhangjin@gmail.com> Cc: Li Xuefeng <lixuefeng@loongson.cn> Cc: Xu Chenghua <xuchenghua@loongson.cn>
2019-01-15 16:04:54 +08:00
#else
#define loongson_llsc_mb() do { } while (0)
#endif
static inline void sync_ginv(void)
{
MIPS: barrier: Add __SYNC() infrastructure Introduce an asm/sync.h header which provides infrastructure that can be used to generate sync instructions of various types, and for various reasons. For example if we need a sync instruction that provides a full completion barrier but only on systems which have weak memory ordering, we can generate the appropriate assembly code using: __SYNC(full, weak_ordering) When the kernel is configured to run on systems with weak memory ordering (ie. CONFIG_WEAK_ORDERING is selected) we'll emit a sync instruction. When the kernel is configured to run on systems with strong memory ordering (ie. CONFIG_WEAK_ORDERING is not selected) we'll emit nothing. The caller doesn't need to know which happened - it simply says what it needs & when, with no concern for checking the kernel configuration. There are some scenarios in which we may want to emit code only when we *didn't* emit a sync instruction. For example, some Loongson3 CPUs suffer from a bug that requires us to emit a sync instruction prior to each ll instruction (enabled by CONFIG_CPU_LOONGSON3_WORKAROUNDS). In cases where this bug workaround is enabled, it's wasteful to then have more generic code emit another sync instruction to provide barriers we need in general. A __SYNC_ELSE() macro allows for this, providing an extra argument that contains code to be assembled only in cases where the sync instruction was not emitted. For example if we have a scenario in which we generally want to emit a release barrier but for affected Loongson3 configurations upgrade that to a full completion barrier, we can do that like so: __SYNC_ELSE(full, loongson3_war, __SYNC(rl, always)) The assembly generated by these macros can be used either as inline assembly or in assembly source files. Differing types of sync as provided by MIPSr6 are defined, but currently they all generate a full completion barrier except in kernels configured for Cavium Octeon systems. There the wmb sync-type is used, and rmb syncs are omitted, as has been the case since commit 6b07d38aaa52 ("MIPS: Octeon: Use optimized memory barrier primitives."). Using __SYNC() with the wmb or rmb types will abstract away the Octeon specific behavior and allow us to later clean up asm/barrier.h code that currently includes a plethora of #ifdef's. Signed-off-by: Paul Burton <paul.burton@mips.com> Cc: linux-mips@vger.kernel.org Cc: Huacai Chen <chenhc@lemote.com> Cc: Jiaxun Yang <jiaxun.yang@flygoat.com> Cc: linux-kernel@vger.kernel.org
2019-10-01 21:53:07 +00:00
asm volatile("sync\t%0" :: "i"(__SYNC_ginv));
}
#include <asm-generic/barrier.h>
#endif /* __ASM_BARRIER_H */