Skip to content

Commit cc4c07c

Browse files
Alexandre Ghitipalmer-dabbelt
Alexandre Ghiti
authored andcommitted
drivers: perf: Implement perf event mmap support in the SBI backend
We used to unconditionnally expose the cycle and instret csrs to userspace, which gives rise to security concerns. So now we only allow access to hw counters from userspace through the perf framework which will handle context switches, per-task events...etc. A sysctl allows to revert the behaviour to the legacy mode so that userspace applications which are not ready for this change do not break. But the default value is to allow userspace only through perf: this will break userspace applications which rely on direct access to rdcycle. This choice was made for security reasons [1][2]: most of the applications which use rdcycle can instead use rdtime to count the elapsed time. [1] https://groups.google.com/a/groups.riscv.org/g/sw-dev/c/REWcwYnzsKE?pli=1 [2] https://www.youtube.com/watch?v=3-c4C_L2PRQ&ab_channel=IEEESymposiumonSecurityandPrivacy Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com> Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
1 parent 50be342 commit cc4c07c

File tree

2 files changed

+195
-7
lines changed

2 files changed

+195
-7
lines changed

drivers/perf/riscv_pmu.c

+9-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,15 @@ void arch_perf_update_userpage(struct perf_event *event,
3838
userpg->cap_user_time_short = 0;
3939
userpg->cap_user_rdpmc = riscv_perf_user_access(event);
4040

41-
userpg->pmc_width = 64;
41+
#ifdef CONFIG_RISCV_PMU
42+
/*
43+
* The counters are 64-bit but the priv spec doesn't mandate all the
44+
* bits to be implemented: that's why, counter width can vary based on
45+
* the cpu vendor.
46+
*/
47+
if (userpg->cap_user_rdpmc)
48+
userpg->pmc_width = to_riscv_pmu(event->pmu)->ctr_get_width(event->hw.idx) + 1;
49+
#endif
4250

4351
do {
4452
rd = sched_clock_read_begin(&seq);

drivers/perf/riscv_pmu_sbi.c

+186-6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@
2424
#include <asm/sbi.h>
2525
#include <asm/hwcap.h>
2626

27+
#define SYSCTL_NO_USER_ACCESS 0
28+
#define SYSCTL_USER_ACCESS 1
29+
#define SYSCTL_LEGACY 2
30+
31+
#define PERF_EVENT_FLAG_NO_USER_ACCESS BIT(SYSCTL_NO_USER_ACCESS)
32+
#define PERF_EVENT_FLAG_USER_ACCESS BIT(SYSCTL_USER_ACCESS)
33+
#define PERF_EVENT_FLAG_LEGACY BIT(SYSCTL_LEGACY)
34+
2735
PMU_FORMAT_ATTR(event, "config:0-47");
2836
PMU_FORMAT_ATTR(firmware, "config:63");
2937

@@ -43,6 +51,9 @@ static const struct attribute_group *riscv_pmu_attr_groups[] = {
4351
NULL,
4452
};
4553

54+
/* Allow user mode access by default */
55+
static int sysctl_perf_user_access __read_mostly = SYSCTL_USER_ACCESS;
56+
4657
/*
4758
* RISC-V doesn't have heterogeneous harts yet. This need to be part of
4859
* per_cpu in case of harts with different pmu counters
@@ -301,6 +312,11 @@ int riscv_pmu_get_hpm_info(u32 *hw_ctr_width, u32 *num_hw_ctr)
301312
}
302313
EXPORT_SYMBOL_GPL(riscv_pmu_get_hpm_info);
303314

315+
static uint8_t pmu_sbi_csr_index(struct perf_event *event)
316+
{
317+
return pmu_ctr_list[event->hw.idx].csr - CSR_CYCLE;
318+
}
319+
304320
static unsigned long pmu_sbi_get_filter_flags(struct perf_event *event)
305321
{
306322
unsigned long cflags = 0;
@@ -329,18 +345,34 @@ static int pmu_sbi_ctr_get_idx(struct perf_event *event)
329345
struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
330346
struct sbiret ret;
331347
int idx;
332-
uint64_t cbase = 0;
348+
uint64_t cbase = 0, cmask = rvpmu->cmask;
333349
unsigned long cflags = 0;
334350

335351
cflags = pmu_sbi_get_filter_flags(event);
352+
353+
/*
354+
* In legacy mode, we have to force the fixed counters for those events
355+
* but not in the user access mode as we want to use the other counters
356+
* that support sampling/filtering.
357+
*/
358+
if (hwc->flags & PERF_EVENT_FLAG_LEGACY) {
359+
if (event->attr.config == PERF_COUNT_HW_CPU_CYCLES) {
360+
cflags |= SBI_PMU_CFG_FLAG_SKIP_MATCH;
361+
cmask = 1;
362+
} else if (event->attr.config == PERF_COUNT_HW_INSTRUCTIONS) {
363+
cflags |= SBI_PMU_CFG_FLAG_SKIP_MATCH;
364+
cmask = 1UL << (CSR_INSTRET - CSR_CYCLE);
365+
}
366+
}
367+
336368
/* retrieve the available counter index */
337369
#if defined(CONFIG_32BIT)
338370
ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase,
339-
rvpmu->cmask, cflags, hwc->event_base, hwc->config,
371+
cmask, cflags, hwc->event_base, hwc->config,
340372
hwc->config >> 32);
341373
#else
342374
ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase,
343-
rvpmu->cmask, cflags, hwc->event_base, hwc->config, 0);
375+
cmask, cflags, hwc->event_base, hwc->config, 0);
344376
#endif
345377
if (ret.error) {
346378
pr_debug("Not able to find a counter for event %lx config %llx\n",
@@ -474,6 +506,22 @@ static u64 pmu_sbi_ctr_read(struct perf_event *event)
474506
return val;
475507
}
476508

509+
static void pmu_sbi_set_scounteren(void *arg)
510+
{
511+
struct perf_event *event = (struct perf_event *)arg;
512+
513+
csr_write(CSR_SCOUNTEREN,
514+
csr_read(CSR_SCOUNTEREN) | (1 << pmu_sbi_csr_index(event)));
515+
}
516+
517+
static void pmu_sbi_reset_scounteren(void *arg)
518+
{
519+
struct perf_event *event = (struct perf_event *)arg;
520+
521+
csr_write(CSR_SCOUNTEREN,
522+
csr_read(CSR_SCOUNTEREN) & ~(1 << pmu_sbi_csr_index(event)));
523+
}
524+
477525
static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival)
478526
{
479527
struct sbiret ret;
@@ -490,13 +538,21 @@ static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival)
490538
if (ret.error && (ret.error != SBI_ERR_ALREADY_STARTED))
491539
pr_err("Starting counter idx %d failed with error %d\n",
492540
hwc->idx, sbi_err_map_linux_errno(ret.error));
541+
542+
if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) &&
543+
(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
544+
pmu_sbi_set_scounteren((void *)event);
493545
}
494546

495547
static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag)
496548
{
497549
struct sbiret ret;
498550
struct hw_perf_event *hwc = &event->hw;
499551

552+
if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) &&
553+
(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
554+
pmu_sbi_reset_scounteren((void *)event);
555+
500556
ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, hwc->idx, 1, flag, 0, 0, 0);
501557
if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
502558
flag != SBI_PMU_STOP_FLAG_RESET)
@@ -704,10 +760,13 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
704760
struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
705761

706762
/*
707-
* Enable the access for CYCLE, TIME, and INSTRET CSRs from userspace,
708-
* as is necessary to maintain uABI compatibility.
763+
* We keep enabling userspace access to CYCLE, TIME and INSTRET via the
764+
* legacy option but that will be removed in the future.
709765
*/
710-
csr_write(CSR_SCOUNTEREN, 0x7);
766+
if (sysctl_perf_user_access == SYSCTL_LEGACY)
767+
csr_write(CSR_SCOUNTEREN, 0x7);
768+
else
769+
csr_write(CSR_SCOUNTEREN, 0x2);
711770

712771
/* Stop all the counters so that they can be enabled from perf */
713772
pmu_sbi_stop_all(pmu);
@@ -838,6 +897,121 @@ static void riscv_pmu_destroy(struct riscv_pmu *pmu)
838897
cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
839898
}
840899

900+
static void pmu_sbi_event_init(struct perf_event *event)
901+
{
902+
/*
903+
* The permissions are set at event_init so that we do not depend
904+
* on the sysctl value that can change.
905+
*/
906+
if (sysctl_perf_user_access == SYSCTL_NO_USER_ACCESS)
907+
event->hw.flags |= PERF_EVENT_FLAG_NO_USER_ACCESS;
908+
else if (sysctl_perf_user_access == SYSCTL_USER_ACCESS)
909+
event->hw.flags |= PERF_EVENT_FLAG_USER_ACCESS;
910+
else
911+
event->hw.flags |= PERF_EVENT_FLAG_LEGACY;
912+
}
913+
914+
static void pmu_sbi_event_mapped(struct perf_event *event, struct mm_struct *mm)
915+
{
916+
if (event->hw.flags & PERF_EVENT_FLAG_NO_USER_ACCESS)
917+
return;
918+
919+
if (event->hw.flags & PERF_EVENT_FLAG_LEGACY) {
920+
if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES &&
921+
event->attr.config != PERF_COUNT_HW_INSTRUCTIONS) {
922+
return;
923+
}
924+
}
925+
926+
/*
927+
* The user mmapped the event to directly access it: this is where
928+
* we determine based on sysctl_perf_user_access if we grant userspace
929+
* the direct access to this event. That means that within the same
930+
* task, some events may be directly accessible and some other may not,
931+
* if the user changes the value of sysctl_perf_user_accesss in the
932+
* meantime.
933+
*/
934+
935+
event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
936+
937+
/*
938+
* We must enable userspace access *before* advertising in the user page
939+
* that it is possible to do so to avoid any race.
940+
* And we must notify all cpus here because threads that currently run
941+
* on other cpus will try to directly access the counter too without
942+
* calling pmu_sbi_ctr_start.
943+
*/
944+
if (event->hw.flags & PERF_EVENT_FLAG_USER_ACCESS)
945+
on_each_cpu_mask(mm_cpumask(mm),
946+
pmu_sbi_set_scounteren, (void *)event, 1);
947+
}
948+
949+
static void pmu_sbi_event_unmapped(struct perf_event *event, struct mm_struct *mm)
950+
{
951+
if (event->hw.flags & PERF_EVENT_FLAG_NO_USER_ACCESS)
952+
return;
953+
954+
if (event->hw.flags & PERF_EVENT_FLAG_LEGACY) {
955+
if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES &&
956+
event->attr.config != PERF_COUNT_HW_INSTRUCTIONS) {
957+
return;
958+
}
959+
}
960+
961+
/*
962+
* Here we can directly remove user access since the user does not have
963+
* access to the user page anymore so we avoid the racy window where the
964+
* user could have read cap_user_rdpmc to true right before we disable
965+
* it.
966+
*/
967+
event->hw.flags &= ~PERF_EVENT_FLAG_USER_READ_CNT;
968+
969+
if (event->hw.flags & PERF_EVENT_FLAG_USER_ACCESS)
970+
on_each_cpu_mask(mm_cpumask(mm),
971+
pmu_sbi_reset_scounteren, (void *)event, 1);
972+
}
973+
974+
static void riscv_pmu_update_counter_access(void *info)
975+
{
976+
if (sysctl_perf_user_access == SYSCTL_LEGACY)
977+
csr_write(CSR_SCOUNTEREN, 0x7);
978+
else
979+
csr_write(CSR_SCOUNTEREN, 0x2);
980+
}
981+
982+
static int riscv_pmu_proc_user_access_handler(struct ctl_table *table,
983+
int write, void *buffer,
984+
size_t *lenp, loff_t *ppos)
985+
{
986+
int prev = sysctl_perf_user_access;
987+
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
988+
989+
/*
990+
* Test against the previous value since we clear SCOUNTEREN when
991+
* sysctl_perf_user_access is set to SYSCTL_USER_ACCESS, but we should
992+
* not do that if that was already the case.
993+
*/
994+
if (ret || !write || prev == sysctl_perf_user_access)
995+
return ret;
996+
997+
on_each_cpu(riscv_pmu_update_counter_access, NULL, 1);
998+
999+
return 0;
1000+
}
1001+
1002+
static struct ctl_table sbi_pmu_sysctl_table[] = {
1003+
{
1004+
.procname = "perf_user_access",
1005+
.data = &sysctl_perf_user_access,
1006+
.maxlen = sizeof(unsigned int),
1007+
.mode = 0644,
1008+
.proc_handler = riscv_pmu_proc_user_access_handler,
1009+
.extra1 = SYSCTL_ZERO,
1010+
.extra2 = SYSCTL_TWO,
1011+
},
1012+
{ }
1013+
};
1014+
8411015
static int pmu_sbi_device_probe(struct platform_device *pdev)
8421016
{
8431017
struct riscv_pmu *pmu = NULL;
@@ -881,6 +1055,10 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
8811055
pmu->ctr_get_width = pmu_sbi_ctr_get_width;
8821056
pmu->ctr_clear_idx = pmu_sbi_ctr_clear_idx;
8831057
pmu->ctr_read = pmu_sbi_ctr_read;
1058+
pmu->event_init = pmu_sbi_event_init;
1059+
pmu->event_mapped = pmu_sbi_event_mapped;
1060+
pmu->event_unmapped = pmu_sbi_event_unmapped;
1061+
pmu->csr_index = pmu_sbi_csr_index;
8841062

8851063
ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
8861064
if (ret)
@@ -894,6 +1072,8 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
8941072
if (ret)
8951073
goto out_unregister;
8961074

1075+
register_sysctl("kernel", sbi_pmu_sysctl_table);
1076+
8971077
return 0;
8981078

8991079
out_unregister:

0 commit comments

Comments
 (0)