memlat: Optimize perf event reads when possible

We can skip the locking and other overhead of perf_event_read_value() when we know in advance that the perf event in question can be read from the current CPU. This occurs when either the perf event permits reads from CPUs other than the one its on, or when the CPU doing the reads is the same CPU that owns the perf event. Our PMU drivers only set two possible values for `readable_on_cpus`: CPU_MASK_ALL or nothing. As such, we can simply check for CPU_MASK_ALL beforehand in order to determine if the perf event allows non-local reads. We can also reduce the scope of under_scm_call() since we now know which CPU we're reading a perf event from, thus reducing the false positive rate of under_scm_call() as it is now per-CPU. Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
utziacre · Jul 5, 2024 · 50608db · 50608db
1 parent d012032
commit 50608db
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 18 deletions.
diff --git a/drivers/devfreq/arm-memlat-mon.c b/drivers/devfreq/arm-memlat-mon.c
@@ -48,6 +48,7 @@ struct event_data {
 	struct perf_event *pevent;
 	unsigned long prev_count;
 	unsigned long last_delta;
+	bool any_cpu_readable;
 };
 
 struct cpu_data {
@@ -165,7 +166,37 @@ static inline void read_event(struct event_data *event)
 	if (!event->pevent)
 		return;
 
-	total = perf_event_read_value(event->pevent, &enabled, &running);
+	if (event->any_cpu_readable) {
+		if (perf_event_read_local(event->pevent, &total, NULL, NULL))
+			return;
+	} else {
+		unsigned int ev_cpu = READ_ONCE(event->pevent->oncpu);
+		bool local_read;
+		int ret;
+
+		if (ev_cpu >= nr_cpu_ids)
+			return;
+
+		local_irq_disable();
+		if ((local_read = (ev_cpu == raw_smp_processor_id())))
+			ret = perf_event_read_local(event->pevent, &total, NULL, NULL);
+		local_irq_enable();
+
+		if (!local_read) {
+			/*
+			 * Some SCM calls take very long (20+ ms), so the perf
+			 * event IPI could lag on the CPU running the SCM call.
+			 */
+			if (under_scm_call(ev_cpu))
+				return;
+
+			total = perf_event_read_value(event->pevent, &enabled,
+						      &running);
+		} else if (ret) {
+			return;
+		}
+	}
+
 	ev_count = total - event->prev_count;
 	event->prev_count = total;
 	event->last_delta = ev_count;
@@ -226,13 +257,6 @@ static unsigned long get_cnt(struct memlat_hwmon *hw)
 	struct memlat_cpu_grp *cpu_grp = mon->cpu_grp;
 	unsigned int cpu;
 
-	/*
-	 * Some of SCM call is very heavy(+20ms) so perf IPI could
-	 * be stuck on the CPU which contributes long latency.
-	 */
-	if (under_scm_call())
-		return 0;
-
 	for_each_cpu(cpu, &mon->cpus) {
 		struct cpu_data *cpu_data = to_cpu_data(cpu_grp, cpu);
 		struct event_data *common_evs = cpu_data->common_evs;
@@ -291,6 +315,7 @@ static struct perf_event_attr *alloc_attr(void)
 static int set_event(struct event_data *ev, int cpu, unsigned int event_id,
 		     struct perf_event_attr *attr)
 {
+	static struct cpumask all_cpu_mask = CPU_MASK_ALL;
 	struct perf_event *pevent;
 
 	if (!event_id)
@@ -303,6 +328,8 @@ static int set_event(struct event_data *ev, int cpu, unsigned int event_id,
 
 	ev->pevent = pevent;
 	perf_event_enable(pevent);
+	ev->any_cpu_readable =
+		cpumask_equal(&pevent->readable_on_cpus, &all_cpu_mask);
 
 	return 0;
 }

diff --git a/drivers/soc/qcom/scm.c b/drivers/soc/qcom/scm.c
@@ -28,7 +28,7 @@
 #define SCM_INTERRUPTED		1
 #define SCM_V2_EBUSY		-12
 
-static atomic_t scm_call_count = ATOMIC_INIT(0);
+static DEFINE_PER_CPU(atomic_t, scm_call_count);
 static DEFINE_MUTEX(scm_lock);
 
 /*
@@ -147,11 +147,12 @@ static int ___scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
 static int __scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
+	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;
 
-	atomic_inc(&scm_call_count);
+	atomic_inc(cnt);
 	ret = ___scm_call_armv8_64(x0, x1, x2, x3, x4, x5, ret1, ret2, ret3);
-	atomic_dec(&scm_call_count);
+	atomic_dec(cnt);
 
 	return ret;
 }
@@ -209,11 +210,12 @@ static int ___scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
+	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;
 
-	atomic_inc(&scm_call_count);
+	atomic_inc(cnt);
 	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
-	atomic_dec(&scm_call_count);
+	atomic_dec(cnt);
 
 	return ret;
 }
@@ -271,11 +273,12 @@ static int ___scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
+	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;
 
-	atomic_inc(&scm_call_count);
+	atomic_inc(cnt);
 	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
-	atomic_dec(&scm_call_count);
+	atomic_dec(cnt);
 
 	return ret;
 }
@@ -773,7 +776,7 @@ early_initcall(scm_mem_protection_init);
 
 #endif
 
-bool under_scm_call(void)
+bool under_scm_call(int cpu)
 {
-	return atomic_read(&scm_call_count);
+	return atomic_read(per_cpu_ptr(&scm_call_count, cpu));
 }
diff --git a/include/soc/qcom/scm.h b/include/soc/qcom/scm.h
@@ -108,7 +108,7 @@ extern int scm_get_feat_version(u32 feat);
 extern bool is_scm_armv8(void);
 
 extern struct mutex scm_lmh_lock;
-extern bool under_scm_call(void);
+extern bool under_scm_call(int cpu);
 
 #else
 
@@ -167,7 +167,15 @@ static inline bool scm_is_secure_device(void)
 	return false;
 }
 
+<<<<<<< HEAD
 extern bool under_scm_call(void)
+=======
+static inline int scm_enable_mem_protection(void)
+{
+	return 0;
+}
+extern bool under_scm_call(int cpu)
+>>>>>>> 190be9ed11703 (memlat: Optimize perf event reads when possible)
 {
 	return false;
 }