diff --git a/overlay/gpu-perf.c b/overlay/gpu-perf.c
index ef170909..653148b8 100644
--- a/overlay/gpu-perf.c
+++ b/overlay/gpu-perf.c
@@ -229,12 +229,21 @@ static int wait_begin(struct gpu_perf *gp, const void *event)
 {
 	const struct sample_event *sample = event;
 	struct gpu_perf_comm *comm;
+	struct gpu_perf_wait *wait;
 
 	comm = lookup_comm(gp, sample->pid);
 	if (comm == NULL)
 		return 0;
 
-	comm->wait_begin = sample->time;
+	wait = malloc(sizeof(*wait));
+	if (wait == NULL)
+		return 0;
+
+	wait->seqno = sample->raw[3];
+	wait->time = sample->time;
+	wait->next = comm->wait;
+	comm->wait = wait;
+
 	return 0;
 }
 
@@ -242,12 +251,22 @@ static int wait_end(struct gpu_perf *gp, const void *event)
 {
 	const struct sample_event *sample = event;
 	struct gpu_perf_comm *comm;
+	struct gpu_perf_wait *wait, **prev;
 
 	comm = lookup_comm(gp, sample->pid);
 	if (comm == NULL)
 		return 0;
 
-	comm->wait_time += sample->time - comm->wait_begin;
+	for (prev = &comm->wait; (wait = *prev) != NULL; prev = &wait->next) {
+		if (wait->seqno != sample->raw[3])
+			continue;
+
+		comm->wait_time += sample->time - wait->time;
+		*prev = wait->next;
+		free(wait);
+		return 1;
+	}
+
 	return 0;
 }
 
diff --git a/overlay/gpu-perf.h b/overlay/gpu-perf.h
index 5c3e242e..476bbaab 100644
--- a/overlay/gpu-perf.h
+++ b/overlay/gpu-perf.h
@@ -14,6 +14,11 @@ struct gpu_perf {
 	int flip_complete;
 	struct gpu_perf_comm {
 		struct gpu_perf_comm *next;
+		struct gpu_perf_wait {
+			struct gpu_perf_wait *next;
+			uint32_t seqno;
+			uint64_t time;
+		} *wait;
 		char name[256];
 		pid_t pid;
 		int nr_requests[4];
diff --git a/overlay/overlay.c b/overlay/overlay.c
index b41dfc44..bd327ab0 100644
--- a/overlay/overlay.c
+++ b/overlay/overlay.c
@@ -266,10 +266,21 @@ static void show_gpu_perf(struct overlay_context *ctx, struct overlay_gpu_perf *
 			need_comma = true;
 		}
 		if (comm->wait_time) {
-			if (comm->wait_time > 100) {
+			buf[0] = '\0';
+			if (comm->wait_time > 1000*1000) {
+				sprintf(buf, "%s %.1f ms waiting",
+					need_comma ? "," : "",
+					comm->wait_time / (1000*1000.));
+			} else if (comm->wait_time > 100) {
 				sprintf(buf, "%s %.1f us waiting",
 					need_comma ? "," : "",
 					comm->wait_time / 1000.);
+			} else {
+				sprintf(buf, "%s %.0f ns waiting",
+					need_comma ? "," : "",
+					(double)comm->wait_time);
+			}
+			if (buf[0] != '\0') {
 				cairo_show_text(ctx->cr, buf);
 				need_comma = true;
 			}