mirror of
https://github.com/tiagovignatti/intel-gpu-tools.git
synced 2025-06-27 17:56:15 +00:00
benchmarks/gem_latency: Split the nop/work/latency measurement
Split the distinct phases (generate interrupts, busywork, measure latency) into separate batches for finer control. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
This commit is contained in:
parent
e37a4c8092
commit
646cab4c0c
@ -57,9 +57,20 @@ struct consumer {
|
||||
struct producer {
|
||||
pthread_t thread;
|
||||
uint32_t ctx;
|
||||
uint32_t nop_handle;
|
||||
struct drm_i915_gem_exec_object2 exec[2];
|
||||
struct drm_i915_gem_relocation_entry reloc[3];
|
||||
struct {
|
||||
struct drm_i915_gem_exec_object2 exec[1];
|
||||
struct drm_i915_gem_execbuffer2 execbuf;
|
||||
} nop_dispatch;
|
||||
struct {
|
||||
struct drm_i915_gem_exec_object2 exec[2];
|
||||
struct drm_i915_gem_relocation_entry reloc[2];
|
||||
struct drm_i915_gem_execbuffer2 execbuf;
|
||||
} workload_dispatch;
|
||||
struct {
|
||||
struct drm_i915_gem_exec_object2 exec[1];
|
||||
struct drm_i915_gem_relocation_entry reloc[1];
|
||||
struct drm_i915_gem_execbuffer2 execbuf;
|
||||
} latency_dispatch;
|
||||
|
||||
pthread_mutex_t lock;
|
||||
pthread_cond_t p_cond, c_cond;
|
||||
@ -84,78 +95,114 @@ struct producer {
|
||||
|
||||
#define BCS_TIMESTAMP (0x22000 + 0x358)
|
||||
|
||||
static void setup_workload(struct producer *p, int gen, uint32_t scratch)
|
||||
static uint32_t create_workload(int gen, uint32_t scratch)
|
||||
{
|
||||
const int has_64bit_reloc = gen >= 8;
|
||||
uint32_t handle = gem_create(fd, 4096);
|
||||
uint32_t buf[80];
|
||||
int i = 0;
|
||||
|
||||
/* XY_SRC_COPY */
|
||||
buf[i++] = COPY_BLT_CMD | BLT_WRITE_ALPHA | BLT_WRITE_RGB;
|
||||
if (has_64bit_reloc)
|
||||
buf[i-1] += 2;
|
||||
buf[i++] = 0xcc << 16 | 1 << 25 | 1 << 24 | (4*WIDTH);
|
||||
buf[i++] = 0;
|
||||
buf[i++] = HEIGHT << 16 | WIDTH;
|
||||
buf[i++] = 0;
|
||||
if (has_64bit_reloc)
|
||||
buf[i++] = 0;
|
||||
buf[i++] = 0;
|
||||
buf[i++] = 4096;
|
||||
buf[i++] = 0;
|
||||
if (has_64bit_reloc)
|
||||
buf[i++] = 0;
|
||||
buf[i++] = MI_BATCH_BUFFER_END;
|
||||
|
||||
gem_write(fd, handle, 0, buf, i*sizeof(buf[0]));
|
||||
return handle;
|
||||
}
|
||||
|
||||
static void setup_workload(struct producer *p, int gen,
|
||||
uint32_t scratch,
|
||||
uint32_t batch)
|
||||
{
|
||||
struct drm_i915_gem_execbuffer2 *eb;
|
||||
const int has_64bit_reloc = gen >= 8;
|
||||
|
||||
p->workload_dispatch.exec[0].handle = scratch;
|
||||
p->workload_dispatch.exec[1].relocation_count = 2;
|
||||
p->workload_dispatch.exec[1].relocs_ptr = (uintptr_t)p->workload_dispatch.reloc;
|
||||
p->workload_dispatch.exec[1].handle = batch;
|
||||
|
||||
p->workload_dispatch.reloc[0].offset = 4 * sizeof(uint32_t);
|
||||
p->workload_dispatch.reloc[0].delta = 0;
|
||||
p->workload_dispatch.reloc[0].target_handle = scratch;
|
||||
p->workload_dispatch.reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
|
||||
p->workload_dispatch.reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
|
||||
p->workload_dispatch.reloc[0].presumed_offset = 0;
|
||||
|
||||
p->workload_dispatch.reloc[1].offset = 7 * sizeof(uint32_t);
|
||||
if (has_64bit_reloc)
|
||||
p->workload_dispatch.reloc[1].offset += sizeof(uint32_t);
|
||||
p->workload_dispatch.reloc[1].delta = 0;
|
||||
p->workload_dispatch.reloc[1].target_handle = scratch;
|
||||
p->workload_dispatch.reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
|
||||
p->workload_dispatch.reloc[1].write_domain = 0;
|
||||
p->workload_dispatch.reloc[1].presumed_offset = 0;
|
||||
|
||||
eb = memset(&p->workload_dispatch.execbuf, 0, sizeof(*eb));
|
||||
eb->buffers_ptr = (uintptr_t)p->workload_dispatch.exec;
|
||||
eb->buffer_count = 2;
|
||||
eb->flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
|
||||
eb->rsvd1 = p->ctx;
|
||||
}
|
||||
|
||||
static void setup_latency(struct producer *p, int gen)
|
||||
{
|
||||
struct drm_i915_gem_execbuffer2 *eb;
|
||||
const int has_64bit_reloc = gen >= 8;
|
||||
uint32_t handle;
|
||||
uint32_t *map;
|
||||
int i = 0;
|
||||
|
||||
p->exec[0].handle = scratch;
|
||||
p->exec[1].relocation_count = 3;
|
||||
p->exec[1].relocs_ptr = (uintptr_t)p->reloc;
|
||||
p->exec[1].handle = gem_create(fd, 4096);
|
||||
handle = gem_create(fd, 4096);
|
||||
if (gem_has_llc(fd))
|
||||
map = gem_mmap__cpu(fd, p->exec[1].handle, 0, 4096, PROT_WRITE);
|
||||
map = gem_mmap__cpu(fd, handle, 0, 4096, PROT_WRITE);
|
||||
else
|
||||
map = gem_mmap__gtt(fd, p->exec[1].handle, 4096, PROT_WRITE);
|
||||
map = gem_mmap__gtt(fd, handle, 4096, PROT_WRITE);
|
||||
|
||||
/* XY_SRC_COPY */
|
||||
map[i++] = COPY_BLT_CMD | BLT_WRITE_ALPHA | BLT_WRITE_RGB;
|
||||
if (has_64bit_reloc)
|
||||
map[i-1] += 2;
|
||||
map[i++] = 0xcc << 16 | 1 << 25 | 1 << 24 | (4*WIDTH);
|
||||
map[i++] = 0;
|
||||
map[i++] = HEIGHT << 16 | WIDTH;
|
||||
p->reloc[0].offset = i * sizeof(uint32_t);
|
||||
p->reloc[0].delta = 0;
|
||||
p->reloc[0].target_handle = scratch;
|
||||
p->reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
|
||||
p->reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
|
||||
p->reloc[0].presumed_offset = 0;
|
||||
map[i++] = 0;
|
||||
if (has_64bit_reloc)
|
||||
map[i++] = 0;
|
||||
map[i++] = 0;
|
||||
map[i++] = 4096;
|
||||
p->reloc[1].offset = i * sizeof(uint32_t);
|
||||
p->reloc[1].delta = 0;
|
||||
p->reloc[1].target_handle = scratch;
|
||||
p->reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
|
||||
p->reloc[1].write_domain = 0;
|
||||
p->reloc[1].presumed_offset = 0;
|
||||
map[i++] = 0;
|
||||
if (has_64bit_reloc)
|
||||
map[i++] = 0;
|
||||
|
||||
/* MI_FLUSH_DW */
|
||||
map[i++] = 0x26 << 23 | 1;
|
||||
if (has_64bit_reloc)
|
||||
map[i-1]++;
|
||||
map[i++] = 0;
|
||||
map[i++] = 0;
|
||||
if (has_64bit_reloc)
|
||||
map[i++] = 0;
|
||||
p->latency_dispatch.exec[0].relocation_count = 1;
|
||||
p->latency_dispatch.exec[0].relocs_ptr =
|
||||
(uintptr_t)p->latency_dispatch.reloc;
|
||||
p->latency_dispatch.exec[0].handle = handle;
|
||||
|
||||
/* MI_STORE_REG_MEM */
|
||||
map[i++] = 0x24 << 23 | 1;
|
||||
if (has_64bit_reloc)
|
||||
map[i-1]++;
|
||||
map[i++] = BCS_TIMESTAMP;
|
||||
p->reloc[2].offset = i * sizeof(uint32_t);
|
||||
p->reloc[2].delta = 4000;
|
||||
p->reloc[2].target_handle = p->exec[1].handle;
|
||||
p->reloc[2].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
|
||||
p->reloc[2].write_domain = 0; /* We lie! */
|
||||
p->reloc[2].presumed_offset = 0;
|
||||
p->latency_dispatch.reloc[0].offset = i * sizeof(uint32_t);
|
||||
p->latency_dispatch.reloc[0].delta = 4000;
|
||||
p->latency_dispatch.reloc[0].target_handle = handle;
|
||||
p->latency_dispatch.reloc[0].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
|
||||
p->latency_dispatch.reloc[0].write_domain = 0; /* We lie! */
|
||||
p->latency_dispatch.reloc[0].presumed_offset = 0;
|
||||
p->last_timestamp = &map[1000];
|
||||
map[i++] = 4000;
|
||||
if (has_64bit_reloc)
|
||||
map[i++] = 0;
|
||||
|
||||
map[i++] = MI_BATCH_BUFFER_END;
|
||||
|
||||
eb = memset(&p->latency_dispatch.execbuf, 0, sizeof(*eb));
|
||||
eb->buffers_ptr = (uintptr_t)p->latency_dispatch.exec;
|
||||
eb->buffer_count = 1;
|
||||
eb->flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
|
||||
eb->rsvd1 = p->ctx;
|
||||
}
|
||||
|
||||
static uint32_t setup_nop(void)
|
||||
static uint32_t create_nop(void)
|
||||
{
|
||||
uint32_t buf = MI_BATCH_BUFFER_END;
|
||||
uint32_t handle;
|
||||
@ -166,34 +213,31 @@ static uint32_t setup_nop(void)
|
||||
return handle;
|
||||
}
|
||||
|
||||
static void setup_nop(struct producer *p, uint32_t batch)
|
||||
{
|
||||
struct drm_i915_gem_execbuffer2 *eb;
|
||||
|
||||
p->nop_dispatch.exec[0].handle = batch;
|
||||
|
||||
eb = memset(&p->nop_dispatch.execbuf, 0, sizeof(*eb));
|
||||
eb->buffers_ptr = (uintptr_t)p->nop_dispatch.exec;
|
||||
eb->buffer_count = 1;
|
||||
eb->flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
|
||||
eb->rsvd1 = p->ctx;
|
||||
}
|
||||
|
||||
#define READ(x) *(volatile uint32_t *)((volatile char *)igt_global_mmio + x)
|
||||
static void measure_latency(struct producer *p, igt_stats_t *stats)
|
||||
{
|
||||
gem_sync(fd, p->exec[1].handle);
|
||||
gem_sync(fd, p->latency_dispatch.exec[0].handle);
|
||||
igt_stats_push(stats, READ(BCS_TIMESTAMP) - *p->last_timestamp);
|
||||
}
|
||||
|
||||
static void *producer(void *arg)
|
||||
{
|
||||
struct producer *p = arg;
|
||||
struct drm_i915_gem_execbuffer2 nop, workload;
|
||||
struct drm_i915_gem_exec_object2 exec;
|
||||
int n;
|
||||
|
||||
memset(&exec, 0, sizeof(exec));
|
||||
exec.handle = p->nop_handle;
|
||||
memset(&nop, 0, sizeof(nop));
|
||||
nop.buffers_ptr = (uintptr_t)&exec;
|
||||
nop.buffer_count = 1;
|
||||
nop.flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
|
||||
nop.rsvd1 = p->ctx;
|
||||
|
||||
memset(&workload, 0, sizeof(workload));
|
||||
workload.buffers_ptr = (uintptr_t)p->exec;
|
||||
workload.buffer_count = 2;
|
||||
workload.flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
|
||||
workload.rsvd1 = p->ctx;
|
||||
|
||||
while (!done) {
|
||||
uint32_t start = READ(BCS_TIMESTAMP);
|
||||
int batches;
|
||||
@ -206,7 +250,7 @@ static void *producer(void *arg)
|
||||
*/
|
||||
batches = p->nop;
|
||||
while (batches--)
|
||||
gem_execbuf(fd, &nop);
|
||||
gem_execbuf(fd, &p->nop_dispatch.execbuf);
|
||||
|
||||
/* Control the amount of work we do, similar to submitting
|
||||
* empty buffers above, except this time we will load the
|
||||
@ -215,7 +259,12 @@ static void *producer(void *arg)
|
||||
*/
|
||||
batches = p->workload;
|
||||
while (batches--)
|
||||
gem_execbuf(fd, &workload);
|
||||
gem_execbuf(fd, &p->workload_dispatch.execbuf);
|
||||
|
||||
/* Finally, execute a batch that just reads the current
|
||||
* TIMESTAMP so we can measure the latency.
|
||||
*/
|
||||
gem_execbuf(fd, &p->latency_dispatch.execbuf);
|
||||
|
||||
/* Wake all the associated clients to wait upon our batch */
|
||||
pthread_mutex_lock(&p->lock);
|
||||
@ -288,7 +337,9 @@ static int run(int seconds,
|
||||
{
|
||||
struct producer *p;
|
||||
igt_stats_t latency, throughput;
|
||||
uint32_t scratch, batch;
|
||||
uint32_t nop_batch;
|
||||
uint32_t workload_batch;
|
||||
uint32_t scratch;
|
||||
int gen, n, m;
|
||||
int complete;
|
||||
int nrun;
|
||||
@ -305,16 +356,19 @@ static int run(int seconds,
|
||||
|
||||
intel_register_access_init(intel_get_pci_device(), false);
|
||||
|
||||
batch = setup_nop();
|
||||
scratch = gem_create(fd, 4*WIDTH*HEIGHT);
|
||||
nop_batch = create_nop();
|
||||
workload_batch = create_workload(gen, scratch);
|
||||
|
||||
p = calloc(nproducers, sizeof(*p));
|
||||
for (n = 0; n < nproducers; n++) {
|
||||
p[n].nop_handle = batch;
|
||||
setup_workload(&p[n], gen, scratch);
|
||||
if (flags & CONTEXT)
|
||||
p[n].ctx = gem_context_create(fd);
|
||||
|
||||
setup_nop(&p[n], nop_batch);
|
||||
setup_workload(&p[n], gen, scratch, workload_batch);
|
||||
setup_latency(&p[n], gen);
|
||||
|
||||
pthread_mutex_init(&p[n].lock, NULL);
|
||||
pthread_cond_init(&p[n].p_cond, NULL);
|
||||
pthread_cond_init(&p[n].c_cond, NULL);
|
||||
@ -374,7 +428,7 @@ int main(int argc, char **argv)
|
||||
int producers = 1;
|
||||
int consumers = 0;
|
||||
int nop = 0;
|
||||
int workload = 1;
|
||||
int workload = 0;
|
||||
unsigned flags = 0;
|
||||
int c;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user