igt/gem_render_tiled_blits: Speed up by using the GPU to detile

Avoid accessing via the slow GTT to read back and compare the contents
of each bo against expected results. It is much faster, on llc at least,
to detile using the GPU and then copy to system memory for the compare.

Before:

IVB: time sudo ./gem_render_tiled_blits
IGT-Version: 1.6-ge46ff3f (x86_64) (Linux: 3.15.0-rc3+ x86_64)
Using 3072 1MiB buffers
Verifying initialisation...
Cyclic blits, forward...
Cyclic blits, backward...
Random blits...

real	6m26.005s
user	6m19.234s
sys	0m2.414s

PNV: time sudo ./gem_render_tiled_blits
IGT-Version: 1.6-g8556f8a (i686) (Linux: 3.15.0-rc2+ i686)
Using 768 1MiB buffers
Verifying initialisation...
Cyclic blits, forward...
Cyclic blits, backward...
Random blits...

real	1m45.431s
user	1m34.960s
sys	0m4.624s

Using pread:

IVB: time sudo ./gem_render_tiled_blits
IGT-Version: 1.6-ge46ff3f (x86_64) (Linux: 3.15.0-rc3+ x86_64)
Using 3072 1MiB buffers
Verifying initialisation...
Cyclic blits, forward...
Cyclic blits, backward...
Random blits...

real	0m14.717s
user	0m3.699s
sys	0m3.192s

Using snoop:

IVB: time sudo ./gem_render_tiled_blits
IGT-Version: 1.6-ge46ff3f (x86_64) (Linux: 3.15.0-rc3+ x86_64)
Using 3072 1MiB buffers
Using a snoop linear buffer for comparisons
Verifying initialisation...
Cyclic blits, forward...
Cyclic blits, backward...
Random blits...

real	0m13.774s
user	0m3.900s
sys	0m2.089s

PNV: time sudo ./gem_render_tiled_blits
IGT-Version: 1.6-g8556f8a (i686) (Linux: 3.15.0-rc2+ i686)
Using 768 1MiB buffers
Using a snoop linear buffer for comparisons
Verifying initialisation...
Cyclic blits, forward...
Cyclic blits, backward...
Random blits...

real	0m20.831s
user	0m4.384s
sys	0m5.032s

So roughly 10-30x faster depending on platform.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=78244
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
This commit is contained in:
Chris Wilson 2014-05-08 11:56:56 +01:00
parent e46ff3f8c2
commit 66d5f092d4

View File

@ -60,15 +60,30 @@
#define SIZE (HEIGHT*STRIDE)
static igt_render_copyfunc_t render_copy;
static drm_intel_bo *linear;
static uint32_t data[WIDTH*HEIGHT];
static int snoop;
static void
check_bo(drm_intel_bo *bo, uint32_t val)
check_bo(struct intel_batchbuffer *batch, struct igt_buf *buf, uint32_t val)
{
struct igt_buf tmp;
uint32_t *ptr;
int i;
do_or_die(drm_intel_gem_bo_map_gtt(bo));
ptr = bo->virtual;
tmp.bo = linear;
tmp.stride = STRIDE;
tmp.tiling = I915_TILING_NONE;
tmp.size = SIZE;
render_copy(batch, NULL, buf, 0, 0, WIDTH, HEIGHT, &tmp, 0, 0);
if (snoop) {
do_or_die(dri_bo_map(linear, 0));
ptr = linear->virtual;
} else {
do_or_die(drm_intel_bo_get_subdata(linear, 0, sizeof(data), data));
ptr = data;
}
for (i = 0; i < WIDTH*HEIGHT; i++) {
if (ptr[i] != val) {
fprintf(stderr, "Expected 0x%08x, found 0x%08x "
@ -78,7 +93,8 @@ check_bo(drm_intel_bo *bo, uint32_t val)
}
val++;
}
drm_intel_gem_bo_unmap_gtt(bo);
if (ptr != data)
dri_bo_unmap(linear);
}
int main(int argc, char **argv)
@ -89,22 +105,30 @@ int main(int argc, char **argv)
struct igt_buf *buf;
uint32_t start = 0;
int i, j, fd, count;
uint32_t devid;
igt_simple_init();
igt_skip_on_simulation();
fd = drm_open_any();
devid = intel_get_drm_devid(fd);
render_copy = igt_get_render_copyfunc(intel_get_drm_devid(fd));
render_copy = igt_get_render_copyfunc(devid);
if (render_copy == NULL) {
printf("no render-copy function, doing nothing\n");
return 77;
}
snoop = 1;
if (IS_GEN2(devid)) /* chipset only handles cached -> uncached */
snoop = 0;
if (IS_BROADWATER(devid) || IS_CRESTLINE(devid)) /* snafu */
snoop = 0;
bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
drm_intel_bufmgr_gem_set_vma_cache_size(bufmgr, 32);
batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd));
batch = intel_batchbuffer_alloc(bufmgr, devid);
count = 0;
if (argc > 1)
@ -123,6 +147,12 @@ int main(int argc, char **argv)
printf("Using %d 1MiB buffers\n", count);
linear = drm_intel_bo_alloc(bufmgr, "linear", WIDTH*HEIGHT*4, 0);
if (snoop) {
gem_set_caching(fd, linear->handle, 1);
printf("Using a snoop linear buffer for comparisons\n");
}
buf = malloc(sizeof(*buf)*count);
start_val = malloc(sizeof(*start_val)*count);
@ -149,7 +179,7 @@ int main(int argc, char **argv)
printf("Verifying initialisation...\n");
for (i = 0; i < count; i++)
check_bo(buf[i].bo, start_val[i]);
check_bo(batch, &buf[i], start_val[i]);
printf("Cyclic blits, forward...\n");
for (i = 0; i < count * 4; i++) {
@ -160,7 +190,7 @@ int main(int argc, char **argv)
start_val[dst] = start_val[src];
}
for (i = 0; i < count; i++)
check_bo(buf[i].bo, start_val[i]);
check_bo(batch, &buf[i], start_val[i]);
printf("Cyclic blits, backward...\n");
for (i = 0; i < count * 4; i++) {
@ -171,7 +201,7 @@ int main(int argc, char **argv)
start_val[dst] = start_val[src];
}
for (i = 0; i < count; i++)
check_bo(buf[i].bo, start_val[i]);
check_bo(batch, &buf[i], start_val[i]);
printf("Random blits...\n");
for (i = 0; i < count * 4; i++) {
@ -185,7 +215,7 @@ int main(int argc, char **argv)
start_val[dst] = start_val[src];
}
for (i = 0; i < count; i++)
check_bo(buf[i].bo, start_val[i]);
check_bo(batch, &buf[i], start_val[i]);
return 0;
}