diff --git a/tests/Makefile.am b/tests/Makefile.am index 0c6452d9..ce2f8915 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,3 +1,7 @@ +bin_PROGRAMS = \ + gem_stress \ + $(NULL) + TESTS = getversion \ getclient \ getstats \ diff --git a/tests/gem_stress.c b/tests/gem_stress.c new file mode 100644 index 00000000..a5155c9d --- /dev/null +++ b/tests/gem_stress.c @@ -0,0 +1,561 @@ +/* + * Copyright © 2011 Daniel Vetter + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Daniel Vetter + * + * Partially based upon gem_tiled_fence_blits.c + */ + +/** @file gem_stress.c + * + * This is a general gem coherency test. It's designed to eventually replicate + * any possible sequence of access patterns. It works by copying a set of tiles + * between two sets of backing buffer objects, randomly permutating the assinged + * position on each copy operations. + * + * The copy operation are done in tiny portions (to reduce any race windows + * for corruptions, hence increasing the chances for observing one) and are + * constantly switched between all means to copy stuff (fenced blitter, unfenced + * render, mmap, pwrite/read). + * + * After every complete move of a set tiling parameters of a buffer are randomly + * changed to simulate the effects of libdrm caching. + * + * Buffers are 1mb big to nicely fit into fences on gen2/3. A few are further + * split up to test relaxed fencing. Using this to push the average working set + * size over the available gtt space forces objects to be mapped as unfenceable + * (and as a side-effect tests gtt map/unmap coherency). + * + * In short: designed for maximum evilness. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "drm.h" +#include "i915_drm.h" +#include "drmtest.h" +#include "intel_bufmgr.h" +#include "intel_batchbuffer.h" +#include "intel_gpu_tools.h" + +/** TODO: + * - beat on relaxed fencing (i.e. mappable/fenceable tracking in the kernel) + * - render copy (to check fence tracking and cache coherency management by the + * kernel) + * - multi-threading: probably just a wrapper script to launch multiple + * instances + an option to accordingly reduce the working set + * - gen6 inter-ring coherency (needs render copy, first) + */ + +static uint64_t gem_aperture_size(int fd) +{ + struct drm_i915_gem_get_aperture aperture; + + aperture.aper_size = 256*1024*1024; + (void)drmIoctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture); + return aperture.aper_size; +} + +struct scratch_buf { + drm_intel_bo *bo; + uint32_t stride; + uint32_t tiling; + uint32_t *data; +}; + +#define NO_HW 0 + +static drm_intel_bufmgr *bufmgr; +struct intel_batchbuffer *batch; +static int drm_fd; +static int devid; +static int num_fences; + +drm_intel_bo *busy_bo; + +#define MAX_BUFS 4096 +#define SCRATCH_BUF_SIZE 1024*1024 +#define TILE_SIZE 16 +#define TILES_PER_BUF (SCRATCH_BUF_SIZE / (TILE_SIZE*TILE_SIZE*sizeof(uint32_t))) + +static struct scratch_buf buffers[2][MAX_BUFS]; +/* tile i is at logical position tile_permutation[i] */ +static unsigned tile_permutation[MAX_BUFS*TILES_PER_BUF]; +static unsigned num_buffers = 0; +static unsigned current_set = 0; +static unsigned target_set = 0; + +static int fence_storm = 0; + +static void tile2xy(struct scratch_buf *buf, unsigned tile, unsigned *x, unsigned *y) +{ + assert(tile < TILES_PER_BUF); + *x = (tile*TILE_SIZE) % (buf->stride/sizeof(uint32_t)); + *y = ((tile*TILE_SIZE) / (buf->stride/sizeof(uint32_t))) * TILE_SIZE; +} + +/* All this gem trashing wastes too much cpu time, so give the gpu something to + * do to increase changes for races. TODO: Should be autotuned. */ +static void keep_gpu_busy(void) +{ + uint32_t src_pitch, dst_pitch, cmd_bits; + + src_pitch = 4096; + dst_pitch = 4096; + cmd_bits = 0; + + if (IS_965(devid)) { + src_pitch /= 4; + cmd_bits |= XY_SRC_COPY_BLT_SRC_TILED; + } + + if (IS_965(devid)) { + dst_pitch /= 4; + cmd_bits |= XY_SRC_COPY_BLT_DST_TILED; + } + + /* copy lower half to upper half */ + BEGIN_BATCH(8); + OUT_BATCH(XY_SRC_COPY_BLT_CMD | + XY_SRC_COPY_BLT_WRITE_ALPHA | + XY_SRC_COPY_BLT_WRITE_RGB | + cmd_bits); + OUT_BATCH((3 << 24) | /* 32 bits */ + (0xcc << 16) | /* copy ROP */ + dst_pitch); + OUT_BATCH(128 << 16 | 0); + OUT_BATCH(256 << 16 | 1024); + OUT_RELOC(busy_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); + OUT_BATCH(0 << 16 | 0); + OUT_BATCH(src_pitch); + OUT_RELOC(busy_bo, I915_GEM_DOMAIN_RENDER, 0, 0); + ADVANCE_BATCH(); +} + +static unsigned int copyfunc_seq = 0; +static void (*copyfunc)(struct scratch_buf *src, unsigned src_x, unsigned src_y, + struct scratch_buf *dst, unsigned dst_x, unsigned dst_y); + +/* stride, x, y in units of uint32_t! */ +static void cpucpy2d(uint32_t *src, unsigned src_stride, unsigned src_x, unsigned src_y, + uint32_t *dst, unsigned dst_stride, unsigned dst_x, unsigned dst_y) +{ + int i, j; + + for (i = 0; i < TILE_SIZE; i++) { + for (j = 0; j < TILE_SIZE; j++) { + unsigned dst_ofs = dst_x + j + dst_stride * (dst_y + i); + unsigned src_ofs = src_x + j + src_stride * (src_y + i); + dst[dst_ofs] = src[src_ofs]; + } + } +} + +static void next_copyfunc(void); + +static void cpu_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y, + struct scratch_buf *dst, unsigned dst_x, unsigned dst_y) +{ + cpucpy2d(src->data, src->stride/sizeof(uint32_t), src_x, src_y, + dst->data, dst->stride/sizeof(uint32_t), dst_x, dst_y); +} + +static void prw_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y, + struct scratch_buf *dst, unsigned dst_x, unsigned dst_y) +{ + uint32_t tmp_tile[TILE_SIZE*TILE_SIZE]; + + /* TODO: use pwrite/pread here if dst/src is untiled */ + cpucpy2d(src->data, src->stride/sizeof(uint32_t), src_x, src_y, + tmp_tile, TILE_SIZE, 0, 0); + cpucpy2d(tmp_tile, TILE_SIZE, 0, 0, + dst->data, dst->stride/sizeof(uint32_t), dst_x, dst_y); +} + +static void blitter_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y, + struct scratch_buf *dst, unsigned dst_x, unsigned dst_y) +{ + uint32_t src_pitch, dst_pitch, cmd_bits; + src_pitch = src->stride; + dst_pitch = dst->stride; + cmd_bits = 0; + static unsigned keep_gpu_busy_counter = 0; + + /* check both edges of the fence usage */ + if (keep_gpu_busy_counter & 1 && !fence_storm) + keep_gpu_busy(); + + if (IS_965(devid)) { + src_pitch /= 4; + cmd_bits |= XY_SRC_COPY_BLT_SRC_TILED; + } + + if (IS_965(devid)) { + dst_pitch /= 4; + cmd_bits |= XY_SRC_COPY_BLT_DST_TILED; + } + + BEGIN_BATCH(8); + OUT_BATCH(XY_SRC_COPY_BLT_CMD | + XY_SRC_COPY_BLT_WRITE_ALPHA | + XY_SRC_COPY_BLT_WRITE_RGB | + cmd_bits); + OUT_BATCH((3 << 24) | /* 32 bits */ + (0xcc << 16) | /* copy ROP */ + dst_pitch); + OUT_BATCH(dst_y << 16 | dst_x); + OUT_BATCH((dst_y+TILE_SIZE) << 16 | (dst_x+TILE_SIZE)); + OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); + OUT_BATCH(src_y << 16 | src_x); + OUT_BATCH(src_pitch); + OUT_RELOC(src->bo, I915_GEM_DOMAIN_RENDER, 0, 0); + ADVANCE_BATCH(); + + if (!(keep_gpu_busy_counter & 1) && !fence_storm) + keep_gpu_busy(); + + keep_gpu_busy_counter++; + + if (src->tiling) + fence_storm--; + if (dst->tiling) + fence_storm--; + + if (fence_storm <= 0) { + fence_storm = 0; + intel_batchbuffer_flush(batch); + } +} + +static void next_copyfunc(void) +{ + if (fence_storm) + return; + + if (copyfunc_seq % 61 == 0) { + fence_storm = num_fences; + copyfunc = blitter_copyfunc; + } else if (copyfunc_seq % 17 == 0) + copyfunc = cpu_copyfunc; + else if (copyfunc_seq % 19 == 0) + copyfunc = prw_copyfunc; + else + copyfunc = blitter_copyfunc; + + copyfunc_seq++; +} + +static void fan_out(void) +{ + uint32_t tmp_tile[TILE_SIZE*TILE_SIZE]; + uint32_t seq = 0; + int i, j, k; + unsigned x, y; + + for (i = 0; i < num_buffers; i++) { + for (j = 0; j < TILES_PER_BUF; j++) { + tile2xy(&buffers[current_set][i], j, &x, &y); + for (k = 0; k < TILE_SIZE*TILE_SIZE; k++) + tmp_tile[k] = seq++; + + cpucpy2d(tmp_tile, TILE_SIZE, 0, 0, + buffers[current_set][i].data, + buffers[current_set][i].stride / sizeof(uint32_t), + x, y); + } + } + + for (i = 0; i < num_buffers*TILES_PER_BUF; i++) + tile_permutation[i] = i; +} + +static void fan_in_and_check(void) +{ + uint32_t tmp_tile[TILE_SIZE*TILE_SIZE]; + unsigned tile, buf_idx, x, y; + int i,j; + for (i = 0; i < num_buffers*TILES_PER_BUF; i++) { + tile = tile_permutation[i]; + buf_idx = tile / TILES_PER_BUF; + tile %= TILES_PER_BUF; + + tile2xy(&buffers[current_set][buf_idx], tile, &x, &y); + + cpucpy2d(buffers[current_set][buf_idx].data, + buffers[current_set][buf_idx].stride / sizeof(uint32_t), + x, y, + tmp_tile, TILE_SIZE, 0, 0); + for (j = 0; j < TILE_SIZE*TILE_SIZE; j++) { + unsigned expect = i*TILE_SIZE*TILE_SIZE + j; + if (tmp_tile[j] != expect) { + printf("mismatch at tile %i pos %i, read %u, expected %u\n", + i, j, tmp_tile[j], expect); + exit(1); + } + } + } +} + +static void init_buffer(struct scratch_buf *buf) +{ + buf->bo = drm_intel_bo_alloc(bufmgr, "tiled bo", SCRATCH_BUF_SIZE, 4096); + assert(buf->bo); + buf->tiling = I915_TILING_NONE; + buf->stride = 8192; + /* XXX for development of the shuffling */ +#if NO_HW + buf->data = malloc(SCRATCH_BUF_SIZE); +#else + drm_intel_gem_bo_map_gtt(buf->bo); + buf->data = buf->bo->virtual; +#endif +} + +static void permute_array(void *array, unsigned size, + void (*exchange_func)(void *array, unsigned i, unsigned j)) +{ + int i; + long int l; + + for (i = size - 1; i > 1; i--) { + l = random(); + l %= i+1; /* yes, no perfectly uniform, who cares */ + exchange_func(array, i, l); + } +} + +static void exchange_buf(void *array, unsigned i, unsigned j) +{ + struct scratch_buf *buf_arr, tmp; + buf_arr = array; + + memcpy(&tmp, &buf_arr[i], sizeof(struct scratch_buf)); + memcpy(&buf_arr[i], &buf_arr[j], sizeof(struct scratch_buf)); + memcpy(&buf_arr[j], &tmp, sizeof(struct scratch_buf)); +} + +/* libdrm is to clever and prevents us from changin tiling of buffers already + * used in relocations. */ +static void set_tiling(drm_intel_bo *bo, unsigned *tiling, unsigned stride) +{ + struct drm_i915_gem_set_tiling set_tiling; + int ret; + + memset(&set_tiling, 0, sizeof(set_tiling)); + do { + /* set_tiling is slightly broken and overwrites the + * input on the error path, so we have to open code + * drmIoctl. + */ + set_tiling.handle = bo->handle; + set_tiling.tiling_mode = *tiling; + set_tiling.stride = stride; + + ret = ioctl(drm_fd, + DRM_IOCTL_I915_GEM_SET_TILING, + &set_tiling); + } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); + assert(ret != -1); + + *tiling = set_tiling.tiling_mode; +} + +static void init_set(unsigned set) +{ + long int r; + int i; + + permute_array(buffers[set], num_buffers, exchange_buf); + + for (i = 0; i < num_buffers; i++) { + r = random(); + if ((r & 3) != 0) + continue; + r >>= 2; + + if ((r & 3) != 0) + buffers[set][i].tiling = I915_TILING_X; + else + buffers[set][i].tiling = I915_TILING_NONE; + r >>= 2; + + if (buffers[set][i].tiling == I915_TILING_NONE) { + /* min 64 byte stride */ + r %= 8; + buffers[set][i].stride = 64 * (1 << r); + } else if (IS_GEN2(devid)) { + /* min 128 byte stride */ + r %= 7; + buffers[set][i].stride = 128 * (1 << r); + } else { + /* min 512 byte stride */ + r %= 5; + buffers[set][i].stride = 512 * (1 << r); + } + assert(buffers[set][i].stride <= 8192); + + set_tiling(buffers[set][i].bo, + &buffers[set][i].tiling, + buffers[set][i].stride); + } +} + +static void exchange_uint(void *array, unsigned i, unsigned j) +{ + unsigned *i_arr = array; + unsigned i_tmp; + + i_tmp = i_arr[i]; + i_arr[i] = i_arr[j]; + i_arr[j] = i_tmp; +} + +static void copy_tiles(unsigned *permutation) +{ + unsigned src_tile, src_buf_idx, src_x, src_y; + unsigned dst_tile, dst_buf_idx, dst_x, dst_y; + struct scratch_buf *src_buf, *dst_buf; + int i, idx; + for (i = 0; i < num_buffers*TILES_PER_BUF; i++) { + /* tile_permutation is independant of current_permutation, so + * abuse it to randomize the order of the src bos */ + idx = tile_permutation[i]; + src_buf_idx = idx / TILES_PER_BUF; + src_tile = idx % TILES_PER_BUF; + src_buf = &buffers[current_set][src_buf_idx]; + + tile2xy(src_buf, src_tile, &src_x, &src_y); + + dst_buf_idx = permutation[idx] / TILES_PER_BUF; + dst_tile = permutation[idx] % TILES_PER_BUF; + dst_buf = &buffers[target_set][dst_buf_idx]; + + tile2xy(dst_buf, dst_tile, &dst_x, &dst_y); + +#if NO_HW + cpucpy2d(src_buf->data, + src_buf->stride / sizeof(uint32_t), + src_x, src_y, + dst_buf->data, + dst_buf->stride / sizeof(uint32_t), + dst_x, dst_y); +#else + next_copyfunc(); + + copyfunc(src_buf, src_x, src_y, dst_buf, dst_x, dst_y); +#endif + } + + intel_batchbuffer_flush(batch); +} + +static int get_num_fences(void) +{ + drm_i915_getparam_t gp; + int ret, val; + + gp.param = I915_PARAM_NUM_FENCES_AVAIL; + gp.value = &val; + ret = drmIoctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp); + assert (ret == 0); + + printf ("total %d fences\n", val); + assert(val > 4); + + return val - 2; +} + +int main(int argc, char **argv) +{ + int i, j; + unsigned *current_permutation, *tmp_permutation; + + drm_fd = drm_open_any(); + num_buffers = 2 * gem_aperture_size(drm_fd) / (1024*1024) / 3; + num_buffers /= 2; + printf("Using %d 1MiB buffers\n", num_buffers); + + bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096); + drm_intel_bufmgr_gem_enable_reuse(bufmgr); + devid = intel_get_drm_devid(drm_fd); + num_fences = get_num_fences(); + batch = intel_batchbuffer_alloc(bufmgr, devid); + busy_bo = drm_intel_bo_alloc(bufmgr, "tiled bo", SCRATCH_BUF_SIZE, 4096); + + for (i = 0; i < num_buffers; i++) { + init_buffer(&buffers[0][i]); + init_buffer(&buffers[1][i]); + } + current_set = 0; + + current_permutation = malloc(MAX_BUFS*TILES_PER_BUF*sizeof(uint32_t)); + tmp_permutation = malloc(MAX_BUFS*TILES_PER_BUF*sizeof(uint32_t)); + assert(current_permutation); + assert(tmp_permutation); + + /* just in case it helps reproducability */ + srandom(0xdeadbeef); + + fan_out(); + + for (i = 0; i < 512; i++) { + if (i % 64 == 63) { + fan_in_and_check(); + printf("everything correct after %i rounds\n", i + 1); + } + + target_set = (current_set + 1) & 1; + init_set(target_set); + + for (j = 0; j < num_buffers*TILES_PER_BUF; j++) + current_permutation[j] = j; + permute_array(current_permutation, num_buffers*TILES_PER_BUF, exchange_uint); + + copy_tiles(current_permutation); + + memcpy(tmp_permutation, tile_permutation, sizeof(tile_permutation)); + + /* accumulate the permutations */ + for (j = 0; j < num_buffers*TILES_PER_BUF; j++) + tile_permutation[j] = current_permutation[tmp_permutation[j]]; + + current_set = target_set; + } + + fan_in_and_check(); + + intel_batchbuffer_free(batch); + drm_intel_bufmgr_destroy(bufmgr); + + close(drm_fd); + + return 0; +}