/* * Copyright © 2011 Daniel Vetter * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Authors: * Daniel Vetter * * Partially based upon gem_tiled_fence_blits.c */ /** @file gem_stress.c * * This is a general gem coherency test. It's designed to eventually replicate * any possible sequence of access patterns. It works by copying a set of tiles * between two sets of backing buffer objects, randomly permutating the assinged * position on each copy operations. * * The copy operation are done in tiny portions (to reduce any race windows * for corruptions, hence increasing the chances for observing one) and are * constantly switched between all means to copy stuff (fenced blitter, unfenced * render, mmap, pwrite/read). * * After every complete move of a set tiling parameters of a buffer are randomly * changed to simulate the effects of libdrm caching. * * Buffers are 1mb big to nicely fit into fences on gen2/3. A few are further * split up to test relaxed fencing. Using this to push the average working set * size over the available gtt space forces objects to be mapped as unfenceable * (and as a side-effect tests gtt map/unmap coherency). * * In short: designed for maximum evilness. */ #include #include #include #include #include #include #include #include #include #include #include #include "drm.h" #include "i915_drm.h" #include "drmtest.h" #include "intel_bufmgr.h" #include "intel_batchbuffer.h" #include "intel_gpu_tools.h" #include "i915_reg.h" #include "i915_3d.h" #define CMD_POLY_STIPPLE_OFFSET 0x7906 /** TODO: * - beat on relaxed fencing (i.e. mappable/fenceable tracking in the kernel) * - render copy (to check fence tracking and cache coherency management by the * kernel) * - multi-threading: probably just a wrapper script to launch multiple * instances + an option to accordingly reduce the working set * - gen6 inter-ring coherency (needs render copy, first) * - variable buffer size * - add an option to fork a second process that randomly sends signals to the * first one (to check consistency of the kernel recovery paths) */ static uint64_t gem_aperture_size(int fd) { struct drm_i915_gem_get_aperture aperture; aperture.aper_size = 256*1024*1024; (void)drmIoctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture); return aperture.aper_size; } struct scratch_buf { drm_intel_bo *bo; uint32_t stride; uint32_t tiling; uint32_t *data; unsigned num_tiles; }; static drm_intel_bufmgr *bufmgr; struct intel_batchbuffer *batch; static int drm_fd; static int devid; static int num_fences; drm_intel_bo *busy_bo; static struct { unsigned scratch_buf_size; unsigned num_buffers; int trace_tile; int no_hw; int gpu_busy_load; } options; #define MAX_BUFS 4096 #define SCRATCH_BUF_SIZE 1024*1024 #define BUSY_BUF_SIZE (256*4096) #define TILE_SIZE 16 #define TILE_BYTES (TILE_SIZE*TILE_SIZE*sizeof(uint32_t)) static struct scratch_buf buffers[2][MAX_BUFS]; /* tile i is at logical position tile_permutation[i] */ static unsigned *tile_permutation; static unsigned num_buffers = 0; static unsigned current_set = 0; static unsigned target_set = 0; static unsigned num_total_tiles = 0; #define TILES_PER_BUF (num_total_tiles / num_buffers) static int fence_storm = 0; static int gpu_busy_load = 10; static void tile2xy(struct scratch_buf *buf, unsigned tile, unsigned *x, unsigned *y) { assert(tile < buf->num_tiles); *x = (tile*TILE_SIZE) % (buf->stride/sizeof(uint32_t)); *y = ((tile*TILE_SIZE) / (buf->stride/sizeof(uint32_t))) * TILE_SIZE; } /* All this gem trashing wastes too much cpu time, so give the gpu something to * do to increase changes for races. */ static void keep_gpu_busy(void) { uint32_t src_pitch, dst_pitch, cmd_bits; int tmp; src_pitch = 4096; dst_pitch = 4096; cmd_bits = 0; #if 0 /* busy_buf is untiled */ if (IS_965(devid)) { src_pitch /= 4; cmd_bits |= XY_SRC_COPY_BLT_SRC_TILED; } if (IS_965(devid)) { dst_pitch /= 4; cmd_bits |= XY_SRC_COPY_BLT_DST_TILED; } #endif tmp = 1 << gpu_busy_load; assert(tmp <= 1024); /* copy lower half to upper half */ BEGIN_BATCH(8); OUT_BATCH(XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA | XY_SRC_COPY_BLT_WRITE_RGB | cmd_bits); OUT_BATCH((3 << 24) | /* 32 bits */ (0xcc << 16) | /* copy ROP */ dst_pitch); OUT_BATCH(128 << 16 | 0); OUT_BATCH(256 << 16 | tmp); OUT_RELOC(busy_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); OUT_BATCH(0 << 16 | 0); OUT_BATCH(src_pitch); OUT_RELOC(busy_bo, I915_GEM_DOMAIN_RENDER, 0, 0); ADVANCE_BATCH(); } static unsigned int copyfunc_seq = 0; static void (*copyfunc)(struct scratch_buf *src, unsigned src_x, unsigned src_y, struct scratch_buf *dst, unsigned dst_x, unsigned dst_y, unsigned logical_tile_no); /* stride, x, y in units of uint32_t! */ static void cpucpy2d(uint32_t *src, unsigned src_stride, unsigned src_x, unsigned src_y, uint32_t *dst, unsigned dst_stride, unsigned dst_x, unsigned dst_y, unsigned logical_tile_no) { int i, j; int failed = 0; for (i = 0; i < TILE_SIZE; i++) { for (j = 0; j < TILE_SIZE; j++) { unsigned dst_ofs = dst_x + j + dst_stride * (dst_y + i); unsigned src_ofs = src_x + j + src_stride * (src_y + i); unsigned expect = logical_tile_no*TILE_SIZE*TILE_SIZE + i*TILE_SIZE + j; uint32_t tmp = src[src_ofs]; if (tmp != expect) { printf("mismatch at tile %i pos %i, read %i, expected %i, diff %i\n", logical_tile_no, i*TILE_SIZE + j, tmp, expect, (int) tmp - expect); if (options.trace_tile >= 0) exit(1); failed = 1; } dst[dst_ofs] = tmp; } } if (failed) exit(1); } static void cpu_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y, struct scratch_buf *dst, unsigned dst_x, unsigned dst_y, unsigned logical_tile_no) { cpucpy2d(src->data, src->stride/sizeof(uint32_t), src_x, src_y, dst->data, dst->stride/sizeof(uint32_t), dst_x, dst_y, logical_tile_no); } static void prw_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y, struct scratch_buf *dst, unsigned dst_x, unsigned dst_y, unsigned logical_tile_no) { uint32_t tmp_tile[TILE_SIZE*TILE_SIZE]; int i; if (src->tiling == I915_TILING_NONE) { for (i = 0; i < TILE_SIZE; i++) { unsigned ofs = src_x*sizeof(uint32_t) + src->stride*(src_y + i); drm_intel_bo_get_subdata(src->bo, ofs, TILE_SIZE*sizeof(uint32_t), tmp_tile + TILE_SIZE*i); } } else { cpucpy2d(src->data, src->stride/sizeof(uint32_t), src_x, src_y, tmp_tile, TILE_SIZE, 0, 0, logical_tile_no); } if (dst->tiling == I915_TILING_NONE) { for (i = 0; i < TILE_SIZE; i++) { unsigned ofs = dst_x*sizeof(uint32_t) + dst->stride*(dst_y + i); drm_intel_bo_subdata(dst->bo, ofs, TILE_SIZE*sizeof(uint32_t), tmp_tile + TILE_SIZE*i); } } else { cpucpy2d(tmp_tile, TILE_SIZE, 0, 0, dst->data, dst->stride/sizeof(uint32_t), dst_x, dst_y, logical_tile_no); } } static void blitter_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y, struct scratch_buf *dst, unsigned dst_x, unsigned dst_y, unsigned logical_tile_no) { uint32_t src_pitch, dst_pitch, cmd_bits; src_pitch = src->stride; dst_pitch = dst->stride; cmd_bits = 0; static unsigned keep_gpu_busy_counter = 0; /* check both edges of the fence usage */ if (keep_gpu_busy_counter & 1 && !fence_storm) keep_gpu_busy(); if (IS_965(devid) && src->tiling) { src_pitch /= 4; cmd_bits |= XY_SRC_COPY_BLT_SRC_TILED; } if (IS_965(devid) && dst->tiling) { dst_pitch /= 4; cmd_bits |= XY_SRC_COPY_BLT_DST_TILED; } BEGIN_BATCH(8); OUT_BATCH(XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA | XY_SRC_COPY_BLT_WRITE_RGB | cmd_bits); OUT_BATCH((3 << 24) | /* 32 bits */ (0xcc << 16) | /* copy ROP */ dst_pitch); OUT_BATCH(dst_y << 16 | dst_x); OUT_BATCH((dst_y+TILE_SIZE) << 16 | (dst_x+TILE_SIZE)); OUT_RELOC_FENCED(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); OUT_BATCH(src_y << 16 | src_x); OUT_BATCH(src_pitch); OUT_RELOC_FENCED(src->bo, I915_GEM_DOMAIN_RENDER, 0, 0); ADVANCE_BATCH(); if (!(keep_gpu_busy_counter & 1) && !fence_storm) keep_gpu_busy(); keep_gpu_busy_counter++; if (src->tiling) fence_storm--; if (dst->tiling) fence_storm--; if (fence_storm <= 1) { fence_storm = 0; intel_batchbuffer_flush(batch); } } static unsigned buf_width(struct scratch_buf *buf) { return buf->stride/sizeof(uint32_t); } static unsigned buf_height(struct scratch_buf *buf) { return options.scratch_buf_size/buf->stride; } static void emit_vertex(float f) { union { float f; uint32_t ui; } u; u.f = f; OUT_BATCH(u.ui); } static void emit_vertex_normalized(float f, float total) { union { float f; uint32_t ui; } u; u.f = f / total; OUT_BATCH(u.ui); } static void gen3_render_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y, struct scratch_buf *dst, unsigned dst_x, unsigned dst_y, unsigned logical_tile_no) { uint32_t src_pitch, dst_pitch, cmd_bits; src_pitch = src->stride; dst_pitch = dst->stride; cmd_bits = 0; static unsigned keep_gpu_busy_counter = 0; /* check both edges of the fence usage */ if (keep_gpu_busy_counter & 1 && !fence_storm) keep_gpu_busy(); /* invariant state */ { OUT_BATCH(_3DSTATE_AA_CMD | AA_LINE_ECAAR_WIDTH_ENABLE | AA_LINE_ECAAR_WIDTH_1_0 | AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0); OUT_BATCH(_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD | IAB_MODIFY_ENABLE | IAB_MODIFY_FUNC | (BLENDFUNC_ADD << IAB_FUNC_SHIFT) | IAB_MODIFY_SRC_FACTOR | (BLENDFACT_ONE << IAB_SRC_FACTOR_SHIFT) | IAB_MODIFY_DST_FACTOR | (BLENDFACT_ZERO << IAB_DST_FACTOR_SHIFT)); OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD); OUT_BATCH(0); OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD); OUT_BATCH(0); OUT_BATCH(_3DSTATE_DFLT_Z_CMD); OUT_BATCH(0); OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS | CSB_TCB(0, 0) | CSB_TCB(1, 1) | CSB_TCB(2, 2) | CSB_TCB(3, 3) | CSB_TCB(4, 4) | CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7)); OUT_BATCH(_3DSTATE_RASTER_RULES_CMD | ENABLE_POINT_RASTER_RULE | OGL_POINT_RASTER_RULE | ENABLE_LINE_STRIP_PROVOKE_VRTX | ENABLE_TRI_FAN_PROVOKE_VRTX | LINE_STRIP_PROVOKE_VRTX(1) | TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D); OUT_BATCH(_3DSTATE_MODES_4_CMD | ENABLE_LOGIC_OP_FUNC | LOGIC_OP_FUNC(LOGICOP_COPY) | ENABLE_STENCIL_WRITE_MASK | STENCIL_WRITE_MASK(0xff) | ENABLE_STENCIL_TEST_MASK | STENCIL_TEST_MASK(0xff)); OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | I1_LOAD_S(4) | I1_LOAD_S(5) | 2); OUT_BATCH(0x00000000); /* Disable texture coordinate wrap-shortest */ OUT_BATCH((1 << S4_POINT_WIDTH_SHIFT) | S4_LINE_WIDTH_ONE | S4_CULLMODE_NONE | S4_VFMT_XY); OUT_BATCH(0x00000000); /* Stencil. */ OUT_BATCH(_3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT); OUT_BATCH(_3DSTATE_SCISSOR_RECT_0_CMD); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE); OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0); /* disable indirect state */ OUT_BATCH(0); OUT_BATCH(_3DSTATE_STIPPLE); OUT_BATCH(0x00000000); OUT_BATCH(_3DSTATE_BACKFACE_STENCIL_OPS | BFO_ENABLE_STENCIL_TWO_SIDE | 0); } /* samler state */ { #define TEX_COUNT 1 uint32_t tiling_bits = 0; if (src->tiling != I915_TILING_NONE) tiling_bits = MS3_TILED_SURFACE; if (src->tiling == I915_TILING_Y) tiling_bits |= MS3_TILE_WALK; OUT_BATCH(_3DSTATE_MAP_STATE | (3 * TEX_COUNT)); OUT_BATCH((1 << TEX_COUNT) - 1); OUT_RELOC(src->bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); OUT_BATCH(MAPSURF_32BIT | MT_32BIT_ARGB8888 | tiling_bits | (buf_height(src) - 1) << MS3_HEIGHT_SHIFT | (buf_width(src) - 1) << MS3_WIDTH_SHIFT); OUT_BATCH((src->stride/4-1) << MS4_PITCH_SHIFT); OUT_BATCH(_3DSTATE_SAMPLER_STATE | (3 * TEX_COUNT)); OUT_BATCH((1 << TEX_COUNT) - 1); OUT_BATCH(MIPFILTER_NONE << SS2_MIP_FILTER_SHIFT | FILTER_NEAREST << SS2_MAG_FILTER_SHIFT | FILTER_NEAREST << SS2_MIN_FILTER_SHIFT); OUT_BATCH(SS3_NORMALIZED_COORDS | TEXCOORDMODE_WRAP << SS3_TCX_ADDR_MODE_SHIFT | TEXCOORDMODE_WRAP << SS3_TCY_ADDR_MODE_SHIFT | 0 << SS3_TEXTUREMAP_INDEX_SHIFT); OUT_BATCH(0x00000000); } /* render target state */ { uint32_t tiling_bits = 0; if (dst->tiling != I915_TILING_NONE) tiling_bits = BUF_3D_TILED_SURFACE; if (dst->tiling == I915_TILING_Y) tiling_bits |= BUF_3D_TILE_WALK_Y; OUT_BATCH(_3DSTATE_BUF_INFO_CMD); OUT_BATCH(BUF_3D_ID_COLOR_BACK | tiling_bits | BUF_3D_PITCH(dst->stride)); OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD); OUT_BATCH(COLR_BUF_ARGB8888 | DSTORG_HORT_BIAS(0x8) | DSTORG_VERT_BIAS(0x8)); /* draw rect is unconditional */ OUT_BATCH(_3DSTATE_DRAW_RECT_CMD); OUT_BATCH(0x00000000); OUT_BATCH(0x00000000); /* ymin, xmin */ OUT_BATCH(DRAW_YMAX(buf_height(dst) - 1) | DRAW_XMAX(buf_width(dst) - 1)); /* yorig, xorig (relate to color buffer?) */ OUT_BATCH(0x00000000); } /* texfmt */ { uint32_t ss2 = ~0; ss2 &= ~S2_TEXCOORD_FMT(0, TEXCOORDFMT_NOT_PRESENT); ss2 |= S2_TEXCOORD_FMT(0, TEXCOORDFMT_2D); OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(2) | I1_LOAD_S(6) | 1); OUT_BATCH(ss2); OUT_BATCH(S6_CBUF_BLEND_ENABLE | S6_COLOR_WRITE_ENABLE | BLENDFUNC_ADD << S6_CBUF_BLEND_FUNC_SHIFT | BLENDFACT_ONE << S6_CBUF_SRC_BLEND_FACT_SHIFT | BLENDFACT_ZERO << S6_CBUF_DST_BLEND_FACT_SHIFT); OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(0) | I1_LOAD_S(1) | 1); OUT_BATCH(0); /* no vbo */ OUT_BATCH((4 << S1_VERTEX_WIDTH_SHIFT) | (4 << S1_VERTEX_PITCH_SHIFT)); } /* frage shader */ { OUT_BATCH(_3DSTATE_PIXEL_SHADER_PROGRAM | (1 + 3*3 - 2)); /* decl FS_T0 */ OUT_BATCH(D0_DCL | REG_TYPE(FS_T0) << D0_TYPE_SHIFT | REG_NR(FS_T0) << D0_NR_SHIFT | ((REG_TYPE(FS_T0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0)); OUT_BATCH(0); OUT_BATCH(0); /* decl FS_S0 */ OUT_BATCH(D0_DCL | (REG_TYPE(FS_S0) << D0_TYPE_SHIFT) | (REG_NR(FS_S0) << D0_NR_SHIFT) | ((REG_TYPE(FS_S0) != REG_TYPE_S) ? D0_CHANNEL_ALL : 0)); OUT_BATCH(0); OUT_BATCH(0); /* texld(FS_OC, FS_S0, FS_T0 */ OUT_BATCH(T0_TEXLD | (REG_TYPE(FS_OC) << T0_DEST_TYPE_SHIFT) | (REG_NR(FS_OC) << T0_DEST_NR_SHIFT) | (REG_NR(FS_S0) << T0_SAMPLER_NR_SHIFT)); OUT_BATCH((REG_TYPE(FS_T0) << T1_ADDRESS_REG_TYPE_SHIFT) | (REG_NR(FS_T0) << T1_ADDRESS_REG_NR_SHIFT)); OUT_BATCH(0); } OUT_BATCH(PRIM3D_RECTLIST | (3*4 - 1)); emit_vertex(dst_x + TILE_SIZE); emit_vertex(dst_y + TILE_SIZE); emit_vertex_normalized(src_x + TILE_SIZE, buf_width(src)); emit_vertex_normalized(src_y + TILE_SIZE, buf_height(src)); emit_vertex(dst_x); emit_vertex(dst_y + TILE_SIZE); emit_vertex_normalized(src_x, buf_width(src)); emit_vertex_normalized(src_y + TILE_SIZE, buf_height(src)); emit_vertex(dst_x); emit_vertex(dst_y); emit_vertex_normalized(src_x, buf_width(src)); emit_vertex_normalized(src_y, buf_height(src)); if (!(keep_gpu_busy_counter & 1) && !fence_storm) keep_gpu_busy(); keep_gpu_busy_counter++; intel_batchbuffer_flush(batch); } static void render_copyfunc(struct scratch_buf *src, unsigned src_x, unsigned src_y, struct scratch_buf *dst, unsigned dst_x, unsigned dst_y, unsigned logical_tile_no) { if (IS_GEN3(devid)) gen3_render_copyfunc(src, src_x, src_y, dst, dst_x, dst_y, logical_tile_no); else blitter_copyfunc(src, src_x, src_y, dst, dst_x, dst_y, logical_tile_no); } static void next_copyfunc(int tile) { if (fence_storm) { if (tile == options.trace_tile) printf(" using fence storm\n"); return; } if (copyfunc_seq % 61 == 0) { if (tile == options.trace_tile) printf(" using fence storm\n"); fence_storm = num_fences; copyfunc = blitter_copyfunc; } else if (copyfunc_seq % 17 == 0) { if (tile == options.trace_tile) printf(" using cpu\n"); fence_storm = num_fences; copyfunc = cpu_copyfunc; } else if (copyfunc_seq % 19 == 0) { if (tile == options.trace_tile) printf(" using prw\n"); copyfunc = prw_copyfunc; } else if (copyfunc_seq % 3 == 0) { if (tile == options.trace_tile) printf(" using render\n"); copyfunc = render_copyfunc; } else { if (tile == options.trace_tile) printf(" using blitter\n"); copyfunc = blitter_copyfunc; } copyfunc_seq++; } static void fan_out(void) { uint32_t tmp_tile[TILE_SIZE*TILE_SIZE]; uint32_t seq = 0; int i, k; unsigned tile, buf_idx, x, y; for (i = 0; i < num_total_tiles; i++) { tile = i; buf_idx = tile / TILES_PER_BUF; tile %= TILES_PER_BUF; tile2xy(&buffers[current_set][buf_idx], tile, &x, &y); for (k = 0; k < TILE_SIZE*TILE_SIZE; k++) tmp_tile[k] = seq++; cpucpy2d(tmp_tile, TILE_SIZE, 0, 0, buffers[current_set][buf_idx].data, buffers[current_set][buf_idx].stride / sizeof(uint32_t), x, y, i); } for (i = 0; i < num_total_tiles; i++) tile_permutation[i] = i; } static void fan_in_and_check(void) { uint32_t tmp_tile[TILE_SIZE*TILE_SIZE]; unsigned tile, buf_idx, x, y; int i; for (i = 0; i < num_total_tiles; i++) { tile = tile_permutation[i]; buf_idx = tile / TILES_PER_BUF; tile %= TILES_PER_BUF; tile2xy(&buffers[current_set][buf_idx], tile, &x, &y); cpucpy2d(buffers[current_set][buf_idx].data, buffers[current_set][buf_idx].stride / sizeof(uint32_t), x, y, tmp_tile, TILE_SIZE, 0, 0, i); } } static void init_buffer(struct scratch_buf *buf, unsigned size) { buf->bo = drm_intel_bo_alloc(bufmgr, "tiled bo", size, 4096); assert(buf->bo); buf->tiling = I915_TILING_NONE; buf->stride = 8192; if (options.no_hw) buf->data = malloc(size); else { drm_intel_gem_bo_map_gtt(buf->bo); buf->data = buf->bo->virtual; } buf->num_tiles = size / TILE_BYTES; } static void permute_array(void *array, unsigned size, void (*exchange_func)(void *array, unsigned i, unsigned j)) { int i; long int l; for (i = size - 1; i > 1; i--) { l = random(); l %= i+1; /* yes, no perfectly uniform, who cares */ exchange_func(array, i, l); } } static void exchange_buf(void *array, unsigned i, unsigned j) { struct scratch_buf *buf_arr, tmp; buf_arr = array; memcpy(&tmp, &buf_arr[i], sizeof(struct scratch_buf)); memcpy(&buf_arr[i], &buf_arr[j], sizeof(struct scratch_buf)); memcpy(&buf_arr[j], &tmp, sizeof(struct scratch_buf)); } /* libdrm is to clever and prevents us from changin tiling of buffers already * used in relocations. */ static void set_tiling(drm_intel_bo *bo, unsigned *tiling, unsigned stride) { struct drm_i915_gem_set_tiling set_tiling; int ret; memset(&set_tiling, 0, sizeof(set_tiling)); do { /* set_tiling is slightly broken and overwrites the * input on the error path, so we have to open code * drmIoctl. */ set_tiling.handle = bo->handle; set_tiling.tiling_mode = *tiling; set_tiling.stride = tiling ? stride : 0; ret = ioctl(drm_fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling); } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); assert(ret != -1); *tiling = set_tiling.tiling_mode; } static void init_set(unsigned set) { long int r; int i; permute_array(buffers[set], num_buffers, exchange_buf); if (current_set == 1 && options.gpu_busy_load == 0) { gpu_busy_load++; if (gpu_busy_load > 10) gpu_busy_load = 6; } for (i = 0; i < num_buffers; i++) { r = random(); if ((r & 3) != 0) continue; r >>= 2; if ((r & 3) != 0) buffers[set][i].tiling = I915_TILING_X; else buffers[set][i].tiling = I915_TILING_NONE; r >>= 2; if (buffers[set][i].tiling == I915_TILING_NONE) { /* min 64 byte stride */ r %= 8; buffers[set][i].stride = 64 * (1 << r); } else if (IS_GEN2(devid)) { /* min 128 byte stride */ r %= 7; buffers[set][i].stride = 128 * (1 << r); } else { /* min 512 byte stride */ r %= 5; buffers[set][i].stride = 512 * (1 << r); } assert(buffers[set][i].stride <= 8192); set_tiling(buffers[set][i].bo, &buffers[set][i].tiling, buffers[set][i].stride); if (i == options.trace_tile/TILES_PER_BUF) printf("changing buffer %i containing tile %i: tiling %i, stride %i\n", i, options.trace_tile, buffers[set][i].tiling, buffers[set][i].stride); } } static void exchange_uint(void *array, unsigned i, unsigned j) { unsigned *i_arr = array; unsigned i_tmp; i_tmp = i_arr[i]; i_arr[i] = i_arr[j]; i_arr[j] = i_tmp; } static void copy_tiles(unsigned *permutation) { unsigned src_tile, src_buf_idx, src_x, src_y; unsigned dst_tile, dst_buf_idx, dst_x, dst_y; struct scratch_buf *src_buf, *dst_buf; int i, idx; for (i = 0; i < num_total_tiles; i++) { /* tile_permutation is independant of current_permutation, so * abuse it to randomize the order of the src bos */ idx = tile_permutation[i]; src_buf_idx = idx / TILES_PER_BUF; src_tile = idx % TILES_PER_BUF; src_buf = &buffers[current_set][src_buf_idx]; tile2xy(src_buf, src_tile, &src_x, &src_y); dst_buf_idx = permutation[idx] / TILES_PER_BUF; dst_tile = permutation[idx] % TILES_PER_BUF; dst_buf = &buffers[target_set][dst_buf_idx]; tile2xy(dst_buf, dst_tile, &dst_x, &dst_y); if (options.trace_tile == i) printf("copying tile %i from %i (%i, %i) to %i (%i, %i)", i, tile_permutation[i], src_buf_idx, src_tile, permutation[idx], dst_buf_idx, dst_tile); if (options.no_hw) { cpucpy2d(src_buf->data, src_buf->stride / sizeof(uint32_t), src_x, src_y, dst_buf->data, dst_buf->stride / sizeof(uint32_t), dst_x, dst_y, i); } else { next_copyfunc(i); copyfunc(src_buf, src_x, src_y, dst_buf, dst_x, dst_y, i); } } intel_batchbuffer_flush(batch); } static int get_num_fences(void) { drm_i915_getparam_t gp; int ret, val; gp.param = I915_PARAM_NUM_FENCES_AVAIL; gp.value = &val; ret = drmIoctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp); assert (ret == 0); printf ("total %d fences\n", val); assert(val > 4); return val - 2; } static void parse_options(int argc, char **argv) { int c, tmp; int option_index = 0; static struct option long_options[] = { {"no-hw", 0, 0, 'd'}, {"buf-size", 1, 0, 's'}, {"gpu-busy-load", 1, 0, 'g'}, {"buffer-count", 1, 0, 'c'}, {"trace-tile", 1, 0, 't'} }; options.scratch_buf_size = 256*4096; options.no_hw = 0; options.gpu_busy_load = 0; options.num_buffers = 0; options.trace_tile = -1; while((c = getopt_long(argc, argv, "ns:g:c:t:", long_options, &option_index)) != -1) { switch(c) { case 'd': options.no_hw = 1; printf("no-hw debug mode\n"); break; case 's': tmp = atoi(optarg); if (tmp < TILE_SIZE*8192) printf("scratch buffer size needs to be at least %i\n", TILE_SIZE*8192); else if (tmp & (tmp - 1)) { printf("scratch buffer size needs to be a power-of-two\n"); } else { printf("fixed scratch buffer size to %u\n", tmp); options.scratch_buf_size = tmp; } break; case 'g': tmp = atoi(optarg); if (tmp < 0 || tmp > 10) printf("gpu busy load needs to be bigger than 0 and smaller than 10\n"); else { printf("gpu busy load factor set to %i\n", tmp); gpu_busy_load = options.gpu_busy_load = tmp; } break; case 'c': options.num_buffers = atoi(optarg); printf("buffer count set to %i\n", options.num_buffers); break; case 't': options.trace_tile = atoi(optarg); printf("tracing tile %i\n", options.trace_tile); break; default: printf("unkown command options\n"); break; } } if (optind < argc) printf("unkown command options\n"); } static void init(void) { int i; unsigned tmp; drm_fd = drm_open_any(); if (options.num_buffers == 0) { tmp = gem_aperture_size(drm_fd); tmp = tmp > 256*(1024*1024) ? 256*(1024*1024) : tmp; num_buffers = 2 * tmp / options.scratch_buf_size / 3; num_buffers /= 2; printf("using %u buffers\n", num_buffers); } else num_buffers = options.num_buffers; bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096); drm_intel_bufmgr_gem_enable_reuse(bufmgr); drm_intel_bufmgr_gem_enable_fenced_relocs(bufmgr); devid = intel_get_drm_devid(drm_fd); num_fences = get_num_fences(); batch = intel_batchbuffer_alloc(bufmgr, devid); busy_bo = drm_intel_bo_alloc(bufmgr, "tiled bo", BUSY_BUF_SIZE, 4096); for (i = 0; i < num_buffers; i++) { init_buffer(&buffers[0][i], options.scratch_buf_size); init_buffer(&buffers[1][i], options.scratch_buf_size); num_total_tiles += buffers[0][i].num_tiles; } current_set = 0; /* just in case it helps reproducability */ srandom(0xdeadbeef); } int main(int argc, char **argv) { int i, j; unsigned *current_permutation, *tmp_permutation; parse_options(argc, argv); init(); tile_permutation = malloc(num_total_tiles*sizeof(uint32_t)); current_permutation = malloc(num_total_tiles*sizeof(uint32_t)); tmp_permutation = malloc(num_total_tiles*sizeof(uint32_t)); assert(tile_permutation); assert(current_permutation); assert(tmp_permutation); fan_out(); for (i = 0; i < 512; i++) { printf("round %i\n", i); if (i % 64 == 63) { fan_in_and_check(); printf("everything correct after %i rounds\n", i + 1); } target_set = (current_set + 1) & 1; init_set(target_set); for (j = 0; j < num_total_tiles; j++) current_permutation[j] = j; permute_array(current_permutation, num_total_tiles, exchange_uint); copy_tiles(current_permutation); memcpy(tmp_permutation, tile_permutation, sizeof(unsigned)*num_total_tiles); /* accumulate the permutations */ for (j = 0; j < num_total_tiles; j++) tile_permutation[j] = current_permutation[tmp_permutation[j]]; current_set = target_set; } fan_in_and_check(); intel_batchbuffer_free(batch); drm_intel_bufmgr_destroy(bufmgr); close(drm_fd); return 0; }