lib: Add a GPU error detector

If we listen to the uevents from the kernel, we can detect when the GPU
hangs. This requires us to fork a helper process to do so and send a
signal back to the parent.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
This commit is contained in:
Chris Wilson 2016-03-22 11:33:41 +00:00
parent eb572106b4
commit 756f3e0cb7
9 changed files with 102 additions and 8 deletions

View File

@ -3,7 +3,7 @@ include Makefile.sources
AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib
AM_CFLAGS = $(DRM_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) $(LIBUNWIND_CFLAGS)
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) -lm
LDADD = $(top_builddir)/lib/libintel_tools.la
benchmarks_LTLIBRARIES = gem_exec_tracer.la
gem_exec_tracer_la_LDFLAGS = -module -avoid-version -no-undefined

View File

@ -15,4 +15,4 @@ AM_CFLAGS = \
$(LIBUNWIND_CFLAGS) \
$(CWARNFLAGS)
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS)
LDADD = $(top_builddir)/lib/libintel_tools.la

View File

@ -4,4 +4,4 @@ bin_PROGRAMS = \
AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib
AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) $(LIBUNWIND_CFLAGS)
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS)
LDADD = $(top_builddir)/lib/libintel_tools.la

View File

@ -15,12 +15,20 @@ if HAVE_VC4
endif
AM_CPPFLAGS = -I$(top_srcdir)
AM_CFLAGS = $(DRM_CFLAGS) $(CWARNFLAGS) $(LIBUNWIND_CFLAGS) $(DEBUG_CFLAGS) \
AM_CFLAGS = $(CWARNFLAGS) $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(LIBUNWIND_CFLAGS) $(DEBUG_CFLAGS) \
-DIGT_SRCDIR=\""$(abs_top_srcdir)/tests"\" \
-DIGT_DATADIR=\""$(pkgdatadir)"\" \
-DIGT_LOG_DOMAIN=\""$(subst _,-,$*)"\" \
-pthread
LDADD = $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) -lm
AM_CFLAGS += $(CAIRO_CFLAGS)
libintel_tools_la_LIBADD = \
$(DRM_LIBS) \
$(PCIACCESS_LIBS) \
$(CAIRO_LIBS) \
$(LIBUDEV_LIBS) \
$(LIBUNWIND_LIBS) \
$(TIMER_LIBS) \
-lm

View File

@ -42,6 +42,7 @@
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/poll.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/types.h>
@ -359,6 +360,85 @@ void igt_stop_signal_helper(void)
sig_stat = 0;
}
#if HAVE_UDEV
#include <libudev.h>
static struct igt_helper_process hang_detector;
static void __attribute__((noreturn))
hang_detector_process(pid_t pid, dev_t rdev)
{
struct udev_monitor *mon =
udev_monitor_new_from_netlink(udev_new(), "kernel");
struct pollfd pfd;
udev_monitor_filter_add_match_subsystem_devtype(mon, "drm", NULL);
udev_monitor_enable_receiving(mon);
pfd.fd = udev_monitor_get_fd(mon);
pfd.events = POLLIN;
while (poll(&pfd, 1, -1) > 0) {
struct udev_device *dev = udev_monitor_receive_device(mon);
dev_t devnum;
if (dev == NULL)
break;
devnum = udev_device_get_devnum(dev);
if (memcmp(&rdev, &devnum, sizeof(dev_t)) == 0) {
const char *str;
str = udev_device_get_property_value(dev, "ERROR");
if (str && atoi(str) == 1)
kill(pid, SIGRTMAX);
}
udev_device_unref(dev);
if (kill(pid, 0)) /* Parent has died, so must we. */
break;
}
exit(0);
}
static void sig_abort(int sig)
{
igt_assert(!"GPU hung");
}
void igt_fork_hang_detector(int fd)
{
struct stat st;
if (igt_only_list_subtests())
return;
igt_assert(fstat(fd, &st) == 0);
signal(SIGRTMAX, sig_abort);
igt_fork_helper(&hang_detector)
hang_detector_process(getppid(), st.st_rdev);
}
void igt_stop_hang_detector(void)
{
if (igt_only_list_subtests())
return;
igt_stop_helper(&hang_detector);
}
#else
void igt_fork_hang_detector(int fd)
{
if (igt_only_list_subtests())
return;
}
void igt_stop_hang_detector(void)
{
}
#endif
/**
* igt_check_boolean_env_var:
* @env_var: environment variable name

View File

@ -40,6 +40,9 @@ extern int num_trash_bos;
void igt_fork_signal_helper(void);
void igt_stop_signal_helper(void);
void igt_fork_hang_detector(int fd);
void igt_stop_hang_detector(void);
struct igt_sigiter {
unsigned pass;
};

View File

@ -56,9 +56,8 @@ AM_CFLAGS = $(DRM_CFLAGS) $(CWARNFLAGS) $(DEBUG_CFLAGS)\
$(LIBUNWIND_CFLAGS) \
$(NULL)
LDADD = ../lib/libintel_tools.la $(PCIACCESS_LIBS) $(DRM_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS)
LDADD = ../lib/libintel_tools.la $(GLIB_LIBS)
LDADD += $(CAIRO_LIBS) $(LIBUDEV_LIBS) $(GLIB_LIBS) -lm
AM_CFLAGS += $(CAIRO_CFLAGS) $(LIBUDEV_CFLAGS) $(GLIB_CFLAGS)
AM_LDFLAGS = -Wl,--as-needed

View File

@ -368,6 +368,8 @@ igt_main
igt_fixture
fd = drm_open_driver_master(DRIVER_INTEL);
igt_fork_hang_detector(fd);
for (const struct mode *m = modes; m->name; m++)
igt_subtest_f("%s", *m->name ? m->name : "basic")
whisper(fd, -1, m->flags);
@ -382,6 +384,8 @@ igt_main
whisper(fd, e->exec_id | e->flags, m->flags);
}
igt_stop_hang_detector();
igt_fixture
close(fd);
}

View File

@ -4,7 +4,7 @@ SUBDIRS = null_state_gen registers
AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib
AM_CFLAGS = $(DEBUG_CFLAGS) $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) $(LIBUNWIND_CFLAGS) -DPKGDATADIR=\"$(pkgdatadir)\"
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUDEV_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) -lm
LDADD = $(top_builddir)/lib/libintel_tools.la
AM_LDFLAGS = -Wl,--as-needed