mirror of
				https://github.com/tiagovignatti/intel-gpu-tools.git
				synced 2025-11-04 03:58:27 +00:00 
			
		
		
		
	lib: Add a GPU error detector
If we listen to the uevents from the kernel, we can detect when the GPU hangs. This requires us to fork a helper process to do so and send a signal back to the parent. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
This commit is contained in:
		
							parent
							
								
									eb572106b4
								
							
						
					
					
						commit
						756f3e0cb7
					
				@ -3,7 +3,7 @@ include Makefile.sources
 | 
			
		||||
 | 
			
		||||
AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib
 | 
			
		||||
AM_CFLAGS = $(DRM_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) $(LIBUNWIND_CFLAGS)
 | 
			
		||||
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) -lm
 | 
			
		||||
LDADD = $(top_builddir)/lib/libintel_tools.la
 | 
			
		||||
 | 
			
		||||
benchmarks_LTLIBRARIES = gem_exec_tracer.la
 | 
			
		||||
gem_exec_tracer_la_LDFLAGS = -module -avoid-version -no-undefined
 | 
			
		||||
 | 
			
		||||
@ -15,4 +15,4 @@ AM_CFLAGS = 			\
 | 
			
		||||
	$(LIBUNWIND_CFLAGS)	\
 | 
			
		||||
	$(CWARNFLAGS)
 | 
			
		||||
 | 
			
		||||
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS)
 | 
			
		||||
LDADD = $(top_builddir)/lib/libintel_tools.la
 | 
			
		||||
 | 
			
		||||
@ -4,4 +4,4 @@ bin_PROGRAMS = 				\
 | 
			
		||||
 | 
			
		||||
AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib
 | 
			
		||||
AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) $(LIBUNWIND_CFLAGS)
 | 
			
		||||
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS)
 | 
			
		||||
LDADD = $(top_builddir)/lib/libintel_tools.la
 | 
			
		||||
 | 
			
		||||
@ -15,12 +15,20 @@ if HAVE_VC4
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
AM_CPPFLAGS = -I$(top_srcdir)
 | 
			
		||||
AM_CFLAGS = $(DRM_CFLAGS) $(CWARNFLAGS) $(LIBUNWIND_CFLAGS) $(DEBUG_CFLAGS) \
 | 
			
		||||
AM_CFLAGS = $(CWARNFLAGS) $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(LIBUNWIND_CFLAGS) $(DEBUG_CFLAGS) \
 | 
			
		||||
	    -DIGT_SRCDIR=\""$(abs_top_srcdir)/tests"\" \
 | 
			
		||||
	    -DIGT_DATADIR=\""$(pkgdatadir)"\" \
 | 
			
		||||
	    -DIGT_LOG_DOMAIN=\""$(subst _,-,$*)"\" \
 | 
			
		||||
	    -pthread
 | 
			
		||||
 | 
			
		||||
LDADD = $(CAIRO_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) -lm
 | 
			
		||||
AM_CFLAGS += $(CAIRO_CFLAGS)
 | 
			
		||||
 | 
			
		||||
libintel_tools_la_LIBADD = \
 | 
			
		||||
	$(DRM_LIBS) \
 | 
			
		||||
	$(PCIACCESS_LIBS) \
 | 
			
		||||
	$(CAIRO_LIBS) \
 | 
			
		||||
	$(LIBUDEV_LIBS) \
 | 
			
		||||
	$(LIBUNWIND_LIBS) \
 | 
			
		||||
	$(TIMER_LIBS) \
 | 
			
		||||
	-lm
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -42,6 +42,7 @@
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <time.h>
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#include <sys/poll.h>
 | 
			
		||||
#include <sys/wait.h>
 | 
			
		||||
#include <sys/time.h>
 | 
			
		||||
#include <sys/types.h>
 | 
			
		||||
@ -359,6 +360,85 @@ void igt_stop_signal_helper(void)
 | 
			
		||||
	sig_stat = 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if HAVE_UDEV
 | 
			
		||||
#include <libudev.h>
 | 
			
		||||
 | 
			
		||||
static struct igt_helper_process hang_detector;
 | 
			
		||||
static void __attribute__((noreturn))
 | 
			
		||||
hang_detector_process(pid_t pid, dev_t rdev)
 | 
			
		||||
{
 | 
			
		||||
	struct udev_monitor *mon =
 | 
			
		||||
		udev_monitor_new_from_netlink(udev_new(), "kernel");
 | 
			
		||||
	struct pollfd pfd;
 | 
			
		||||
 | 
			
		||||
	udev_monitor_filter_add_match_subsystem_devtype(mon, "drm", NULL);
 | 
			
		||||
	udev_monitor_enable_receiving(mon);
 | 
			
		||||
 | 
			
		||||
	pfd.fd = udev_monitor_get_fd(mon);
 | 
			
		||||
	pfd.events = POLLIN;
 | 
			
		||||
 | 
			
		||||
	while (poll(&pfd, 1, -1) > 0) {
 | 
			
		||||
		struct udev_device *dev = udev_monitor_receive_device(mon);
 | 
			
		||||
		dev_t devnum;
 | 
			
		||||
 | 
			
		||||
		if (dev == NULL)
 | 
			
		||||
			break;
 | 
			
		||||
 | 
			
		||||
		devnum = udev_device_get_devnum(dev);
 | 
			
		||||
		if (memcmp(&rdev, &devnum, sizeof(dev_t)) == 0) {
 | 
			
		||||
			const char *str;
 | 
			
		||||
 | 
			
		||||
			str = udev_device_get_property_value(dev, "ERROR");
 | 
			
		||||
			if (str && atoi(str) == 1)
 | 
			
		||||
				kill(pid, SIGRTMAX);
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		udev_device_unref(dev);
 | 
			
		||||
		if (kill(pid, 0)) /* Parent has died, so must we. */
 | 
			
		||||
			break;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	exit(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void sig_abort(int sig)
 | 
			
		||||
{
 | 
			
		||||
	igt_assert(!"GPU hung");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void igt_fork_hang_detector(int fd)
 | 
			
		||||
{
 | 
			
		||||
	struct stat st;
 | 
			
		||||
 | 
			
		||||
	if (igt_only_list_subtests())
 | 
			
		||||
		return;
 | 
			
		||||
 | 
			
		||||
	igt_assert(fstat(fd, &st) == 0);
 | 
			
		||||
 | 
			
		||||
	signal(SIGRTMAX, sig_abort);
 | 
			
		||||
	igt_fork_helper(&hang_detector)
 | 
			
		||||
		hang_detector_process(getppid(), st.st_rdev);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void igt_stop_hang_detector(void)
 | 
			
		||||
{
 | 
			
		||||
	if (igt_only_list_subtests())
 | 
			
		||||
		return;
 | 
			
		||||
 | 
			
		||||
	igt_stop_helper(&hang_detector);
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
void igt_fork_hang_detector(int fd)
 | 
			
		||||
{
 | 
			
		||||
	if (igt_only_list_subtests())
 | 
			
		||||
		return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void igt_stop_hang_detector(void)
 | 
			
		||||
{
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * igt_check_boolean_env_var:
 | 
			
		||||
 * @env_var: environment variable name
 | 
			
		||||
 | 
			
		||||
@ -40,6 +40,9 @@ extern int num_trash_bos;
 | 
			
		||||
void igt_fork_signal_helper(void);
 | 
			
		||||
void igt_stop_signal_helper(void);
 | 
			
		||||
 | 
			
		||||
void igt_fork_hang_detector(int fd);
 | 
			
		||||
void igt_stop_hang_detector(void);
 | 
			
		||||
 | 
			
		||||
struct igt_sigiter {
 | 
			
		||||
	unsigned pass;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@ -56,9 +56,8 @@ AM_CFLAGS = $(DRM_CFLAGS) $(CWARNFLAGS) $(DEBUG_CFLAGS)\
 | 
			
		||||
	$(LIBUNWIND_CFLAGS) \
 | 
			
		||||
	$(NULL)
 | 
			
		||||
 | 
			
		||||
LDADD = ../lib/libintel_tools.la $(PCIACCESS_LIBS) $(DRM_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS)
 | 
			
		||||
LDADD = ../lib/libintel_tools.la $(GLIB_LIBS)
 | 
			
		||||
 | 
			
		||||
LDADD += $(CAIRO_LIBS) $(LIBUDEV_LIBS) $(GLIB_LIBS) -lm
 | 
			
		||||
AM_CFLAGS += $(CAIRO_CFLAGS) $(LIBUDEV_CFLAGS) $(GLIB_CFLAGS)
 | 
			
		||||
AM_LDFLAGS = -Wl,--as-needed
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -368,6 +368,8 @@ igt_main
 | 
			
		||||
	igt_fixture
 | 
			
		||||
		fd = drm_open_driver_master(DRIVER_INTEL);
 | 
			
		||||
 | 
			
		||||
	igt_fork_hang_detector(fd);
 | 
			
		||||
 | 
			
		||||
	for (const struct mode *m = modes; m->name; m++)
 | 
			
		||||
		igt_subtest_f("%s", *m->name ? m->name : "basic")
 | 
			
		||||
			whisper(fd, -1, m->flags);
 | 
			
		||||
@ -382,6 +384,8 @@ igt_main
 | 
			
		||||
				whisper(fd, e->exec_id | e->flags, m->flags);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	igt_stop_hang_detector();
 | 
			
		||||
 | 
			
		||||
	igt_fixture
 | 
			
		||||
		close(fd);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -4,7 +4,7 @@ SUBDIRS = null_state_gen registers
 | 
			
		||||
 | 
			
		||||
AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib
 | 
			
		||||
AM_CFLAGS = $(DEBUG_CFLAGS) $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS) $(LIBUNWIND_CFLAGS) -DPKGDATADIR=\"$(pkgdatadir)\"
 | 
			
		||||
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUDEV_LIBS) $(LIBUNWIND_LIBS) $(TIMER_LIBS) -lm
 | 
			
		||||
LDADD = $(top_builddir)/lib/libintel_tools.la
 | 
			
		||||
AM_LDFLAGS = -Wl,--as-needed
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user