intel_l3_parity: Support a daemonic mode

v2: Add a comment explaining the dangers of directly accessing the DFT
register (Daniel)

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
This commit is contained in:
Ben Widawsky 2013-09-10 14:21:23 -07:00
parent bfa7a5906d
commit 799aeb6d00
4 changed files with 186 additions and 5 deletions

View File

@ -39,7 +39,7 @@ dist_bin_SCRIPTS = intel_gpu_abrt
AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib
AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS)
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS)
LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUDEV_LIBS)
intel_dump_decode_SOURCES = \
intel_dump_decode.c
@ -50,3 +50,7 @@ intel_error_decode_SOURCES = \
intel_bios_reader_SOURCES = \
intel_bios_reader.c \
intel_bios.h
intel_l3_parity_SOURCES = \
intel_l3_parity.c \
intel_l3_udev_listener.c

View File

@ -37,6 +37,14 @@
#include "intel_chipset.h"
#include "intel_gpu_tools.h"
#include "drmtest.h"
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#if HAVE_UDEV
#include <libudev.h>
#include <syslog.h>
#endif
#include "intel_l3_parity.h"
static unsigned int devid;
/* L3 size is always a function of banks. The number of banks cannot be
@ -157,7 +165,8 @@ static void usage(const char *name)
" -r, --row=[row] The row to act upon (default 0)\n"
" -b, --bank=[bank] The bank to act upon (default 0)\n"
" -s, --subbank=[subbank] The subbank to act upon (default 0)\n"
" -w, --slice=[slice] Which slice to act on (default: -1 [all])"
" -w, --slice=[slice] Which slice to act on (default: -1 [all])\n"
" , --daemon Run the listener (-L) as a daemon\n"
" ACTIONS (only 1 may be specified at a time):\n"
" -h, --help Display this help\n"
" -H, --hw-info Display the current L3 properties\n"
@ -166,7 +175,8 @@ static void usage(const char *name)
" -e, --enable Enable row, bank, subbank (undo -d)\n"
" -d, --disable=<row,bank,subbank> Disable row, bank, subbank (inline arguments are deprecated. Please use -r, -b, -s instead\n"
" -i, --inject [HSW only] Cause hardware to inject a row errors\n"
" -u, --uninject [HSW only] Turn off hardware error injectection (undo -i)\n",
" -u, --uninject [HSW only] Turn off hardware error injectection (undo -i)\n"
" -L, --listen Listen for uevent errors\n",
name);
}
@ -179,6 +189,7 @@ int main(int argc, char *argv[])
int fd[REAL_MAX_SLICES] = {0}, ret, i;
int action = '0';
int drm_fd = drm_open_any();
int daemonize = 0;
devid = intel_get_drm_devid(drm_fd);
if (intel_gen(devid) < 7 || IS_VALLEYVIEW(devid))
@ -202,11 +213,18 @@ int main(int argc, char *argv[])
assert(lseek(fd[i], 0, SEEK_SET) == 0);
}
/* NB: It is potentially unsafe to read this register if the kernel is
* actively using this register range, or we're running multiple
* instances of this tool. Since neither of those cases should occur
* (and the tool should be root only) we can safely ignore this for
* now. Just be aware of this if for some reason a hang is reported
* when using this tool.
*/
dft = intel_register_read(0xb038);
while (1) {
int c, option_index = 0;
static struct option long_options[] = {
struct option long_options[] = {
{ "help", no_argument, 0, 'h' },
{ "list", no_argument, 0, 'l' },
{ "clear-all", no_argument, 0, 'a' },
@ -215,18 +233,23 @@ int main(int argc, char *argv[])
{ "inject", no_argument, 0, 'i' },
{ "uninject", no_argument, 0, 'u' },
{ "hw-info", no_argument, 0, 'H' },
{ "listen", no_argument, 0, 'L' },
{ "row", required_argument, 0, 'r' },
{ "bank", required_argument, 0, 'b' },
{ "subbank", required_argument, 0, 's' },
{ "slice", required_argument, 0, 'w' },
{ "daemon", no_argument, &daemonize, 1 },
{0, 0, 0, 0}
};
c = getopt_long(argc, argv, "hHr:b:s:w:aled::iu", long_options,
c = getopt_long(argc, argv, "hHr:b:s:w:aled::iuL", long_options,
&option_index);
if (c == -1)
break;
if (c == 0)
continue;
switch (c) {
case '?':
case 'h':
@ -274,6 +297,7 @@ int main(int argc, char *argv[])
case 'a':
case 'l':
case 'e':
case 'L':
if (action != '0') {
fprintf(stderr, "Only one action may be specified\n");
exit(EXIT_FAILURE);
@ -299,6 +323,20 @@ int main(int argc, char *argv[])
printf("warning: overwriting existing injections. This is very dangerous.\n");
}
/* Daemon doesn't work like the other commands */
if (action == 'L') {
struct l3_parity par;
struct l3_location loc;
if (daemonize) {
assert(daemon(0, 0) == 0);
openlog(argv[0], LOG_CONS | LOG_PID, LOG_USER);
}
memset(&par, 0, sizeof(par));
assert(l3_uevent_setup(&par) == 0);
assert(l3_listen(&par, daemonize == 1, &loc) == 0);
exit(EXIT_SUCCESS);
}
if (action == 'l')
decode_dft(dft);

31
tools/intel_l3_parity.h Normal file
View File

@ -0,0 +1,31 @@
#ifndef INTEL_L3_PARITY_H_
#define INTEL_L3_PARITY_H_
#include <stdint.h>
#include <stdbool.h>
struct l3_parity {
struct udev *udev;
struct udev_monitor *uevent_monitor;
int fd;
fd_set fdset;
};
struct l3_location {
uint8_t slice;
uint16_t row;
uint8_t bank;
uint8_t subbank;
};
#if HAVE_UDEV
int l3_uevent_setup(struct l3_parity *par);
/* Listens (blocks) for an l3 parity event. Returns the location of the error. */
int l3_listen(struct l3_parity *par, bool daemon, struct l3_location *loc);
#define l3_uevent_teardown(par) {}
#else
#define l3_uevent_setup(par, daemon, loc) -1
#define l3_listen(par) -1
#endif
#endif

View File

@ -0,0 +1,108 @@
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#if HAVE_UDEV
#include <libudev.h>
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <syslog.h>
#include "i915_drm.h"
#include "intel_l3_parity.h"
#ifndef I915_L3_PARITY_UEVENT
#define I915_L3_PARITY_UEVENT "L3_PARITY_ERROR"
#endif
int l3_uevent_setup(struct l3_parity *par)
{
struct udev *udev;
struct udev_monitor *uevent_monitor;
fd_set fdset;
int fd, ret = -1;
udev = udev_new();
if (!udev) {
return -1;
}
uevent_monitor = udev_monitor_new_from_netlink(udev, "udev");
if (!uevent_monitor)
goto err_out;
ret = udev_monitor_filter_add_match_subsystem_devtype(uevent_monitor, "drm", "drm_minor");
if (ret < 0)
goto err_out;
ret = udev_monitor_enable_receiving(uevent_monitor);
if (ret < 0)
goto err_out;
fd = udev_monitor_get_fd(uevent_monitor);
FD_ZERO(&fdset);
FD_SET(fd, &fdset);
par->udev = udev;
par->fd = fd;
par->fdset = fdset;
par->uevent_monitor = uevent_monitor;
return 0;
err_out:
udev_unref(udev);
return ret;
}
int l3_listen(struct l3_parity *par, bool daemon, struct l3_location *loc)
{
struct udev_device *udev_dev;
const char *parity_status;
char *err_msg;
int ret;
again:
ret = select(par->fd + 1, &par->fdset, NULL, NULL, NULL);
/* Number of bits set is returned, must be >= 1 */
if (ret <= 0) {
return ret;
}
assert(FD_ISSET(par->fd, &par->fdset));
udev_dev = udev_monitor_receive_device(par->uevent_monitor);
if (!udev_dev)
return -1;
parity_status = udev_device_get_property_value(udev_dev, I915_L3_PARITY_UEVENT);
if (strncmp(parity_status, "1", 1))
goto again;
loc->slice = atoi(udev_device_get_property_value(udev_dev, "SLICE"));
loc->row = atoi(udev_device_get_property_value(udev_dev, "ROW"));
loc->bank = atoi(udev_device_get_property_value(udev_dev, "BANK"));
loc->subbank = atoi(udev_device_get_property_value(udev_dev, "SUBBANK"));
udev_device_unref(udev_dev);
asprintf(&err_msg, "Parity error detected on: %d,%d,%d,%d. "
"Try to run intel_l3_parity -r %d -b %d -s %d -w %d -d",
loc->slice, loc->row, loc->bank, loc->subbank,
loc->row, loc->bank, loc->subbank, loc->slice);
if (daemon) {
syslog(LOG_INFO, "%s\n", err_msg);
goto again;
}
fprintf(stderr, "%s\n", err_msg);
free(err_msg);
return 0;
}
#endif