From c6ed1e1af9356cdce1eaa652061dd6e4eb32d283 Mon Sep 17 00:00:00 2001 From: Junliang Li Date: Thu, 13 Feb 2014 10:39:53 +0800 Subject: [PATCH 23/32] add abrt suppport for rasdaemon Adds abrt as another error mechanism for the rasdaemon. This patch does: 1) read ras event (mc,mce and aer) 2) setup a abrt-server unix socket 3) write messages follow ABRT server protocol, set event info into backtrace zone. 4) commit report. For now, it depends on ABRT to limit flood reports. Signed-off-by: Junliang Li Signed-off-by: Mauro Carvalho Chehab --- Makefile.am | 5 +- configure.ac | 9 + ras-aer-handler.c | 6 + ras-events.h | 3 + ras-mc-handler.c | 7 + ras-mce-handler.c | 6 + ras-report.c | 429 +++++++++++++++++++++++++++++++++++++++++++++++++++++ ras-report.h | 39 +++++ 8 files changed, 503 insertions(+), 1 deletions(-) create mode 100644 ras-report.c create mode 100644 ras-report.h diff --git a/Makefile.am b/Makefile.am index 473ce98..c1668b4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -17,10 +17,13 @@ if WITH_MCE mce-intel-dunnington.c mce-intel-tulsa.c \ mce-intel-sb.c mce-intel-ivb.c endif +if WITH_ABRT_REPORT + rasdaemon_SOURCES += ras-report.c +endif rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \ - ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h + ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h # This rule can't be called with more than one Makefile job (like make -j8) # I can't figure out a way to fix that diff --git a/configure.ac b/configure.ac index 4fe6ef2..0ea962e 100644 --- a/configure.ac +++ b/configure.ac @@ -53,6 +53,15 @@ AS_IF([test "x$enable_mce" = "xyes"], [ ]) AM_CONDITIONAL([WITH_MCE], [test x$enable_mce = xyes]) +AC_ARG_ENABLE([abrt_report], + AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)])) + +AS_IF([test "x$enable_abrt_report" = "xyes"], [ + AC_DEFINE(HAVE_ABRT_REPORT,1,"have report event to ABRT") + AC_SUBST([WITH_ABRT_REPORT]) +]) +AM_CONDITIONAL([WITH_ABRT_REPORT], [test x$enable_abrt_report = xyes]) + test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes" diff --git a/ras-aer-handler.c b/ras-aer-handler.c index e5abaca..50526af 100644 --- a/ras-aer-handler.c +++ b/ras-aer-handler.c @@ -24,6 +24,7 @@ #include "ras-record.h" #include "ras-logger.h" #include "bitfield.h" +#include "ras-report.h" static const char *aer_errors[32] = { /* Correctable errors */ @@ -115,5 +116,10 @@ int ras_aer_event_handler(struct trace_seq *s, ras_store_aer_event(ras, &ev); #endif +#ifdef HAVE_ABRT_REPORT + /* Report event to ABRT */ + ras_report_aer_event(ras, &ev); +#endif + return 0; } diff --git a/ras-events.h b/ras-events.h index 554a95e..64e045a 100644 --- a/ras-events.h +++ b/ras-events.h @@ -47,6 +47,9 @@ struct ras_events { /* For the mce handler */ struct mce_priv *mce_priv; + + /* For ABRT socket*/ + int socketfd; }; struct pthread_data { diff --git a/ras-mc-handler.c b/ras-mc-handler.c index 5c24f65..ffb3805 100644 --- a/ras-mc-handler.c +++ b/ras-mc-handler.c @@ -23,6 +23,7 @@ #include "ras-mc-handler.h" #include "ras-record.h" #include "ras-logger.h" +#include "ras-report.h" int ras_mc_event_handler(struct trace_seq *s, struct pevent_record *record, @@ -189,6 +190,12 @@ int ras_mc_event_handler(struct trace_seq *s, /* Insert data into the SGBD */ ras_store_mc_event(ras, &ev); + +#ifdef HAVE_ABRT_REPORT + /* Report event to ABRT */ + ras_report_mc_event(ras, &ev); +#endif + return 0; parse_error: diff --git a/ras-mce-handler.c b/ras-mce-handler.c index 59e8d05..1431049 100644 --- a/ras-mce-handler.c +++ b/ras-mce-handler.c @@ -26,6 +26,7 @@ #include "ras-mce-handler.h" #include "ras-record.h" #include "ras-logger.h" +#include "ras-report.h" /* * The code below were adapted from Andi Kleen/Intel/SuSe mcelog code, @@ -401,5 +402,10 @@ int ras_mce_event_handler(struct trace_seq *s, ras_store_mce_record(ras, &e); #endif +#ifdef HAVE_ABRT_REPORT + /* Report event to ABRT */ + ras_report_mce_event(ras, &e); +#endif + return 0; } diff --git a/ras-report.c b/ras-report.c new file mode 100644 index 0000000..d3e4a79 --- /dev/null +++ b/ras-report.c @@ -0,0 +1,429 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "ras-report.h" + +static int setup_report_socket(void){ + int sockfd = -1; + int rc = -1; + struct sockaddr_un addr; + + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0){ + return -1; + } + + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, ABRT_SOCKET, strlen(ABRT_SOCKET)); + + rc = connect(sockfd, (struct sockaddr *)&addr, sizeof(struct sockaddr_un)); + if (rc < 0){ + return -1; + } + + return sockfd; +} + +static int commit_report_basic(int sockfd){ + char buf[INPUT_BUFFER_SIZE]; + struct utsname un; + int rc = -1; + + if(sockfd < 0){ + return rc; + } + + memset(buf, 0, INPUT_BUFFER_SIZE); + memset(&un, 0, sizeof(struct utsname)); + + rc = uname(&un); + if(rc < 0){ + return rc; + } + + /* + * ABRT server protocol + */ + sprintf(buf, "PUT / HTTP/1.1\r\n\r\n"); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + return -1; + } + + sprintf(buf, "PID=%d", (int)getpid()); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + return -1; + } + + sprintf(buf, "EXECUTABLE=/boot/vmlinuz-%s", un.release); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + return -1; + } + + sprintf(buf, "BASENAME=%s", "rasdaemon"); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + return -1; + } + + return 0; +} + +/* + * add "DONE" string to finish message. + */ +static int commit_report_done(int sockfd){ + int rc = -1; + + if(sockfd < 0){ + return -1; + } + + rc = write(sockfd, "DONE\0", strlen("DONE\0")); + if(rc < strlen("DONE\0")){ + return -1; + } + + return 0; +} + +static int set_mc_event_backtrace(char *buf, struct ras_mc_event *ev){ + char bt_buf[MAX_BACKTRACE_SIZE]; + + if(!buf || !ev) + return -1; + + sprintf(bt_buf, "BACKTRACE= " \ + "timestamp=%s\n" \ + "error_count=%d\n" \ + "error_type=%s\n" \ + "msg=%s\n" \ + "label=%s\n" \ + "mc_index=%c\n" \ + "top_layer=%c\n" \ + "middle_layer=%c\n" \ + "lower_layer=%c\n" \ + "address=%llu\n" \ + "grain=%llu\n" \ + "syndrome=%llu\n" \ + "driver_detail=%s\n", \ + ev->timestamp, \ + ev->error_count, \ + ev->error_type, \ + ev->msg, \ + ev->label, \ + ev->mc_index, \ + ev->top_layer, \ + ev->middle_layer, \ + ev->lower_layer, \ + ev->address, \ + ev->grain, \ + ev->syndrome, \ + ev->driver_detail); + + strcat(buf, bt_buf); + + return 0; +} + +static int set_mce_event_backtrace(char *buf, struct mce_event *ev){ + char bt_buf[MAX_BACKTRACE_SIZE]; + + if(!buf || !ev) + return -1; + + sprintf(bt_buf, "BACKTRACE=" \ + "timestamp=%s\n" \ + "bank_name=%s\n" \ + "error_msg=%s\n" \ + "mcgstatus_msg=%s\n" \ + "mcistatus_msg=%s\n" \ + "mcastatus_msg=%s\n" \ + "user_action=%s\n" \ + "mc_location=%s\n" \ + "mcgcap=%lu\n" \ + "mcgstatus=%lu\n" \ + "status=%lu\n" \ + "addr=%lu\n" \ + "misc=%lu\n" \ + "ip=%lu\n" \ + "tsc=%lu\n" \ + "walltime=%lu\n" \ + "cpu=%u\n" \ + "cpuid=%u\n" \ + "apicid=%u\n" \ + "socketid=%u\n" \ + "cs=%d\n" \ + "bank=%d\n" \ + "cpuvendor=%d\n", \ + ev->timestamp, \ + ev->bank_name, \ + ev->error_msg, \ + ev->mcgstatus_msg, \ + ev->mcistatus_msg, \ + ev->mcastatus_msg, \ + ev->user_action, \ + ev->mc_location, \ + ev->mcgcap, \ + ev->mcgstatus, \ + ev->status, \ + ev->addr, \ + ev->misc, \ + ev->ip, \ + ev->tsc, \ + ev->walltime, \ + ev->cpu, \ + ev->cpuid, \ + ev->apicid, \ + ev->socketid, \ + ev->cs, \ + ev->bank, \ + ev->cpuvendor); + + strcat(buf, bt_buf); + + return 0; +} + +static int set_aer_event_backtrace(char *buf, struct ras_aer_event *ev){ + char bt_buf[MAX_BACKTRACE_SIZE]; + + if(!buf || !ev) + return -1; + + sprintf(bt_buf, "BACKTRACE=" \ + "timestamp=%s\n" \ + "error_type=%s\n" \ + "dev_name=%s\n" \ + "msg=%s\n", \ + ev->timestamp, \ + ev->error_type, \ + ev->dev_name, \ + ev->msg); + + strcat(buf, bt_buf); + + return 0; +} + +static int commit_report_backtrace(int sockfd, int type, void *ev){ + char buf[MAX_BACKTRACE_SIZE]; + char *pbuf = buf; + int rc = -1; + int buf_len = 0; + + if(sockfd < 0 || !ev){ + return -1; + } + + memset(buf, 0, MAX_BACKTRACE_SIZE); + + switch(type){ + case MC_EVENT: + rc = set_mc_event_backtrace(buf, (struct ras_mc_event *)ev); + break; + case AER_EVENT: + rc = set_aer_event_backtrace(buf, (struct ras_aer_event *)ev); + break; + case MCE_EVENT: + rc = set_mce_event_backtrace(buf, (struct mce_event *)ev); + break; + default: + return -1; + } + + if(rc < 0){ + return -1; + } + + buf_len = strlen(buf); + + for(;buf_len > INPUT_BUFFER_SIZE - 1; buf_len -= (INPUT_BUFFER_SIZE - 1)){ + rc = write(sockfd, pbuf, INPUT_BUFFER_SIZE - 1); + if(rc < INPUT_BUFFER_SIZE - 1){ + return -1; + } + + pbuf = pbuf + INPUT_BUFFER_SIZE - 1; + } + + rc = write(sockfd, pbuf, buf_len + 1); + if(rc < buf_len){ + return -1; + } + + return 0; +} + +int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = -1; + int done = 0; + int rc = -1; + + memset(buf, 0, sizeof(buf)); + + sockfd = setup_report_socket(); + if(sockfd < 0){ + return -1; + } + + rc = commit_report_basic(sockfd); + if(rc < 0){ + goto mc_fail; + } + + rc = commit_report_backtrace(sockfd, MC_EVENT, ev); + if(rc < 0){ + goto mc_fail; + } + + sprintf(buf, "ANALYZER=%s", "rasdaemon-mc"); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + goto mc_fail; + } + + sprintf(buf, "REASON=%s", "EDAC driver report problem"); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + goto mc_fail; + } + + rc = commit_report_done(sockfd); + if(rc < 0){ + goto mc_fail; + } + + done = 1; + +mc_fail: + + if(sockfd > 0){ + close(sockfd); + } + + if(done){ + return 0; + }else{ + return -1; + } +} + +int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = 0; + int done = 0; + int rc = -1; + + memset(buf, 0, sizeof(buf)); + + sockfd = setup_report_socket(); + if(sockfd < 0){ + return -1; + } + + rc = commit_report_basic(sockfd); + if(rc < 0){ + goto aer_fail; + } + + rc = commit_report_backtrace(sockfd, AER_EVENT, ev); + if(rc < 0){ + goto aer_fail; + } + + sprintf(buf, "ANALYZER=%s", "rasdaemon-aer"); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + goto aer_fail; + } + + sprintf(buf, "REASON=%s", "PCIe AER driver report problem"); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + goto aer_fail; + } + + rc = commit_report_done(sockfd); + if(rc < 0){ + goto aer_fail; + } + + done = 1; + +aer_fail: + + if(sockfd > 0){ + close(sockfd); + } + + if(done){ + return 0; + }else{ + return -1; + } +} + +int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){ + char buf[MAX_MESSAGE_SIZE]; + int sockfd = 0; + int done = 0; + int rc = -1; + + memset(buf, 0, sizeof(buf)); + + sockfd = setup_report_socket(); + if(sockfd < 0){ + return -1; + } + + rc = commit_report_basic(sockfd); + if(rc < 0){ + goto mce_fail; + } + + rc = commit_report_backtrace(sockfd, MCE_EVENT, ev); + if(rc < 0){ + goto mce_fail; + } + + sprintf(buf, "ANALYZER=%s", "rasdaemon-mce"); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + goto mce_fail; + } + + sprintf(buf, "REASON=%s", "Machine Check driver report problem"); + rc = write(sockfd, buf, strlen(buf) + 1); + if(rc < strlen(buf) + 1){ + goto mce_fail; + } + + rc = commit_report_done(sockfd); + if(rc < 0){ + goto mce_fail; + } + + done = 1; + +mce_fail: + + if(sockfd > 0){ + close(sockfd); + } + + if(done){ + return 0; + }else{ + return -1; + } +} diff --git a/ras-report.h b/ras-report.h new file mode 100644 index 0000000..7920cdf --- /dev/null +++ b/ras-report.h @@ -0,0 +1,39 @@ +#ifndef __RAS_REPORT_H +#define __RAS_REPORT_H + +#include "ras-record.h" +#include "ras-events.h" +#include "ras-mc-handler.h" +#include "ras-mce-handler.h" +#include "ras-aer-handler.h" + +/* Maximal length of backtrace. */ +#define MAX_BACKTRACE_SIZE (1024*1024) +/* Amount of data received from one client for a message before reporting error. */ +#define MAX_MESSAGE_SIZE (4*MAX_BACKTRACE_SIZE) +/* Maximal number of characters read from socket at once. */ +#define INPUT_BUFFER_SIZE (8*1024) +/* ABRT socket file */ +#define ABRT_SOCKET "/var/run/abrt/abrt.socket" + +enum { + MC_EVENT, + MCE_EVENT, + AER_EVENT +}; + +#ifdef HAVE_ABRT_REPORT + +int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev); +int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev); +int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev); + +#else + +static inline int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; }; +static inline int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; }; +static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; }; + +#endif + +#endif -- 1.7.1